blob: 8c342fb499ca66d8a60154ede216d462e191e3e2 [file] [log] [blame]
Oded Gabbayc4d66342019-02-16 00:39:11 +02001// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * Copyright 2016-2019 HabanaLabs, Ltd.
5 * All Rights Reserved.
6 *
7 */
8
Tomer Tayare00dac32019-04-10 15:18:46 +03009#define pr_fmt(fmt) "habanalabs: " fmt
10
Oded Gabbayc4d66342019-02-16 00:39:11 +020011#include "habanalabs.h"
12
13#include <linux/pci.h>
14#include <linux/module.h>
15
16#define HL_DRIVER_AUTHOR "HabanaLabs Kernel Driver Team"
17
18#define HL_DRIVER_DESC "Driver for HabanaLabs's AI Accelerators"
19
20MODULE_AUTHOR(HL_DRIVER_AUTHOR);
21MODULE_DESCRIPTION(HL_DRIVER_DESC);
22MODULE_LICENSE("GPL v2");
23
24static int hl_major;
25static struct class *hl_class;
Oded Gabbay5e6e0232019-02-27 12:15:16 +020026static DEFINE_IDR(hl_devs_idr);
27static DEFINE_MUTEX(hl_devs_idr_lock);
Oded Gabbayc4d66342019-02-16 00:39:11 +020028
Oded Gabbayeff6f4a2019-02-16 00:39:21 +020029static int timeout_locked = 5;
30static int reset_on_lockup = 1;
31
32module_param(timeout_locked, int, 0444);
33MODULE_PARM_DESC(timeout_locked,
34 "Device lockup timeout in seconds (0 = disabled, default 5s)");
35
36module_param(reset_on_lockup, int, 0444);
37MODULE_PARM_DESC(reset_on_lockup,
38 "Do device reset on lockup (0 = no, 1 = yes, default yes)");
39
Oded Gabbayc4d66342019-02-16 00:39:11 +020040#define PCI_VENDOR_ID_HABANALABS 0x1da3
41
42#define PCI_IDS_GOYA 0x0001
43
44static const struct pci_device_id ids[] = {
45 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
46 { 0, }
47};
48MODULE_DEVICE_TABLE(pci, ids);
49
50/*
51 * get_asic_type - translate device id to asic type
52 *
53 * @device: id of the PCI device
54 *
55 * Translate device id to asic type.
56 * In case of unidentified device, return -1
57 */
58static enum hl_asic_type get_asic_type(u16 device)
59{
60 enum hl_asic_type asic_type;
61
62 switch (device) {
63 case PCI_IDS_GOYA:
64 asic_type = ASIC_GOYA;
65 break;
66 default:
67 asic_type = ASIC_INVALID;
68 break;
69 }
70
71 return asic_type;
72}
73
74/*
75 * hl_device_open - open function for habanalabs device
76 *
77 * @inode: pointer to inode structure
78 * @filp: pointer to file structure
79 *
80 * Called when process opens an habanalabs device.
81 */
82int hl_device_open(struct inode *inode, struct file *filp)
83{
84 struct hl_device *hdev;
85 struct hl_fpriv *hpriv;
Oded Gabbay0861e412019-02-16 00:39:14 +020086 int rc;
Oded Gabbayc4d66342019-02-16 00:39:11 +020087
88 mutex_lock(&hl_devs_idr_lock);
89 hdev = idr_find(&hl_devs_idr, iminor(inode));
90 mutex_unlock(&hl_devs_idr_lock);
91
92 if (!hdev) {
93 pr_err("Couldn't find device %d:%d\n",
94 imajor(inode), iminor(inode));
95 return -ENXIO;
96 }
97
98 hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
Oded Gabbayeb7caf82019-07-30 11:56:09 +030099 if (!hpriv)
100 return -ENOMEM;
Oded Gabbayc4d66342019-02-16 00:39:11 +0200101
102 hpriv->hdev = hdev;
103 filp->private_data = hpriv;
104 hpriv->filp = filp;
Oded Gabbayeff6f4a2019-02-16 00:39:21 +0200105 mutex_init(&hpriv->restore_phase_mutex);
Oded Gabbayc4d66342019-02-16 00:39:11 +0200106 kref_init(&hpriv->refcount);
107 nonseekable_open(inode, filp);
108
Oded Gabbaybe5d9262019-02-16 00:39:15 +0200109 hl_cb_mgr_init(&hpriv->cb_mgr);
Oded Gabbay0861e412019-02-16 00:39:14 +0200110 hl_ctx_mgr_init(&hpriv->ctx_mgr);
111
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300112 hpriv->taskpid = find_get_pid(current->pid);
113
114 mutex_lock(&hdev->fpriv_list_lock);
115
116 if (hl_device_disabled_or_in_reset(hdev)) {
117 dev_err_ratelimited(hdev->dev,
118 "Can't open %s because it is disabled or in reset\n",
119 dev_name(hdev->dev));
120 rc = -EPERM;
Oded Gabbay0861e412019-02-16 00:39:14 +0200121 goto out_err;
122 }
123
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300124 if (hdev->in_debug) {
125 dev_err_ratelimited(hdev->dev,
126 "Can't open %s because it is being debugged by another user\n",
127 dev_name(hdev->dev));
128 rc = -EPERM;
129 goto out_err;
130 }
Oded Gabbayc4d66342019-02-16 00:39:11 +0200131
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300132 if (hdev->compute_ctx) {
Oded Gabbay4d6a7752019-07-30 09:10:50 +0300133 dev_dbg_ratelimited(hdev->dev,
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300134 "Can't open %s because another user is working on it\n",
135 dev_name(hdev->dev));
136 rc = -EBUSY;
137 goto out_err;
138 }
139
140 rc = hl_ctx_create(hdev, hpriv);
141 if (rc) {
142 dev_err(hdev->dev, "Failed to create context %d\n", rc);
143 goto out_err;
144 }
145
146 /* Device is IDLE at this point so it is legal to change PLLs.
147 * There is no need to check anything because if the PLL is
148 * already HIGH, the set function will return without doing
149 * anything
Oded Gabbayd91389b2019-02-16 00:39:19 +0200150 */
151 hl_device_set_frequency(hdev, PLL_HIGH);
152
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300153 list_add(&hpriv->dev_node, &hdev->fpriv_list);
154 mutex_unlock(&hdev->fpriv_list_lock);
155
Oded Gabbayc2164772019-02-16 00:39:24 +0200156 hl_debugfs_add_file(hpriv);
157
Oded Gabbayc4d66342019-02-16 00:39:11 +0200158 return 0;
Oded Gabbay0861e412019-02-16 00:39:14 +0200159
160out_err:
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300161 mutex_unlock(&hdev->fpriv_list_lock);
Oded Gabbay0861e412019-02-16 00:39:14 +0200162
Oded Gabbayeb7caf82019-07-30 11:56:09 +0300163 hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
164 hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
165 filp->private_data = NULL;
166 mutex_destroy(&hpriv->restore_phase_mutex);
167 put_pid(hpriv->taskpid);
168
169 kfree(hpriv);
Oded Gabbay0861e412019-02-16 00:39:14 +0200170 return rc;
Oded Gabbayc4d66342019-02-16 00:39:11 +0200171}
172
Oded Gabbay4d6a7752019-07-30 09:10:50 +0300173int hl_device_open_ctrl(struct inode *inode, struct file *filp)
174{
175 struct hl_device *hdev;
176 struct hl_fpriv *hpriv;
177 int rc;
178
179 mutex_lock(&hl_devs_idr_lock);
180 hdev = idr_find(&hl_devs_idr, iminor(inode));
181 mutex_unlock(&hl_devs_idr_lock);
182
183 if (!hdev) {
184 pr_err("Couldn't find device %d:%d\n",
185 imajor(inode), iminor(inode));
186 return -ENXIO;
187 }
188
189 hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
190 if (!hpriv)
191 return -ENOMEM;
192
193 mutex_lock(&hdev->fpriv_list_lock);
194
195 if (hl_device_disabled_or_in_reset(hdev)) {
196 dev_err_ratelimited(hdev->dev_ctrl,
197 "Can't open %s because it is disabled or in reset\n",
198 dev_name(hdev->dev_ctrl));
199 rc = -EPERM;
200 goto out_err;
201 }
202
203 list_add(&hpriv->dev_node, &hdev->fpriv_list);
204 mutex_unlock(&hdev->fpriv_list_lock);
205
206 hpriv->hdev = hdev;
207 filp->private_data = hpriv;
208 hpriv->filp = filp;
209 hpriv->is_control = true;
210 nonseekable_open(inode, filp);
211
212 hpriv->taskpid = find_get_pid(current->pid);
213
214 return 0;
215
216out_err:
217 mutex_unlock(&hdev->fpriv_list_lock);
218 kfree(hpriv);
219 return rc;
220}
221
Oded Gabbay8c173dc2019-05-08 09:55:23 +0300222static void set_driver_behavior_per_device(struct hl_device *hdev)
223{
224 hdev->mmu_enable = 1;
225 hdev->cpu_enable = 1;
226 hdev->fw_loading = 1;
227 hdev->cpu_queues_enable = 1;
228 hdev->heartbeat = 1;
229
230 hdev->reset_pcilink = 0;
231}
232
Oded Gabbayc4d66342019-02-16 00:39:11 +0200233/*
234 * create_hdev - create habanalabs device instance
235 *
236 * @dev: will hold the pointer to the new habanalabs device structure
237 * @pdev: pointer to the pci device
238 * @asic_type: in case of simulator device, which device is it
239 * @minor: in case of simulator device, the minor of the device
240 *
241 * Allocate memory for habanalabs device and initialize basic fields
242 * Identify the ASIC type
243 * Allocate ID (minor) for the device (only for real devices)
244 */
245int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
246 enum hl_asic_type asic_type, int minor)
247{
248 struct hl_device *hdev;
Oded Gabbay4d6a7752019-07-30 09:10:50 +0300249 int rc, main_id, ctrl_id = 0;
Oded Gabbayc4d66342019-02-16 00:39:11 +0200250
251 *dev = NULL;
252
253 hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
254 if (!hdev)
255 return -ENOMEM;
256
Oded Gabbay8c173dc2019-05-08 09:55:23 +0300257 /* First, we must find out which ASIC are we handling. This is needed
258 * to configure the behavior of the driver (kernel parameters)
259 */
Oded Gabbay29593842019-04-04 14:33:34 +0300260 if (pdev) {
Oded Gabbayc4d66342019-02-16 00:39:11 +0200261 hdev->asic_type = get_asic_type(pdev->device);
262 if (hdev->asic_type == ASIC_INVALID) {
263 dev_err(&pdev->dev, "Unsupported ASIC\n");
264 rc = -ENODEV;
265 goto free_hdev;
266 }
267 } else {
268 hdev->asic_type = asic_type;
269 }
270
Oded Gabbay8c173dc2019-05-08 09:55:23 +0300271 hdev->major = hl_major;
272 hdev->reset_on_lockup = reset_on_lockup;
273 hdev->pldm = 0;
274
275 set_driver_behavior_per_device(hdev);
276
277 if (timeout_locked)
278 hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
279 else
280 hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
281
282 hdev->disabled = true;
283 hdev->pdev = pdev; /* can be NULL in case of simulator device */
284
Oded Gabbayd9973872019-03-07 18:03:23 +0200285 /* Set default DMA mask to 32 bits */
286 hdev->dma_mask = 32;
287
Oded Gabbayc4d66342019-02-16 00:39:11 +0200288 mutex_lock(&hl_devs_idr_lock);
289
Oded Gabbay4d6a7752019-07-30 09:10:50 +0300290 /* Always save 2 numbers, 1 for main device and 1 for control.
291 * They must be consecutive
292 */
293 main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS,
Oded Gabbayc4d66342019-02-16 00:39:11 +0200294 GFP_KERNEL);
Oded Gabbayc4d66342019-02-16 00:39:11 +0200295
Oded Gabbay4d6a7752019-07-30 09:10:50 +0300296 if (main_id >= 0)
297 ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
298 main_id + 2, GFP_KERNEL);
Oded Gabbayc4d66342019-02-16 00:39:11 +0200299
300 mutex_unlock(&hl_devs_idr_lock);
301
Oded Gabbay4d6a7752019-07-30 09:10:50 +0300302 if ((main_id < 0) || (ctrl_id < 0)) {
303 if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
Oded Gabbayc4d66342019-02-16 00:39:11 +0200304 pr_err("too many devices in the system\n");
Oded Gabbay4d6a7752019-07-30 09:10:50 +0300305
306 if (main_id >= 0) {
307 mutex_lock(&hl_devs_idr_lock);
308 idr_remove(&hl_devs_idr, main_id);
309 mutex_unlock(&hl_devs_idr_lock);
Oded Gabbayc4d66342019-02-16 00:39:11 +0200310 }
Oded Gabbay4d6a7752019-07-30 09:10:50 +0300311
312 rc = -EBUSY;
Oded Gabbayc4d66342019-02-16 00:39:11 +0200313 goto free_hdev;
314 }
315
Oded Gabbay4d6a7752019-07-30 09:10:50 +0300316 hdev->id = main_id;
317 hdev->id_control = ctrl_id;
Oded Gabbayc4d66342019-02-16 00:39:11 +0200318
319 *dev = hdev;
320
321 return 0;
322
323free_hdev:
324 kfree(hdev);
325 return rc;
326}
327
328/*
329 * destroy_hdev - destroy habanalabs device instance
330 *
331 * @dev: pointer to the habanalabs device structure
332 *
333 */
334void destroy_hdev(struct hl_device *hdev)
335{
336 /* Remove device from the device list */
337 mutex_lock(&hl_devs_idr_lock);
338 idr_remove(&hl_devs_idr, hdev->id);
Oded Gabbay4d6a7752019-07-30 09:10:50 +0300339 idr_remove(&hl_devs_idr, hdev->id_control);
Oded Gabbayc4d66342019-02-16 00:39:11 +0200340 mutex_unlock(&hl_devs_idr_lock);
341
342 kfree(hdev);
343}
344
345static int hl_pmops_suspend(struct device *dev)
346{
Chuhong Yuan30f27322019-07-23 20:46:08 +0800347 struct hl_device *hdev = dev_get_drvdata(dev);
Oded Gabbayc4d66342019-02-16 00:39:11 +0200348
349 pr_debug("Going to suspend PCI device\n");
350
351 if (!hdev) {
352 pr_err("device pointer is NULL in suspend\n");
353 return 0;
354 }
355
356 return hl_device_suspend(hdev);
357}
358
359static int hl_pmops_resume(struct device *dev)
360{
Chuhong Yuan30f27322019-07-23 20:46:08 +0800361 struct hl_device *hdev = dev_get_drvdata(dev);
Oded Gabbayc4d66342019-02-16 00:39:11 +0200362
363 pr_debug("Going to resume PCI device\n");
364
365 if (!hdev) {
366 pr_err("device pointer is NULL in resume\n");
367 return 0;
368 }
369
370 return hl_device_resume(hdev);
371}
372
373/*
374 * hl_pci_probe - probe PCI habanalabs devices
375 *
376 * @pdev: pointer to pci device
377 * @id: pointer to pci device id structure
378 *
379 * Standard PCI probe function for habanalabs device.
380 * Create a new habanalabs device and initialize it according to the
381 * device's type
382 */
383static int hl_pci_probe(struct pci_dev *pdev,
384 const struct pci_device_id *id)
385{
386 struct hl_device *hdev;
387 int rc;
388
389 dev_info(&pdev->dev, HL_NAME
390 " device found [%04x:%04x] (rev %x)\n",
391 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
392
Oded Gabbay29593842019-04-04 14:33:34 +0300393 rc = create_hdev(&hdev, pdev, ASIC_INVALID, -1);
Oded Gabbayc4d66342019-02-16 00:39:11 +0200394 if (rc)
395 return rc;
396
397 pci_set_drvdata(pdev, hdev);
398
399 rc = hl_device_init(hdev, hl_class);
400 if (rc) {
401 dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
402 rc = -ENODEV;
403 goto disable_device;
404 }
405
406 return 0;
407
408disable_device:
409 pci_set_drvdata(pdev, NULL);
410 destroy_hdev(hdev);
411
412 return rc;
413}
414
415/*
416 * hl_pci_remove - remove PCI habanalabs devices
417 *
418 * @pdev: pointer to pci device
419 *
420 * Standard PCI remove function for habanalabs device
421 */
422static void hl_pci_remove(struct pci_dev *pdev)
423{
424 struct hl_device *hdev;
425
426 hdev = pci_get_drvdata(pdev);
427 if (!hdev)
428 return;
429
430 hl_device_fini(hdev);
431 pci_set_drvdata(pdev, NULL);
432
433 destroy_hdev(hdev);
434}
435
436static const struct dev_pm_ops hl_pm_ops = {
437 .suspend = hl_pmops_suspend,
438 .resume = hl_pmops_resume,
439};
440
441static struct pci_driver hl_pci_driver = {
442 .name = HL_NAME,
443 .id_table = ids,
444 .probe = hl_pci_probe,
445 .remove = hl_pci_remove,
446 .driver.pm = &hl_pm_ops,
447};
448
449/*
450 * hl_init - Initialize the habanalabs kernel driver
451 */
452static int __init hl_init(void)
453{
454 int rc;
455 dev_t dev;
456
457 pr_info("loading driver\n");
458
459 rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
460 if (rc < 0) {
461 pr_err("unable to get major\n");
462 return rc;
463 }
464
465 hl_major = MAJOR(dev);
466
467 hl_class = class_create(THIS_MODULE, HL_NAME);
468 if (IS_ERR(hl_class)) {
469 pr_err("failed to allocate class\n");
470 rc = PTR_ERR(hl_class);
471 goto remove_major;
472 }
473
Oded Gabbayc2164772019-02-16 00:39:24 +0200474 hl_debugfs_init();
475
Oded Gabbayc4d66342019-02-16 00:39:11 +0200476 rc = pci_register_driver(&hl_pci_driver);
477 if (rc) {
478 pr_err("failed to register pci device\n");
Oded Gabbayc2164772019-02-16 00:39:24 +0200479 goto remove_debugfs;
Oded Gabbayc4d66342019-02-16 00:39:11 +0200480 }
481
482 pr_debug("driver loaded\n");
483
484 return 0;
485
Oded Gabbayc2164772019-02-16 00:39:24 +0200486remove_debugfs:
487 hl_debugfs_fini();
Oded Gabbayc4d66342019-02-16 00:39:11 +0200488 class_destroy(hl_class);
489remove_major:
490 unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
491 return rc;
492}
493
494/*
495 * hl_exit - Release all resources of the habanalabs kernel driver
496 */
497static void __exit hl_exit(void)
498{
499 pci_unregister_driver(&hl_pci_driver);
500
Oded Gabbayc2164772019-02-16 00:39:24 +0200501 /*
502 * Removing debugfs must be after all devices or simulator devices
503 * have been removed because otherwise we get a bug in the
504 * debugfs module for referencing NULL objects
505 */
506 hl_debugfs_fini();
507
Oded Gabbayc4d66342019-02-16 00:39:11 +0200508 class_destroy(hl_class);
509 unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
510
511 idr_destroy(&hl_devs_idr);
512
513 pr_debug("driver removed\n");
514}
515
516module_init(hl_init);
517module_exit(hl_exit);