blob: 24d0dbb36ba6ff9a4f65c54ee8ddae2aee1e2701 [file] [log] [blame]
Logan Gunthorpe52916982018-10-04 15:27:35 -06001// SPDX-License-Identifier: GPL-2.0
2/*
3 * PCI Peer 2 Peer DMA support.
4 *
5 * Copyright (c) 2016-2018, Logan Gunthorpe
6 * Copyright (c) 2016-2017, Microsemi Corporation
7 * Copyright (c) 2017, Christoph Hellwig
8 * Copyright (c) 2018, Eideticom Inc.
9 */
10
11#include <linux/pci-p2pdma.h>
12#include <linux/module.h>
13#include <linux/slab.h>
14#include <linux/genalloc.h>
15#include <linux/memremap.h>
16#include <linux/percpu-refcount.h>
17#include <linux/random.h>
18#include <linux/seq_buf.h>
19
20struct pci_p2pdma {
21 struct percpu_ref devmap_ref;
22 struct completion devmap_ref_done;
23 struct gen_pool *pool;
24 bool p2pmem_published;
25};
26
27static void pci_p2pdma_percpu_release(struct percpu_ref *ref)
28{
29 struct pci_p2pdma *p2p =
30 container_of(ref, struct pci_p2pdma, devmap_ref);
31
32 complete_all(&p2p->devmap_ref_done);
33}
34
35static void pci_p2pdma_percpu_kill(void *data)
36{
37 struct percpu_ref *ref = data;
38
39 /*
40 * pci_p2pdma_add_resource() may be called multiple times
41 * by a driver and may register the percpu_kill devm action multiple
42 * times. We only want the first action to actually kill the
43 * percpu_ref.
44 */
45 if (percpu_ref_is_dying(ref))
46 return;
47
48 percpu_ref_kill(ref);
49}
50
51static void pci_p2pdma_release(void *data)
52{
53 struct pci_dev *pdev = data;
54
55 if (!pdev->p2pdma)
56 return;
57
58 wait_for_completion(&pdev->p2pdma->devmap_ref_done);
59 percpu_ref_exit(&pdev->p2pdma->devmap_ref);
60
61 gen_pool_destroy(pdev->p2pdma->pool);
62 pdev->p2pdma = NULL;
63}
64
65static int pci_p2pdma_setup(struct pci_dev *pdev)
66{
67 int error = -ENOMEM;
68 struct pci_p2pdma *p2p;
69
70 p2p = devm_kzalloc(&pdev->dev, sizeof(*p2p), GFP_KERNEL);
71 if (!p2p)
72 return -ENOMEM;
73
74 p2p->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev));
75 if (!p2p->pool)
76 goto out;
77
78 init_completion(&p2p->devmap_ref_done);
79 error = percpu_ref_init(&p2p->devmap_ref,
80 pci_p2pdma_percpu_release, 0, GFP_KERNEL);
81 if (error)
82 goto out_pool_destroy;
83
84 error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev);
85 if (error)
86 goto out_pool_destroy;
87
88 pdev->p2pdma = p2p;
89
90 return 0;
91
92out_pool_destroy:
93 gen_pool_destroy(p2p->pool);
94out:
95 devm_kfree(&pdev->dev, p2p);
96 return error;
97}
98
99/**
100 * pci_p2pdma_add_resource - add memory for use as p2p memory
101 * @pdev: the device to add the memory to
102 * @bar: PCI BAR to add
103 * @size: size of the memory to add, may be zero to use the whole BAR
104 * @offset: offset into the PCI BAR
105 *
106 * The memory will be given ZONE_DEVICE struct pages so that it may
107 * be used with any DMA request.
108 */
109int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
110 u64 offset)
111{
112 struct dev_pagemap *pgmap;
113 void *addr;
114 int error;
115
116 if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM))
117 return -EINVAL;
118
119 if (offset >= pci_resource_len(pdev, bar))
120 return -EINVAL;
121
122 if (!size)
123 size = pci_resource_len(pdev, bar) - offset;
124
125 if (size + offset > pci_resource_len(pdev, bar))
126 return -EINVAL;
127
128 if (!pdev->p2pdma) {
129 error = pci_p2pdma_setup(pdev);
130 if (error)
131 return error;
132 }
133
134 pgmap = devm_kzalloc(&pdev->dev, sizeof(*pgmap), GFP_KERNEL);
135 if (!pgmap)
136 return -ENOMEM;
137
138 pgmap->res.start = pci_resource_start(pdev, bar) + offset;
139 pgmap->res.end = pgmap->res.start + size - 1;
140 pgmap->res.flags = pci_resource_flags(pdev, bar);
141 pgmap->ref = &pdev->p2pdma->devmap_ref;
142 pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
143
144 addr = devm_memremap_pages(&pdev->dev, pgmap);
145 if (IS_ERR(addr)) {
146 error = PTR_ERR(addr);
147 goto pgmap_free;
148 }
149
150 error = gen_pool_add_virt(pdev->p2pdma->pool, (unsigned long)addr,
151 pci_bus_address(pdev, bar) + offset,
152 resource_size(&pgmap->res), dev_to_node(&pdev->dev));
153 if (error)
154 goto pgmap_free;
155
156 error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_percpu_kill,
157 &pdev->p2pdma->devmap_ref);
158 if (error)
159 goto pgmap_free;
160
161 pci_info(pdev, "added peer-to-peer DMA memory %pR\n",
162 &pgmap->res);
163
164 return 0;
165
166pgmap_free:
167 devm_kfree(&pdev->dev, pgmap);
168 return error;
169}
170EXPORT_SYMBOL_GPL(pci_p2pdma_add_resource);
171
172/*
173 * Note this function returns the parent PCI device with a
174 * reference taken. It is the caller's responsibily to drop
175 * the reference.
176 */
177static struct pci_dev *find_parent_pci_dev(struct device *dev)
178{
179 struct device *parent;
180
181 dev = get_device(dev);
182
183 while (dev) {
184 if (dev_is_pci(dev))
185 return to_pci_dev(dev);
186
187 parent = get_device(dev->parent);
188 put_device(dev);
189 dev = parent;
190 }
191
192 return NULL;
193}
194
195/*
196 * Check if a PCI bridge has its ACS redirection bits set to redirect P2P
197 * TLPs upstream via ACS. Returns 1 if the packets will be redirected
198 * upstream, 0 otherwise.
199 */
200static int pci_bridge_has_acs_redir(struct pci_dev *pdev)
201{
202 int pos;
203 u16 ctrl;
204
205 pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ACS);
206 if (!pos)
207 return 0;
208
209 pci_read_config_word(pdev, pos + PCI_ACS_CTRL, &ctrl);
210
211 if (ctrl & (PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_EC))
212 return 1;
213
214 return 0;
215}
216
217static void seq_buf_print_bus_devfn(struct seq_buf *buf, struct pci_dev *pdev)
218{
219 if (!buf)
220 return;
221
222 seq_buf_printf(buf, "%s;", pci_name(pdev));
223}
224
225/*
226 * Find the distance through the nearest common upstream bridge between
227 * two PCI devices.
228 *
229 * If the two devices are the same device then 0 will be returned.
230 *
231 * If there are two virtual functions of the same device behind the same
232 * bridge port then 2 will be returned (one step down to the PCIe switch,
233 * then one step back to the same device).
234 *
235 * In the case where two devices are connected to the same PCIe switch, the
236 * value 4 will be returned. This corresponds to the following PCI tree:
237 *
238 * -+ Root Port
239 * \+ Switch Upstream Port
240 * +-+ Switch Downstream Port
241 * + \- Device A
242 * \-+ Switch Downstream Port
243 * \- Device B
244 *
245 * The distance is 4 because we traverse from Device A through the downstream
246 * port of the switch, to the common upstream port, back up to the second
247 * downstream port and then to Device B.
248 *
249 * Any two devices that don't have a common upstream bridge will return -1.
250 * In this way devices on separate PCIe root ports will be rejected, which
251 * is what we want for peer-to-peer seeing each PCIe root port defines a
252 * separate hierarchy domain and there's no way to determine whether the root
253 * complex supports forwarding between them.
254 *
255 * In the case where two devices are connected to different PCIe switches,
256 * this function will still return a positive distance as long as both
257 * switches eventually have a common upstream bridge. Note this covers
258 * the case of using multiple PCIe switches to achieve a desired level of
259 * fan-out from a root port. The exact distance will be a function of the
260 * number of switches between Device A and Device B.
261 *
262 * If a bridge which has any ACS redirection bits set is in the path
263 * then this functions will return -2. This is so we reject any
264 * cases where the TLPs are forwarded up into the root complex.
265 * In this case, a list of all infringing bridge addresses will be
266 * populated in acs_list (assuming it's non-null) for printk purposes.
267 */
268static int upstream_bridge_distance(struct pci_dev *a,
269 struct pci_dev *b,
270 struct seq_buf *acs_list)
271{
272 int dist_a = 0;
273 int dist_b = 0;
274 struct pci_dev *bb = NULL;
275 int acs_cnt = 0;
276
277 /*
278 * Note, we don't need to take references to devices returned by
279 * pci_upstream_bridge() seeing we hold a reference to a child
280 * device which will already hold a reference to the upstream bridge.
281 */
282
283 while (a) {
284 dist_b = 0;
285
286 if (pci_bridge_has_acs_redir(a)) {
287 seq_buf_print_bus_devfn(acs_list, a);
288 acs_cnt++;
289 }
290
291 bb = b;
292
293 while (bb) {
294 if (a == bb)
295 goto check_b_path_acs;
296
297 bb = pci_upstream_bridge(bb);
298 dist_b++;
299 }
300
301 a = pci_upstream_bridge(a);
302 dist_a++;
303 }
304
305 return -1;
306
307check_b_path_acs:
308 bb = b;
309
310 while (bb) {
311 if (a == bb)
312 break;
313
314 if (pci_bridge_has_acs_redir(bb)) {
315 seq_buf_print_bus_devfn(acs_list, bb);
316 acs_cnt++;
317 }
318
319 bb = pci_upstream_bridge(bb);
320 }
321
322 if (acs_cnt)
323 return -2;
324
325 return dist_a + dist_b;
326}
327
328static int upstream_bridge_distance_warn(struct pci_dev *provider,
329 struct pci_dev *client)
330{
331 struct seq_buf acs_list;
332 int ret;
333
334 seq_buf_init(&acs_list, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
335 if (!acs_list.buffer)
336 return -ENOMEM;
337
338 ret = upstream_bridge_distance(provider, client, &acs_list);
339 if (ret == -2) {
340 pci_warn(client, "cannot be used for peer-to-peer DMA as ACS redirect is set between the client and provider (%s)\n",
341 pci_name(provider));
342 /* Drop final semicolon */
343 acs_list.buffer[acs_list.len-1] = 0;
344 pci_warn(client, "to disable ACS redirect for this path, add the kernel parameter: pci=disable_acs_redir=%s\n",
345 acs_list.buffer);
346
347 } else if (ret < 0) {
348 pci_warn(client, "cannot be used for peer-to-peer DMA as the client and provider (%s) do not share an upstream bridge\n",
349 pci_name(provider));
350 }
351
352 kfree(acs_list.buffer);
353
354 return ret;
355}
356
357/**
358 * pci_p2pdma_distance_many - Determive the cumulative distance between
359 * a p2pdma provider and the clients in use.
360 * @provider: p2pdma provider to check against the client list
361 * @clients: array of devices to check (NULL-terminated)
362 * @num_clients: number of clients in the array
363 * @verbose: if true, print warnings for devices when we return -1
364 *
365 * Returns -1 if any of the clients are not compatible (behind the same
366 * root port as the provider), otherwise returns a positive number where
367 * a lower number is the preferrable choice. (If there's one client
368 * that's the same as the provider it will return 0, which is best choice).
369 *
370 * For now, "compatible" means the provider and the clients are all behind
371 * the same PCI root port. This cuts out cases that may work but is safest
372 * for the user. Future work can expand this to white-list root complexes that
373 * can safely forward between each ports.
374 */
375int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients,
376 int num_clients, bool verbose)
377{
378 bool not_supported = false;
379 struct pci_dev *pci_client;
380 int distance = 0;
381 int i, ret;
382
383 if (num_clients == 0)
384 return -1;
385
386 for (i = 0; i < num_clients; i++) {
387 pci_client = find_parent_pci_dev(clients[i]);
388 if (!pci_client) {
389 if (verbose)
390 dev_warn(clients[i],
391 "cannot be used for peer-to-peer DMA as it is not a PCI device\n");
392 return -1;
393 }
394
395 if (verbose)
396 ret = upstream_bridge_distance_warn(provider,
397 pci_client);
398 else
399 ret = upstream_bridge_distance(provider, pci_client,
400 NULL);
401
402 pci_dev_put(pci_client);
403
404 if (ret < 0)
405 not_supported = true;
406
407 if (not_supported && !verbose)
408 break;
409
410 distance += ret;
411 }
412
413 if (not_supported)
414 return -1;
415
416 return distance;
417}
418EXPORT_SYMBOL_GPL(pci_p2pdma_distance_many);
419
420/**
421 * pci_has_p2pmem - check if a given PCI device has published any p2pmem
422 * @pdev: PCI device to check
423 */
424bool pci_has_p2pmem(struct pci_dev *pdev)
425{
426 return pdev->p2pdma && pdev->p2pdma->p2pmem_published;
427}
428EXPORT_SYMBOL_GPL(pci_has_p2pmem);
429
430/**
431 * pci_p2pmem_find - find a peer-to-peer DMA memory device compatible with
432 * the specified list of clients and shortest distance (as determined
433 * by pci_p2pmem_dma())
434 * @clients: array of devices to check (NULL-terminated)
435 * @num_clients: number of client devices in the list
436 *
437 * If multiple devices are behind the same switch, the one "closest" to the
438 * client devices in use will be chosen first. (So if one of the providers are
439 * the same as one of the clients, that provider will be used ahead of any
440 * other providers that are unrelated). If multiple providers are an equal
441 * distance away, one will be chosen at random.
442 *
443 * Returns a pointer to the PCI device with a reference taken (use pci_dev_put
444 * to return the reference) or NULL if no compatible device is found. The
445 * found provider will also be assigned to the client list.
446 */
447struct pci_dev *pci_p2pmem_find_many(struct device **clients, int num_clients)
448{
449 struct pci_dev *pdev = NULL;
450 int distance;
451 int closest_distance = INT_MAX;
452 struct pci_dev **closest_pdevs;
453 int dev_cnt = 0;
454 const int max_devs = PAGE_SIZE / sizeof(*closest_pdevs);
455 int i;
456
457 closest_pdevs = kmalloc(PAGE_SIZE, GFP_KERNEL);
458 if (!closest_pdevs)
459 return NULL;
460
461 while ((pdev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, pdev))) {
462 if (!pci_has_p2pmem(pdev))
463 continue;
464
465 distance = pci_p2pdma_distance_many(pdev, clients,
466 num_clients, false);
467 if (distance < 0 || distance > closest_distance)
468 continue;
469
470 if (distance == closest_distance && dev_cnt >= max_devs)
471 continue;
472
473 if (distance < closest_distance) {
474 for (i = 0; i < dev_cnt; i++)
475 pci_dev_put(closest_pdevs[i]);
476
477 dev_cnt = 0;
478 closest_distance = distance;
479 }
480
481 closest_pdevs[dev_cnt++] = pci_dev_get(pdev);
482 }
483
484 if (dev_cnt)
485 pdev = pci_dev_get(closest_pdevs[prandom_u32_max(dev_cnt)]);
486
487 for (i = 0; i < dev_cnt; i++)
488 pci_dev_put(closest_pdevs[i]);
489
490 kfree(closest_pdevs);
491 return pdev;
492}
493EXPORT_SYMBOL_GPL(pci_p2pmem_find_many);
494
495/**
496 * pci_alloc_p2p_mem - allocate peer-to-peer DMA memory
497 * @pdev: the device to allocate memory from
498 * @size: number of bytes to allocate
499 *
500 * Returns the allocated memory or NULL on error.
501 */
502void *pci_alloc_p2pmem(struct pci_dev *pdev, size_t size)
503{
504 void *ret;
505
506 if (unlikely(!pdev->p2pdma))
507 return NULL;
508
509 if (unlikely(!percpu_ref_tryget_live(&pdev->p2pdma->devmap_ref)))
510 return NULL;
511
512 ret = (void *)gen_pool_alloc(pdev->p2pdma->pool, size);
513
514 if (unlikely(!ret))
515 percpu_ref_put(&pdev->p2pdma->devmap_ref);
516
517 return ret;
518}
519EXPORT_SYMBOL_GPL(pci_alloc_p2pmem);
520
521/**
522 * pci_free_p2pmem - free peer-to-peer DMA memory
523 * @pdev: the device the memory was allocated from
524 * @addr: address of the memory that was allocated
525 * @size: number of bytes that was allocated
526 */
527void pci_free_p2pmem(struct pci_dev *pdev, void *addr, size_t size)
528{
529 gen_pool_free(pdev->p2pdma->pool, (uintptr_t)addr, size);
530 percpu_ref_put(&pdev->p2pdma->devmap_ref);
531}
532EXPORT_SYMBOL_GPL(pci_free_p2pmem);
533
534/**
535 * pci_virt_to_bus - return the PCI bus address for a given virtual
536 * address obtained with pci_alloc_p2pmem()
537 * @pdev: the device the memory was allocated from
538 * @addr: address of the memory that was allocated
539 */
540pci_bus_addr_t pci_p2pmem_virt_to_bus(struct pci_dev *pdev, void *addr)
541{
542 if (!addr)
543 return 0;
544 if (!pdev->p2pdma)
545 return 0;
546
547 /*
548 * Note: when we added the memory to the pool we used the PCI
549 * bus address as the physical address. So gen_pool_virt_to_phys()
550 * actually returns the bus address despite the misleading name.
551 */
552 return gen_pool_virt_to_phys(pdev->p2pdma->pool, (unsigned long)addr);
553}
554EXPORT_SYMBOL_GPL(pci_p2pmem_virt_to_bus);
555
556/**
557 * pci_p2pmem_alloc_sgl - allocate peer-to-peer DMA memory in a scatterlist
558 * @pdev: the device to allocate memory from
559 * @nents: the number of SG entries in the list
560 * @length: number of bytes to allocate
561 *
562 * Returns 0 on success
563 */
564struct scatterlist *pci_p2pmem_alloc_sgl(struct pci_dev *pdev,
565 unsigned int *nents, u32 length)
566{
567 struct scatterlist *sg;
568 void *addr;
569
570 sg = kzalloc(sizeof(*sg), GFP_KERNEL);
571 if (!sg)
572 return NULL;
573
574 sg_init_table(sg, 1);
575
576 addr = pci_alloc_p2pmem(pdev, length);
577 if (!addr)
578 goto out_free_sg;
579
580 sg_set_buf(sg, addr, length);
581 *nents = 1;
582 return sg;
583
584out_free_sg:
585 kfree(sg);
586 return NULL;
587}
588EXPORT_SYMBOL_GPL(pci_p2pmem_alloc_sgl);
589
590/**
591 * pci_p2pmem_free_sgl - free a scatterlist allocated by pci_p2pmem_alloc_sgl()
592 * @pdev: the device to allocate memory from
593 * @sgl: the allocated scatterlist
594 */
595void pci_p2pmem_free_sgl(struct pci_dev *pdev, struct scatterlist *sgl)
596{
597 struct scatterlist *sg;
598 int count;
599
600 for_each_sg(sgl, sg, INT_MAX, count) {
601 if (!sg)
602 break;
603
604 pci_free_p2pmem(pdev, sg_virt(sg), sg->length);
605 }
606 kfree(sgl);
607}
608EXPORT_SYMBOL_GPL(pci_p2pmem_free_sgl);
609
610/**
611 * pci_p2pmem_publish - publish the peer-to-peer DMA memory for use by
612 * other devices with pci_p2pmem_find()
613 * @pdev: the device with peer-to-peer DMA memory to publish
614 * @publish: set to true to publish the memory, false to unpublish it
615 *
616 * Published memory can be used by other PCI device drivers for
617 * peer-2-peer DMA operations. Non-published memory is reserved for
618 * exlusive use of the device driver that registers the peer-to-peer
619 * memory.
620 */
621void pci_p2pmem_publish(struct pci_dev *pdev, bool publish)
622{
623 if (pdev->p2pdma)
624 pdev->p2pdma->p2pmem_published = publish;
625}
626EXPORT_SYMBOL_GPL(pci_p2pmem_publish);