blob: b1923ad2ddd3899483741f3806ede0ff8245baf9 [file] [log] [blame]
Sudeep Dutt7df20f22015-04-29 05:32:28 -07001/*
2 * Intel MIC Platform Software Stack (MPSS)
3 *
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2014 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * BSD LICENSE
21 *
22 * Copyright(c) 2014 Intel Corporation.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 *
28 * * Redistributions of source code must retain the above copyright
29 * notice, this list of conditions and the following disclaimer.
30 * * Redistributions in binary form must reproduce the above copyright
31 * notice, this list of conditions and the following disclaimer in
32 * the documentation and/or other materials provided with the
33 * distribution.
34 * * Neither the name of Intel Corporation nor the names of its
35 * contributors may be used to endorse or promote products derived
36 * from this software without specific prior written permission.
37 *
38 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
39 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
40 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
41 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
42 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
44 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
45 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
46 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
47 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
48 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
49 *
50 * Intel SCIF driver.
51 *
52 */
53#ifndef __SCIF_H__
54#define __SCIF_H__
55
56#include <linux/types.h>
57#include <linux/poll.h>
58#include <linux/scif_ioctl.h>
59
60#define SCIF_ACCEPT_SYNC 1
61#define SCIF_SEND_BLOCK 1
62#define SCIF_RECV_BLOCK 1
63
64enum {
65 SCIF_PROT_READ = (1 << 0),
66 SCIF_PROT_WRITE = (1 << 1)
67};
68
69enum {
70 SCIF_MAP_FIXED = 0x10,
71 SCIF_MAP_KERNEL = 0x20,
72};
73
74enum {
75 SCIF_FENCE_INIT_SELF = (1 << 0),
76 SCIF_FENCE_INIT_PEER = (1 << 1),
77 SCIF_SIGNAL_LOCAL = (1 << 4),
78 SCIF_SIGNAL_REMOTE = (1 << 5)
79};
80
81enum {
82 SCIF_RMA_USECPU = (1 << 0),
83 SCIF_RMA_USECACHE = (1 << 1),
84 SCIF_RMA_SYNC = (1 << 2),
85 SCIF_RMA_ORDERED = (1 << 3)
86};
87
88/* End of SCIF Admin Reserved Ports */
89#define SCIF_ADMIN_PORT_END 1024
90
91/* End of SCIF Reserved Ports */
92#define SCIF_PORT_RSVD 1088
93
94typedef struct scif_endpt *scif_epd_t;
95
Ashutosh Dixitb7f94442015-09-29 18:10:44 -070096/**
97 * struct scif_pollepd - SCIF endpoint to be monitored via scif_poll
98 * @epd: SCIF endpoint
99 * @events: requested events
100 * @revents: returned events
101 */
102struct scif_pollepd {
103 scif_epd_t epd;
104 short events;
105 short revents;
106};
107
Sudeep Dutt7df20f22015-04-29 05:32:28 -0700108#define SCIF_OPEN_FAILED ((scif_epd_t)-1)
109#define SCIF_REGISTER_FAILED ((off_t)-1)
110#define SCIF_MMAP_FAILED ((void *)-1)
111
112/**
113 * scif_open() - Create an endpoint
114 *
115 * Return:
116 * Upon successful completion, scif_open() returns an endpoint descriptor to
117 * be used in subsequent SCIF functions calls to refer to that endpoint;
118 * otherwise in user mode SCIF_OPEN_FAILED (that is ((scif_epd_t)-1)) is
119 * returned and errno is set to indicate the error; in kernel mode a NULL
120 * scif_epd_t is returned.
121 *
122 * Errors:
123 * ENOMEM - Insufficient kernel memory was available
124 */
125scif_epd_t scif_open(void);
126
127/**
128 * scif_bind() - Bind an endpoint to a port
129 * @epd: endpoint descriptor
130 * @pn: port number
131 *
132 * scif_bind() binds endpoint epd to port pn, where pn is a port number on the
133 * local node. If pn is zero, a port number greater than or equal to
134 * SCIF_PORT_RSVD is assigned and returned. Each endpoint may be bound to
135 * exactly one local port. Ports less than 1024 when requested can only be bound
136 * by system (or root) processes or by processes executed by privileged users.
137 *
138 * Return:
139 * Upon successful completion, scif_bind() returns the port number to which epd
140 * is bound; otherwise in user mode -1 is returned and errno is set to
141 * indicate the error; in kernel mode the negative of one of the following
142 * errors is returned.
143 *
144 * Errors:
145 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
146 * EINVAL - the endpoint or the port is already bound
147 * EISCONN - The endpoint is already connected
148 * ENOSPC - No port number available for assignment
149 * EACCES - The port requested is protected and the user is not the superuser
150 */
151int scif_bind(scif_epd_t epd, u16 pn);
152
153/**
154 * scif_listen() - Listen for connections on an endpoint
155 * @epd: endpoint descriptor
156 * @backlog: maximum pending connection requests
157 *
158 * scif_listen() marks the endpoint epd as a listening endpoint - that is, as
159 * an endpoint that will be used to accept incoming connection requests. Once
160 * so marked, the endpoint is said to be in the listening state and may not be
161 * used as the endpoint of a connection.
162 *
163 * The endpoint, epd, must have been bound to a port.
164 *
165 * The backlog argument defines the maximum length to which the queue of
166 * pending connections for epd may grow. If a connection request arrives when
167 * the queue is full, the client may receive an error with an indication that
168 * the connection was refused.
169 *
170 * Return:
171 * Upon successful completion, scif_listen() returns 0; otherwise in user mode
172 * -1 is returned and errno is set to indicate the error; in kernel mode the
173 * negative of one of the following errors is returned.
174 *
175 * Errors:
176 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
177 * EINVAL - the endpoint is not bound to a port
178 * EISCONN - The endpoint is already connected or listening
179 */
180int scif_listen(scif_epd_t epd, int backlog);
181
182/**
183 * scif_connect() - Initiate a connection on a port
184 * @epd: endpoint descriptor
185 * @dst: global id of port to which to connect
186 *
187 * The scif_connect() function requests the connection of endpoint epd to remote
188 * port dst. If the connection is successful, a peer endpoint, bound to dst, is
189 * created on node dst.node. On successful return, the connection is complete.
190 *
191 * If the endpoint epd has not already been bound to a port, scif_connect()
192 * will bind it to an unused local port.
193 *
194 * A connection is terminated when an endpoint of the connection is closed,
195 * either explicitly by scif_close(), or when a process that owns one of the
196 * endpoints of the connection is terminated.
197 *
198 * In user space, scif_connect() supports an asynchronous connection mode
199 * if the application has set the O_NONBLOCK flag on the endpoint via the
200 * fcntl() system call. Setting this flag will result in the calling process
201 * not to wait during scif_connect().
202 *
203 * Return:
204 * Upon successful completion, scif_connect() returns the port ID to which the
205 * endpoint, epd, is bound; otherwise in user mode -1 is returned and errno is
206 * set to indicate the error; in kernel mode the negative of one of the
207 * following errors is returned.
208 *
209 * Errors:
210 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
211 * ECONNREFUSED - The destination was not listening for connections or refused
212 * the connection request
213 * EINVAL - dst.port is not a valid port ID
214 * EISCONN - The endpoint is already connected
215 * ENOMEM - No buffer space is available
216 * ENODEV - The destination node does not exist, or the node is lost or existed,
217 * but is not currently in the network since it may have crashed
218 * ENOSPC - No port number available for assignment
219 * EOPNOTSUPP - The endpoint is listening and cannot be connected
220 */
221int scif_connect(scif_epd_t epd, struct scif_port_id *dst);
222
223/**
224 * scif_accept() - Accept a connection on an endpoint
225 * @epd: endpoint descriptor
226 * @peer: global id of port to which connected
227 * @newepd: new connected endpoint descriptor
228 * @flags: flags
229 *
230 * The scif_accept() call extracts the first connection request from the queue
231 * of pending connections for the port on which epd is listening. scif_accept()
232 * creates a new endpoint, bound to the same port as epd, and allocates a new
233 * SCIF endpoint descriptor, returned in newepd, for the endpoint. The new
234 * endpoint is connected to the endpoint through which the connection was
235 * requested. epd is unaffected by this call, and remains in the listening
236 * state.
237 *
238 * On successful return, peer holds the global port identifier (node id and
239 * local port number) of the port which requested the connection.
240 *
241 * A connection is terminated when an endpoint of the connection is closed,
242 * either explicitly by scif_close(), or when a process that owns one of the
243 * endpoints of the connection is terminated.
244 *
245 * The number of connections that can (subsequently) be accepted on epd is only
246 * limited by system resources (memory).
247 *
248 * The flags argument is formed by OR'ing together zero or more of the
249 * following values.
250 * SCIF_ACCEPT_SYNC - block until a connection request is presented. If
251 * SCIF_ACCEPT_SYNC is not in flags, and no pending
252 * connections are present on the queue, scif_accept()
253 * fails with an EAGAIN error
254 *
255 * In user mode, the select() and poll() functions can be used to determine
256 * when there is a connection request. In kernel mode, the scif_poll()
257 * function may be used for this purpose. A readable event will be delivered
258 * when a connection is requested.
259 *
260 * Return:
261 * Upon successful completion, scif_accept() returns 0; otherwise in user mode
262 * -1 is returned and errno is set to indicate the error; in kernel mode the
263 * negative of one of the following errors is returned.
264 *
265 * Errors:
266 * EAGAIN - SCIF_ACCEPT_SYNC is not set and no connections are present to be
267 * accepted or SCIF_ACCEPT_SYNC is not set and remote node failed to complete
268 * its connection request
269 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
270 * EINTR - Interrupted function
271 * EINVAL - epd is not a listening endpoint, or flags is invalid, or peer is
272 * NULL, or newepd is NULL
273 * ENODEV - The requesting node is lost or existed, but is not currently in the
274 * network since it may have crashed
275 * ENOMEM - Not enough space
276 * ENOENT - Secondary part of epd registration failed
277 */
278int scif_accept(scif_epd_t epd, struct scif_port_id *peer, scif_epd_t
279 *newepd, int flags);
280
281/**
282 * scif_close() - Close an endpoint
283 * @epd: endpoint descriptor
284 *
285 * scif_close() closes an endpoint and performs necessary teardown of
286 * facilities associated with that endpoint.
287 *
288 * If epd is a listening endpoint then it will no longer accept connection
289 * requests on the port to which it is bound. Any pending connection requests
290 * are rejected.
291 *
292 * If epd is a connected endpoint, then its peer endpoint is also closed. RMAs
293 * which are in-process through epd or its peer endpoint will complete before
294 * scif_close() returns. Registered windows of the local and peer endpoints are
295 * released as if scif_unregister() was called against each window.
296 *
297 * Closing a SCIF endpoint does not affect local registered memory mapped by
298 * a SCIF endpoint on a remote node. The local memory remains mapped by the peer
299 * SCIF endpoint explicitly removed by calling munmap(..) by the peer.
300 *
301 * If the peer endpoint's receive queue is not empty at the time that epd is
302 * closed, then the peer endpoint can be passed as the endpoint parameter to
303 * scif_recv() until the receive queue is empty.
304 *
305 * epd is freed and may no longer be accessed.
306 *
307 * Return:
308 * Upon successful completion, scif_close() returns 0; otherwise in user mode
309 * -1 is returned and errno is set to indicate the error; in kernel mode the
310 * negative of one of the following errors is returned.
311 *
312 * Errors:
313 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
314 */
315int scif_close(scif_epd_t epd);
316
317/**
318 * scif_send() - Send a message
319 * @epd: endpoint descriptor
320 * @msg: message buffer address
321 * @len: message length
322 * @flags: blocking mode flags
323 *
324 * scif_send() sends data to the peer of endpoint epd. Up to len bytes of data
325 * are copied from memory starting at address msg. On successful execution the
326 * return value of scif_send() is the number of bytes that were sent, and is
327 * zero if no bytes were sent because len was zero. scif_send() may be called
328 * only when the endpoint is in a connected state.
329 *
330 * If a scif_send() call is non-blocking, then it sends only those bytes which
331 * can be sent without waiting, up to a maximum of len bytes.
332 *
333 * If a scif_send() call is blocking, then it normally returns after sending
334 * all len bytes. If a blocking call is interrupted or the connection is
335 * reset, the call is considered successful if some bytes were sent or len is
336 * zero, otherwise the call is considered unsuccessful.
337 *
338 * In user mode, the select() and poll() functions can be used to determine
339 * when the send queue is not full. In kernel mode, the scif_poll() function
340 * may be used for this purpose.
341 *
342 * It is recommended that scif_send()/scif_recv() only be used for short
343 * control-type message communication between SCIF endpoints. The SCIF RMA
344 * APIs are expected to provide better performance for transfer sizes of
345 * 1024 bytes or longer for the current MIC hardware and software
346 * implementation.
347 *
348 * scif_send() will block until the entire message is sent if SCIF_SEND_BLOCK
349 * is passed as the flags argument.
350 *
351 * Return:
352 * Upon successful completion, scif_send() returns the number of bytes sent;
353 * otherwise in user mode -1 is returned and errno is set to indicate the
354 * error; in kernel mode the negative of one of the following errors is
355 * returned.
356 *
357 * Errors:
358 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
359 * ECONNRESET - Connection reset by peer
360 * EFAULT - An invalid address was specified for a parameter
361 * EINVAL - flags is invalid, or len is negative
362 * ENODEV - The remote node is lost or existed, but is not currently in the
363 * network since it may have crashed
364 * ENOMEM - Not enough space
365 * ENOTCONN - The endpoint is not connected
366 */
367int scif_send(scif_epd_t epd, void *msg, int len, int flags);
368
369/**
370 * scif_recv() - Receive a message
371 * @epd: endpoint descriptor
372 * @msg: message buffer address
373 * @len: message buffer length
374 * @flags: blocking mode flags
375 *
376 * scif_recv() receives data from the peer of endpoint epd. Up to len bytes of
377 * data are copied to memory starting at address msg. On successful execution
378 * the return value of scif_recv() is the number of bytes that were received,
379 * and is zero if no bytes were received because len was zero. scif_recv() may
380 * be called only when the endpoint is in a connected state.
381 *
382 * If a scif_recv() call is non-blocking, then it receives only those bytes
383 * which can be received without waiting, up to a maximum of len bytes.
384 *
385 * If a scif_recv() call is blocking, then it normally returns after receiving
386 * all len bytes. If the blocking call was interrupted due to a disconnection,
387 * subsequent calls to scif_recv() will copy all bytes received upto the point
388 * of disconnection.
389 *
390 * In user mode, the select() and poll() functions can be used to determine
391 * when data is available to be received. In kernel mode, the scif_poll()
392 * function may be used for this purpose.
393 *
394 * It is recommended that scif_send()/scif_recv() only be used for short
395 * control-type message communication between SCIF endpoints. The SCIF RMA
396 * APIs are expected to provide better performance for transfer sizes of
397 * 1024 bytes or longer for the current MIC hardware and software
398 * implementation.
399 *
400 * scif_recv() will block until the entire message is received if
401 * SCIF_RECV_BLOCK is passed as the flags argument.
402 *
403 * Return:
404 * Upon successful completion, scif_recv() returns the number of bytes
405 * received; otherwise in user mode -1 is returned and errno is set to
406 * indicate the error; in kernel mode the negative of one of the following
407 * errors is returned.
408 *
409 * Errors:
410 * EAGAIN - The destination node is returning from a low power state
411 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
412 * ECONNRESET - Connection reset by peer
413 * EFAULT - An invalid address was specified for a parameter
414 * EINVAL - flags is invalid, or len is negative
415 * ENODEV - The remote node is lost or existed, but is not currently in the
416 * network since it may have crashed
417 * ENOMEM - Not enough space
418 * ENOTCONN - The endpoint is not connected
419 */
420int scif_recv(scif_epd_t epd, void *msg, int len, int flags);
421
422/**
423 * scif_register() - Mark a memory region for remote access.
424 * @epd: endpoint descriptor
425 * @addr: starting virtual address
426 * @len: length of range
427 * @offset: offset of window
428 * @prot_flags: read/write protection flags
429 * @map_flags: mapping flags
430 *
431 * The scif_register() function opens a window, a range of whole pages of the
432 * registered address space of the endpoint epd, starting at offset po and
433 * continuing for len bytes. The value of po, further described below, is a
434 * function of the parameters offset and len, and the value of map_flags. Each
435 * page of the window represents the physical memory page which backs the
436 * corresponding page of the range of virtual address pages starting at addr
437 * and continuing for len bytes. addr and len are constrained to be multiples
438 * of the page size. A successful scif_register() call returns po.
439 *
440 * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset
441 * exactly, and offset is constrained to be a multiple of the page size. The
442 * mapping established by scif_register() will not replace any existing
443 * registration; an error is returned if any page within the range [offset,
444 * offset + len - 1] intersects an existing window.
445 *
446 * When SCIF_MAP_FIXED is not set, the implementation uses offset in an
447 * implementation-defined manner to arrive at po. The po value so chosen will
448 * be an area of the registered address space that the implementation deems
449 * suitable for a mapping of len bytes. An offset value of 0 is interpreted as
450 * granting the implementation complete freedom in selecting po, subject to
451 * constraints described below. A non-zero value of offset is taken to be a
452 * suggestion of an offset near which the mapping should be placed. When the
453 * implementation selects a value for po, it does not replace any extant
454 * window. In all cases, po will be a multiple of the page size.
455 *
456 * The physical pages which are so represented by a window are available for
457 * access in calls to mmap(), scif_readfrom(), scif_writeto(),
458 * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the
459 * physical pages represented by the window will not be reused by the memory
460 * subsystem for any other purpose. Note that the same physical page may be
461 * represented by multiple windows.
462 *
463 * Subsequent operations which change the memory pages to which virtual
464 * addresses are mapped (such as mmap(), munmap()) have no effect on
465 * existing window.
466 *
467 * If the process will fork(), it is recommended that the registered
468 * virtual address range be marked with MADV_DONTFORK. Doing so will prevent
469 * problems due to copy-on-write semantics.
470 *
471 * The prot_flags argument is formed by OR'ing together one or more of the
472 * following values.
473 * SCIF_PROT_READ - allow read operations from the window
474 * SCIF_PROT_WRITE - allow write operations to the window
475 *
476 * The map_flags argument can be set to SCIF_MAP_FIXED which interprets a
477 * fixed offset.
478 *
479 * Return:
480 * Upon successful completion, scif_register() returns the offset at which the
481 * mapping was placed (po); otherwise in user mode SCIF_REGISTER_FAILED (that
482 * is (off_t *)-1) is returned and errno is set to indicate the error; in
483 * kernel mode the negative of one of the following errors is returned.
484 *
485 * Errors:
486 * EADDRINUSE - SCIF_MAP_FIXED is set in map_flags, and pages in the range
487 * [offset, offset + len -1] are already registered
488 * EAGAIN - The mapping could not be performed due to lack of resources
489 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
490 * ECONNRESET - Connection reset by peer
491 * EFAULT - Addresses in the range [addr, addr + len - 1] are invalid
492 * EINVAL - map_flags is invalid, or prot_flags is invalid, or SCIF_MAP_FIXED is
493 * set in flags, and offset is not a multiple of the page size, or addr is not a
494 * multiple of the page size, or len is not a multiple of the page size, or is
495 * 0, or offset is negative
496 * ENODEV - The remote node is lost or existed, but is not currently in the
497 * network since it may have crashed
498 * ENOMEM - Not enough space
499 * ENOTCONN -The endpoint is not connected
500 */
501off_t scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset,
502 int prot_flags, int map_flags);
503
504/**
505 * scif_unregister() - Mark a memory region for remote access.
506 * @epd: endpoint descriptor
507 * @offset: start of range to unregister
508 * @len: length of range to unregister
509 *
510 * The scif_unregister() function closes those previously registered windows
511 * which are entirely within the range [offset, offset + len - 1]. It is an
512 * error to specify a range which intersects only a subrange of a window.
513 *
514 * On a successful return, pages within the window may no longer be specified
515 * in calls to mmap(), scif_readfrom(), scif_writeto(), scif_vreadfrom(),
516 * scif_vwriteto(), scif_get_pages, and scif_fence_signal(). The window,
517 * however, continues to exist until all previous references against it are
518 * removed. A window is referenced if there is a mapping to it created by
519 * mmap(), or if scif_get_pages() was called against the window
520 * (and the pages have not been returned via scif_put_pages()). A window is
521 * also referenced while an RMA, in which some range of the window is a source
522 * or destination, is in progress. Finally a window is referenced while some
523 * offset in that window was specified to scif_fence_signal(), and the RMAs
524 * marked by that call to scif_fence_signal() have not completed. While a
525 * window is in this state, its registered address space pages are not
526 * available for use in a new registered window.
527 *
528 * When all such references to the window have been removed, its references to
529 * all the physical pages which it represents are removed. Similarly, the
530 * registered address space pages of the window become available for
531 * registration in a new window.
532 *
533 * Return:
534 * Upon successful completion, scif_unregister() returns 0; otherwise in user
535 * mode -1 is returned and errno is set to indicate the error; in kernel mode
536 * the negative of one of the following errors is returned. In the event of an
537 * error, no windows are unregistered.
538 *
539 * Errors:
540 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
541 * ECONNRESET - Connection reset by peer
542 * EINVAL - the range [offset, offset + len - 1] intersects a subrange of a
543 * window, or offset is negative
544 * ENODEV - The remote node is lost or existed, but is not currently in the
545 * network since it may have crashed
546 * ENOTCONN - The endpoint is not connected
547 * ENXIO - Offsets in the range [offset, offset + len - 1] are invalid for the
548 * registered address space of epd
549 */
550int scif_unregister(scif_epd_t epd, off_t offset, size_t len);
551
552/**
553 * scif_readfrom() - Copy from a remote address space
554 * @epd: endpoint descriptor
555 * @loffset: offset in local registered address space to
556 * which to copy
557 * @len: length of range to copy
558 * @roffset: offset in remote registered address space
559 * from which to copy
560 * @rma_flags: transfer mode flags
561 *
562 * scif_readfrom() copies len bytes from the remote registered address space of
563 * the peer of endpoint epd, starting at the offset roffset to the local
564 * registered address space of epd, starting at the offset loffset.
565 *
566 * Each of the specified ranges [loffset, loffset + len - 1] and [roffset,
567 * roffset + len - 1] must be within some registered window or windows of the
568 * local and remote nodes. A range may intersect multiple registered windows,
569 * but only if those windows are contiguous in the registered address space.
570 *
571 * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
572 * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
573 * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the
574 * transfer is complete. Otherwise, the transfer may be performed asynchron-
575 * ously. The order in which any two asynchronous RMA operations complete
576 * is non-deterministic. The synchronization functions, scif_fence_mark()/
577 * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
578 * the completion of asynchronous RMA operations on the same endpoint.
579 *
580 * The DMA transfer of individual bytes is not guaranteed to complete in
581 * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
582 * cacheline or partial cacheline of the source range will become visible on
583 * the destination node after all other transferred data in the source
584 * range has become visible on the destination node.
585 *
586 * The optimal DMA performance will likely be realized if both
587 * loffset and roffset are cacheline aligned (are a multiple of 64). Lower
588 * performance will likely be realized if loffset and roffset are not
589 * cacheline aligned but are separated by some multiple of 64. The lowest level
590 * of performance is likely if loffset and roffset are not separated by a
591 * multiple of 64.
592 *
593 * The rma_flags argument is formed by ORing together zero or more of the
594 * following values.
595 * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
596 * engine.
597 * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
598 * transfer has completed. Passing this flag results in the
599 * current implementation busy waiting and consuming CPU cycles
600 * while the DMA transfer is in progress for best performance by
601 * avoiding the interrupt latency.
602 * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
603 * the source range becomes visible on the destination node
604 * after all other transferred data in the source range has
605 * become visible on the destination
606 *
607 * Return:
608 * Upon successful completion, scif_readfrom() returns 0; otherwise in user
609 * mode -1 is returned and errno is set to indicate the error; in kernel mode
610 * the negative of one of the following errors is returned.
611 *
612 * Errors:
613 * EACCESS - Attempt to write to a read-only range
614 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
615 * ECONNRESET - Connection reset by peer
616 * EINVAL - rma_flags is invalid
617 * ENODEV - The remote node is lost or existed, but is not currently in the
618 * network since it may have crashed
619 * ENOTCONN - The endpoint is not connected
620 * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered
621 * address space of epd, or, The range [roffset, roffset + len - 1] is invalid
622 * for the registered address space of the peer of epd, or loffset or roffset
623 * is negative
624 */
625int scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, off_t
626 roffset, int rma_flags);
627
628/**
629 * scif_writeto() - Copy to a remote address space
630 * @epd: endpoint descriptor
631 * @loffset: offset in local registered address space
632 * from which to copy
633 * @len: length of range to copy
634 * @roffset: offset in remote registered address space to
635 * which to copy
636 * @rma_flags: transfer mode flags
637 *
638 * scif_writeto() copies len bytes from the local registered address space of
639 * epd, starting at the offset loffset to the remote registered address space
640 * of the peer of endpoint epd, starting at the offset roffset.
641 *
642 * Each of the specified ranges [loffset, loffset + len - 1] and [roffset,
643 * roffset + len - 1] must be within some registered window or windows of the
644 * local and remote nodes. A range may intersect multiple registered windows,
645 * but only if those windows are contiguous in the registered address space.
646 *
647 * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
648 * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
649 * flags includes SCIF_RMA_SYNC, then scif_writeto() will return after the
650 * transfer is complete. Otherwise, the transfer may be performed asynchron-
651 * ously. The order in which any two asynchronous RMA operations complete
652 * is non-deterministic. The synchronization functions, scif_fence_mark()/
653 * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
654 * the completion of asynchronous RMA operations on the same endpoint.
655 *
656 * The DMA transfer of individual bytes is not guaranteed to complete in
657 * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
658 * cacheline or partial cacheline of the source range will become visible on
659 * the destination node after all other transferred data in the source
660 * range has become visible on the destination node.
661 *
662 * The optimal DMA performance will likely be realized if both
663 * loffset and roffset are cacheline aligned (are a multiple of 64). Lower
664 * performance will likely be realized if loffset and roffset are not cacheline
665 * aligned but are separated by some multiple of 64. The lowest level of
666 * performance is likely if loffset and roffset are not separated by a multiple
667 * of 64.
668 *
669 * The rma_flags argument is formed by ORing together zero or more of the
670 * following values.
671 * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
672 * engine.
673 * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
674 * transfer has completed. Passing this flag results in the
675 * current implementation busy waiting and consuming CPU cycles
676 * while the DMA transfer is in progress for best performance by
677 * avoiding the interrupt latency.
678 * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
679 * the source range becomes visible on the destination node
680 * after all other transferred data in the source range has
681 * become visible on the destination
682 *
683 * Return:
684 * Upon successful completion, scif_readfrom() returns 0; otherwise in user
685 * mode -1 is returned and errno is set to indicate the error; in kernel mode
686 * the negative of one of the following errors is returned.
687 *
688 * Errors:
689 * EACCESS - Attempt to write to a read-only range
690 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
691 * ECONNRESET - Connection reset by peer
692 * EINVAL - rma_flags is invalid
693 * ENODEV - The remote node is lost or existed, but is not currently in the
694 * network since it may have crashed
695 * ENOTCONN - The endpoint is not connected
696 * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered
697 * address space of epd, or, The range [roffset , roffset + len -1] is invalid
698 * for the registered address space of the peer of epd, or loffset or roffset
699 * is negative
700 */
701int scif_writeto(scif_epd_t epd, off_t loffset, size_t len, off_t
702 roffset, int rma_flags);
703
704/**
705 * scif_vreadfrom() - Copy from a remote address space
706 * @epd: endpoint descriptor
707 * @addr: address to which to copy
708 * @len: length of range to copy
709 * @roffset: offset in remote registered address space
710 * from which to copy
711 * @rma_flags: transfer mode flags
712 *
713 * scif_vreadfrom() copies len bytes from the remote registered address
714 * space of the peer of endpoint epd, starting at the offset roffset, to local
715 * memory, starting at addr.
716 *
717 * The specified range [roffset, roffset + len - 1] must be within some
718 * registered window or windows of the remote nodes. The range may
719 * intersect multiple registered windows, but only if those windows are
720 * contiguous in the registered address space.
721 *
722 * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
723 * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
724 * flags includes SCIF_RMA_SYNC, then scif_vreadfrom() will return after the
725 * transfer is complete. Otherwise, the transfer may be performed asynchron-
726 * ously. The order in which any two asynchronous RMA operations complete
727 * is non-deterministic. The synchronization functions, scif_fence_mark()/
728 * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
729 * the completion of asynchronous RMA operations on the same endpoint.
730 *
731 * The DMA transfer of individual bytes is not guaranteed to complete in
732 * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
733 * cacheline or partial cacheline of the source range will become visible on
734 * the destination node after all other transferred data in the source
735 * range has become visible on the destination node.
736 *
737 * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back
738 * the specified local memory range may be remain in a pinned state even after
739 * the specified transfer completes. This may reduce overhead if some or all of
740 * the same virtual address range is referenced in a subsequent call of
741 * scif_vreadfrom() or scif_vwriteto().
742 *
743 * The optimal DMA performance will likely be realized if both
744 * addr and roffset are cacheline aligned (are a multiple of 64). Lower
745 * performance will likely be realized if addr and roffset are not
746 * cacheline aligned but are separated by some multiple of 64. The lowest level
747 * of performance is likely if addr and roffset are not separated by a
748 * multiple of 64.
749 *
750 * The rma_flags argument is formed by ORing together zero or more of the
751 * following values.
752 * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
753 * engine.
754 * SCIF_RMA_USECACHE - enable registration caching
755 * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
756 * transfer has completed. Passing this flag results in the
757 * current implementation busy waiting and consuming CPU cycles
758 * while the DMA transfer is in progress for best performance by
759 * avoiding the interrupt latency.
760 * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
761 * the source range becomes visible on the destination node
762 * after all other transferred data in the source range has
763 * become visible on the destination
764 *
765 * Return:
766 * Upon successful completion, scif_vreadfrom() returns 0; otherwise in user
767 * mode -1 is returned and errno is set to indicate the error; in kernel mode
768 * the negative of one of the following errors is returned.
769 *
770 * Errors:
771 * EACCESS - Attempt to write to a read-only range
772 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
773 * ECONNRESET - Connection reset by peer
774 * EFAULT - Addresses in the range [addr, addr + len - 1] are invalid
775 * EINVAL - rma_flags is invalid
776 * ENODEV - The remote node is lost or existed, but is not currently in the
777 * network since it may have crashed
778 * ENOTCONN - The endpoint is not connected
779 * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the
780 * registered address space of epd
781 */
782int scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, off_t roffset,
783 int rma_flags);
784
785/**
786 * scif_vwriteto() - Copy to a remote address space
787 * @epd: endpoint descriptor
788 * @addr: address from which to copy
789 * @len: length of range to copy
790 * @roffset: offset in remote registered address space to
791 * which to copy
792 * @rma_flags: transfer mode flags
793 *
794 * scif_vwriteto() copies len bytes from the local memory, starting at addr, to
795 * the remote registered address space of the peer of endpoint epd, starting at
796 * the offset roffset.
797 *
798 * The specified range [roffset, roffset + len - 1] must be within some
799 * registered window or windows of the remote nodes. The range may intersect
800 * multiple registered windows, but only if those windows are contiguous in the
801 * registered address space.
802 *
803 * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
804 * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
805 * flags includes SCIF_RMA_SYNC, then scif_vwriteto() will return after the
806 * transfer is complete. Otherwise, the transfer may be performed asynchron-
807 * ously. The order in which any two asynchronous RMA operations complete
808 * is non-deterministic. The synchronization functions, scif_fence_mark()/
809 * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
810 * the completion of asynchronous RMA operations on the same endpoint.
811 *
812 * The DMA transfer of individual bytes is not guaranteed to complete in
813 * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
814 * cacheline or partial cacheline of the source range will become visible on
815 * the destination node after all other transferred data in the source
816 * range has become visible on the destination node.
817 *
818 * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back
819 * the specified local memory range may be remain in a pinned state even after
820 * the specified transfer completes. This may reduce overhead if some or all of
821 * the same virtual address range is referenced in a subsequent call of
822 * scif_vreadfrom() or scif_vwriteto().
823 *
824 * The optimal DMA performance will likely be realized if both
825 * addr and offset are cacheline aligned (are a multiple of 64). Lower
826 * performance will likely be realized if addr and offset are not cacheline
827 * aligned but are separated by some multiple of 64. The lowest level of
828 * performance is likely if addr and offset are not separated by a multiple of
829 * 64.
830 *
831 * The rma_flags argument is formed by ORing together zero or more of the
832 * following values.
833 * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA
834 * engine.
835 * SCIF_RMA_USECACHE - allow registration caching
836 * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the
837 * transfer has completed. Passing this flag results in the
838 * current implementation busy waiting and consuming CPU cycles
839 * while the DMA transfer is in progress for best performance by
840 * avoiding the interrupt latency.
841 * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of
842 * the source range becomes visible on the destination node
843 * after all other transferred data in the source range has
844 * become visible on the destination
845 *
846 * Return:
847 * Upon successful completion, scif_vwriteto() returns 0; otherwise in user
848 * mode -1 is returned and errno is set to indicate the error; in kernel mode
849 * the negative of one of the following errors is returned.
850 *
851 * Errors:
852 * EACCESS - Attempt to write to a read-only range
853 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
854 * ECONNRESET - Connection reset by peer
855 * EFAULT - Addresses in the range [addr, addr + len - 1] are invalid
856 * EINVAL - rma_flags is invalid
857 * ENODEV - The remote node is lost or existed, but is not currently in the
858 * network since it may have crashed
859 * ENOTCONN - The endpoint is not connected
860 * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the
861 * registered address space of epd
862 */
863int scif_vwriteto(scif_epd_t epd, void *addr, size_t len, off_t roffset,
864 int rma_flags);
865
866/**
867 * scif_fence_mark() - Mark previously issued RMAs
868 * @epd: endpoint descriptor
869 * @flags: control flags
870 * @mark: marked value returned as output.
871 *
872 * scif_fence_mark() returns after marking the current set of all uncompleted
873 * RMAs initiated through the endpoint epd or the current set of all
874 * uncompleted RMAs initiated through the peer of endpoint epd. The RMAs are
875 * marked with a value returned at mark. The application may subsequently call
876 * scif_fence_wait(), passing the value returned at mark, to await completion
877 * of all RMAs so marked.
878 *
879 * The flags argument has exactly one of the following values.
880 * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint
881 * epd are marked
882 * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer
883 * of endpoint epd are marked
884 *
885 * Return:
886 * Upon successful completion, scif_fence_mark() returns 0; otherwise in user
887 * mode -1 is returned and errno is set to indicate the error; in kernel mode
888 * the negative of one of the following errors is returned.
889 *
890 * Errors:
891 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
892 * ECONNRESET - Connection reset by peer
893 * EINVAL - flags is invalid
894 * ENODEV - The remote node is lost or existed, but is not currently in the
895 * network since it may have crashed
896 * ENOTCONN - The endpoint is not connected
897 * ENOMEM - Insufficient kernel memory was available
898 */
899int scif_fence_mark(scif_epd_t epd, int flags, int *mark);
900
901/**
902 * scif_fence_wait() - Wait for completion of marked RMAs
903 * @epd: endpoint descriptor
904 * @mark: mark request
905 *
906 * scif_fence_wait() returns after all RMAs marked with mark have completed.
907 * The value passed in mark must have been obtained in a previous call to
908 * scif_fence_mark().
909 *
910 * Return:
911 * Upon successful completion, scif_fence_wait() returns 0; otherwise in user
912 * mode -1 is returned and errno is set to indicate the error; in kernel mode
913 * the negative of one of the following errors is returned.
914 *
915 * Errors:
916 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
917 * ECONNRESET - Connection reset by peer
918 * ENODEV - The remote node is lost or existed, but is not currently in the
919 * network since it may have crashed
920 * ENOTCONN - The endpoint is not connected
921 * ENOMEM - Insufficient kernel memory was available
922 */
923int scif_fence_wait(scif_epd_t epd, int mark);
924
925/**
926 * scif_fence_signal() - Request a memory update on completion of RMAs
927 * @epd: endpoint descriptor
928 * @loff: local offset
929 * @lval: local value to write to loffset
930 * @roff: remote offset
931 * @rval: remote value to write to roffset
932 * @flags: flags
933 *
934 * scif_fence_signal() returns after marking the current set of all uncompleted
935 * RMAs initiated through the endpoint epd or marking the current set of all
936 * uncompleted RMAs initiated through the peer of endpoint epd.
937 *
938 * If flags includes SCIF_SIGNAL_LOCAL, then on completion of the RMAs in the
939 * marked set, lval is written to memory at the address corresponding to offset
940 * loff in the local registered address space of epd. loff must be within a
941 * registered window. If flags includes SCIF_SIGNAL_REMOTE, then on completion
942 * of the RMAs in the marked set, rval is written to memory at the address
943 * corresponding to offset roff in the remote registered address space of epd.
944 * roff must be within a remote registered window of the peer of epd. Note
945 * that any specified offset must be DWORD (4 byte / 32 bit) aligned.
946 *
947 * The flags argument is formed by OR'ing together the following.
948 * Exactly one of the following values.
949 * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint
950 * epd are marked
951 * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer
952 * of endpoint epd are marked
953 * One or more of the following values.
954 * SCIF_SIGNAL_LOCAL - On completion of the marked set of RMAs, write lval to
955 * memory at the address corresponding to offset loff in the local
956 * registered address space of epd.
957 * SCIF_SIGNAL_REMOTE - On completion of the marked set of RMAs, write rval to
958 * memory at the address corresponding to offset roff in the remote
959 * registered address space of epd.
960 *
961 * Return:
962 * Upon successful completion, scif_fence_signal() returns 0; otherwise in
963 * user mode -1 is returned and errno is set to indicate the error; in kernel
964 * mode the negative of one of the following errors is returned.
965 *
966 * Errors:
967 * EBADF, ENOTTY - epd is not a valid endpoint descriptor
968 * ECONNRESET - Connection reset by peer
969 * EINVAL - flags is invalid, or loff or roff are not DWORD aligned
970 * ENODEV - The remote node is lost or existed, but is not currently in the
971 * network since it may have crashed
972 * ENOTCONN - The endpoint is not connected
973 * ENXIO - loff is invalid for the registered address of epd, or roff is invalid
974 * for the registered address space, of the peer of epd
975 */
976int scif_fence_signal(scif_epd_t epd, off_t loff, u64 lval, off_t roff,
977 u64 rval, int flags);
978
979/**
980 * scif_get_node_ids() - Return information about online nodes
981 * @nodes: array in which to return online node IDs
982 * @len: number of entries in the nodes array
983 * @self: address to place the node ID of the local node
984 *
985 * scif_get_node_ids() fills in the nodes array with up to len node IDs of the
986 * nodes in the SCIF network. If there is not enough space in nodes, as
987 * indicated by the len parameter, only len node IDs are returned in nodes. The
988 * return value of scif_get_node_ids() is the total number of nodes currently in
989 * the SCIF network. By checking the return value against the len parameter,
990 * the user may determine if enough space for nodes was allocated.
991 *
992 * The node ID of the local node is returned at self.
993 *
994 * Return:
995 * Upon successful completion, scif_get_node_ids() returns the actual number of
996 * online nodes in the SCIF network including 'self'; otherwise in user mode
997 * -1 is returned and errno is set to indicate the error; in kernel mode no
998 * errors are returned.
999 *
1000 * Errors:
1001 * EFAULT - Bad address
1002 */
1003int scif_get_node_ids(u16 *nodes, int len, u16 *self);
1004
Ashutosh Dixitb7f94442015-09-29 18:10:44 -07001005/**
1006 * scif_poll() - Wait for some event on an endpoint
1007 * @epds: Array of endpoint descriptors
1008 * @nepds: Length of epds
1009 * @timeout: Upper limit on time for which scif_poll() will block
1010 *
1011 * scif_poll() waits for one of a set of endpoints to become ready to perform
1012 * an I/O operation.
1013 *
1014 * The epds argument specifies the endpoint descriptors to be examined and the
1015 * events of interest for each endpoint descriptor. epds is a pointer to an
1016 * array with one member for each open endpoint descriptor of interest.
1017 *
1018 * The number of items in the epds array is specified in nepds. The epd field
1019 * of scif_pollepd is an endpoint descriptor of an open endpoint. The field
1020 * events is a bitmask specifying the events which the application is
1021 * interested in. The field revents is an output parameter, filled by the
1022 * kernel with the events that actually occurred. The bits returned in revents
1023 * can include any of those specified in events, or one of the values POLLERR,
1024 * POLLHUP, or POLLNVAL. (These three bits are meaningless in the events
1025 * field, and will be set in the revents field whenever the corresponding
1026 * condition is true.)
1027 *
1028 * If none of the events requested (and no error) has occurred for any of the
1029 * endpoint descriptors, then scif_poll() blocks until one of the events occurs.
1030 *
1031 * The timeout argument specifies an upper limit on the time for which
1032 * scif_poll() will block, in milliseconds. Specifying a negative value in
1033 * timeout means an infinite timeout.
1034 *
1035 * The following bits may be set in events and returned in revents.
1036 * POLLIN - Data may be received without blocking. For a connected
1037 * endpoint, this means that scif_recv() may be called without blocking. For a
1038 * listening endpoint, this means that scif_accept() may be called without
1039 * blocking.
1040 * POLLOUT - Data may be sent without blocking. For a connected endpoint, this
1041 * means that scif_send() may be called without blocking. POLLOUT may also be
1042 * used to block waiting for a non-blocking connect to complete. This bit value
1043 * has no meaning for a listening endpoint and is ignored if specified.
1044 *
1045 * The following bits are only returned in revents, and are ignored if set in
1046 * events.
1047 * POLLERR - An error occurred on the endpoint
1048 * POLLHUP - The connection to the peer endpoint was disconnected
1049 * POLLNVAL - The specified endpoint descriptor is invalid.
1050 *
1051 * Return:
1052 * Upon successful completion, scif_poll() returns a non-negative value. A
1053 * positive value indicates the total number of endpoint descriptors that have
1054 * been selected (that is, endpoint descriptors for which the revents member is
1055 * non-zero). A value of 0 indicates that the call timed out and no endpoint
1056 * descriptors have been selected. Otherwise in user mode -1 is returned and
1057 * errno is set to indicate the error; in kernel mode the negative of one of
1058 * the following errors is returned.
1059 *
1060 * Errors:
1061 * EINTR - A signal occurred before any requested event
1062 * EINVAL - The nepds argument is greater than {OPEN_MAX}
1063 * ENOMEM - There was no space to allocate file descriptor tables
1064 */
1065int scif_poll(struct scif_pollepd *epds, unsigned int nepds, long timeout);
1066
Sudeep Dutt7df20f22015-04-29 05:32:28 -07001067#endif /* __SCIF_H__ */