1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
|
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright (c) 2007, The Ohio State University. All rights reserved.
*
* Portions of this source code is developed by the team members of
* The Ohio State University's Network-Based Computing Laboratory (NBCL),
* headed by Professor Dhabaleswar K. (DK) Panda.
*
* Acknowledgements to contributions from developors:
* Ranjit Noronha: noronha@cse.ohio-state.edu
* Lei Chai : chail@cse.ohio-state.edu
* Weikuan Yu : yuw@cse.ohio-state.edu
*
*/
#ifndef _RPC_RPC_RDMA_H
#define _RPC_RPC_RDMA_H
#include <rpc/rpc.h>
#include <rpc/rpc_sztypes.h>
#include <sys/sunddi.h>
#include <sys/sunldi.h>
#ifdef __cplusplus
extern "C" {
#endif
#define RPCRDMA_VERS 1 /* Version of the RPC over RDMA protocol */
#define RDMATF_VERS 1 /* Version of the API used by RPC for RDMA */
#define RDMATF_VERS_1 1 /* Current version of RDMATF */
/*
* The size of an RPC call or reply message
*/
#define RPC_MSG_SZ 1024
/*
* RDMA chunk size
*/
#define RDMA_MINCHUNK 1024
/*
* Storage for a chunk list
*/
#define RPC_CL_SZ 1024
/*
* Chunk size
*/
#define MINCHUNK 1024
/*
* Size of receive buffer
*/
#define RPC_BUF_SIZE 2048
#define NOWAIT 0 /* don't wait for operation of complete */
#define WAIT 1 /* wait and ensure that operation is complete */
/*
* RDMA xdr buffer control and other control flags. Add new flags here,
* set them in private structure for xdr over RDMA in xdr_rdma.c
*/
#define XDR_RDMA_CHUNK 0x1
#define XDR_RDMA_WLIST_REG 0x2
#define XDR_RDMA_RLIST_REG 0x4
#define LONG_REPLY_LEN 65536
#define WCL_BUF_LEN 32768
#define RCL_BUF_LEN 32768
#define RDMA_BUFS_RQST 34 /* Num bufs requested by client */
#define RDMA_BUFS_GRANT 32 /* Num bufs granted by server */
struct xdr_ops *xdrrdma_xops(void);
/*
* Credit Control Structures.
*/
typedef enum rdma_cc_type {
RDMA_CC_CLNT, /* CONN is for a client */
RDMA_CC_SRV /* CONN is for a server */
} rdma_cc_type_t;
/*
* Client side credit control data structure.
*/
typedef struct rdma_clnt_cred_ctrl {
uint32_t clnt_cc_granted_ops;
uint32_t clnt_cc_in_flight_ops;
kcondvar_t clnt_cc_cv;
} rdma_clnt_cred_ctrl_t;
/*
* Server side credit control data structure.
*/
typedef struct rdma_srv_cred_ctrl {
uint32_t srv_cc_buffers_granted;
uint32_t srv_cc_cur_buffers_used;
uint32_t srv_cc_posted;
uint32_t srv_cc_max_buf_size; /* to be determined by CCP */
uint32_t srv_cc_cur_buf_size; /* to be determined by CCP */
} rdma_srv_cred_ctrl_t;
typedef enum {
RPCCALL_WLIST,
RPCCALL_WCHUNK,
RPCCALL_NOWRITE
}rpccall_write_t;
typedef enum {
CLIST_REG_SOURCE = 1,
CLIST_REG_DST
} clist_dstsrc;
/*
* Return codes from RDMA operations
*/
typedef enum {
RDMA_SUCCESS = 0, /* successful operation */
RDMA_INVAL = 1, /* invalid parameter */
RDMA_TIMEDOUT = 2, /* operation timed out */
RDMA_INTR = 3, /* operation interrupted */
RDMA_NORESOURCE = 4, /* insufficient resource */
/*
* connection errors
*/
RDMA_REJECT = 5, /* connection req rejected */
RDMA_NOLISTENER = 6, /* no listener on server */
RDMA_UNREACHABLE = 7, /* host unreachable */
RDMA_CONNLOST = 8, /* connection lost */
RDMA_XPRTFAILED = 9, /* RDMA transport failed */
RDMA_PROTECTERR = 10, /* memory protection error */
RDMA_OVERRUN = 11, /* transport overrun */
RDMA_RECVQEMPTY = 12, /* incoming pkt dropped, recv q empty */
RDMA_PROTFAILED = 13, /* RDMA protocol failed */
RDMA_NOTSUPP = 14, /* requested feature not supported */
RDMA_REMOTERR = 15, /* error at remote end */
/*
* RDMATF errors
*/
RDMA_BADVERS = 16, /* mismatch RDMATF versions */
RDMA_REG_EXIST = 17, /* RDMATF registration already exists */
RDMA_HCA_ATTACH = 18,
RDMA_HCA_DETACH = 19,
/*
* fallback error
*/
RDMA_FAILED = 20 /* generic error */
} rdma_stat;
/*
* Memory region context. This is an RDMA provider generated
* handle for a registered arbitrary size contiguous virtual
* memory. The RDMA Interface Adapter needs this for local or
* remote memory access.
*
* The mrc_rmr field holds the remote memory region context
* which is sent over-the-wire to provide the remote host
* with RDMA access to the memory region.
*/
struct mrc {
uint32_t mrc_rmr; /* Remote MR context, sent OTW */
union {
struct mr {
uint32_t lmr; /* Local MR context */
uint64_t linfo; /* Local memory info */
} mr;
} lhdl;
};
#define mrc_lmr lhdl.mr.lmr
#define mrc_linfo lhdl.mr.linfo
/*
* Memory management for the RDMA buffers
*/
/*
* RDMA buffer types
*/
typedef enum {
SEND_BUFFER, /* buf for send msg */
SEND_DESCRIPTOR, /* buf used for send msg descriptor in plugins only */
RECV_BUFFER, /* buf for recv msg */
RECV_DESCRIPTOR, /* buf used for recv msg descriptor in plugins only */
RDMA_LONG_BUFFER /* chunk buf used in RDMATF only and not in plugins */
} rdma_btype;
/*
* RDMA buffer information
*/
typedef struct rdma_buf {
rdma_btype type; /* buffer type */
uint_t len; /* length of buffer */
caddr_t addr; /* buffer address */
struct mrc handle; /* buffer registration handle */
caddr_t rb_private;
} rdma_buf_t;
/*
* The XDR offset value is used by the XDR
* routine to identify the position in the
* RPC message where the opaque object would
* normally occur. Neither the data content
* of the chunk, nor its size field are included
* in the RPC message. The XDR offset is calculated
* as if the chunks were present.
*
* The remaining fields identify the chunk of data
* on the sender. The c_memhandle identifies a
* registered RDMA memory region and the c_addr
* and c_len fields identify the chunk within it.
*/
struct clist {
uint32 c_xdroff; /* XDR offset */
uint32 c_len; /* Length */
clist_dstsrc c_regtype; /* type of registration */
struct mrc c_smemhandle; /* src memory handle */
uint64 c_ssynchandle; /* src sync handle */
union {
uint64 c_saddr; /* src address */
caddr_t c_saddr3;
} w;
struct mrc c_dmemhandle; /* dst memory handle */
uint64 c_dsynchandle; /* dst sync handle */
union {
uint64 c_daddr; /* dst address */
caddr_t c_daddr3;
} u;
struct as *c_adspc; /* address space for saddr/daddr */
rdma_buf_t rb_longbuf; /* used for long requests/replies */
struct clist *c_next; /* Next chunk */
};
typedef struct clist clist;
/*
* max 4M wlist xfer size
* This is defined because the rfs3_tsize service requires
* svc_req struct (which we don't have that in krecv).
*/
#define MAX_SVC_XFER_SIZE (4*1024*1024)
enum rdma_proc {
RDMA_MSG = 0, /* chunk list and RPC msg follow */
RDMA_NOMSG = 1, /* only chunk list follows */
RDMA_MSGP = 2, /* chunk list and RPC msg with padding follow */
RDMA_DONE = 3 /* signal completion of chunk transfer */
};
/*
* Listener information for a service
*/
struct rdma_svc_data {
queue_t q; /* queue_t to place incoming pkts */
int active; /* If active, after registeration startup */
rdma_stat err_code; /* Error code from plugin layer */
int32_t svcid; /* RDMA based service identifier */
};
/*
* Per RDMA plugin module information.
* Will be populated by each plugin
* module during its initialization.
*/
typedef struct rdma_mod {
char *rdma_api; /* "kvipl", "ibtf", etc */
uint_t rdma_version; /* RDMATF API version */
int rdma_count; /* # of devices */
struct rdmaops *rdma_ops; /* rdma op vector for api */
} rdma_mod_t;
/*
* Registry of RDMA plugins
*/
typedef struct rdma_registry {
rdma_mod_t *r_mod; /* plugin mod info */
uint32_t r_mod_state;
struct rdma_registry *r_next; /* next registered RDMA plugin */
} rdma_registry_t;
/*
* RDMA MODULE state flags (r_mod_state).
*/
#define RDMA_MOD_ACTIVE 1
#define RDMA_MOD_INACTIVE 0
/*
* RDMA transport information
*/
typedef struct rdma_info {
uint_t addrlen; /* address length */
uint_t mts; /* max transfer size */
uint_t mtu; /* native mtu size of unlerlying network */
} rdma_info_t;
typedef enum {
C_IDLE = 0x00000001,
C_CONN_PEND = 0x00000002,
C_CONNECTED = 0x00000004,
C_ERROR_CONN = 0x00000008,
C_DISCONN_PEND = 0x00000010,
C_REMOTE_DOWN = 0x00000020
} conn_c_state;
/* c_flags */
#define C_CLOSE_NOTNEEDED 0x00000001 /* just free the channel */
#define C_CLOSE_PENDING 0x00000002 /* a close in progress */
/*
* RDMA Connection information
*/
typedef struct conn {
rdma_mod_t *c_rdmamod; /* RDMA transport info for conn */
char *c_netid; /* tcp or tcp6 token */
struct netbuf c_raddr; /* remote address */
struct netbuf c_laddr; /* local address */
struct netbuf c_addrmask; /* Address Mask */
int c_ref; /* no. of clients of connection */
struct conn *c_next; /* next in list of connections */
struct conn *c_prev; /* prev in list of connections */
caddr_t c_private; /* transport specific stuff */
conn_c_state c_state; /* state of connection */
int c_flags; /* flags for connection management */
rdma_cc_type_t c_cc_type; /* client or server, for credit cntrl */
union {
rdma_clnt_cred_ctrl_t c_clnt_cc;
rdma_srv_cred_ctrl_t c_srv_cc;
} rdma_conn_cred_ctrl_u;
kmutex_t c_lock; /* protect c_state and c_ref fields */
kcondvar_t c_cv; /* to signal when pending is done */
timeout_id_t c_timeout; /* timeout id for untimeout() */
time_t c_last_used; /* last time any activity on the conn */
} CONN;
/*
* Data transferred from plugin interrupt to svc_queuereq()
*/
typedef struct rdma_recv_data {
CONN *conn;
int status;
rdma_buf_t rpcmsg;
} rdma_recv_data_t;
/* structure used to pass information for READ over rdma write */
typedef enum {
RCI_WRITE_UIO_CHUNK = 1,
RCI_WRITE_ADDR_CHUNK = 2,
RCI_REPLY_CHUNK = 3
} rci_type_t;
typedef struct {
rci_type_t rci_type;
union {
struct uio *rci_uiop;
caddr_t rci_addr;
} rci_a;
uint32 rci_len;
struct clist **rci_clpp; /* point to write chunk list in readargs */
} rdma_chunkinfo_t;
typedef struct {
uint_t rcil_len;
uint_t rcil_len_alt;
} rdma_chunkinfo_lengths_t;
typedef struct {
struct clist *rwci_wlist;
CONN *rwci_conn;
} rdma_wlist_conn_info_t;
/*
* Operations vector for RDMA transports.
*/
typedef struct rdmaops {
/* Network */
rdma_stat (*rdma_reachable)(int addr_type, struct netbuf *,
void **handle);
/* Connection */
rdma_stat (*rdma_get_conn)(struct netbuf *, struct netbuf *,
int addr_type, void *, CONN **);
rdma_stat (*rdma_rel_conn)(CONN *);
/* Server side listner start and stop routines */
void (*rdma_svc_listen)(struct rdma_svc_data *);
void (*rdma_svc_stop)(struct rdma_svc_data *);
/* Memory */
rdma_stat (*rdma_regmem)(CONN *, caddr_t, caddr_t,
uint_t, struct mrc *);
rdma_stat (*rdma_deregmem)(CONN *, caddr_t, struct mrc);
rdma_stat (*rdma_regmemsync)(CONN *, caddr_t, caddr_t, uint_t,
struct mrc *, void **, void *);
rdma_stat (*rdma_deregmemsync)(CONN *, caddr_t, struct mrc,
void *, void *);
rdma_stat (*rdma_syncmem)(CONN *, void *, caddr_t, int, int);
/* Buffer */
rdma_stat (*rdma_buf_alloc)(CONN *, rdma_buf_t *);
void (*rdma_buf_free)(CONN *, rdma_buf_t *);
/* Transfer */
rdma_stat (*rdma_send)(CONN *, clist *, uint32_t);
rdma_stat (*rdma_send_resp)(CONN *, clist *, uint32_t);
rdma_stat (*rdma_clnt_recvbuf)(CONN *, clist *, uint32_t);
rdma_stat (*rdma_clnt_recvbuf_remove)(CONN *, uint32_t);
rdma_stat (*rdma_svc_recvbuf)(CONN *, clist *);
rdma_stat (*rdma_recv)(CONN *, clist **, uint32_t);
/* RDMA */
rdma_stat (*rdma_read)(CONN *, clist *, int);
rdma_stat (*rdma_write)(CONN *, clist *, int);
/* INFO */
rdma_stat (*rdma_getinfo)(rdma_info_t *info);
} rdmaops_t;
typedef struct rdma_svc_wait {
kmutex_t svc_lock;
kcondvar_t svc_cv;
rdma_stat svc_stat;
} rdma_svc_wait_t;
extern rdma_svc_wait_t rdma_wait;
/*
* RDMA operations.
*/
#define RDMA_REACHABLE(rdma_ops, addr_type, addr, handle) \
(*(rdma_ops)->rdma_reachable)(addr_type, addr, handle)
#define RDMA_GET_CONN(rdma_ops, saddr, daddr, addr_type, handle, conn) \
(*(rdma_ops)->rdma_get_conn)(saddr, daddr, addr_type, handle, conn)
#define RDMA_REL_CONN(conn) \
(*(conn)->c_rdmamod->rdma_ops->rdma_rel_conn)(conn)
#define RDMA_REGMEM(conn, adsp, buff, len, handle) \
(*(conn)->c_rdmamod->rdma_ops->rdma_regmem)(conn, adsp, \
buff, len, handle)
#define RDMA_DEREGMEM(conn, buff, handle) \
(*(conn)->c_rdmamod->rdma_ops->rdma_deregmem)(conn, buff, handle)
#define RDMA_REGMEMSYNC(conn, adsp, buff, len, handle, synchandle, lrc) \
(*(conn)->c_rdmamod->rdma_ops->rdma_regmemsync)(conn, adsp, buff, \
len, handle, synchandle, lrc)
#define RDMA_DEREGMEMSYNC(conn, buff, handle, synchandle, lrc) \
(*(conn)->c_rdmamod->rdma_ops->rdma_deregmemsync)(conn, buff, \
handle, synchandle, lrc)
#define RDMA_SYNCMEM(conn, handle, buff, len, direction) \
(*(conn)->c_rdmamod->rdma_ops->rdma_syncmem)(conn, handle, \
buff, len, direction)
#define RDMA_BUF_ALLOC(conn, rbuf) \
(*(conn)->c_rdmamod->rdma_ops->rdma_buf_alloc)(conn, rbuf)
#define RDMA_BUF_FREE(conn, rbuf) \
(*(conn)->c_rdmamod->rdma_ops->rdma_buf_free)(conn, rbuf)
#define RDMA_SEND(conn, sendlist, xid) \
(*(conn)->c_rdmamod->rdma_ops->rdma_send)(conn, sendlist, xid)
#define RDMA_SEND_RESP(conn, sendlist, xid) \
(*(conn)->c_rdmamod->rdma_ops->rdma_send_resp)(conn, sendlist, xid)
#define RDMA_CLNT_RECVBUF(conn, cl, xid) \
(*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf)(conn, cl, xid)
#define RDMA_CLNT_RECVBUF_REMOVE(conn, xid) \
(*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf_remove)(conn, xid)
#define RDMA_SVC_RECVBUF(conn, cl) \
(*(conn)->c_rdmamod->rdma_ops->rdma_svc_recvbuf)(conn, cl)
#define RDMA_RECV(conn, recvlist, xid) \
(*(conn)->c_rdmamod->rdma_ops->rdma_recv)(conn, recvlist, xid)
#define RDMA_READ(conn, cl, wait) \
(*(conn)->c_rdmamod->rdma_ops->rdma_read)(conn, cl, wait)
#define RDMA_WRITE(conn, cl, wait) \
(*(conn)->c_rdmamod->rdma_ops->rdma_write)(conn, cl, wait)
#define RDMA_GETINFO(rdma_mod, info) \
(*(rdma_mod)->rdma_ops->rdma_getinfo)(info)
#ifdef _KERNEL
extern rdma_registry_t *rdma_mod_head;
extern krwlock_t rdma_lock; /* protects rdma_mod_head list */
extern int rdma_modloaded; /* flag for loading RDMA plugins */
extern int rdma_dev_available; /* rdma device is loaded or not */
extern kmutex_t rdma_modload_lock; /* protects rdma_modloaded flag */
extern uint_t rdma_minchunk;
extern ldi_ident_t rpcmod_li; /* needed by layed driver framework */
/*
* General RDMA routines
*/
extern struct clist *clist_alloc(void);
extern void clist_add(struct clist **, uint32_t, int,
struct mrc *, caddr_t, struct mrc *, caddr_t);
extern void clist_free(struct clist *);
extern uint32_t clist_len(struct clist *);
extern void clist_zero_len(struct clist *);
extern rdma_stat clist_register(CONN *conn, struct clist *cl, clist_dstsrc);
extern rdma_stat clist_deregister(CONN *conn, struct clist *cl);
extern rdma_stat clist_syncmem(CONN *conn, struct clist *cl, clist_dstsrc);
extern rdma_stat rdma_clnt_postrecv(CONN *conn, uint32_t xid);
extern rdma_stat rdma_clnt_postrecv_remove(CONN *conn, uint32_t xid);
extern rdma_stat rdma_svc_postrecv(CONN *conn);
extern rdma_stat rdma_register_mod(rdma_mod_t *mod);
extern rdma_stat rdma_unregister_mod(rdma_mod_t *mod);
extern rdma_stat rdma_buf_alloc(CONN *, rdma_buf_t *);
extern void rdma_buf_free(CONN *, rdma_buf_t *);
extern int rdma_modload();
extern bool_t rdma_get_wchunk(struct svc_req *, iovec_t *, struct clist *);
extern rdma_stat rdma_kwait(void);
extern int rdma_setup_read_chunks(struct clist *, uint32_t, int *);
/*
* RDMA XDR
*/
extern void xdrrdma_create(XDR *, caddr_t, uint_t, int, struct clist *,
enum xdr_op, CONN *);
extern void xdrrdma_destroy(XDR *);
extern uint_t xdrrdma_getpos(XDR *);
extern bool_t xdrrdma_setpos(XDR *, uint_t);
extern bool_t xdr_clist(XDR *, clist *);
extern bool_t xdr_do_clist(XDR *, clist **);
extern uint_t xdr_getbufsize(XDR *);
extern unsigned int xdrrdma_sizeof(xdrproc_t, void *, int, uint_t *, uint_t *);
extern unsigned int xdrrdma_authsize(AUTH *, struct cred *, int);
extern void xdrrdma_store_wlist(XDR *, struct clist *);
extern struct clist *xdrrdma_wclist(XDR *);
extern bool_t xdr_decode_reply_wchunk(XDR *, struct clist **);
extern bool_t xdr_decode_wlist(XDR *xdrs, struct clist **, bool_t *);
extern bool_t xdr_decode_wlist_svc(XDR *xdrs, struct clist **, bool_t *,
uint32_t *, CONN *);
extern bool_t xdr_encode_rlist_svc(XDR *, clist *);
extern bool_t xdr_encode_wlist(XDR *, clist *);
extern bool_t xdr_encode_reply_wchunk(XDR *, struct clist *,
uint32_t seg_array_len);
bool_t xdrrdma_getrdmablk(XDR *, struct clist **, uint_t *,
CONN **conn, const uint_t);
bool_t xdrrdma_read_from_client(struct clist *, CONN **, uint_t);
bool_t xdrrdma_send_read_data(XDR *, uint_t, struct clist *);
bool_t xdrrdma_free_clist(CONN *, struct clist *);
#endif /* _KERNEL */
#ifdef __cplusplus
}
#endif
#endif /* _RPC_RPC_RDMA_H */
|