summaryrefslogtreecommitdiff
path: root/usr/src/uts/sun4v/sys/vsw_ldc.h
blob: c9a2daba41f4605db13ef1ae85027e5d71a1a942 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
 */

/*
 * This header file contains the basic data structures which the
 * virtual switch (vsw) uses to communicate with vnet clients.
 *
 * The virtual switch reads the machine description (MD) to
 * determine how many port_t structures to create (each port_t
 * can support communications to a single network device). The
 * port_t's are maintained in a linked list.
 *
 * Each port in turn contains a number of logical domain channels
 * (ldc's) which are inter domain communications channels which
 * are used for passing small messages between the domains. There
 * may be any number of channels associated with each port, though
 * currently most devices only have a single channel. The current
 * implementation provides support for only one channel per port.
 *
 * The ldc is a bi-directional channel, which is divided up into
 * two directional 'lanes', one outbound from the switch to the
 * virtual network device, the other inbound to the switch.
 * Depending on the type of device each lane may have seperate
 * communication paramaters (such as mtu etc).
 *
 * For those network clients which use descriptor rings the
 * rings are associated with the appropriate lane. I.e. rings
 * which the switch exports are associated with the outbound lanes
 * while those which the network clients are exporting to the switch
 * are associated with the inbound lane.
 *
 * In diagram form the data structures look as follows:
 *
 * vsw instance
 *     |
 *     +----->port_t----->port_t----->port_t----->
 *		|
 *		+--->ldc_t
 *		       |
 *		       +--->lane_t (inbound)
 *		       |       |
 *		       |       +--->dring
 *		       |
 *		       +--->lane_t (outbound)
 *			       |
 *			       +--->dring
 *
 */

#ifndef	_VSW_LDC_H
#define	_VSW_LDC_H

#ifdef	__cplusplus
extern "C" {
#endif

/*
 * LDC pkt tranfer MTU - largest msg size used
 */
#define	VSW_LDC_MTU		64

#define	VSW_DEF_MSG_WORDS	\
	(VNET_DRING_REG_EXT_MSG_SIZE_MAX / sizeof (uint64_t))

/*
 * Default message type.
 */
typedef struct def_msg {
	uint64_t	data[VSW_DEF_MSG_WORDS];
} def_msg_t;

/*
 * Currently only support one major/minor pair.
 */
#define	VSW_NUM_VER	1

typedef struct ver_sup {
	uint16_t	ver_major;	/* major version number */
	uint16_t	ver_minor;	/* minor version number */
} ver_sup_t;

/*
 * Lane states.
 */
#define	VSW_LANE_INACTIV	0x0	/* No params set for lane */

#define	VSW_VER_INFO_SENT	0x1	/* Version # sent to peer */
#define	VSW_VER_INFO_RECV	0x2	/* Version # recv from peer */
#define	VSW_VER_ACK_RECV	0x4
#define	VSW_VER_ACK_SENT	0x8
#define	VSW_VER_NACK_RECV	0x10
#define	VSW_VER_NACK_SENT	0x20

#define	VSW_ATTR_INFO_SENT	0x40	/* Attributes sent to peer */
#define	VSW_ATTR_INFO_RECV	0x80	/* Peer attributes received */
#define	VSW_ATTR_ACK_SENT	0x100
#define	VSW_ATTR_ACK_RECV	0x200
#define	VSW_ATTR_NACK_SENT	0x400
#define	VSW_ATTR_NACK_RECV	0x800

#define	VSW_DRING_INFO_SENT	0x1000	/* Dring info sent to peer */
#define	VSW_DRING_INFO_RECV	0x2000	/* Dring info received */
#define	VSW_DRING_ACK_SENT	0x4000
#define	VSW_DRING_ACK_RECV	0x8000
#define	VSW_DRING_NACK_SENT	0x10000
#define	VSW_DRING_NACK_RECV	0x20000

#define	VSW_RDX_INFO_SENT	0x40000	/* RDX sent to peer */
#define	VSW_RDX_INFO_RECV	0x80000	/* RDX received from peer */
#define	VSW_RDX_ACK_SENT	0x100000
#define	VSW_RDX_ACK_RECV	0x200000
#define	VSW_RDX_NACK_SENT	0x400000
#define	VSW_RDX_NACK_RECV	0x800000

#define	VSW_MCST_INFO_SENT	0x1000000
#define	VSW_MCST_INFO_RECV	0x2000000
#define	VSW_MCST_ACK_SENT	0x4000000
#define	VSW_MCST_ACK_RECV	0x8000000
#define	VSW_MCST_NACK_SENT	0x10000000
#define	VSW_MCST_NACK_RECV	0x20000000

#define	VSW_LANE_ACTIVE		0x40000000	/* Lane open to xmit data */

/* Handshake milestones */
#define	VSW_MILESTONE0		0x1	/* ver info exchanged */
#define	VSW_MILESTONE1		0x2	/* attribute exchanged */
#define	VSW_MILESTONE2		0x4	/* dring info exchanged */
#define	VSW_MILESTONE3		0x8	/* rdx exchanged */
#define	VSW_MILESTONE4		0x10	/* handshake complete */

/*
 * Lane direction (relative to ourselves).
 */
#define	INBOUND			0x1
#define	OUTBOUND		0x2

/* Peer session id received */
#define	VSW_PEER_SESSION	0x1

/*
 * Maximum number of consecutive reads of data from channel
 */
#define	VSW_MAX_CHAN_READ	50

/*
 * Currently only support one ldc per port.
 */
#define	VSW_PORT_MAX_LDCS	1	/* max # of ldcs per port */

/*
 * Used for port add/deletion.
 */
#define	VSW_PORT_UPDATED	0x1

#define	LDC_TX_SUCCESS		0	/* ldc transmit success */
#define	LDC_TX_FAILURE		1	/* ldc transmit failure */
#define	LDC_TX_NORESOURCES	2	/* out of descriptors */

/*
 * Descriptor ring info
 *
 * Each descriptor element has a pre-allocated data buffer
 * associated with it, into which data being transmitted is
 * copied. By pre-allocating we speed up the copying process.
 * The buffer is re-used once the peer has indicated that it is
 * finished with the descriptor.
 */
#define	VSW_RING_EL_DATA_SZ	2048	/* Size of data section (bytes) */
#define	VSW_PRIV_SIZE	sizeof (vnet_private_desc_t)

#define	VSW_MAX_COOKIES		((ETHERMTU >> MMU_PAGESHIFT) + 2)

/*
 * Size of the mblk in each mblk pool.
 */
#define	VSW_MBLK_SZ_128		128
#define	VSW_MBLK_SZ_256		256
#define	VSW_MBLK_SZ_2048	2048

/*
 * Number of mblks in each mblk pool.
 */
#define	VSW_NUM_MBLKS	1024

/*
 * Number of rcv buffers in RxDringData mode
 */
#define	VSW_RXDRING_NRBUFS	(vsw_num_descriptors * vsw_nrbufs_factor)

/* increment recv index */
#define	INCR_DESC_INDEX(dp, i)	\
		((i) = (((i) + 1) & ((dp)->num_descriptors - 1)))

/* decrement recv index */
#define	DECR_DESC_INDEX(dp, i)	\
		((i) = (((i) - 1) & ((dp)->num_descriptors - 1)))

#define	INCR_TXI	INCR_DESC_INDEX
#define	DECR_TXI	DECR_DESC_INDEX
#define	INCR_RXI	INCR_DESC_INDEX
#define	DECR_RXI	DECR_DESC_INDEX

/* bounds check rx index */
#define	CHECK_DESC_INDEX(dp, i)	\
		(((i) >= 0) && ((i) < (dp)->num_descriptors))

#define	CHECK_RXI	CHECK_DESC_INDEX
#define	CHECK_TXI	CHECK_DESC_INDEX

/*
 * Private descriptor
 */
typedef struct vsw_private_desc {
	/*
	 * Below lock must be held when accessing the state of
	 * a descriptor on either the private or public sections
	 * of the ring.
	 */
	kmutex_t		dstate_lock;
	uint64_t		dstate;
	vnet_public_desc_t	*descp;
	ldc_mem_handle_t	memhandle;
	void			*datap;
	uint64_t		datalen;
	uint64_t		ncookies;
	ldc_mem_cookie_t	memcookie[VSW_MAX_COOKIES];
	int			bound;
} vsw_private_desc_t;

/*
 * Descriptor ring structure
 */
typedef struct dring_info {
	kmutex_t		dlock;		/* sync access */
	uint32_t		num_descriptors; /* # of descriptors */
	uint32_t		descriptor_size; /* size of descriptor */
	uint32_t		options;	/* dring options (mode) */
	ldc_dring_handle_t	dring_handle;	/* dring LDC handle */
	uint32_t		dring_ncookies;	/* # of dring cookies */
	ldc_mem_cookie_t	dring_cookie[1]; /* LDC cookie of dring */
	ldc_mem_handle_t	data_handle;	/* data area  LDC handle */
	uint32_t		data_ncookies;	/* # of data area cookies */
	ldc_mem_cookie_t	*data_cookie;	/* data area LDC cookies */
	uint64_t		ident;		/* identifier sent to peer */
	uint64_t		end_idx;	/* last idx processed */
	int64_t			last_ack_recv;	/* last ack received */
	kmutex_t		txlock;		/* protect tx desc alloc */
	uint32_t		next_txi;	/* next tx descriptor index */
	uint32_t		next_rxi;	/* next expected recv index */
	kmutex_t		restart_lock;	/* protect restart_reqd */
	boolean_t		restart_reqd;	/* send restart msg */
	uint32_t		restart_peer_txi; /* index to restart peer */
	void			*pub_addr;	/* base of public section */
	void			*priv_addr;	/* base of private section */
	void			*data_addr;	/* base of data section */
	size_t			data_sz;	/* size of data section */
	size_t			desc_data_sz;	/* size of descr data blk */
	uint8_t			dring_mtype;	/* dring mem map type */
	uint32_t		num_bufs;	/* # of buffers */
	vio_mblk_pool_t		*rx_vmp;	/* rx mblk pool */
	vio_mblk_t		**rxdp_to_vmp;	/* descr to buf map tbl */
} dring_info_t;

/*
 * Each ldc connection is comprised of two lanes, incoming
 * from a peer, and outgoing to that peer. Each lane shares
 * common ldc parameters and also has private lane-specific
 * parameters.
 */
typedef struct lane {
	uint64_t	lstate;		/* Lane state */
	uint16_t	ver_major;	/* Version major number */
	uint16_t	ver_minor;	/* Version minor number */
	uint64_t	seq_num;	/* Sequence number */
	uint64_t	mtu;		/* ETHERMTU */
	uint64_t	addr;		/* Unique physical address */
	uint8_t		addr_type;	/* Only MAC address at moment */
	uint8_t		xfer_mode;	/* Dring or Pkt based */
	uint8_t		ack_freq;	/* Only non zero for Pkt based xfer */
	uint32_t	physlink_update;	/* physlink updates */
	uint8_t		dring_mode;	/* Descriptor ring mode */
	dring_info_t	*dringp;	/* List of drings for this lane */
} lane_t;

/* channel drain states */
#define	VSW_LDC_INIT		0x1	/* Initial non-drain state */
#define	VSW_LDC_DRAINING	0x2	/* Channel draining */

/*
 * vnet-protocol-version dependent function prototypes.
 */
typedef int	(*vsw_ldctx_t) (void *, mblk_t *, mblk_t *, uint32_t);
typedef void	(*vsw_ldcrx_pktdata_t) (void *, void *, uint32_t);
typedef void	(*vsw_ldcrx_dringdata_t) (void *, void *);

/* ldc information associated with a vsw-port */
typedef struct vsw_ldc {
	struct vsw_ldc		*ldc_next;	/* next ldc in the list */
	struct vsw_port		*ldc_port;	/* associated port */
	struct vsw		*ldc_vswp;	/* associated vsw */
	kmutex_t		ldc_cblock;	/* sync callback processing */
	kmutex_t		ldc_txlock;	/* sync transmits */
	kmutex_t		ldc_rxlock;	/* sync rx */
	uint64_t		ldc_id;		/* channel number */
	ldc_handle_t		ldc_handle;	/* channel handle */
	kmutex_t		drain_cv_lock;
	kcondvar_t		drain_cv;	/* channel draining */
	int			drain_state;
	uint32_t		hphase;		/* handshake phase */
	int			hcnt;		/* # handshake attempts */
	kmutex_t		status_lock;
	ldc_status_t		ldc_status;	/* channel status */
	uint8_t			reset_active;	/* reset flag */
	uint64_t		local_session;	/* Our session id */
	uint64_t		peer_session;	/* Our peers session id */
	uint8_t			session_status;	/* Session recv'd, sent */
	uint32_t		hss_id;		/* Handshake session id */
	uint64_t		next_ident;	/* Next dring ident # to use */
	lane_t			lane_in;	/* Inbound lane */
	lane_t			lane_out;	/* Outbound lane */
	uint8_t			dev_class;	/* Peer device class */
	boolean_t		pls_negotiated;	/* phys link state update ? */
	vio_multi_pool_t	vmp;		/* Receive mblk pools */
	uint32_t		max_rxpool_size; /* max size of rxpool in use */
	uint64_t		*ldcmsg;	/* msg buffer for ldc_read() */
	uint64_t		msglen;		/* size of ldcmsg */
	uint32_t		dringdata_msgid; /* msgid in RxDringData mode */

	/* tx thread fields */
	kthread_t		*tx_thread;	/* tx thread */
	uint32_t		tx_thr_flags;	/* tx thread flags */
	kmutex_t		tx_thr_lock;	/* lock for tx thread */
	kcondvar_t		tx_thr_cv;	/* cond.var for tx thread */
	mblk_t			*tx_mhead;	/* tx mblks head */
	mblk_t			*tx_mtail;	/* tx mblks tail */
	uint32_t		tx_cnt;		/* # of pkts queued for tx */

	/* message thread fields */
	kthread_t		*msg_thread;	/* message thread */
	uint32_t		msg_thr_flags;	/* message thread flags */
	kmutex_t		msg_thr_lock;	/* lock for message thread */
	kcondvar_t		msg_thr_cv;	/* cond.var for msg thread */

	/* receive thread fields */
	kthread_t		*rcv_thread;	/* receive thread */
	uint32_t		rcv_thr_flags;	/* receive thread flags */
	kmutex_t		rcv_thr_lock;	/* lock for receive thread */
	kcondvar_t		rcv_thr_cv;	/* cond.var for recv thread */

	vsw_ldctx_t		tx;		/* transmit function */
	vsw_ldcrx_pktdata_t	rx_pktdata;	/* process raw data msg */
	vsw_ldcrx_dringdata_t	rx_dringdata;	/* process dring data msg */

	/* channel statistics */
	vgen_stats_t		ldc_stats;	/* channel statistics */
	kstat_t			*ksp;		/* channel kstats */
} vsw_ldc_t;

/* worker thread flags */
#define	VSW_WTHR_DATARCVD 	0x01	/* data received */
#define	VSW_WTHR_STOP 		0x02	/* stop worker thread request */

/* multicast addresses port is interested in */
typedef struct mcst_addr {
	struct mcst_addr	*nextp;
	struct ether_addr	mca;	/* multicast address */
	uint64_t		addr;	/* mcast addr converted to hash key */
	boolean_t		mac_added; /* added into physical device */
} mcst_addr_t;

/* Port detach states */
#define	VSW_PORT_INIT		0x1	/* Initial non-detach state */
#define	VSW_PORT_DETACHING	0x2	/* In process of being detached */
#define	VSW_PORT_DETACHABLE	0x4	/* Safe to detach */

/* port information associated with a vsw */
typedef struct vsw_port {
	int			p_instance;	/* port instance */
	struct vsw_port		*p_next;	/* next port in the list */
	struct vsw		*p_vswp;	/* associated vsw */
	int			num_ldcs;	/* # of ldcs in the port */
	uint64_t		*ldc_ids;	/* ldc ids */
	vsw_ldc_t		*ldcp;		/* ldc for this port */

	kmutex_t		tx_lock;	/* transmit lock */
	int			(*transmit)(vsw_ldc_t *, mblk_t *);

	int			state;		/* port state */
	kmutex_t		state_lock;
	kcondvar_t		state_cv;

	krwlock_t		maccl_rwlock;	/* protect fields below */
	mac_client_handle_t	p_mch;		/* mac client handle */
	mac_unicast_handle_t	p_muh;		/* mac unicast handle */

	kmutex_t		mca_lock;	/* multicast lock */
	mcst_addr_t		*mcap;		/* list of multicast addrs */

	boolean_t		addr_set;	/* Addr set where */

	/*
	 * mac address of the port & connected device
	 */
	struct ether_addr	p_macaddr;
	uint16_t		pvid;	/* port vlan id (untagged) */
	struct vsw_vlanid	*vids;	/* vlan ids (tagged) */
	uint16_t		nvids;	/* # of vids */
	mod_hash_t		*vlan_hashp;	/* vlan hash table */
	uint32_t		vlan_nchains;	/* # of vlan hash chains */

	/* HybridIO related info */
	uint32_t		p_hio_enabled;	/* Hybrid mode enabled? */
	uint32_t		p_hio_capable;	/* Port capable of HIO */

	/* bandwidth limit */
	uint64_t		p_bandwidth;	/* bandwidth limit */
} vsw_port_t;

/* list of ports per vsw */
typedef struct vsw_port_list {
	vsw_port_t	*head;		/* head of the list */
	krwlock_t	lockrw;		/* sync access(rw) to the list */
	int		num_ports;	/* number of ports in the list */
} vsw_port_list_t;

/*
 * Taskq control message
 */
typedef struct vsw_ctrl_task {
	vsw_ldc_t	*ldcp;
	def_msg_t	pktp;
	uint32_t	hss_id;
} vsw_ctrl_task_t;

/*
 * State of connection to peer. Some of these states
 * can be mapped to LDC events as follows:
 *
 * VSW_CONN_RESET -> LDC_RESET_EVT
 * VSW_CONN_UP    -> LDC_UP_EVT
 */
#define	VSW_CONN_UP		0x1	/* Connection come up */
#define	VSW_CONN_RESET		0x2	/* Connection reset */
#define	VSW_CONN_RESTART	0x4	/* Restarting handshake on connection */

typedef struct vsw_conn_evt {
	uint16_t	evt;		/* Connection event */
	vsw_ldc_t	*ldcp;
} vsw_conn_evt_t;

/*
 * Ethernet broadcast address definition.
 */
static	struct	ether_addr	etherbroadcastaddr = {
	0xff, 0xff, 0xff, 0xff, 0xff, 0xff
};

#define	IS_BROADCAST(ehp) \
	(bcmp(&ehp->ether_dhost, &etherbroadcastaddr, ETHERADDRL) == 0)
#define	IS_MULTICAST(ehp) \
	((ehp->ether_dhost.ether_addr_octet[0] & 01) == 1)

#define	READ_ENTER(x)	rw_enter(x, RW_READER)
#define	WRITE_ENTER(x)	rw_enter(x, RW_WRITER)
#define	RW_EXIT(x)	rw_exit(x)

#define	VSW_PORT_REFHOLD(portp)	atomic_inc_32(&((portp)->ref_cnt))
#define	VSW_PORT_REFRELE(portp)	atomic_dec_32(&((portp)->ref_cnt))

#ifdef	__cplusplus
}
#endif

#endif	/* _VSW_LDC_H */