1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
|
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
* Copyright 2019, Joyent, Inc.
*/
#ifndef _SYS_IB_EOIB_EIB_IMPL_H
#define _SYS_IB_EOIB_EIB_IMPL_H
#ifdef __cplusplus
extern "C" {
#endif
#include <sys/ddi.h>
#include <sys/mac.h>
#include <sys/sunddi.h>
#include <sys/varargs.h>
#include <sys/vlan.h>
#include <sys/ib/ibtl/ibti.h>
#include <sys/ib/ibtl/ibvti.h>
#include <sys/ib/ib_pkt_hdrs.h>
#include <sys/ib/clients/eoib/fip.h>
#include <sys/ib/clients/eoib/eib.h>
/*
* Driver specific constants
*/
#define EIB_E_SUCCESS 0
#define EIB_E_FAILURE -1
#define EIB_MAX_LINE 128
#define EIB_MAX_SGL 59
#define EIB_MAX_POST_MULTIPLE 4
#define EIB_MAX_PAYLOAD_HDR_SZ 160
#define EIB_TX_COPY_THRESH 4096 /* greater than mtu */
#define EIB_MAX_VNICS 64 /* do not change this */
#define EIB_LOGIN_TIMEOUT_USEC 8000000
#define EIB_RWR_CHUNK_SZ 8
#define EIB_IPHDR_ALIGN_ROOM 32
#define EIB_IP_HDR_ALIGN 2
#define EIB_MAX_RX_PKTS_ONINTR 0x800
#define EIB_MAX_LOGIN_ATTEMPTS 3
#define EIB_MAX_VHUB_TBL_ATTEMPTS 3
#define EIB_MAX_KA_ATTEMPTS 3
#define EIB_MAX_ATTEMPTS 10
#define EIB_DELAY_HALF_SECOND 500000
#define EIB_GRH_SZ (sizeof (ib_grh_t))
/*
* Debug messages
*/
#define EIB_MSGS_CRIT 0x01
#define EIB_MSGS_ERR 0x02
#define EIB_MSGS_WARN 0x04
#define EIB_MSGS_DEBUG 0x08
#define EIB_MSGS_ARGS 0x10
#define EIB_MSGS_PKT 0x20
#define EIB_MSGS_VERBOSE 0x40
#define EIB_MSGS_DEFAULT (EIB_MSGS_CRIT | EIB_MSGS_ERR | EIB_MSGS_WARN)
#define EIB_LOGSZ_DEFAULT 0x20000
#define EIB_DPRINTF_CRIT eib_dprintf_crit
#define EIB_DPRINTF_ERR eib_dprintf_err
#define EIB_DPRINTF_WARN eib_dprintf_warn
#ifdef EIB_DEBUG
#define EIB_DPRINTF_DEBUG eib_dprintf_debug
#define EIB_DPRINTF_ARGS eib_dprintf_args
#define EIB_DPRINTF_PKT eib_dprintf_pkt
#define EIB_DPRINTF_VERBOSE eib_dprintf_verbose
#else
#define EIB_DPRINTF_DEBUG(...) (void)(0)
#define EIB_DPRINTF_ARGS(...) (void)(0)
#define EIB_DPRINTF_PKT(...) (void)(0)
#define EIB_DPRINTF_VERBOSE(...) (void)(0)
#endif
/*
* EoIB threads to provide various services
*/
#define EIB_EVENTS_HDLR "eib_events_handler"
#define EIB_RWQES_REFILLER "eib_rwqes_refiller"
#define EIB_VNIC_CREATOR "eib_vnic_creator"
#define EIB_TXWQES_MONITOR "eib_txwqe_monitor"
#define EIB_LSOBUFS_MONITOR "eib_lsobufs_monitor"
/*
* Macro for finding the least significant bit set in a 64-bit unsigned int
*/
#define EIB_FIND_LSB_SET(val64) eib_setbit_mod67[((-(val64) & (val64)) % 67)]
/*
* LSO buffers
*
* Under normal circumstances we should never need to use any buffer
* that's larger than MTU. Unfortunately, IB HCA has limitations
* on the length of SGL that are much smaller than those for regular
* ethernet NICs. Since the network layer doesn't care to limit the
* number of mblk fragments in any send mp chain, we end up having to
* use these larger buffers occasionally.
*/
#define EIB_LSO_MAXLEN 65536
#define EIB_LSO_BUFSZ 8192
#define EIB_LSO_NUM_BUFS 1024
#define EIB_LSO_FREE_BUFS_THRESH (EIB_LSO_NUM_BUFS >> 5)
typedef struct eib_lsobuf_s {
struct eib_lsobuf_s *lb_next;
uint8_t *lb_buf;
int lb_isfree;
} eib_lsobuf_t;
typedef struct eib_lsobkt_s {
kmutex_t bk_lock;
kcondvar_t bk_cv;
uint_t bk_status;
uint8_t *bk_mem;
eib_lsobuf_t *bk_bufl;
eib_lsobuf_t *bk_free_head;
ibt_mr_hdl_t bk_mr_hdl;
ibt_lkey_t bk_lkey;
uint_t bk_nelem;
uint_t bk_nfree;
} eib_lsobkt_t;
#define EIB_LBUF_SHORT 0x1
#define EIB_LBUF_MONITOR_DIE 0x2
/*
* The admin partition is only used for sending login and logout messages
* and receiving login acknowledgements from the gateway. While packets
* going out on several vlans at the same time could result in multiple
* vnic creations happening at the same time (and therefore multiple login
* packets), we serialize the vnic creation via the vnic creator thread, so
* we shouldn't need a lot of send wqes or receive wqes. Note also that we
* keep the cq size request to slightly less than a 2^n boundary to allow
* the alloc cq routine to return the closest 2^n boundary as the real cq
* size without wasting too much memory.
*/
#define EIB_ADMIN_MAX_SWQE 30
#define EIB_ADMIN_MAX_RWQE 30
#define EIB_ADMIN_CQ_SIZE (EIB_ADMIN_MAX_SWQE + EIB_ADMIN_MAX_RWQE + 1)
/*
* The control qp is per vhub partition, and is used to send and receive
* vhub control messages such as vhub table request/response, vhub
* update response and vnic alive messages. While the vhub table response
* and vhub update messages might take a few rwqes, the vhub table request
* is made only once per vnic, and the vnic alive message is periodic
* and uses a single swqe as well. Per vnic, we should certainly not need
* too many swqes/rwqes.
*/
#define EIB_CTL_MAX_SWQE 30
#define EIB_CTL_MAX_RWQE 30
#define EIB_CTL_CQ_SIZE (EIB_CTL_MAX_SWQE + EIB_CTL_MAX_RWQE + 1)
/*
* For the vNIC's data channel, there are three items that are of importance:
* the constraints defined below, the hca_max_chan_sz attribute and the value of
* (hca_max_cq_sz - 1). The maximum limit on swqe/rwqe is set to the minimum
* of these three values.
*
* While the total number of RWQEs posted to the data channel of any vNIC will
* not exceed EIB_DATA_MAX_RWQE, we also do not want to acquire and post all of
* it during the data channel initialization, since that is a lot of wqes for
* one vnic to consume when we don't even know if the vnic will need it at all.
* We post an initial set of EIB_DATA_RWQE_BKT rwqes, and slowly post more and
* more sets as we see them being consumed, until we hit the hard limit of
* EIB_DATA_MAX_RWQE.
*/
#define EIB_DATA_MAX_SWQE 4000
#define EIB_DATA_MAX_RWQE 4000
#define EIB_DATA_RWQE_BKT 512
/*
* vNIC data channel CQ moderation parameters
*/
#define EIB_TX_COMP_COUNT 10
#define EIB_TX_COMP_USEC 300
#define EIB_RX_COMP_COUNT 4
#define EIB_RX_COMP_USEC 10
/*
* qe_info masks (blk:ndx:type:flags)
*/
#define EIB_WQEBLK_SHIFT 24
#define EIB_WQEBLK_MASK 0xFF
#define EIB_WQENDX_SHIFT 16
#define EIB_WQENDX_MASK 0xFF
#define EIB_WQETYP_SHIFT 8
#define EIB_WQETYP_MASK 0xFF
#define EIB_WQEFLGS_SHIFT 0
#define EIB_WQEFLGS_MASK 0xFF
/*
* Macros to get the bit fields from qe_info
*/
#define EIB_WQE_BLK(info) (((info) >> EIB_WQEBLK_SHIFT) & EIB_WQEBLK_MASK)
#define EIB_WQE_NDX(info) (((info) >> EIB_WQENDX_SHIFT) & EIB_WQENDX_MASK)
#define EIB_WQE_TYPE(info) (((info) >> EIB_WQETYP_SHIFT) & EIB_WQETYP_MASK)
#define EIB_WQE_FLAGS(info) ((info) & EIB_WQEFLGS_MASK)
/*
* Values for type and flags in qe_info
*/
#define EIB_WQE_TX 0x1
#define EIB_WQE_RX 0x2
/*
* Flags for rx wqes/buffers
*/
#define EIB_WQE_FLG_POSTED_TO_HCA 0x1
#define EIB_WQE_FLG_WITH_NW 0x2
/*
* Flags for tx wqes/buffers
*/
#define EIB_WQE_FLG_BUFTYPE_LSO 0x4
#define EIB_WQE_FLG_BUFTYPE_MAPPED 0x8
/*
* Send/Recv workq entries
*/
typedef struct eib_wqe_s {
struct eib_wqe_pool_s *qe_pool;
uint8_t *qe_cpbuf;
uint8_t *qe_payload_hdr;
uint_t qe_bufsz;
uint_t qe_info;
int qe_vnic_inst;
ibt_ud_dest_hdl_t qe_dest;
frtn_t qe_frp;
mblk_t *qe_mp;
ibt_mi_hdl_t qe_iov_hdl;
ibt_all_wr_t qe_wr;
ibt_wr_ds_t qe_sgl;
ibt_wr_ds_t qe_big_sgl[EIB_MAX_SGL];
struct eib_wqe_s *qe_nxt_post;
struct eib_chan_s *qe_chan;
} eib_wqe_t;
/*
* The wqe in-use/free status in EoIB is managed via a 2-level bitmap
* logic.
*
* Each set of 64 wqes (a "wqe block") is managed by a single 64-bit
* integer bitmap. The free status of a set of 64 such wqe blocks (a
* "wqe pool") is managed by one 64-bit integer bitmap (if any wqe in
* the wqe block is free, the bit in the map is 1, otherwise it is 0).
*
* The maximum pool size is 4096 wqes, but this can easily be extended
* to support more wqes using additional pools of wqes.
*
* Note that an entire pool of wqes is allocated via a single allocation,
* the wqe addresses in a pool are all contiguous. The tx/rx copy buffers
* for a wqe pool are also allocated via a single allocation.
*/
#define EIB_BLKS_PER_POOL 64
#define EIB_WQES_PER_BLK 64 /* do not change this */
#define EIB_WQES_PER_POOL (EIB_BLKS_PER_POOL * EIB_WQES_PER_BLK)
#define EIB_WQE_SZ (sizeof (eib_wqe_t))
#define EIB_WQEBLK_SZ (EIB_WQES_PER_BLK * EIB_WQE_SZ)
typedef struct eib_wqe_pool_s {
struct eib_wqe_pool_s *wp_next;
struct eib_s *wp_ss;
ib_vaddr_t wp_vaddr;
ib_memlen_t wp_memsz;
ibt_mr_hdl_t wp_mr;
ibt_lkey_t wp_lkey;
uint_t wp_nfree_lwm;
int wp_type;
kmutex_t wp_lock;
kcondvar_t wp_cv;
uint_t wp_status;
uint_t wp_nfree;
uint64_t wp_free_blks;
uint64_t wp_free_wqes[EIB_BLKS_PER_POOL];
struct eib_wqe_s *wp_wqe;
} eib_wqe_pool_t;
/*
* Values for wp_type
*/
#define EIB_WP_TYPE_TX 0x1
#define EIB_WP_TYPE_RX 0x2
/*
* Values for wp_status (bit fields)
*/
#define EIB_TXWQE_SHORT 0x1 /* only for tx wqe pool */
#define EIB_TXWQE_MONITOR_DIE 0x2 /* only for tx wqe pool */
#define EIB_RXWQE_SHORT 0x1 /* only for rx wqe pool */
/*
* The low-water-mark is an indication of when wqe grabs for low-priority
* qps should start to get refused (swqe grabs for control messages such
* as keepalives and rwqe grabs for posting back to control qps will still
* be allowed). The high-water-mark is an indication of when normal
* behavior should resume.
*/
#define EIB_NFREE_SWQES_LWM (EIB_WQES_PER_POOL / 64) /* 1/64 */
#define EIB_NFREE_SWQES_HWM (EIB_WQES_PER_POOL / 32) /* 1/32 */
#define EIB_NFREE_RWQES_LWM (EIB_WQES_PER_POOL / 10) /* 10% */
#define EIB_NFREE_RWQES_HWM (EIB_WQES_PER_POOL / 5) /* 20% */
/*
* The "rwqes low" is used to determine when we should start using allocb()
* to copy and send received mblks in the rx path. It should be a little
* above the rwqes low-water-mark, but less than the high-water-mark.
*/
#define EIB_NFREE_RWQES_LOW \
((EIB_NFREE_RWQES_LWM + EIB_NFREE_RWQES_HWM) / 2)
#define EIB_WPRI_HI 1 /* for keepalive posts */
#define EIB_WPRI_LO 2 /* for all other posts */
/*
* Multicast GID Layout: the multicast gid is specified in big-endian
* representation, as a collection of different-sized fields in the
* EoIB specification. On Solaris, the multicast gid is represented
* as a collection of two 8-byte fields (in ib_gid_t).
*/
typedef struct eib_mgid_spec_s {
uint8_t sp_mgid_prefix[FIP_MGID_PREFIX_LEN];
uint8_t sp_type;
uint8_t sp_dmac[ETHERADDRL];
uint8_t sp_rss_hash;
uint8_t sp_vhub_id[FIP_VHUBID_LEN];
} eib_mgid_spec_t;
/*
* Values for sp_type in mgid as per EoIB specification
*/
#define EIB_MGID_VHUB_DATA 0x0
#define EIB_MGID_VHUB_UPDATE 0x2
#define EIB_MGID_VHUB_TABLE 0x3
typedef union eib_mgid_s {
eib_mgid_spec_t gd_spec;
ib_gid_t gd_sol;
} eib_mgid_t;
/*
* Gateway properties handed over to us by the EoIB nexus
*/
typedef struct eib_gw_props_s {
kmutex_t pp_gw_lock;
ib_guid_t pp_gw_system_guid;
ib_guid_t pp_gw_guid;
ib_sn_prefix_t pp_gw_sn_prefix;
uint_t pp_gw_adv_period;
uint_t pp_gw_ka_period;
uint_t pp_vnic_ka_period;
ib_qpn_t pp_gw_ctrl_qpn;
ib_lid_t pp_gw_lid;
uint16_t pp_gw_portid;
uint16_t pp_gw_num_net_vnics;
uint8_t pp_gw_flag_available;
uint8_t pp_gw_is_host_adm_vnics;
uint8_t pp_gw_sl;
uint8_t pp_gw_n_rss_qpn;
uint8_t *pp_gw_system_name;
uint8_t *pp_gw_port_name;
uint8_t *pp_gw_vendor_id;
clock_t pp_gw_ka_ticks; /* 2.5 x gw_ka_period */
clock_t pp_vnic_ka_ticks; /* vnic_ka_period */
} eib_gw_props_t;
/*
* Port-specific properties
*/
typedef struct eib_props_s {
uint64_t ep_ifspeed;
ib_guid_t ep_hca_guid;
uint8_t ep_port_num;
ib_gid_t ep_sgid;
ib_lid_t ep_blid;
uint16_t ep_mtu;
ibt_srate_t ep_srate;
} eib_props_t;
/*
* Capabilities derived from HCA attributes
*/
typedef struct eib_caps_s {
uint_t cp_lso_maxlen;
uint32_t cp_cksum_flags;
int cp_resv_lkey_capab;
ibt_lkey_t cp_resv_lkey;
uint_t cp_max_swqe;
uint_t cp_max_rwqe;
uint_t cp_max_sgl;
uint_t cp_hiwm_sgl;
} eib_caps_t;
/*
* List of multicast groups the vnic joined
*/
typedef struct eib_mcg_s {
struct eib_mcg_s *mg_next;
ib_gid_t mg_rgid;
ib_gid_t mg_mgid;
uint8_t mg_join_state;
uint8_t mg_mac[ETHERADDRL];
ibt_mcg_info_t *mg_mcginfo;
} eib_mcg_t;
/*
* Admin/control/data channel information
*/
typedef struct eib_chan_s {
ibt_channel_hdl_t ch_chan;
ib_qpn_t ch_qpn;
ibt_wc_t *ch_wc;
ibt_cq_hdl_t ch_cq_hdl;
uint_t ch_cq_sz;
ibt_wc_t *ch_rcv_wc;
ibt_cq_hdl_t ch_rcv_cq_hdl;
uint_t ch_rcv_cq_sz;
int ch_vnic_inst;
uint_t ch_max_swqes;
uint_t ch_max_rwqes;
uint_t ch_lwm_rwqes;
uint_t ch_rwqe_bktsz;
uint_t ch_ip_hdr_align;
boolean_t ch_alloc_mp;
boolean_t ch_tear_down;
kmutex_t ch_pkey_lock;
ib_pkey_t ch_pkey;
uint16_t ch_pkey_ix;
kmutex_t ch_cep_lock;
kcondvar_t ch_cep_cv;
ibt_cep_state_t ch_cep_state;
kmutex_t ch_tx_lock;
kcondvar_t ch_tx_cv;
uint_t ch_tx_posted;
boolean_t ch_tx_busy;
struct eib_wqe_s *ch_tx;
struct eib_wqe_s *ch_tx_tail;
kmutex_t ch_rx_lock;
kcondvar_t ch_rx_cv;
uint_t ch_rx_posted;
boolean_t ch_rx_refilling;
kmutex_t ch_vhub_lock;
struct eib_mcg_s *ch_vhub_table;
struct eib_mcg_s *ch_vhub_update;
struct eib_mcg_s *ch_vhub_data;
struct eib_chan_s *ch_rxpost_next;
} eib_chan_t;
/*
* States for vNIC state machine during login
*/
#define EIB_LOGIN_INIT 0
#define EIB_LOGIN_ACK_WAIT 1
#define EIB_LOGIN_ACK_RCVD 2
#define EIB_LOGIN_NACK_RCVD 3
#define EIB_LOGIN_TBL_WAIT 4
#define EIB_LOGIN_TBL_INPROG 5
#define EIB_LOGIN_TBL_DONE 6
#define EIB_LOGIN_TBL_FAILED 7
#define EIB_LOGIN_DONE 8
#define EIB_LOGIN_TIMED_OUT 9
#define EIB_LOGOUT_DONE 10
typedef struct eib_login_data_s {
ib_guid_t ld_gw_guid;
ib_lid_t ld_gw_lid;
uint_t ld_syndrome;
uint16_t ld_gw_port_id;
ib_qpn_t ld_gw_data_qpn;
ib_qpn_t ld_gw_ctl_qpn;
uint16_t ld_vnic_id; /* includes set msbit */
uint16_t ld_vhub_mtu;
uint16_t ld_vhub_pkey;
uint16_t ld_assigned_vlan;
uint8_t ld_gw_sl;
uint8_t ld_n_rss_mcgid;
uint8_t ld_n_mac_mcgid;
uint8_t ld_vnic_name[FIP_VNIC_NAME_LEN];
uint8_t ld_assigned_mac[ETHERADDRL];
uint8_t ld_gw_mgid_prefix[FIP_MGID_PREFIX_LEN];
uint8_t ld_vlan_in_packets;
uint32_t ld_vhub_id;
} eib_login_data_t;
#define EIB_UNICAST_MAC(mac) (((mac)[0] & 0x01) == 0)
/*
* Map to translate between DMAC and {qpn, lid, sl}
*/
typedef struct eib_vhub_map_s {
struct eib_vhub_map_s *mp_next;
uint32_t mp_tusn;
ib_qpn_t mp_qpn;
ib_lid_t mp_lid;
uint8_t mp_mac[ETHERADDRL];
uint8_t mp_sl;
uint8_t mp_v_rss_type;
} eib_vhub_map_t;
/*
* Per-vNIC vHUB Table
*/
#define EIB_TB_NBUCKETS 13
typedef struct eib_vhub_table_s {
kmutex_t tb_lock;
struct eib_vhub_map_s *tb_gateway;
struct eib_vhub_map_s *tb_unicast_miss;
struct eib_vhub_map_s *tb_vhub_multicast;
struct eib_vhub_map_s *tb_vnic_entry[EIB_TB_NBUCKETS];
struct eib_vhub_map_s *tb_mcast_entry[EIB_TB_NBUCKETS];
uint32_t tb_tusn;
uint8_t tb_eport_state;
uint16_t tb_entries_seen;
uint16_t tb_entries_in_table;
uint32_t tb_checksum;
} eib_vhub_table_t;
typedef struct eib_vhub_update_s {
kmutex_t up_lock;
eib_vhub_map_t *up_vnic_entry;
uint32_t up_tusn;
uint8_t up_eport_state;
} eib_vhub_update_t;
typedef struct eib_ether_hdr_s {
int eh_tagless;
uint16_t eh_ether_type;
uint16_t eh_vlan;
uint8_t eh_dmac[ETHERADDRL];
uint8_t eh_smac[ETHERADDRL];
} eib_ether_hdr_t;
/*
* vNIC Information
*/
typedef struct eib_vnic_s {
struct eib_s *vn_ss;
eib_chan_t *vn_ctl_chan;
eib_chan_t *vn_data_chan;
int vn_instance;
uint16_t vn_vlan;
uint16_t vn_id;
uint8_t vn_macaddr[ETHERADDRL];
struct eib_login_data_s vn_login_data;
kmutex_t vn_lock;
kcondvar_t vn_cv;
uint_t vn_state;
struct eib_vhub_table_s *vn_vhub_table;
struct eib_vhub_update_s *vn_vhub_update;
ddi_softint_handle_t vn_ctl_si_hdl;
ddi_softint_handle_t vn_data_tx_si_hdl;
ddi_softint_handle_t vn_data_rx_si_hdl;
} eib_vnic_t;
/*
* Base NIC's mac state flags. The lock protects the starting/stopping
* bits. Access to the rest of the mac state is protected by these
* two bits.
*/
#define EIB_NIC_STARTING 0x01
#define EIB_NIC_STOPPING 0x02
#define EIB_NIC_STARTED 0x80
#define EIB_NIC_RESTARTING (EIB_NIC_STARTING | EIB_NIC_STOPPING)
typedef struct eib_node_state_s {
kmutex_t ns_lock;
kcondvar_t ns_cv;
uint_t ns_nic_state;
link_state_t ns_link_state;
} eib_node_state_t;
/*
* MIB-II statistics to report to the mac layer
*/
typedef struct eib_stats_s {
uint64_t st_obytes; /* bytes sent out */
uint64_t st_opkts; /* pkts sent out */
uint64_t st_brdcstxmit; /* broadcast pkts transmitted */
uint64_t st_multixmit; /* multicast pkts transmitted */
uint64_t st_oerrors; /* transmit errors */
uint64_t st_noxmitbuf; /* transmit pkts discarded */
uint64_t st_rbytes; /* bytes received */
uint64_t st_ipkts; /* pkts received */
uint64_t st_brdcstrcv; /* broadcast pkts received */
uint64_t st_multircv; /* multicast pkts received */
uint64_t st_ierrors; /* receive errors */
uint64_t st_norcvbuf; /* receive pkts discarded */
} eib_stats_t;
#define EIB_UPDATE_COUNTER(addr, val) (atomic_add_64((addr), (val)))
#define EIB_INCR_COUNTER(addr) (atomic_inc_64((addr)))
#define EIB_DECR_COUNTER(addr) (atomic_dec_64((addr)))
/*
* Cache of address vectors with dlid as the key. Currently we use
* eib state structure's ei_lock to protect the individual address
* vector's fields. This is a lock granularity that's slightly
* bigger than ideal, but it should do for now.
*/
#define EIB_AV_NBUCKETS 17
typedef struct eib_avect_s {
struct eib_avect_s *av_next;
ibt_adds_vect_t av_vect;
uint_t av_ref;
} eib_avect_t;
/*
* vNIC creation and deletion are serialized by a non-zero value
* to the ei_vnic_state member (i.e. only one vnic may be created
* or deleted at a time). The code makes sure to access/update
* the ei_active_vnics member only after a successful setting of
* ei_vnic_state.
*/
#define EIB_VN_BEING_CREATED 0x01
#define EIB_VN_BEING_DELETED 0x02
#define EIB_VN_BEING_MODIFIED (EIB_VN_BEING_CREATED | EIB_VN_BEING_DELETED)
/*
* All possible EoIB event work items that need to be handled
*/
#define EIB_EV_NONE 0
#define EIB_EV_PORT_DOWN 1
#define EIB_EV_PORT_UP 2
#define EIB_EV_PKEY_CHANGE 3
#define EIB_EV_SGID_CHANGE 4
#define EIB_EV_CLNT_REREG 5
#define EIB_EV_GW_EPORT_DOWN 6
#define EIB_EV_GW_DOWN 7
#define EIB_EV_GW_UP 8
#define EIB_EV_GW_INFO_UPDATE 9
#define EIB_EV_MCG_DELETED 10
#define EIB_EV_MCG_CREATED 11
#define EIB_EV_SHUTDOWN 12
typedef struct eib_event_s {
struct eib_event_s *ev_next;
uint_t ev_code;
void *ev_arg;
} eib_event_t;
/*
* Work element for new vnic creation
*/
typedef struct eib_vnic_req_s {
struct eib_vnic_req_s *vr_next;
uint_t vr_req;
uint8_t vr_mac[ETHERADDRL];
uint16_t vr_vlan;
} eib_vnic_req_t;
/*
* Values for vr_req
*/
#define EIB_CR_REQ_NEW_VNIC 1
#define EIB_CR_REQ_FLUSH 2
#define EIB_CR_REQ_DIE 3
/*
* Work element for vnics kept alive by the keepalive manager thread
* and bitfield values for ei_ka_vnics_event.
*/
typedef struct eib_ka_vnics_s {
struct eib_ka_vnics_s *ka_next;
struct eib_vnic_s *ka_vnic;
} eib_ka_vnics_t;
#define EIB_KA_VNICS_DIE 0x1
#define EIB_KA_VNICS_TIMED_OUT 0x2
/*
* EoIB per-instance state
*/
typedef struct eib_s {
ibt_clnt_hdl_t ei_ibt_hdl;
ibt_hca_hdl_t ei_hca_hdl;
ibt_pd_hdl_t ei_pd_hdl;
mac_handle_t ei_mac_hdl;
ddi_softint_handle_t ei_admin_si_hdl;
ddi_callback_id_t ei_login_ack_cb;
ddi_callback_id_t ei_gw_alive_cb;
ddi_callback_id_t ei_gw_info_cb;
ibt_hca_attr_t *ei_hca_attrs;
dev_info_t *ei_dip;
uint_t ei_instance;
struct eib_gw_props_s *ei_gw_props;
struct eib_props_s *ei_props;
struct eib_caps_s *ei_caps;
struct eib_stats_s *ei_stats;
struct eib_node_state_s *ei_node_state;
struct eib_chan_s *ei_admin_chan;
struct eib_wqe_pool_s *ei_tx;
struct eib_wqe_pool_s *ei_rx;
struct eib_lsobkt_s *ei_lso;
kmutex_t ei_vnic_lock;
kcondvar_t ei_vnic_cv;
uint_t ei_vnic_state;
uint64_t ei_active_vnics;
uint64_t ei_zombie_vnics;
uint64_t ei_rejoin_vnics;
struct eib_vnic_s *ei_vnic[EIB_MAX_VNICS];
struct eib_vnic_s *ei_vnic_pending;
int64_t ei_gw_last_heartbeat;
boolean_t ei_gw_unreachable;
uint8_t ei_gw_eport_state;
kmutex_t ei_av_lock;
struct eib_avect_s *ei_av[EIB_AV_NBUCKETS];
kmutex_t ei_ev_lock;
kcondvar_t ei_ev_cv;
struct eib_event_s *ei_event;
kmutex_t ei_rxpost_lock;
kcondvar_t ei_rxpost_cv;
uint_t ei_rxpost_die;
struct eib_chan_s *ei_rxpost;
kmutex_t ei_vnic_req_lock;
kcondvar_t ei_vnic_req_cv;
struct eib_vnic_req_s *ei_vnic_req;
struct eib_vnic_req_s *ei_failed_vnic_req;
struct eib_vnic_req_s *ei_pending_vnic_req;
kmutex_t ei_ka_vnics_lock;
kcondvar_t ei_ka_vnics_cv;
uint_t ei_ka_vnics_event;
struct eib_ka_vnics_s *ei_ka_vnics;
kt_did_t ei_txwqe_monitor;
kt_did_t ei_lsobufs_monitor;
kt_did_t ei_rwqes_refiller;
kt_did_t ei_vnic_creator;
kt_did_t ei_events_handler;
kt_did_t ei_keepalives_manager;
} eib_t;
/*
* Private read-only datalink properties
*/
#define EIB_DLPROP_GW_EPORT_STATE "_eib_eport_state"
#define EIB_DLPROP_HCA_GUID "_eib_hca_guid"
#define EIB_DLPROP_PORT_GUID "_eib_port_guid"
/*
* FUNCTION PROTOTYPES FOR CROSS-FILE LINKAGE
*/
/*
* FIP protocol related
*/
extern int eib_fip_login(eib_t *, eib_vnic_t *, int *);
extern int eib_fip_heartbeat(eib_t *, eib_vnic_t *, int *);
extern int eib_fip_vhub_table(eib_t *, eib_vnic_t *, int *);
extern int eib_fip_logout(eib_t *, eib_vnic_t *, int *);
extern int eib_fip_parse_login_ack(eib_t *, uint8_t *, eib_login_data_t *);
extern int eib_fip_parse_ctl_pkt(uint8_t *, eib_vnic_t *);
/*
* Service threads and other handlers
*/
extern void eib_events_handler(eib_t *);
extern void eib_svc_enqueue_event(eib_t *, eib_event_t *);
extern void eib_refill_rwqes(eib_t *);
extern void eib_vnic_creator(eib_t *);
extern void eib_monitor_tx_wqes(eib_t *);
extern void eib_monitor_lso_bufs(eib_t *);
extern void eib_manage_keepalives(eib_t *);
extern void eib_stop_events_handler(eib_t *);
extern void eib_stop_refill_rwqes(eib_t *);
extern void eib_stop_vnic_creator(eib_t *);
extern void eib_stop_monitor_tx_wqes(eib_t *);
extern int eib_stop_monitor_lso_bufs(eib_t *, boolean_t);
extern void eib_stop_manage_keepalives(eib_t *);
extern void eib_flush_vnic_reqs(eib_t *);
extern void eib_gw_info_cb(dev_info_t *, ddi_eventcookie_t, void *, void *);
extern void eib_gw_alive_cb(dev_info_t *, ddi_eventcookie_t, void *, void *);
extern void eib_login_ack_cb(dev_info_t *, ddi_eventcookie_t, void *, void *);
/*
* Admin QP related
*/
extern int eib_adm_setup_qp(eib_t *, int *);
extern uint_t eib_adm_comp_handler(caddr_t, caddr_t);
extern void eib_rb_adm_setup_qp(eib_t *);
/*
* Control QP related
*/
extern int eib_ctl_create_qp(eib_t *, eib_vnic_t *, int *);
extern uint_t eib_ctl_comp_handler(caddr_t, caddr_t);
extern void eib_rb_ctl_create_qp(eib_t *, eib_vnic_t *);
/*
* Data QP related
*/
extern int eib_data_create_qp(eib_t *, eib_vnic_t *, int *);
extern uint_t eib_data_rx_comp_handler(caddr_t, caddr_t);
extern uint_t eib_data_tx_comp_handler(caddr_t, caddr_t);
extern void eib_data_rx_recycle(caddr_t);
extern void eib_data_post_tx(eib_vnic_t *, eib_wqe_t *);
extern void eib_data_parse_ether_hdr(mblk_t *, eib_ether_hdr_t *);
extern int eib_data_lookup_vnic(eib_t *, uint8_t *, uint16_t, eib_vnic_t **,
boolean_t *);
extern int eib_data_prepare_frame(eib_vnic_t *, eib_wqe_t *, mblk_t *,
eib_ether_hdr_t *);
extern void eib_rb_data_create_qp(eib_t *, eib_vnic_t *);
/*
* Resource related
*/
extern int eib_rsrc_setup_bufs(eib_t *, int *);
extern int eib_rsrc_grab_swqes(eib_t *, eib_wqe_t **, uint_t, uint_t *, int);
extern int eib_rsrc_grab_rwqes(eib_t *, eib_wqe_t **, uint_t, uint_t *, int);
extern int eib_rsrc_grab_lsobufs(eib_t *, uint_t, ibt_wr_ds_t *, uint32_t *);
extern eib_wqe_t *eib_rsrc_grab_swqe(eib_t *, int);
extern eib_wqe_t *eib_rsrc_grab_rwqe(eib_t *, int);
extern void eib_rsrc_return_swqe(eib_t *, eib_wqe_t *, eib_chan_t *);
extern void eib_rsrc_return_rwqe(eib_t *, eib_wqe_t *, eib_chan_t *);
extern void eib_rsrc_return_lsobufs(eib_t *, ibt_wr_ds_t *, uint32_t);
extern void eib_rsrc_decr_posted_swqe(eib_t *, eib_chan_t *);
extern void eib_rsrc_decr_posted_rwqe(eib_t *, eib_chan_t *);
extern void eib_rsrc_txwqes_needed(eib_t *);
extern void eib_rsrc_lsobufs_needed(eib_t *);
extern boolean_t eib_rsrc_rxpool_low(eib_wqe_t *);
extern void eib_rb_rsrc_setup_bufs(eib_t *, boolean_t);
/*
* IBT related
*/
extern int eib_ibt_hca_init(eib_t *);
extern void eib_ibt_link_mod(eib_t *);
extern int eib_ibt_modify_chan_pkey(eib_t *, eib_chan_t *, ib_pkey_t);
extern eib_avect_t *eib_ibt_hold_avect(eib_t *, ib_lid_t, uint8_t);
extern void eib_ibt_release_avect(eib_t *, eib_avect_t *);
extern void eib_ibt_free_avects(eib_t *);
extern void eib_ibt_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
ibt_async_event_t *);
extern void eib_ibt_record_capab(eib_t *, ibt_hca_attr_t *, eib_caps_t *);
extern void eib_rb_ibt_hca_init(eib_t *, uint_t);
/*
* Chan related
*/
extern eib_chan_t *eib_chan_init(void);
extern void eib_chan_fini(eib_chan_t *);
extern int eib_chan_post_rx(eib_t *, eib_chan_t *, uint_t *);
extern int eib_chan_post_recv(eib_t *, eib_chan_t *, eib_wqe_t *);
/*
* Mac layer related
*/
extern void eib_mac_set_nic_state(eib_t *, uint_t);
extern void eib_mac_clr_nic_state(eib_t *, uint_t);
extern void eib_mac_upd_nic_state(eib_t *, uint_t, uint_t);
extern uint_t eib_mac_get_nic_state(eib_t *);
extern void eib_mac_link_state(eib_t *, link_state_t, boolean_t);
extern void eib_mac_link_down(eib_t *, boolean_t);
extern void eib_mac_link_up(eib_t *, boolean_t);
extern int eib_mac_start(eib_t *);
extern void eib_mac_stop(eib_t *);
extern int eib_mac_multicast(eib_t *, boolean_t, uint8_t *);
extern int eib_mac_promisc(eib_t *, boolean_t);
extern int eib_mac_tx(eib_t *, mblk_t *);
extern int eib_mac_hca_portstate(eib_t *, ib_lid_t *, int *);
/*
* VNIC related
*/
extern int eib_vnic_create(eib_t *, uint8_t *, uint16_t, eib_vnic_t **, int *);
extern void eib_vnic_delete(eib_t *, eib_vnic_t *);
extern int eib_vnic_wait_for_login_ack(eib_t *, eib_vnic_t *, int *);
extern void eib_vnic_login_ack(eib_t *, eib_login_data_t *);
extern int eib_vnic_wait_for_table(eib_t *, eib_vnic_t *, int *);
extern void eib_vnic_vhub_table_done(eib_vnic_t *, uint_t);
extern int eib_vnic_join_data_mcg(eib_t *, eib_vnic_t *, uint8_t *,
boolean_t, int *);
extern int eib_vnic_setup_dest(eib_vnic_t *, eib_wqe_t *, uint8_t *, uint16_t);
extern void eib_vnic_leave_data_mcg(eib_t *, eib_vnic_t *, uint8_t *);
extern void eib_vnic_init_tables(eib_t *, eib_vnic_t *);
extern void eib_vnic_fini_tables(eib_t *, eib_vnic_t *, boolean_t);
extern eib_chan_t *eib_vnic_get_data_chan(eib_t *, int);
extern void eib_vnic_need_new(eib_t *, uint8_t *, uint16_t);
extern void eib_vnic_enqueue_req(eib_t *, eib_vnic_req_t *);
extern void eib_vnic_resurrect_zombies(eib_t *, uint8_t *);
extern void eib_vnic_restart(eib_t *, int, uint8_t *);
extern void eib_vnic_rejoin_mcgs(eib_t *);
extern void eib_rb_vnic_create(eib_t *, eib_vnic_t *, uint_t);
/*
* Logging and other stuff
*/
extern void eib_debug_init(void);
extern void eib_debug_fini(void);
extern void eib_dprintf_crit(int, const char *fmt, ...);
extern void eib_dprintf_err(int, const char *fmt, ...);
extern void eib_dprintf_warn(int, const char *fmt, ...);
#ifdef EIB_DEBUG
extern void eib_dprintf_debug(int, const char *fmt, ...);
extern void eib_dprintf_args(int, const char *fmt, ...);
extern void eib_dprintf_pkt(int, uint8_t *, uint_t);
extern void eib_dprintf_verbose(int, const char *fmt, ...);
#endif
extern int eib_get_props(eib_t *);
extern void eib_update_props(eib_t *, eib_gw_info_t *);
extern void eib_rb_get_props(eib_t *);
/*
* EoIB specific global variables
*/
extern ib_gid_t eib_reserved_gid;
extern uint8_t eib_zero_mac[];
extern uint8_t eib_broadcast_mac[];
extern int eib_setbit_mod67[];
extern char *eib_pvt_props[];
/*
* HW/FW workarounds
*/
extern int eib_wa_no_desc_list_len;
extern int eib_wa_no_cksum_offload;
extern int eib_wa_no_lso;
extern int eib_wa_no_mcast_entries;
extern int eib_wa_no_av_discover;
extern int eib_wa_no_good_vp_flag;
extern int eib_wa_no_good_vhub_cksum;
/*
* Miscellaneous externs
*/
extern void freemsgchain(mblk_t *);
extern pri_t minclsyspri;
#ifdef __cplusplus
}
#endif
#endif /* _SYS_IB_EOIB_EIB_IMPL_H */
|