summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/inet/tcp_impl.h
blob: 61af05f74917242a60237be85f8dea01b8d3162b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2019 Joyent, Inc.
 * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
 */

#ifndef	_INET_TCP_IMPL_H
#define	_INET_TCP_IMPL_H

/*
 * TCP implementation private declarations.  These interfaces are
 * used to build the IP module and are not meant to be accessed
 * by any modules except IP itself.  They are undocumented and are
 * subject to change without notice.
 */

#ifdef	__cplusplus
extern "C" {
#endif

#ifdef _KERNEL

#include <sys/cpuvar.h>
#include <sys/clock_impl.h>	/* For LBOLT_FASTPATH{,64} */
#include <inet/optcom.h>
#include <inet/tcp.h>
#include <inet/tunables.h>

#define	TCP_MOD_ID	5105

extern struct qinit	tcp_sock_winit;
extern struct qinit	tcp_winit;

extern sock_downcalls_t sock_tcp_downcalls;

/*
 * Note that by default, the _snd_lowat_fraction tunable controls the value of
 * the transmit low water mark.  TCP_XMIT_LOWATER (and thus the _xmit_lowat
 * property) is only used if the administrator has disabled _snd_lowat_fraction
 * by setting it to 0.
 */
#define	TCP_XMIT_LOWATER	4096
#define	TCP_XMIT_HIWATER	128000
#define	TCP_RECV_LOWATER	2048
#define	TCP_RECV_HIWATER	1048576

/*
 * Bind hash list size and has function.  It has to be a power of 2 for
 * hashing.
 */
#define	TCP_BIND_FANOUT_SIZE	1024
#define	TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1))

/*
 * This implementation follows the 4.3BSD interpretation of the urgent
 * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause
 * incompatible changes in protocols like telnet and rlogin.
 */
#define	TCP_OLD_URP_INTERPRETATION	1

/* TCP option length */
#define	TCPOPT_NOP_LEN		1
#define	TCPOPT_MAXSEG_LEN	4
#define	TCPOPT_WS_LEN		3
#define	TCPOPT_REAL_WS_LEN	(TCPOPT_WS_LEN+1)
#define	TCPOPT_TSTAMP_LEN	10
#define	TCPOPT_REAL_TS_LEN	(TCPOPT_TSTAMP_LEN+2)
#define	TCPOPT_SACK_OK_LEN	2
#define	TCPOPT_REAL_SACK_OK_LEN	(TCPOPT_SACK_OK_LEN+2)
#define	TCPOPT_REAL_SACK_LEN	4
#define	TCPOPT_MAX_SACK_LEN	36
#define	TCPOPT_HEADER_LEN	2

/* Round up the value to the nearest mss. */
#define	MSS_ROUNDUP(value, mss)		((((value) - 1) / (mss) + 1) * (mss))

/*
 * Was this tcp created via socket() interface?
 */
#define	TCP_IS_SOCKET(tcp)	((tcp)->tcp_issocket)

/*
 * Is this tcp not attached to any upper client?
 */
#define	TCP_IS_DETACHED(tcp)	((tcp)->tcp_detached)

/* TCP timers related data structures.  Refer to tcp_timers.c. */
typedef struct tcp_timer_s {
	conn_t	*connp;
	void	(*tcpt_proc)(void *);
	callout_id_t   tcpt_tid;
} tcp_timer_t;

extern kmem_cache_t *tcp_timercache;

/*
 * Macro for starting various timers.  Retransmission timer has its own macro,
 * TCP_TIMER_RESTART().  tim is in millisec.
 */
#define	TCP_TIMER(tcp, f, tim)		\
	tcp_timeout(tcp->tcp_connp, f, tim)
#define	TCP_TIMER_CANCEL(tcp, id)	\
	tcp_timeout_cancel(tcp->tcp_connp, id)

/*
 * To restart the TCP retransmission timer.  intvl is in millisec.
 */
#define	TCP_TIMER_RESTART(tcp, intvl) {					\
	if ((tcp)->tcp_timer_tid != 0)					\
		(void) TCP_TIMER_CANCEL((tcp), (tcp)->tcp_timer_tid);	\
	(tcp)->tcp_timer_tid = TCP_TIMER((tcp), tcp_timer, (intvl));	\
}


/*
 * Maximum TIME_WAIT timeout.  It is defined here (instead of tcp_tunables.c)
 * so that other parameters can be derived from it.
 */
#define	TCP_TIME_WAIT_MAX	(10 * MINUTES)

/*
 * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
 * Running it every 5 seconds seems to yield a reasonable balance between
 * cleanup liveliness and system load.
 */
#define	TCP_TIME_WAIT_DELAY	(5 * SECONDS)

#define	TCP_TIME_WAIT_BUCKETS	((TCP_TIME_WAIT_MAX / TCP_TIME_WAIT_DELAY) + 1)

/*
 * For scalability, we must not run a timer for every TCP connection
 * in TIME_WAIT state.  To see why, consider (for time wait interval of
 * 1 minutes):
 *	10,000 connections/sec * 60 seconds/time wait = 600,000 active conn's
 *
 * Since TIME_WAIT expiration occurs on a per-squeue basis, handling
 * connections from all netstacks on the system, a simple queue is inadequate
 * for pending entries.  This is because tcp_time_wait_interval may differ
 * between connections, causing tail insertion to violate expiration order.
 *
 * Instead of performing expensive sorting or unnecessary list traversal to
 * counteract interval variance between netstacks, a timing wheel structure is
 * used.  The duration covered by each bucket in the wheel is determined by the
 * TCP_TIME_WAIT_DELAY (5 seconds).  The number of buckets in the wheel is
 * determined by dividing the maximum TIME_WAIT interval (10 minutes) by
 * TCP_TIME_WAIT_DELAY, with one added bucket for rollover protection.
 * (Yielding 121 buckets with the current parameters)  When items are inserted
 * into the set of buckets, they are indexed by using their expiration time
 * divided by the bucket size, modulo the number of buckets.  This means that
 * when each bucket is processed, all items within should have expired within
 * the last TCP_TIME_WAIT_DELAY interval.
 *
 * Since bucket timer schedules are rounded to the nearest TCP_TIME_WAIT_DELAY
 * interval to ensure all connections in the pending bucket will be expired, a
 * per-squeue offset is used when doing TIME_WAIT scheduling.  This offset is
 * between 0 and the TCP_TIME_WAIT_DELAY and is designed to avoid scheduling
 * all of the tcp_time_wait_collector threads to run in lock-step.  The offset
 * is fixed while there are any connections present in the buckets.
 *
 * When a tcp_t enters TIME_WAIT state, a timer is started (timeout is
 * tcps_time_wait_interval).  When the tcp_t is detached (upper layer closes
 * the end point), it is scheduled to be cleaned up by the squeue-driving
 * tcp_time_wait_collector (also using tcps_time_wait_interval).  This means
 * that the TIME_WAIT state can be extended (up to doubled) if the tcp_t
 * doesn't become detached for a long time.
 *
 * The list manipulations (including tcp_time_wait_next/prev)
 * are protected by the tcp_time_wait_lock. The content of the
 * detached TIME_WAIT connections is protected by the normal perimeters.
 *
 * These connection lists are per squeue and squeues are shared across the
 * tcp_stack_t instances.  Things in a tcp_time_wait_bucket remain associated
 * with the tcp_stack_t and conn_netstack.  Any tcp_t connections stored in the
 * tcp_free_list are disassociated and have NULL tcp_tcps and conn_netstack
 * pointers.
 */
typedef struct tcp_squeue_priv_s {
	kmutex_t	tcp_time_wait_lock;
	boolean_t	tcp_time_wait_collector_active;
	callout_id_t	tcp_time_wait_tid;
	uint64_t	tcp_time_wait_cnt;
	int64_t		tcp_time_wait_schedule;
	int64_t		tcp_time_wait_offset;
	tcp_t		*tcp_time_wait_bucket[TCP_TIME_WAIT_BUCKETS];
	tcp_t		*tcp_free_list;
	uint_t		tcp_free_list_cnt;
} tcp_squeue_priv_t;

/*
 * Parameters for TCP Initial Send Sequence number (ISS) generation.  When
 * tcp_strong_iss is set to 1, which is the default, the ISS is calculated
 * by adding three components: a time component which grows by 1 every 4096
 * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27);
 * a per-connection component which grows by 125000 for every new connection;
 * and an "extra" component that grows by a random amount centered
 * approximately on 64000.  This causes the ISS generator to cycle every
 * 4.89 hours if no TCP connections are made, and faster if connections are
 * made.
 *
 * When tcp_strong_iss is set to 0, ISS is calculated by adding two
 * components: a time component which grows by 250000 every second; and
 * a per-connection component which grows by 125000 for every new connections.
 *
 * A third method, when tcp_strong_iss is set to 2, for generating ISS is
 * prescribed by Steve Bellovin.  This involves adding time, the 125000 per
 * connection, and a one-way hash (MD5) of the connection ID <sport, dport,
 * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered
 * password.
 */
#define	ISS_INCR	250000
#define	ISS_NSEC_SHT	12

/* Macros for timestamp comparisons */
#define	TSTMP_GEQ(a, b)	((int32_t)((a)-(b)) >= 0)
#define	TSTMP_LT(a, b)	((int32_t)((a)-(b)) < 0)

/*
 * Initialize cwnd according to RFC 3390.  def_max_init_cwnd is
 * either tcp_slow_start_initial or tcp_slow_start_after idle
 * depending on the caller.  If the upper layer has not used the
 * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd
 * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd.
 * If the upper layer has changed set the tcp_init_cwnd, just use
 * it to calculate the tcp_cwnd.
 *
 * "An Argument for Increasing TCP's Initial Congestion Window"
 * ACM SIGCOMM Computer Communications Review, vol. 40 (2010), pp. 27-33
 *  -- Nandita Dukkipati, Tiziana Refice, Yuchung Cheng,
 *     Hsiao-keng Jerry Chu, Tom Herbert, Amit Agarwal,
 *     Arvind Jain, Natalia Sutin
 *
 *   "Based on the results from our experiments, we believe the
 *    initial congestion window should be at least ten segments
 *    and the same be investigated for standardization by the IETF."
 *
 * As such, the def_max_init_cwnd argument with which this macro is
 * invoked is either the tcps_slow_start_initial or
 * tcps_slow_start_after_idle which both default to 0 and will respect
 * RFC 3390 exactly.  If the tunables are explicitly set by the operator,
 * then the initial congestion window should be set as the operator
 * demands, within reason. We shall arbitrarily define reason as a
 * maximum of 16 (same as used by the TCP_INIT_CWND setsockopt).
 */

/* Maximum TCP initial cwin (start/restart). */
#define	TCP_MAX_INIT_CWND	16

#define	TCP_SET_INIT_CWND(tcp, mss, def_max_init_cwnd)			\
{									\
	if ((tcp)->tcp_init_cwnd == 0) {				\
		if (def_max_init_cwnd == 0) {				\
			(tcp)->tcp_cwnd = MIN(4 * (mss),		\
			    MAX(2 * (mss), 4380 / (mss) * (mss)));	\
		} else {						\
			(tcp)->tcp_cwnd = MIN(TCP_MAX_INIT_CWND * (mss),\
			    def_max_init_cwnd * (mss));			\
		}							\
	} else {							\
		(tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss);		\
	}								\
	tcp->tcp_cwnd_cnt = 0;						\
}

/*
 * Set ECN capable transport (ECT) code point in IP header.
 *
 * Note that there are 2 ECT code points '01' and '10', which are called
 * ECT(1) and ECT(0) respectively.  Here we follow the original ECT code
 * point ECT(0) for TCP as described in RFC 2481.
 */
#define	TCP_SET_ECT(tcp, iph) \
	if ((tcp)->tcp_connp->conn_ipversion == IPV4_VERSION) { \
		/* We need to clear the code point first. */ \
		((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \
		((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \
	} else { \
		((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \
		((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \
	}

/*
 * TCP options struct returned from tcp_parse_options.
 */
typedef struct tcp_opt_s {
	uint32_t	tcp_opt_mss;
	uint32_t	tcp_opt_wscale;
	uint32_t	tcp_opt_ts_val;
	uint32_t	tcp_opt_ts_ecr;
	tcp_t		*tcp;
} tcp_opt_t;

/*
 * Flags returned from tcp_parse_options.
 */
#define	TCP_OPT_MSS_PRESENT	1
#define	TCP_OPT_WSCALE_PRESENT	2
#define	TCP_OPT_TSTAMP_PRESENT	4
#define	TCP_OPT_SACK_OK_PRESENT	8
#define	TCP_OPT_SACK_PRESENT	16

/*
 * Write-side flow-control is implemented via the per instance STREAMS
 * write-side Q by explicitly setting QFULL to stop the flow of mblk_t(s)
 * and clearing QFULL and calling qbackenable() to restart the flow based
 * on the number of TCP unsent bytes (i.e. those not on the wire waiting
 * for a remote ACK).
 *
 * This is different than a standard STREAMS kmod which when using the
 * STREAMS Q the framework would automatictly flow-control based on the
 * defined hiwat/lowat values as mblk_t's are enqueued/dequeued.
 *
 * As of FireEngine TCP write-side flow-control needs to take into account
 * both the unsent tcp_xmit list bytes but also any squeue_t enqueued bytes
 * (i.e. from tcp_wput() -> tcp_output()).
 *
 * This is accomplished by adding a new tcp_t fields, tcp_squeue_bytes, to
 * count the number of bytes enqueued by tcp_wput() and the number of bytes
 * dequeued and processed by tcp_output().
 *
 * So, the total number of bytes unsent is (squeue_bytes + unsent) with all
 * flow-control uses of unsent replaced with the macro TCP_UNSENT_BYTES.
 */
extern void	tcp_clrqfull(tcp_t *);
extern void	tcp_setqfull(tcp_t *);

#define	TCP_UNSENT_BYTES(tcp) \
	((tcp)->tcp_squeue_bytes + (tcp)->tcp_unsent)

/*
 * Linked list struct to store listener connection limit configuration per
 * IP stack.  The list is stored at tcps_listener_conf in tcp_stack_t.
 *
 * tl_port: the listener port of this limit configuration
 * tl_ratio: the maximum amount of memory consumed by all concurrent TCP
 *           connections created by a listener does not exceed 1/tl_ratio
 *           of the total system memory.  Note that this is only an
 *           approximation.
 * tl_link: linked list struct
 */
typedef struct tcp_listener_s {
	in_port_t	tl_port;
	uint32_t	tl_ratio;
	list_node_t	tl_link;
} tcp_listener_t;

/*
 * If there is a limit set on the number of connections allowed per each
 * listener, the following struct is used to store that counter.  It keeps
 * the number of TCP connection created by a listener.  Note that this needs
 * to be separated from the listener since the listener can go away before
 * all the connections are gone.
 *
 * When the struct is allocated, tlc_cnt is set to 1.  When a new connection
 * is created by the listener, tlc_cnt is incremented by 1.  When a connection
 * created by the listener goes away, tlc_count is decremented by 1.  When the
 * listener itself goes away, tlc_cnt is decremented  by one.  The last
 * connection (or the listener) which decrements tlc_cnt to zero frees the
 * struct.
 *
 * tlc_max is the maximum number of concurrent TCP connections created from a
 * listner.  It is calculated when the tcp_listen_cnt_t is allocated.
 *
 * tlc_report_time stores the time when cmn_err() is called to report that the
 * max has been exceeeded.  Report is done at most once every
 * TCP_TLC_REPORT_INTERVAL mins for a listener.
 *
 * tlc_drop stores the number of connection attempt dropped because the
 * limit has reached.
 */
typedef struct tcp_listen_cnt_s {
	uint32_t	tlc_max;
	uint32_t	tlc_cnt;
	int64_t		tlc_report_time;
	uint32_t	tlc_drop;
} tcp_listen_cnt_t;

/*
 * Track tcp_t entities bound to the same port/address tuple via SO_REUSEPORT.
 * - tcprg_lock:	Protects the other fields
 * - tcprg_size:	Allocated size (in entries) of tcprg_members array
 * - tcprg_count:	Count of occupied tcprg_members slots
 * - tcprg_active:	Count of members which still have SO_REUSEPORT set
 * - tcprg_members:	Connections associated with address/port group
 */
typedef struct tcp_rg_s {
	kmutex_t	tcprg_lock;
	unsigned int	tcprg_size;
	unsigned int	tcprg_count;
	unsigned int	tcprg_active;
	tcp_t		**tcprg_members;
} tcp_rg_t;

#define	TCP_TLC_REPORT_INTERVAL	(30 * MINUTES)

#define	TCP_DECR_LISTEN_CNT(tcp)					\
{									\
	ASSERT((tcp)->tcp_listen_cnt->tlc_cnt > 0);			\
	if (atomic_dec_32_nv(&(tcp)->tcp_listen_cnt->tlc_cnt) == 0) \
		kmem_free((tcp)->tcp_listen_cnt, sizeof (tcp_listen_cnt_t)); \
	(tcp)->tcp_listen_cnt = NULL;					\
}

/* Increment and decrement the number of connections in tcp_stack_t. */
#define	TCPS_CONN_INC(tcps)						\
	atomic_inc_64(							\
	    (uint64_t *)&(tcps)->tcps_sc[CPU->cpu_seqid]->tcp_sc_conn_cnt)

#define	TCPS_CONN_DEC(tcps)						\
	atomic_dec_64(							\
	    (uint64_t *)&(tcps)->tcps_sc[CPU->cpu_seqid]->tcp_sc_conn_cnt)

/*
 * When the system is under memory pressure, stack variable tcps_reclaim is
 * true, we shorten the connection timeout abort interval to tcp_early_abort
 * seconds.  Defined in tcp.c.
 */
extern uint32_t tcp_early_abort;

/*
 * To reach to an eager in Q0 which can be dropped due to an incoming
 * new SYN request when Q0 is full, a new doubly linked list is
 * introduced. This list allows to select an eager from Q0 in O(1) time.
 * This is needed to avoid spending too much time walking through the
 * long list of eagers in Q0 when tcp_drop_q0() is called. Each member of
 * this new list has to be a member of Q0.
 * This list is headed by listener's tcp_t. When the list is empty,
 * both the pointers - tcp_eager_next_drop_q0 and tcp_eager_prev_drop_q0,
 * of listener's tcp_t point to listener's tcp_t itself.
 *
 * Given an eager in Q0 and a listener, MAKE_DROPPABLE() puts the eager
 * in the list. MAKE_UNDROPPABLE() takes the eager out of the list.
 * These macros do not affect the eager's membership to Q0.
 */
#define	MAKE_DROPPABLE(listener, eager)					\
	if ((eager)->tcp_eager_next_drop_q0 == NULL) {			\
		(listener)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0\
		    = (eager);						\
		(eager)->tcp_eager_prev_drop_q0 = (listener);		\
		(eager)->tcp_eager_next_drop_q0 =			\
		    (listener)->tcp_eager_next_drop_q0;			\
		(listener)->tcp_eager_next_drop_q0 = (eager);		\
	}

#define	MAKE_UNDROPPABLE(eager)						\
	if ((eager)->tcp_eager_next_drop_q0 != NULL) {			\
		(eager)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0	\
		    = (eager)->tcp_eager_prev_drop_q0;			\
		(eager)->tcp_eager_prev_drop_q0->tcp_eager_next_drop_q0	\
		    = (eager)->tcp_eager_next_drop_q0;			\
		(eager)->tcp_eager_prev_drop_q0 = NULL;			\
		(eager)->tcp_eager_next_drop_q0 = NULL;			\
	}

/*
 * The format argument to pass to tcp_display().
 * DISP_PORT_ONLY means that the returned string has only port info.
 * DISP_ADDR_AND_PORT means that the returned string also contains the
 * remote and local IP address.
 */
#define	DISP_PORT_ONLY		1
#define	DISP_ADDR_AND_PORT	2

#define	IP_ADDR_CACHE_SIZE	2048
#define	IP_ADDR_CACHE_HASH(faddr)					\
	(ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1))

/*
 * TCP reassembly macros.  We hide starting and ending sequence numbers in
 * b_next and b_prev of messages on the reassembly queue.  The messages are
 * chained using b_cont.  These macros are used in tcp_reass() so we don't
 * have to see the ugly casts and assignments.
 */
#define	TCP_REASS_SEQ(mp)		((uint32_t)(uintptr_t)((mp)->b_next))
#define	TCP_REASS_SET_SEQ(mp, u)	((mp)->b_next = \
					(mblk_t *)(uintptr_t)(u))
#define	TCP_REASS_END(mp)		((uint32_t)(uintptr_t)((mp)->b_prev))
#define	TCP_REASS_SET_END(mp, u)	((mp)->b_prev = \
					(mblk_t *)(uintptr_t)(u))

#define	tcps_time_wait_interval		tcps_propinfo_tbl[0].prop_cur_uval
#define	tcps_conn_req_max_q		tcps_propinfo_tbl[1].prop_cur_uval
#define	tcps_conn_req_max_q0		tcps_propinfo_tbl[2].prop_cur_uval
#define	tcps_conn_req_min		tcps_propinfo_tbl[3].prop_cur_uval
#define	tcps_conn_grace_period		tcps_propinfo_tbl[4].prop_cur_uval
#define	tcps_cwnd_max_			tcps_propinfo_tbl[5].prop_cur_uval
#define	tcps_dbg			tcps_propinfo_tbl[6].prop_cur_uval
#define	tcps_smallest_nonpriv_port	tcps_propinfo_tbl[7].prop_cur_uval
#define	tcps_ip_abort_cinterval		tcps_propinfo_tbl[8].prop_cur_uval
#define	tcps_ip_abort_linterval		tcps_propinfo_tbl[9].prop_cur_uval
#define	tcps_ip_abort_interval		tcps_propinfo_tbl[10].prop_cur_uval
#define	tcps_ip_notify_cinterval	tcps_propinfo_tbl[11].prop_cur_uval
#define	tcps_ip_notify_interval		tcps_propinfo_tbl[12].prop_cur_uval
#define	tcps_ipv4_ttl			tcps_propinfo_tbl[13].prop_cur_uval
#define	tcps_keepalive_interval_high	tcps_propinfo_tbl[14].prop_max_uval
#define	tcps_keepalive_interval		tcps_propinfo_tbl[14].prop_cur_uval
#define	tcps_keepalive_interval_low	tcps_propinfo_tbl[14].prop_min_uval
#define	tcps_maxpsz_multiplier		tcps_propinfo_tbl[15].prop_cur_uval
#define	tcps_mss_def_ipv4		tcps_propinfo_tbl[16].prop_cur_uval
#define	tcps_mss_max_ipv4		tcps_propinfo_tbl[17].prop_cur_uval
#define	tcps_mss_min			tcps_propinfo_tbl[18].prop_cur_uval
#define	tcps_naglim_def			tcps_propinfo_tbl[19].prop_cur_uval
#define	tcps_rexmit_interval_initial_high	\
					tcps_propinfo_tbl[20].prop_max_uval
#define	tcps_rexmit_interval_initial	tcps_propinfo_tbl[20].prop_cur_uval
#define	tcps_rexmit_interval_initial_low	\
					tcps_propinfo_tbl[20].prop_min_uval
#define	tcps_rexmit_interval_max_high	tcps_propinfo_tbl[21].prop_max_uval
#define	tcps_rexmit_interval_max	tcps_propinfo_tbl[21].prop_cur_uval
#define	tcps_rexmit_interval_max_low	tcps_propinfo_tbl[21].prop_min_uval
#define	tcps_rexmit_interval_min_high	tcps_propinfo_tbl[22].prop_max_uval
#define	tcps_rexmit_interval_min	tcps_propinfo_tbl[22].prop_cur_uval
#define	tcps_rexmit_interval_min_low	tcps_propinfo_tbl[22].prop_min_uval
#define	tcps_deferred_ack_interval	tcps_propinfo_tbl[23].prop_cur_uval
#define	tcps_snd_lowat_fraction		tcps_propinfo_tbl[24].prop_cur_uval
#define	tcps_dupack_fast_retransmit	tcps_propinfo_tbl[25].prop_cur_uval
#define	tcps_ignore_path_mtu		tcps_propinfo_tbl[26].prop_cur_bval
#define	tcps_smallest_anon_port		tcps_propinfo_tbl[27].prop_cur_uval
#define	tcps_largest_anon_port		tcps_propinfo_tbl[28].prop_cur_uval
#define	tcps_xmit_hiwat			tcps_propinfo_tbl[29].prop_cur_uval
#define	tcps_xmit_lowat			tcps_propinfo_tbl[30].prop_cur_uval
#define	tcps_recv_hiwat			tcps_propinfo_tbl[31].prop_cur_uval
#define	tcps_recv_hiwat_minmss		tcps_propinfo_tbl[32].prop_cur_uval
#define	tcps_fin_wait_2_flush_interval_high	\
					tcps_propinfo_tbl[33].prop_max_uval
#define	tcps_fin_wait_2_flush_interval	tcps_propinfo_tbl[33].prop_cur_uval
#define	tcps_fin_wait_2_flush_interval_low	\
					tcps_propinfo_tbl[33].prop_min_uval
#define	tcps_max_buf			tcps_propinfo_tbl[34].prop_cur_uval
#define	tcps_strong_iss			tcps_propinfo_tbl[35].prop_cur_uval
#define	tcps_rtt_updates		tcps_propinfo_tbl[36].prop_cur_uval
#define	tcps_wscale_always		tcps_propinfo_tbl[37].prop_cur_bval
#define	tcps_tstamp_always		tcps_propinfo_tbl[38].prop_cur_bval
#define	tcps_tstamp_if_wscale		tcps_propinfo_tbl[39].prop_cur_bval
#define	tcps_rexmit_interval_extra	tcps_propinfo_tbl[40].prop_cur_uval
#define	tcps_deferred_acks_max		tcps_propinfo_tbl[41].prop_cur_uval
#define	tcps_slow_start_after_idle	tcps_propinfo_tbl[42].prop_cur_uval
#define	tcps_slow_start_initial		tcps_propinfo_tbl[43].prop_cur_uval
#define	tcps_sack_permitted		tcps_propinfo_tbl[44].prop_cur_uval
#define	tcps_ipv6_hoplimit		tcps_propinfo_tbl[45].prop_cur_uval
#define	tcps_mss_def_ipv6		tcps_propinfo_tbl[46].prop_cur_uval
#define	tcps_mss_max_ipv6		tcps_propinfo_tbl[47].prop_cur_uval
#define	tcps_rev_src_routes		tcps_propinfo_tbl[48].prop_cur_bval
#define	tcps_local_dack_interval	tcps_propinfo_tbl[49].prop_cur_uval
#define	tcps_local_dacks_max		tcps_propinfo_tbl[50].prop_cur_uval
#define	tcps_ecn_permitted		tcps_propinfo_tbl[51].prop_cur_uval
#define	tcps_rst_sent_rate_enabled	tcps_propinfo_tbl[52].prop_cur_bval
#define	tcps_rst_sent_rate		tcps_propinfo_tbl[53].prop_cur_uval
#define	tcps_push_timer_interval	tcps_propinfo_tbl[54].prop_cur_uval
#define	tcps_use_smss_as_mss_opt	tcps_propinfo_tbl[55].prop_cur_bval
#define	tcps_keepalive_abort_interval_high \
					tcps_propinfo_tbl[56].prop_max_uval
#define	tcps_keepalive_abort_interval \
					tcps_propinfo_tbl[56].prop_cur_uval
#define	tcps_keepalive_abort_interval_low \
					tcps_propinfo_tbl[56].prop_min_uval
#define	tcps_wroff_xtra			tcps_propinfo_tbl[57].prop_cur_uval
#define	tcps_dev_flow_ctl		tcps_propinfo_tbl[58].prop_cur_bval
#define	tcps_reass_timeout		tcps_propinfo_tbl[59].prop_cur_uval
#define	tcps_iss_incr			tcps_propinfo_tbl[65].prop_cur_uval
#define	tcps_abc			tcps_propinfo_tbl[67].prop_cur_bval
#define	tcps_abc_l_var			tcps_propinfo_tbl[68].prop_cur_uval


/*
 * As defined in RFC 6298, the RTO is the average estimates (SRTT) plus a
 * multiple of the deviation estimates (K * RTTVAR):
 *
 * RTO = SRTT + max(G, K * RTTVAR)
 *
 * K is defined in the RFC as 4, and G is the clock granularity. We constrain
 * the minimum mean deviation to TCP_SD_MIN when processing new RTTs, so this
 * becomes:
 *
 * RTO = SRTT + 4 * RTTVAR
 *
 * In practice, however, we make several additions to it. As we use a finer
 * grained clock than BSD and update RTO for every ACK, we add in another 1/4 of
 * RTT to the deviation of RTO to accommodate burstiness of 1/4 of window size:
 *
 * RTO = SRTT + (SRTT / 4) + 4 * RTTVAR
 *
 * Since tcp_rtt_sa is 8 times the SRTT, and tcp_rtt_sd is 4 times the RTTVAR,
 * this becomes:
 *
 * RTO = (tcp_rtt_sa / 8) + ((tcp_rtt_sa / 8) / 4) + tcp_rtt_sd
 * RTO = (tcp_rtt_sa / 2^3) + (tcp_rtt_sa / 2^5) + tcp_rtt_sd
 * RTO = (tcp_rtt_sa >> 3) + (tcp_rtt_sa >> 5) + tcp_rtt_sd
 *
 * The "tcp_rexmit_interval_extra" and "tcp_conn_grace_period" tunables are
 * used to help account for extreme environments where the algorithm fails to
 * work; by default they should be 0. (The latter tunable is only used for
 * calculating the intial RTO, and so is optionally passed in as "extra".) We
 * add them here:
 *
 * RTO = (tcp_rtt_sa >> 3) + (tcp_rtt_sa >> 5) + tcp_rtt_sd +
 *     tcps_rexmit_interval_extra + tcps_conn_grace_period
 *
 * We then pin the RTO within our configured boundaries (sections 2.4 and 2.5
 * of RFC 6298).
 */
static __GNU_INLINE clock_t
tcp_calculate_rto(tcp_t *tcp, tcp_stack_t *tcps, uint32_t extra)
{
	clock_t rto;

	rto = NSEC2MSEC((tcp->tcp_rtt_sa >> 3) + (tcp->tcp_rtt_sa >> 5) +
	    tcp->tcp_rtt_sd) + tcps->tcps_rexmit_interval_extra + extra;

	if (rto < tcp->tcp_rto_min) {
		rto = tcp->tcp_rto_min;
	} else if (rto > tcp->tcp_rto_max) {
		rto = tcp->tcp_rto_max;
	}

	return (rto);
}

extern struct qinit tcp_rinitv4, tcp_rinitv6;
extern boolean_t do_tcp_fusion;

/*
 * Object to represent database of options to search passed to
 * {sock,tpi}optcom_req() interface routine to take care of option
 * management and associated methods.
 */
extern optdb_obj_t	tcp_opt_obj;
extern uint_t		tcp_max_optsize;

extern int tcp_squeue_flag;

extern uint_t tcp_free_list_max_cnt;

/*
 * Functions in tcp.c.
 */
extern void	tcp_acceptor_hash_insert(t_uscalar_t, tcp_t *);
extern tcp_t	*tcp_acceptor_hash_lookup(t_uscalar_t, tcp_stack_t *);
extern void	tcp_acceptor_hash_remove(tcp_t *);
extern mblk_t	*tcp_ack_mp(tcp_t *);
extern int	tcp_build_hdrs(tcp_t *);
extern void	tcp_cleanup(tcp_t *);
extern int	tcp_clean_death(tcp_t *, int);
extern void	tcp_clean_death_wrapper(void *, mblk_t *, void *,
		    ip_recv_attr_t *);
extern void	tcp_close_common(conn_t *, int);
extern void	tcp_close_detached(tcp_t *);
extern void	tcp_close_mpp(mblk_t **);
extern void	tcp_closei_local(tcp_t *);
extern sock_lower_handle_t tcp_create(int, int, int, sock_downcalls_t **,
		    uint_t *, int *, int, cred_t *);
extern conn_t	*tcp_create_common(cred_t *, boolean_t, boolean_t, int *);
extern void	tcp_disconnect(tcp_t *, mblk_t *);
extern char	*tcp_display(tcp_t *, char *, char);
extern int	tcp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *,
		    boolean_t);
extern int	tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t,
		    cred_t *, pid_t);
extern int	tcp_do_listen(conn_t *, struct sockaddr *, socklen_t, int,
		    cred_t *, boolean_t);
extern int	tcp_do_unbind(conn_t *);
extern boolean_t	tcp_eager_blowoff(tcp_t *, t_scalar_t);
extern void	tcp_eager_cleanup(tcp_t *, boolean_t);
extern void	tcp_eager_kill(void *, mblk_t *, void *, ip_recv_attr_t *);
extern void	tcp_eager_unlink(tcp_t *);
extern void	tcp_init_values(tcp_t *, tcp_t *);
extern void	tcp_ipsec_cleanup(tcp_t *);
extern int	tcp_maxpsz_set(tcp_t *, boolean_t);
extern void	tcp_mss_set(tcp_t *, uint32_t);
extern void	tcp_reinput(conn_t *, mblk_t *, ip_recv_attr_t *, ip_stack_t *);
extern int	tcp_rsrv(queue_t *);
extern uint_t	tcp_rwnd_reopen(tcp_t *);
extern int	tcp_rwnd_set(tcp_t *, uint32_t);
extern int	tcp_set_destination(tcp_t *);
extern void	tcp_set_ws_value(tcp_t *);
extern void	tcp_stop_lingering(tcp_t *);
extern boolean_t	tcp_update_pmtu(tcp_t *, boolean_t);
extern mblk_t	*tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t);
extern boolean_t	tcp_zcopy_check(tcp_t *);
extern void	tcp_zcopy_notify(tcp_t *);
extern void	tcp_get_proto_props(tcp_t *, struct sock_proto_props *);

/*
 * Bind related functions in tcp_bind.c
 */
extern int	tcp_bind_check(conn_t *, struct sockaddr *, socklen_t,
		    cred_t *, boolean_t);
extern void	tcp_bind_hash_insert(tf_t *, tcp_t *, int);
extern void	tcp_bind_hash_remove(tcp_t *);
extern in_port_t	tcp_bindi(tcp_t *, in_port_t, const in6_addr_t *,
			    int, boolean_t, boolean_t, boolean_t);
extern in_port_t	tcp_update_next_port(in_port_t, const tcp_t *,
			    boolean_t);
extern tcp_rg_t *tcp_rg_init(tcp_t *);
extern boolean_t tcp_rg_remove(tcp_rg_t *, tcp_t *);
extern void tcp_rg_destroy(tcp_rg_t *);
extern void tcp_rg_setactive(tcp_rg_t *, boolean_t);

/*
 * Fusion related functions in tcp_fusion.c.
 */
extern void	tcp_fuse(tcp_t *, uchar_t *, tcpha_t *);
extern void	tcp_unfuse(tcp_t *);
extern boolean_t tcp_fuse_output(tcp_t *, mblk_t *, uint32_t);
extern void	tcp_fuse_output_urg(tcp_t *, mblk_t *);
extern boolean_t tcp_fuse_rcv_drain(queue_t *, tcp_t *, mblk_t **);
extern size_t	tcp_fuse_set_rcv_hiwat(tcp_t *, size_t);
extern int	tcp_fuse_maxpsz(tcp_t *);
extern void	tcp_fuse_backenable(tcp_t *);
extern void	tcp_iss_key_init(uint8_t *, int, tcp_stack_t *);

/*
 * Output related functions in tcp_output.c.
 */
extern void	tcp_close_output(void *, mblk_t *, void *, ip_recv_attr_t *);
extern void	tcp_output(void *, mblk_t *, void *, ip_recv_attr_t *);
extern void	tcp_output_urgent(void *, mblk_t *, void *, ip_recv_attr_t *);
extern void	tcp_rexmit_after_error(tcp_t *);
extern void	tcp_sack_rexmit(tcp_t *, uint_t *);
extern void	tcp_send_data(tcp_t *, mblk_t *);
extern void	tcp_send_synack(void *, mblk_t *, void *, ip_recv_attr_t *);
extern void	tcp_shutdown_output(void *, mblk_t *, void *, ip_recv_attr_t *);
extern void	tcp_ss_rexmit(tcp_t *);
extern void	tcp_update_xmit_tail(tcp_t *, uint32_t);
extern int	tcp_wput(queue_t *, mblk_t *);
extern void	tcp_wput_data(tcp_t *, mblk_t *, boolean_t);
extern int	tcp_wput_sock(queue_t *, mblk_t *);
extern int	tcp_wput_fallback(queue_t *, mblk_t *);
extern void	tcp_xmit_ctl(char *, tcp_t *, uint32_t, uint32_t, int);
extern void	tcp_xmit_listeners_reset(mblk_t *, ip_recv_attr_t *,
		    ip_stack_t *i, conn_t *);
extern mblk_t	*tcp_xmit_mp(tcp_t *, mblk_t *, int32_t, int32_t *,
		    mblk_t **, uint32_t, boolean_t, uint32_t *, boolean_t);

/*
 * Input related functions in tcp_input.c.
 */
extern void	cc_cong_signal(tcp_t *, uint32_t, uint32_t);
extern void	tcp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
extern void	tcp_input_data(void *, mblk_t *, void *, ip_recv_attr_t *);
extern void	tcp_input_listener_unbound(void *, mblk_t *, void *,
		    ip_recv_attr_t *);
extern boolean_t	tcp_paws_check(tcp_t *, const tcp_opt_t *);
extern int	tcp_parse_options(tcpha_t *, tcp_opt_t *);
extern uint_t	tcp_rcv_drain(tcp_t *);
extern void	tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t, cred_t *);
extern boolean_t	tcp_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *,
			    ip_recv_attr_t *);

/*
 * Kernel socket related functions in tcp_socket.c.
 */
extern int	tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
		    so_proto_quiesced_cb_t, sock_quiesce_arg_t *);
extern boolean_t tcp_newconn_notify(tcp_t *, ip_recv_attr_t *);

/*
 * Timer related functions in tcp_timers.c.
 */
extern void	tcp_ack_timer(void *);
extern void	tcp_close_linger_timeout(void *);
extern void	tcp_keepalive_timer(void *);
extern void	tcp_push_timer(void *);
extern void	tcp_reass_timer(void *);
extern mblk_t	*tcp_timermp_alloc(int);
extern void	tcp_timermp_free(tcp_t *);
extern timeout_id_t tcp_timeout(conn_t *, void (*)(void *), hrtime_t);
extern clock_t	tcp_timeout_cancel(conn_t *, timeout_id_t);
extern void	tcp_timer(void *arg);
extern void	tcp_timers_stop(tcp_t *);

/*
 * TCP TPI related functions in tcp_tpi.c.
 */
extern void	tcp_addr_req(tcp_t *, mblk_t *);
extern void	tcp_capability_req(tcp_t *, mblk_t *);
extern boolean_t	tcp_conn_con(tcp_t *, uchar_t *, mblk_t *,
			    mblk_t **, ip_recv_attr_t *);
extern void	tcp_err_ack(tcp_t *, mblk_t *, int, int);
extern void	tcp_err_ack_prim(tcp_t *, mblk_t *, int, int, int);
extern void	tcp_info_req(tcp_t *, mblk_t *);
extern void	tcp_send_conn_ind(void *, mblk_t *, void *);
extern void	tcp_send_pending(void *, mblk_t *, void *, ip_recv_attr_t *);
extern int	tcp_tpi_accept(queue_t *, mblk_t *);
extern void	tcp_tpi_bind(tcp_t *, mblk_t *);
extern int	tcp_tpi_close(queue_t *, int, cred_t *);
extern int	tcp_tpi_close_accept(queue_t *, int, cred_t *);
extern void	tcp_tpi_connect(tcp_t *, mblk_t *);
extern int	tcp_tpi_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
extern int	tcp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
		    uint_t *, uchar_t *, void *, cred_t *);
extern void	tcp_tpi_unbind(tcp_t *, mblk_t *);
extern void	tcp_tli_accept(tcp_t *, mblk_t *);
extern void	tcp_use_pure_tpi(tcp_t *);
extern void	tcp_do_capability_ack(tcp_t *, struct T_capability_ack *,
		    t_uscalar_t);

/*
 * TCP option processing related functions in tcp_opt_data.c
 */
extern int	tcp_opt_get(conn_t *, int, int, uchar_t *);
extern int	tcp_opt_set(conn_t *, uint_t, int, int, uint_t, uchar_t *,
		    uint_t *, uchar_t *, void *, cred_t *);

/*
 * TCP time wait processing related functions in tcp_time_wait.c.
 */
extern void		tcp_time_wait_append(tcp_t *);
extern void		tcp_time_wait_collector(void *);
extern boolean_t	tcp_time_wait_remove(tcp_t *, tcp_squeue_priv_t *);
extern void		tcp_time_wait_processing(tcp_t *, mblk_t *, uint32_t,
			    uint32_t, int, tcpha_t *, ip_recv_attr_t *);

/*
 * Misc functions in tcp_misc.c.
 */
extern uint32_t	tcp_find_listener_conf(tcp_stack_t *, in_port_t);
extern void	tcp_ioctl_abort_conn(queue_t *, mblk_t *);
extern void	tcp_listener_conf_cleanup(tcp_stack_t *);
extern void	tcp_stack_cpu_add(tcp_stack_t *, processorid_t);

#endif	/* _KERNEL */

#ifdef	__cplusplus
}
#endif

#endif	/* _INET_TCP_IMPL_H */