summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSebastien Roy <seb@delphix.com>2017-08-01 13:21:40 -0400
committerRobert Mustacchi <rm@joyent.com>2019-08-23 18:42:52 +0000
commit45a4b79d042e642c2ed7090ec290469ccf8fc563 (patch)
tree3a2b9b0104d34bf6063ec1875142e69c1bc7a296
parent867a2ce85cd3f659cb7bc187ba93a095fe1df597 (diff)
downloadillumos-joyent-45a4b79d042e642c2ed7090ec290469ccf8fc563.tar.gz
11553 Want pluggable TCP congestion control algorithms
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com> Reviewed by: Dan McDonald <danmcd@joyent.com> Reviewed by: Robert Mustacchi <robert.mustacchi@joyent.com> Approved by: Richard Lowe <richlowe@richlowe.net>
-rw-r--r--exception_lists/cstyle4
-rw-r--r--exception_lists/hdrchk2
-rw-r--r--exception_lists/packaging1
-rw-r--r--usr/src/lib/libipadm/common/ipadm_prop.c5
-rw-r--r--usr/src/man/man1m/ipadm.1m5
-rw-r--r--usr/src/pkg/manifests/system-kernel.mf7
-rw-r--r--usr/src/uts/Makefile.targ4
-rw-r--r--usr/src/uts/Makefile.uts10
-rw-r--r--usr/src/uts/common/Makefile.files8
-rw-r--r--usr/src/uts/common/Makefile.rules9
-rw-r--r--usr/src/uts/common/inet/Makefile6
-rw-r--r--usr/src/uts/common/inet/cc.h214
-rw-r--r--usr/src/uts/common/inet/cc/THIRDPARTYLICENSE29
-rw-r--r--usr/src/uts/common/inet/cc/THIRDPARTYLICENSE.descrip3
-rw-r--r--usr/src/uts/common/inet/cc/cc.c200
-rw-r--r--usr/src/uts/common/inet/cc/cc_cubic.c428
-rw-r--r--usr/src/uts/common/inet/cc/cc_cubic.h222
-rw-r--r--usr/src/uts/common/inet/cc/cc_module.h56
-rw-r--r--usr/src/uts/common/inet/cc/cc_newreno.c268
-rw-r--r--usr/src/uts/common/inet/cc/cc_sunreno.c222
-rw-r--r--usr/src/uts/common/inet/ip/ip.c1
-rw-r--r--usr/src/uts/common/inet/tcp.h10
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c33
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_input.c246
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_output.c19
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_timers.c33
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_tunables.c79
-rw-r--r--usr/src/uts/common/inet/tcp_impl.h5
-rw-r--r--usr/src/uts/common/inet/tcp_stack.h3
-rw-r--r--usr/src/uts/intel/Makefile.intel7
-rw-r--r--usr/src/uts/intel/cc/Makefile69
-rw-r--r--usr/src/uts/intel/cc_cubic/Makefile73
-rw-r--r--usr/src/uts/intel/cc_newreno/Makefile73
-rw-r--r--usr/src/uts/intel/cc_sunreno/Makefile73
-rw-r--r--usr/src/uts/intel/ip/Makefile10
-rw-r--r--usr/src/uts/sparc/Makefile.sparc8
-rw-r--r--usr/src/uts/sparc/cc/Makefile69
-rw-r--r--usr/src/uts/sparc/cc_cubic/Makefile73
-rw-r--r--usr/src/uts/sparc/cc_newreno/Makefile73
-rw-r--r--usr/src/uts/sparc/cc_sunreno/Makefile73
-rw-r--r--usr/src/uts/sparc/ip/Makefile21
41 files changed, 2626 insertions, 128 deletions
diff --git a/exception_lists/cstyle b/exception_lists/cstyle
index 06801920d4..3d2fc67caa 100644
--- a/exception_lists/cstyle
+++ b/exception_lists/cstyle
@@ -808,6 +808,10 @@ usr/src/uts/common/gssapi/mechs/krb5/mech/util_validate.c
usr/src/uts/common/gssapi/mechs/krb5/mech/val_cred.c
usr/src/uts/common/gssapi/mechs/krb5/mech/verify.c
usr/src/uts/common/gssapi/mechs/krb5/mech/wrap_size_limit.c
+usr/src/uts/common/inet/cc.h
+usr/src/uts/common/inet/cc/cc_cubic.c
+usr/src/uts/common/inet/cc/cc_module.h
+usr/src/uts/common/inet/cc/cc_newreno.c
usr/src/uts/common/io/bnx/570x/*
usr/src/uts/common/io/bnx/include/*
usr/src/uts/common/io/bnxe/577xx/common/bnxe_clc.c
diff --git a/exception_lists/hdrchk b/exception_lists/hdrchk
index ba30a7d6bb..9e8194ac8d 100644
--- a/exception_lists/hdrchk
+++ b/exception_lists/hdrchk
@@ -226,6 +226,8 @@ usr/src/uts/common/gssapi/mechs/krb5/include/krb5.h
usr/src/uts/common/gssapi/mechs/krb5/include/old.h
usr/src/uts/common/gssapi/mechs/krb5/include/raw.h
usr/src/uts/common/gssapi/mechs/krb5/include/rsa-md4.h
+usr/src/uts/common/inet/cc.h
+usr/src/uts/common/inet/cc/cc_module.h
usr/src/uts/common/io/axf/ax88172reg.h
usr/src/uts/common/io/bnx/570x/*
usr/src/uts/common/io/bnx/include/*
diff --git a/exception_lists/packaging b/exception_lists/packaging
index 8fb6466e79..1f91b551b4 100644
--- a/exception_lists/packaging
+++ b/exception_lists/packaging
@@ -137,6 +137,7 @@ usr/lib/llib-like.ln
usr/lib/amd64/llib-like.ln i386
usr/lib/sparcv9/llib-like.ln sparc
#
+usr/include/inet/cc.h
usr/include/inet/ip_impl.h
usr/include/inet/ip_ndp.h
usr/include/inet/ip2mac_impl.h
diff --git a/usr/src/lib/libipadm/common/ipadm_prop.c b/usr/src/lib/libipadm/common/ipadm_prop.c
index 0c3a25382f..4fc0dc0851 100644
--- a/usr/src/lib/libipadm/common/ipadm_prop.c
+++ b/usr/src/lib/libipadm/common/ipadm_prop.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
*/
/*
@@ -153,6 +153,9 @@ static const char *ecn_sack_vals[] = {"never", "passive", "active", NULL};
/* Supported TCP protocol properties */
static ipadm_prop_desc_t ipadm_tcp_prop_table[] = {
+ { "congestion_control", NULL, IPADMPROP_CLASS_MODULE, MOD_PROTO_TCP, 0,
+ i_ipadm_set_prop, i_ipadm_get_prop, i_ipadm_get_prop },
+
{ "ecn", NULL, IPADMPROP_CLASS_MODULE, MOD_PROTO_TCP, 0,
i_ipadm_set_ecnsack, i_ipadm_get_ecnsack, i_ipadm_get_ecnsack },
diff --git a/usr/src/man/man1m/ipadm.1m b/usr/src/man/man1m/ipadm.1m
index 0381aa130d..8208657ddc 100644
--- a/usr/src/man/man1m/ipadm.1m
+++ b/usr/src/man/man1m/ipadm.1m
@@ -10,7 +10,7 @@
.\"
.\"
.\" Copyright (c) 2012, Joyent, Inc. All Rights Reserved
-.\" Copyright (c) 2013 by Delphix. All rights reserved.
+.\" Copyright (c) 2013, 2017 by Delphix. All rights reserved.
.\" Copyright 2018 Nexenta Systems, Inc.
.\" Copyright (c) 2016-2017, Chris Fraire <cfraire@me.com>.
.\" Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
@@ -777,6 +777,9 @@ syntax can be used to add/remove values from the current list of values on the
property.
The property name can be one of the following:
.Bl -tag -compact -width "smallest_nonpriv_port"
+.It Cm congestion_control
+The default congestion-control algorithm to be used for new connections
+.Pq TCP .
.It Cm ecn
Explicit congestion control
.Pq Cm never Ns / Ns Cm passive Ns / Ns Cm active
diff --git a/usr/src/pkg/manifests/system-kernel.mf b/usr/src/pkg/manifests/system-kernel.mf
index 0aea96988d..02186d11de 100644
--- a/usr/src/pkg/manifests/system-kernel.mf
+++ b/usr/src/pkg/manifests/system-kernel.mf
@@ -21,6 +21,7 @@
#
# Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+# Copyright (c) 2014, 2017 by Delphix. All rights reserved.
# Copyright 2013 Saso Kiselkov. All rights reserved.
# Copyright 2015 Nexenta Systems, Inc. All rights reserved.
# Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
@@ -66,6 +67,8 @@ dir path=etc/crypto group=sys
dir path=etc/sock2path.d group=sys
dir path=kernel group=sys
$(i386_ONLY)dir path=kernel/$(ARCH64) group=sys
+dir path=kernel/cc group=sys
+dir path=kernel/cc/$(ARCH64) group=sys
dir path=kernel/crypto group=sys
dir path=kernel/crypto/$(ARCH64) group=sys
dir path=kernel/dacf group=sys
@@ -311,6 +314,9 @@ file path=etc/name_to_sysnum group=sys \
file path=etc/sock2path.d/system%2Fkernel group=sys
file path=etc/system group=sys original_name=SUNWckr:etc/system preserve=true
$(i386_ONLY)file path=kernel/$(ARCH64)/genunix group=sys mode=0755
+file path=kernel/cc/$(ARCH64)/cc_cubic group=sys mode=0755
+file path=kernel/cc/$(ARCH64)/cc_newreno group=sys mode=0755
+file path=kernel/cc/$(ARCH64)/cc_sunreno group=sys mode=0755
file path=kernel/crypto/$(ARCH64)/aes group=sys mode=0755
file path=kernel/crypto/$(ARCH64)/arcfour group=sys mode=0755
file path=kernel/crypto/$(ARCH64)/blowfish group=sys mode=0755
@@ -533,6 +539,7 @@ file path=kernel/misc/$(ARCH64)/bignum group=sys mode=0755
$(i386_ONLY)file path=kernel/misc/$(ARCH64)/bootdev group=sys mode=0755
file path=kernel/misc/$(ARCH64)/busra group=sys mode=0755
file path=kernel/misc/$(ARCH64)/cardbus group=sys mode=0755
+file path=kernel/misc/$(ARCH64)/cc group=sys mode=0755
file path=kernel/misc/$(ARCH64)/cmlb group=sys mode=0755
file path=kernel/misc/$(ARCH64)/consconfig group=sys mode=0755
file path=kernel/misc/$(ARCH64)/ctf group=sys mode=0755
diff --git a/usr/src/uts/Makefile.targ b/usr/src/uts/Makefile.targ
index 80e4e7d115..c5c32caa19 100644
--- a/usr/src/uts/Makefile.targ
+++ b/usr/src/uts/Makefile.targ
@@ -22,6 +22,7 @@
# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
# Copyright 2014 Garrett D'Amore <garrett@damore.org>
# Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
+# Copyright (c) 2017 by Delphix. All rights reserved.
#
# This Makefiles contains the common targets and definitions for
# all kernels. It is to be included in the Makefiles for specific
@@ -177,6 +178,9 @@ $(ROOT_DACF_DIR)/%: $(OBJS_DIR)/% $(ROOT_DACF_DIR) FRC
$(ROOT_BRAND_DIR)/%: $(OBJS_DIR)/% $(ROOT_BRAND_DIR) FRC
$(INS.file)
+$(ROOT_CC_DIR)/%: $(OBJS_DIR)/% $(ROOT_MOD_DIR) $(ROOT_CC_DIR) FRC
+ $(INS.file)
+
$(ROOT_CRYPTO_DIR)/%: $(OBJS_DIR)/% $(ROOT_CRYPTO_DIR) FRC
$(INS.file)
diff --git a/usr/src/uts/Makefile.uts b/usr/src/uts/Makefile.uts
index 5fe9959e65..033d50a146 100644
--- a/usr/src/uts/Makefile.uts
+++ b/usr/src/uts/Makefile.uts
@@ -22,7 +22,7 @@
#
# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2011 Bayard G. Bell. All rights reserved.
-# Copyright (c) 2011 by Delphix. All rights reserved.
+# Copyright (c) 2011,2017 by Delphix. All rights reserved.
# Copyright (c) 2013 Andrew Stormont. All rights reserved.
# Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
# Copyright (c) 2019, Joyent, Inc.
@@ -430,6 +430,7 @@ ROOT_FONT_DIR_32 = $(ROOT_MOD_DIR)/fonts
ROOT_DACF_DIR_32 = $(ROOT_MOD_DIR)/dacf
ROOT_CRYPTO_DIR_32 = $(ROOT_MOD_DIR)/crypto
ROOT_MAC_DIR_32 = $(ROOT_MOD_DIR)/mac
+ROOT_CC_DIR_32 = $(ROOT_MOD_DIR)/cc
ROOT_KICONV_DIR_32 = $(ROOT_MOD_DIR)/kiconv
ROOT_KERN_DIR_64 = $(ROOT_MOD_DIR)/$(SUBDIR64)
@@ -457,6 +458,7 @@ ROOT_FONT_DIR_64 = $(ROOT_MOD_DIR)/fonts/$(SUBDIR64)
ROOT_DACF_DIR_64 = $(ROOT_MOD_DIR)/dacf/$(SUBDIR64)
ROOT_CRYPTO_DIR_64 = $(ROOT_MOD_DIR)/crypto/$(SUBDIR64)
ROOT_MAC_DIR_64 = $(ROOT_MOD_DIR)/mac/$(SUBDIR64)
+ROOT_CC_DIR_64 = $(ROOT_MOD_DIR)/cc/$(SUBDIR64)
ROOT_KICONV_DIR_64 = $(ROOT_MOD_DIR)/kiconv/$(SUBDIR64)
ROOT_KERN_DIR = $(ROOT_KERN_DIR_$(CLASS))
@@ -484,6 +486,7 @@ ROOT_FONT_DIR = $(ROOT_FONT_DIR_$(CLASS))
ROOT_DACF_DIR = $(ROOT_DACF_DIR_$(CLASS))
ROOT_CRYPTO_DIR = $(ROOT_CRYPTO_DIR_$(CLASS))
ROOT_MAC_DIR = $(ROOT_MAC_DIR_$(CLASS))
+ROOT_CC_DIR = $(ROOT_CC_DIR_$(CLASS))
ROOT_KICONV_DIR = $(ROOT_KICONV_DIR_$(CLASS))
ROOT_FIRMWARE_DIR = $(ROOT_MOD_DIR)/firmware
@@ -502,6 +505,7 @@ ROOT_MOD_DIRS_32 += $(ROOT_EMLXS_FW_DIR_32)
ROOT_MOD_DIRS_32 += $(ROOT_CPU_DIR_32) $(ROOT_FONT_DIR_32)
ROOT_MOD_DIRS_32 += $(ROOT_TOD_DIR_32) $(ROOT_DACF_DIR_32)
ROOT_MOD_DIRS_32 += $(ROOT_CRYPTO_DIR_32) $(ROOT_MAC_DIR_32)
+ROOT_MOD_DIRS_32 += $(ROOT_CC_DIR_32)
ROOT_MOD_DIRS_32 += $(ROOT_KICONV_DIR_32)
ROOT_MOD_DIRS_32 += $(ROOT_FIRMWARE_DIR)
@@ -595,7 +599,7 @@ PARALLEL_KMODS = $(DRV_KMODS) $(EXEC_KMODS) $(FS_KMODS) $(SCHED_KMODS) \
$(MMU_KMODS) $(DACF_KMODS) $(EXPORT_KMODS) $(IPP_KMODS) \
$(CRYPTO_KMODS) $(PCBE_KMODS) \
$(DRV_KMODS_$(CLASS)) $(MISC_KMODS_$(CLASS)) $(MAC_KMODS) \
- $(BRAND_KMODS) $(KICONV_KMODS) \
+ $(BRAND_KMODS) $(KICONV_KMODS) $(CC_KMODS) \
$(SOCKET_KMODS)
KMODS = $(GENUNIX_KMODS) $(PARALLEL_KMODS)
@@ -607,7 +611,7 @@ LINT_KMODS = $(DRV_KMODS) $(EXEC_KMODS) $(FS_KMODS) $(SCHED_KMODS) \
$(MACH_KMODS) $(GSS_KMODS) $(DACF_KMODS) $(IPP_KMODS) \
$(CRYPTO_KMODS) $(PCBE_KMODS) \
$(DRV_KMODS_$(CLASS)) $(MISC_KMODS_$(CLASS)) $(MAC_KMODS) \
- $(BRAND_KMODS) $(KICONV_KMODS) $(SOCKET_KMODS)
+ $(BRAND_KMODS) $(KICONV_KMODS) $(CC_KMODS) $(SOCKET_KMODS)
#
# Files to be compiled with -xa, to generate basic block execution
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index ebc1b2db90..9a5a48c4c8 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -488,6 +488,14 @@ BLKDEV_OBJS += blkdev.o
CARDBUS_OBJS += cardbus.o cardbus_hp.o cardbus_cfg.o
+CC_OBJS += cc.o
+
+CC_CUBIC_OBJS += cc_cubic.o
+
+CC_NEWRENO_OBJS += cc_newreno.o
+
+CC_SUNRENO_OBJS += cc_sunreno.o
+
CONSKBD_OBJS += conskbd.o
CONSMS_OBJS += consms.o
diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules
index 983c5c359d..a489d8314a 100644
--- a/usr/src/uts/common/Makefile.rules
+++ b/usr/src/uts/common/Makefile.rules
@@ -25,7 +25,7 @@
# Copyright 2013 Saso Kiselkov. All rights reserved.
# Copyright 2019 Joyent, Inc.
# Copyright 2018 Nexenta Systems, Inc.
-# Copyright (c) 2016 by Delphix. All rights reserved.
+# Copyright (c) 2017 by Delphix. All rights reserved.
#
#
@@ -527,6 +527,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/dlpistub/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/cc/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -1962,6 +1966,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/nca/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/dlpistub/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/cc/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
diff --git a/usr/src/uts/common/inet/Makefile b/usr/src/uts/common/inet/Makefile
index 14ce78a884..5bcbc7532f 100644
--- a/usr/src/uts/common/inet/Makefile
+++ b/usr/src/uts/common/inet/Makefile
@@ -1,4 +1,4 @@
-#
+#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
@@ -23,12 +23,14 @@
# Copyright 2010 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
+# Copyright (c) 2014, 2017 by Delphix. All rights reserved.
+#
# uts/common/inet/Makefile
#
# include global definitions
include ../../../Makefile.master
-HDRS= arp.h common.h ipclassifier.h ip.h ip6.h ipdrop.h ipnet.h \
+HDRS= arp.h cc.h common.h ipclassifier.h ip.h ip6.h ipdrop.h ipnet.h \
ipsecah.h ipsecesp.h ipsec_info.h iptun.h ip6_asp.h ip_if.h ip_ire.h \
ip_multi.h ip_netinfo.h ip_ndp.h ip_rts.h ipsec_impl.h keysock.h \
led.h mi.h mib2.h nd.h optcom.h sadb.h sctp_itf.h snmpcom.h tcp.h \
diff --git a/usr/src/uts/common/inet/cc.h b/usr/src/uts/common/inet/cc.h
new file mode 100644
index 0000000000..170d0e7f8b
--- /dev/null
+++ b/usr/src/uts/common/inet/cc.h
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2007-2008
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ *
+ * This software was developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University of Technology, by Lawrence Stewart and
+ * James Healy, made possible in part by a grant from the Cisco University
+ * Research Program Fund at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This software was first released in 2007 by James Healy and Lawrence Stewart
+ * whilst working on the NewTCP research project at Swinburne University of
+ * Technology's Centre for Advanced Internet Architectures, Melbourne,
+ * Australia, which was made possible in part by a grant from the Cisco
+ * University Research Program Fund at Community Foundation Silicon Valley.
+ * More details are available at:
+ * http://caia.swin.edu.au/urp/newtcp/
+ */
+
+#ifndef _NETINET_CC_H_
+#define _NETINET_CC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <netinet/tcp.h>
+#include <sys/queue.h>
+#include <sys/rwlock.h>
+
+#define CC_ALGO_NAME_MAX 16 /* max congestion control name length */
+
+#define CC_DEFAULT_ALGO_NAME "sunreno"
+
+struct tcp_s;
+struct sctp_s;
+
+/* CC housekeeping functions. */
+extern struct cc_algo *cc_load_algo(const char *name);
+extern int cc_register_algo(struct cc_algo *add_cc);
+extern int cc_deregister_algo(struct cc_algo *remove_cc);
+
+/*
+ * Wrapper around transport structs that contain same-named congestion
+ * control variables. Allows algos to be shared amongst multiple CC aware
+ * transports.
+ *
+ * In theory, this code (from FreeBSD) can be used to support pluggable
+ * congestion control for sctp as well as tcp. However, the support for sctp
+ * in FreeBSD is incomplete, and in practice "type" is ignored. cc_module.h
+ * provides a CCV macro which implementations can use to get a variable out of
+ * the protocol-appropriate structure.
+ *
+ * If FreeBSD eventually does extend support for pluggable congestion control
+ * to sctp, we'll need to make sure we're setting "type" appropriately or use
+ * a definition of CCV that ignores it.
+ */
+struct cc_var {
+ void *cc_data; /* Per-connection private algorithm data. */
+ int bytes_this_ack; /* # bytes acked by the current ACK. */
+ int t_bytes_acked; /* # bytes acked during current RTT */
+ tcp_seq curack; /* Most recent ACK. */
+ uint32_t flags; /* Flags for cc_var (see below) */
+ int type; /* Indicates which ptr is valid in ccvc. */
+ union ccv_container {
+ struct tcp_s *tcp;
+ struct sctp_s *sctp;
+ } ccvc;
+ uint16_t nsegs; /* # segments coalesced into current chain. */
+};
+
+/*
+ * cc_var flags.
+ *
+ * CCF_ABC_SENTAWND is set when a full congestion window of data has been ACKed
+ * according to the Appropriate Byte Counting spec, defined in RFC 3465.
+ */
+#define CCF_ABC_SENTAWND 0x0001 /* ABC counted cwnd worth of bytes? */
+#define CCF_CWND_LIMITED 0x0002 /* Are we currently cwnd limited? */
+#define CCF_FASTRECOVERY 0x0004 /* in NewReno Fast Recovery */
+#define CCF_WASFRECOVERY 0x0008 /* was in NewReno Fast Recovery */
+#define CCF_CONGRECOVERY 0x0010 /* congestion recovery mode */
+#define CCF_WASCRECOVERY 0x0020 /* was in congestion recovery */
+/*
+ * In slow-start due to a retransmission timeout. This flag is enabled for the
+ * duration of the slow-start phase.
+ */
+#define CCF_RTO 0x0040 /* in slow-start due to timeout */
+
+#define IN_FASTRECOVERY(flags) (flags & CCF_FASTRECOVERY)
+#define ENTER_FASTRECOVERY(flags) flags |= CCF_FASTRECOVERY
+#define EXIT_FASTRECOVERY(flags) flags &= ~CCF_FASTRECOVERY
+
+#define IN_CONGRECOVERY(flags) (flags & CCF_CONGRECOVERY)
+#define ENTER_CONGRECOVERY(flags) flags |= CCF_CONGRECOVERY
+#define EXIT_CONGRECOVERY(flags) flags &= ~CCF_CONGRECOVERY
+
+#define IN_RECOVERY(flags) (flags & (CCF_CONGRECOVERY | CCF_FASTRECOVERY))
+#define ENTER_RECOVERY(flags) flags |= (CCF_CONGRECOVERY | CCF_FASTRECOVERY)
+#define EXIT_RECOVERY(flags) flags &= ~(CCF_CONGRECOVERY | CCF_FASTRECOVERY)
+
+/*
+ * ACK types passed to the ack_received() hook.
+ *
+ * CC_ACK is passed when an ACK acknowledges previously unACKed data.
+ * CC_DUPACK is passed when a duplicate ACK is received. The conditions under
+ * which an ACK is considered a duplicate ACK are defined in RFC 5681.
+ */
+#define CC_ACK 0x0001 /* Regular in sequence ACK. */
+#define CC_DUPACK 0x0002 /* Duplicate ACK. */
+#define CC_PARTIALACK 0x0004 /* Not yet. */
+#define CC_SACK 0x0008 /* Not yet. */
+
+/*
+ * Congestion signal types passed to the cong_signal() hook. The highest order 8
+ * bits (0x01000000 - 0x80000000) are reserved for CC algos to declare their own
+ * congestion signal types.
+ *
+ * The congestion signals defined here cover the following situations:
+ * CC_ECN: A packet with an Explicit Congestion Notification was received
+ * See RFC 3168.
+ * CC_RTO: A round-trip timeout occured.
+ * CC_RTO_ERR: An ACK was received for a sequence number after we fired an RTO
+ * for that sequence number
+ * CC_NDUPACK: Trigger fast retransmit based on the assumption that receiving
+ * N duplicate ACKs indicates packet loss rather than reordering. Fast
+ * retransmit is followed by fast recovery. Fast retransmit and recovery
+ * were originally described in RFC 2581 and were updated by RFC3782
+ * (NewReno). In both RFC2581 and RFC3782, N is 3.
+ */
+#define CC_ECN 0x00000001 /* ECN marked packet received. */
+#define CC_RTO 0x00000002 /* RTO fired. */
+#define CC_RTO_ERR 0x00000004 /* RTO fired in error. */
+#define CC_NDUPACK 0x00000008 /* Threshold of dupack's reached. */
+
+#define CC_SIGPRIVMASK 0xFF000000 /* Mask to check if sig is private. */
+
+/*
+ * Structure to hold data and function pointers that together represent a
+ * congestion control algorithm.
+ */
+struct cc_algo {
+ char name[CC_ALGO_NAME_MAX];
+
+ /* Init CC state for a new control block. */
+ int (*cb_init)(struct cc_var *ccv);
+
+ /* Cleanup CC state for a terminating control block. */
+ void (*cb_destroy)(struct cc_var *ccv);
+
+ /* Init variables for a newly established connection. */
+ void (*conn_init)(struct cc_var *ccv);
+
+ /* Called on receipt of an ack. */
+ void (*ack_received)(struct cc_var *ccv, uint16_t type);
+
+ /* Called on detection of a congestion signal. */
+ void (*cong_signal)(struct cc_var *ccv, uint32_t type);
+
+ /* Called after exiting congestion recovery. */
+ void (*post_recovery)(struct cc_var *ccv);
+
+ /* Called when data transfer resumes after an idle period. */
+ void (*after_idle)(struct cc_var *ccv);
+
+ STAILQ_ENTRY(cc_algo) entries;
+};
+
+typedef int cc_walk_func_t(void *, struct cc_algo *);
+extern int cc_walk_algos(cc_walk_func_t *, void *);
+
+/* Macro to obtain the CC algo's struct ptr. */
+#define CC_ALGO(tp) ((tp)->tcp_cc_algo)
+
+/* Macro to obtain the CC algo's data ptr. */
+#define CC_DATA(tp) ((tp)->tcp_ccv.cc_data)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NETINET_CC_H_ */
diff --git a/usr/src/uts/common/inet/cc/THIRDPARTYLICENSE b/usr/src/uts/common/inet/cc/THIRDPARTYLICENSE
new file mode 100644
index 0000000000..d2cdf5164e
--- /dev/null
+++ b/usr/src/uts/common/inet/cc/THIRDPARTYLICENSE
@@ -0,0 +1,29 @@
+This software was developed by Lawrence Stewart while studying at the Centre
+for Advanced Internet Architectures, Swinburne University of Technology, made
+possible in part by a grant from the Cisco University Research Program Fund
+at Community Foundation Silicon Valley.
+
+Portions of this software were developed at the Centre for Advanced
+Internet Architectures, Swinburne University of Technology, Melbourne,
+Australia by David Hayes under sponsorship from the FreeBSD Foundation.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
diff --git a/usr/src/uts/common/inet/cc/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/inet/cc/THIRDPARTYLICENSE.descrip
new file mode 100644
index 0000000000..4740689711
--- /dev/null
+++ b/usr/src/uts/common/inet/cc/THIRDPARTYLICENSE.descrip
@@ -0,0 +1,3 @@
+The congestion control framework, its header, and the CUBIC and NewReno
+congestion control modules came from FreeBSD, and are therefore licensed
+under the 2-Clause BSD License.
diff --git a/usr/src/uts/common/inet/cc/cc.c b/usr/src/uts/common/inet/cc/cc.c
new file mode 100644
index 0000000000..7bb213f74e
--- /dev/null
+++ b/usr/src/uts/common/inet/cc/cc.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2007-2008
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ *
+ * This software was developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University of Technology, by Lawrence Stewart and
+ * James Healy, made possible in part by a grant from the Cisco University
+ * Research Program Fund at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This software was first released in 2007 by James Healy and Lawrence Stewart
+ * whilst working on the NewTCP research project at Swinburne University of
+ * Technology's Centre for Advanced Internet Architectures, Melbourne,
+ * Australia, which was made possible in part by a grant from the Cisco
+ * University Research Program Fund at Community Foundation Silicon Valley.
+ * More details are available at:
+ * http://caia.swin.edu.au/urp/newtcp/
+ */
+
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/queue.h>
+#include <inet/cc.h>
+#include <inet/tcp.h>
+#include <sys/sdt.h>
+
+#define CC_KMODDIR "cc"
+
+/*
+ * List of available cc algorithms on the current system. Access is
+ * synchronized using cc_list_lock.
+ */
+static STAILQ_HEAD(cc_head, cc_algo) cc_list = STAILQ_HEAD_INITIALIZER(cc_list);
+static kmutex_t cc_list_lock;
+
+static struct modlmisc cc_modlmisc = {
+ &mod_miscops,
+ "Pluggable Congestion Control Framework"
+};
+
+static struct modlinkage cc_modlinkage = {
+ MODREV_1,
+ &cc_modlmisc,
+ NULL
+};
+
+/*
+ * Initialise CC subsystem on system boot.
+ */
+int
+_init(void)
+{
+ STAILQ_INIT(&cc_list);
+
+ return (mod_install(&cc_modlinkage));
+}
+
+int
+_fini(void)
+{
+ return (EBUSY);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&cc_modlinkage, modinfop));
+}
+
+int
+cc_walk_algos(cc_walk_func_t *func, void *cd)
+{
+ struct cc_algo *algo;
+ int ret = 0;
+
+ mutex_enter(&cc_list_lock);
+ STAILQ_FOREACH(algo, &cc_list, entries) {
+ if ((ret = func(cd, algo)) != 0) {
+ break;
+ }
+ }
+ mutex_exit(&cc_list_lock);
+
+ return (ret);
+}
+
+/*
+ * Search for an algorithm of a given name, and return the corresponding set of
+ * operations. If there is no algorithm with the given name present, then this
+ * function returns NULL.
+ *
+ * Since this function is passed names from userland, it needs to be paranoid
+ * about the string, in case it's missing a terminating NUL character.
+ */
+struct cc_algo *
+cc_load_algo(const char *name)
+{
+ struct cc_algo *algo;
+ boolean_t found = B_FALSE;
+
+ if (strnlen(name, CC_ALGO_NAME_MAX) >= CC_ALGO_NAME_MAX) {
+ return (NULL);
+ }
+
+ mutex_enter(&cc_list_lock);
+ STAILQ_FOREACH(algo, &cc_list, entries) {
+ if (strncmp(algo->name, name, CC_ALGO_NAME_MAX) == 0) {
+ found = B_TRUE;
+ break;
+ }
+ }
+ mutex_exit(&cc_list_lock);
+
+ return (found ? algo : NULL);
+}
+
+/*
+ * Returns non-zero on success, 0 on failure.
+ */
+int
+cc_deregister_algo(struct cc_algo *remove_cc)
+{
+ struct cc_algo *funcs, *tmpfuncs;
+ int err = ENOENT;
+
+ mutex_enter(&cc_list_lock);
+ STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
+ if (funcs == remove_cc) {
+ STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
+ err = 0;
+ break;
+ }
+ }
+ mutex_exit(&cc_list_lock);
+ return (err);
+}
+
+/*
+ * Returns 0 on success, non-zero on failure.
+ */
+int
+cc_register_algo(struct cc_algo *add_cc)
+{
+ struct cc_algo *funcs;
+ size_t nlen;
+ int err = 0;
+
+ nlen = strnlen(add_cc->name, CC_ALGO_NAME_MAX);
+ if (nlen == 0 || nlen >= CC_ALGO_NAME_MAX) {
+ return (EINVAL);
+ }
+
+ /*
+ * Iterate over list of registered CC algorithms and make sure
+ * we're not trying to add a duplicate.
+ */
+ mutex_enter(&cc_list_lock);
+ STAILQ_FOREACH(funcs, &cc_list, entries) {
+ if (strncmp(funcs->name, add_cc->name, CC_ALGO_NAME_MAX) == 0)
+ err = EEXIST;
+ }
+
+ if (err == 0)
+ STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
+
+ mutex_exit(&cc_list_lock);
+
+ return (err);
+}
diff --git a/usr/src/uts/common/inet/cc/cc_cubic.c b/usr/src/uts/common/inet/cc/cc_cubic.c
new file mode 100644
index 0000000000..a4b8f29e18
--- /dev/null
+++ b/usr/src/uts/common/inet/cc/cc_cubic.c
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2008-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ *
+ * This software was developed by Lawrence Stewart while studying at the Centre
+ * for Advanced Internet Architectures, Swinburne University of Technology, made
+ * possible in part by a grant from the Cisco University Research Program Fund
+ * at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * An implementation of the CUBIC congestion control algorithm for FreeBSD,
+ * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha.
+ * Originally released as part of the NewTCP research project at Swinburne
+ * University of Technology's Centre for Advanced Internet Architectures,
+ * Melbourne, Australia, which was made possible in part by a grant from the
+ * Cisco University Research Program Fund at Community Foundation Silicon
+ * Valley. More details are available at:
+ * http://caia.swin.edu.au/urp/newtcp/
+ */
+
+#include <sys/errno.h>
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/modctl.h>
+#include <sys/time.h>
+
+#include <inet/tcp_impl.h>
+#include <inet/cc.h>
+#include <inet/cc/cc_cubic.h>
+#include <inet/cc/cc_module.h>
+
+static struct modlmisc cc_cubic_modlmisc = {
+ &mod_miscops,
+ "Cubic Congestion Control"
+};
+
+static struct modlinkage cc_cubic_modlinkage = {
+ MODREV_1,
+ &cc_cubic_modlmisc,
+ NULL
+};
+
+/*
+ * cubic uses the NewReno implementation of after_idle and uses NewReno's
+ * ack_received callback during slow start.
+ */
+static struct cc_algo *newreno_cc_algo;
+
+static void cubic_ack_received(struct cc_var *ccv, uint16_t type);
+static void cubic_cb_destroy(struct cc_var *ccv);
+static int cubic_cb_init(struct cc_var *ccv);
+static void cubic_cong_signal(struct cc_var *ccv, uint32_t type);
+static void cubic_conn_init(struct cc_var *ccv);
+static void cubic_post_recovery(struct cc_var *ccv);
+static void cubic_record_rtt(struct cc_var *ccv);
+static void cubic_ssthresh_update(struct cc_var *ccv);
+
+struct cubic {
+ /* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */
+ int64_t K;
+ /* Sum of RTT samples across an epoch in nanoseconds. */
+ hrtime_t sum_rtt_nsecs;
+ /* cwnd at the most recent congestion event. */
+ uint32_t max_cwnd;
+ /* cwnd at the previous congestion event. */
+ uint32_t prev_max_cwnd;
+ /* Number of congestion events. */
+ uint32_t num_cong_events;
+ /* Minimum observed rtt in nanoseconds. */
+ hrtime_t min_rtt_nsecs;
+ /* Mean observed rtt between congestion epochs. */
+ hrtime_t mean_rtt_nsecs;
+ /* ACKs since last congestion event. */
+ int epoch_ack_count;
+ /* Time of last congestion event in nanoseconds. */
+ hrtime_t t_last_cong;
+};
+
+struct cc_algo cubic_cc_algo = {
+ .name = "cubic",
+ .ack_received = cubic_ack_received,
+ .cb_destroy = cubic_cb_destroy,
+ .cb_init = cubic_cb_init,
+ .cong_signal = cubic_cong_signal,
+ .conn_init = cubic_conn_init,
+ .post_recovery = cubic_post_recovery,
+};
+
+int
+_init(void)
+{
+ int err;
+
+ if ((newreno_cc_algo = cc_load_algo("newreno")) == NULL)
+ return (EINVAL);
+
+ if ((err = cc_register_algo(&cubic_cc_algo)) == 0) {
+ if ((err = mod_install(&cc_cubic_modlinkage)) != 0)
+ (void) cc_deregister_algo(&cubic_cc_algo);
+ }
+ cubic_cc_algo.after_idle = newreno_cc_algo->after_idle;
+ return (err);
+}
+
+int
+_fini(void)
+{
+ /* XXX Not unloadable for now */
+ return (EBUSY);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&cc_cubic_modlinkage, modinfop));
+}
+
+static void
+cubic_ack_received(struct cc_var *ccv, uint16_t type)
+{
+ struct cubic *cubic_data;
+ uint32_t w_tf, w_cubic_next;
+ hrtime_t nsecs_since_cong;
+
+ cubic_data = ccv->cc_data;
+ cubic_record_rtt(ccv);
+
+ /*
+ * Regular ACK and we're not in cong/fast recovery and we're cwnd
+ * limited and we're either not doing ABC or are slow starting or are
+ * doing ABC and we've sent a cwnd's worth of bytes.
+ */
+ if (type == CC_ACK && !IN_RECOVERY(ccv->flags) &&
+ (ccv->flags & CCF_CWND_LIMITED) && (!CC_ABC(ccv) ||
+ CCV(ccv, tcp_cwnd) <= CCV(ccv, tcp_cwnd_ssthresh) ||
+ (CC_ABC(ccv) && (ccv->flags & CCF_ABC_SENTAWND)))) {
+ /* Use the logic in NewReno ack_received() for slow start. */
+ if (CCV(ccv, tcp_cwnd) <= CCV(ccv, tcp_cwnd_ssthresh) ||
+ cubic_data->min_rtt_nsecs == TCPTV_SRTTBASE)
+ newreno_cc_algo->ack_received(ccv, type);
+ else {
+ nsecs_since_cong = gethrtime() -
+ cubic_data->t_last_cong;
+
+ /*
+ * The mean RTT is used to best reflect the equations in
+ * the I-D. Using min_rtt in the tf_cwnd calculation
+ * causes w_tf to grow much faster than it should if the
+ * RTT is dominated by network buffering rather than
+ * propagation delay.
+ */
+ w_tf = tf_cwnd(nsecs_since_cong,
+ cubic_data->mean_rtt_nsecs, cubic_data->max_cwnd,
+ CCV(ccv, tcp_mss));
+
+ w_cubic_next = cubic_cwnd(nsecs_since_cong +
+ cubic_data->mean_rtt_nsecs, cubic_data->max_cwnd,
+ CCV(ccv, tcp_mss), cubic_data->K);
+
+ ccv->flags &= ~CCF_ABC_SENTAWND;
+
+ if (w_cubic_next < w_tf) {
+ /*
+ * TCP-friendly region, follow tf
+ * cwnd growth.
+ */
+ CCV(ccv, tcp_cwnd) = w_tf;
+ } else if (CCV(ccv, tcp_cwnd) < w_cubic_next) {
+ /*
+ * Concave or convex region, follow CUBIC
+ * cwnd growth.
+ */
+ if (CC_ABC(ccv))
+ CCV(ccv, tcp_cwnd) = w_cubic_next;
+ else
+ CCV(ccv, tcp_cwnd) += ((w_cubic_next -
+ CCV(ccv, tcp_cwnd)) *
+ CCV(ccv, tcp_mss)) /
+ CCV(ccv, tcp_cwnd);
+ }
+
+ /*
+ * If we're not in slow start and we're probing for a
+ * new cwnd limit at the start of a connection
+ * (happens when hostcache has a relevant entry),
+ * keep updating our current estimate of the
+ * max_cwnd.
+ */
+ if (cubic_data->num_cong_events == 0 &&
+ cubic_data->max_cwnd < CCV(ccv, tcp_cwnd))
+ cubic_data->max_cwnd = CCV(ccv, tcp_cwnd);
+ }
+ }
+}
+
+static void
+cubic_cb_destroy(struct cc_var *ccv)
+{
+
+ if (ccv->cc_data != NULL)
+ kmem_free(ccv->cc_data, sizeof (struct cubic));
+}
+
+static int
+cubic_cb_init(struct cc_var *ccv)
+{
+ struct cubic *cubic_data;
+
+ cubic_data = kmem_alloc(sizeof (struct cubic), KM_NOSLEEP);
+
+ if (cubic_data == NULL)
+ return (ENOMEM);
+
+ /* Init some key variables with sensible defaults. */
+ cubic_data->t_last_cong = gethrtime();
+ cubic_data->min_rtt_nsecs = TCPTV_SRTTBASE;
+ cubic_data->mean_rtt_nsecs = 1;
+
+ ccv->cc_data = cubic_data;
+
+ return (0);
+}
+
+/*
+ * Perform any necessary tasks before we enter congestion recovery.
+ */
+static void
+cubic_cong_signal(struct cc_var *ccv, uint32_t type)
+{
+ struct cubic *cubic_data;
+ uint32_t cwin;
+ uint32_t mss;
+
+ cubic_data = ccv->cc_data;
+ cwin = CCV(ccv, tcp_cwnd);
+ mss = CCV(ccv, tcp_mss);
+
+ switch (type) {
+ case CC_NDUPACK:
+ if (!IN_FASTRECOVERY(ccv->flags)) {
+ if (!IN_CONGRECOVERY(ccv->flags)) {
+ cubic_ssthresh_update(ccv);
+ cubic_data->num_cong_events++;
+ cubic_data->prev_max_cwnd =
+ cubic_data->max_cwnd;
+ cubic_data->max_cwnd = cwin;
+ CCV(ccv, tcp_cwnd) =
+ CCV(ccv, tcp_cwnd_ssthresh);
+ }
+ ENTER_RECOVERY(ccv->flags);
+ }
+ break;
+
+ case CC_ECN:
+ if (!IN_CONGRECOVERY(ccv->flags)) {
+ cubic_ssthresh_update(ccv);
+ cubic_data->num_cong_events++;
+ cubic_data->prev_max_cwnd = cubic_data->max_cwnd;
+ cubic_data->max_cwnd = cwin;
+ cubic_data->t_last_cong = gethrtime();
+ CCV(ccv, tcp_cwnd) = CCV(ccv, tcp_cwnd_ssthresh);
+ ENTER_CONGRECOVERY(ccv->flags);
+ }
+ break;
+
+ case CC_RTO:
+ /*
+ * Grab the current time and record it so we know when the
+ * most recent congestion event was. Only record it when the
+ * timeout has fired more than once, as there is a reasonable
+ * chance the first one is a false alarm and may not indicate
+ * congestion.
+ */
+ if (CCV(ccv, tcp_timer_backoff) >= 2) {
+ cubic_data->num_cong_events++;
+ cubic_data->t_last_cong = gethrtime();
+ cubic_ssthresh_update(ccv);
+ cubic_data->max_cwnd = cwin;
+ CCV(ccv, tcp_cwnd) = mss;
+ }
+ break;
+ }
+}
+
+static void
+cubic_conn_init(struct cc_var *ccv)
+{
+ struct cubic *cubic_data;
+
+ cubic_data = ccv->cc_data;
+
+ /*
+ * Ensure we have a sane initial value for max_cwnd recorded. Without
+ * this here bad things happen when entries from the TCP hostcache
+ * get used.
+ */
+ cubic_data->max_cwnd = CCV(ccv, tcp_cwnd);
+}
+
+/*
+ * Perform any necessary tasks before we exit congestion recovery.
+ */
+static void
+cubic_post_recovery(struct cc_var *ccv)
+{
+ struct cubic *cubic_data;
+
+ cubic_data = ccv->cc_data;
+
+ /* Fast convergence heuristic. */
+ if (cubic_data->max_cwnd < cubic_data->prev_max_cwnd) {
+ cubic_data->max_cwnd = (cubic_data->max_cwnd * CUBIC_FC_FACTOR)
+ >> CUBIC_SHIFT;
+ }
+
+ if (IN_FASTRECOVERY(ccv->flags)) {
+ /* Update cwnd based on beta and adjusted max_cwnd. */
+ CCV(ccv, tcp_cwnd) = max(1, ((CUBIC_BETA *
+ cubic_data->max_cwnd) >> CUBIC_SHIFT));
+ }
+ cubic_data->t_last_cong = gethrtime();
+
+ /* Calculate the average RTT between congestion epochs. */
+ if (cubic_data->epoch_ack_count > 0 &&
+ cubic_data->sum_rtt_nsecs >= cubic_data->epoch_ack_count) {
+ cubic_data->mean_rtt_nsecs =
+ (cubic_data->sum_rtt_nsecs / cubic_data->epoch_ack_count);
+ }
+
+ cubic_data->epoch_ack_count = 0;
+ cubic_data->sum_rtt_nsecs = 0;
+ cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, tcp_mss));
+}
+
+/*
+ * Record the min RTT and sum samples for the epoch average RTT calculation.
+ */
+static void
+cubic_record_rtt(struct cc_var *ccv)
+{
+ struct cubic *cubic_data;
+ int t_srtt_nsecs;
+
+ /* Ignore srtt until a min number of samples have been taken. */
+ if (CCV(ccv, tcp_rtt_update) >= CUBIC_MIN_RTT_SAMPLES) {
+ cubic_data = ccv->cc_data;
+ /* tcp_rtt_sa is 8 * smoothed RTT in nanoseconds */
+ t_srtt_nsecs = CCV(ccv, tcp_rtt_sa) >> 3;
+
+ /*
+ * Record the current SRTT as our minrtt if it's the smallest
+ * we've seen or minrtt is currently equal to its initialized
+ * value.
+ *
+ * XXXLAS: Should there be some hysteresis for minrtt?
+ */
+ if ((t_srtt_nsecs < cubic_data->min_rtt_nsecs ||
+ cubic_data->min_rtt_nsecs == TCPTV_SRTTBASE)) {
+ cubic_data->min_rtt_nsecs = max(1, t_srtt_nsecs);
+
+ /*
+ * If the connection is within its first congestion
+ * epoch, ensure we prime mean_rtt_nsecs with a
+ * reasonable value until the epoch average RTT is
+ * calculated in cubic_post_recovery().
+ */
+ if (cubic_data->min_rtt_nsecs >
+ cubic_data->mean_rtt_nsecs)
+ cubic_data->mean_rtt_nsecs =
+ cubic_data->min_rtt_nsecs;
+ }
+
+ /* Sum samples for epoch average RTT calculation. */
+ cubic_data->sum_rtt_nsecs += t_srtt_nsecs;
+ cubic_data->epoch_ack_count++;
+ }
+}
+
+/*
+ * Update the ssthresh in the event of congestion.
+ */
+static void
+cubic_ssthresh_update(struct cc_var *ccv)
+{
+ struct cubic *cubic_data;
+
+ cubic_data = ccv->cc_data;
+
+ /*
+ * On the first congestion event, set ssthresh to cwnd * 0.5, on
+ * subsequent congestion events, set it to cwnd * beta.
+ */
+ if (cubic_data->num_cong_events == 0)
+ CCV(ccv, tcp_cwnd_ssthresh) = CCV(ccv, tcp_cwnd) >> 1;
+ else
+ CCV(ccv, tcp_cwnd_ssthresh) =
+ (CCV(ccv, tcp_cwnd) * CUBIC_BETA) >> CUBIC_SHIFT;
+}
diff --git a/usr/src/uts/common/inet/cc/cc_cubic.h b/usr/src/uts/common/inet/cc/cc_cubic.h
new file mode 100644
index 0000000000..c87751d257
--- /dev/null
+++ b/usr/src/uts/common/inet/cc/cc_cubic.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2008-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
+ *
+ * This software was developed by Lawrence Stewart while studying at the Centre
+ * for Advanced Internet Architectures, Swinburne University of Technology, made
+ * possible in part by a grant from the Cisco University Research Program Fund
+ * at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _NETINET_CC_CUBIC_H_
+#define _NETINET_CC_CUBIC_H_
+
+/* Number of bits of precision for fixed point math calcs. */
+#define CUBIC_SHIFT 8
+
+#define CUBIC_SHIFT_4 32
+
+/* 0.5 << CUBIC_SHIFT. */
+#define RENO_BETA 128
+
+/* ~0.8 << CUBIC_SHIFT. */
+#define CUBIC_BETA 204
+
+/* ~0.2 << CUBIC_SHIFT. */
+#define ONE_SUB_CUBIC_BETA 51
+
+/* 3 * ONE_SUB_CUBIC_BETA. */
+#define THREE_X_PT2 153
+
+/* (2 << CUBIC_SHIFT) - ONE_SUB_CUBIC_BETA. */
+#define TWO_SUB_PT2 461
+
+/* ~0.4 << CUBIC_SHIFT. */
+#define CUBIC_C_FACTOR 102
+
+/* CUBIC fast convergence factor: ~0.9 << CUBIC_SHIFT. */
+#define CUBIC_FC_FACTOR 230
+
+/* Don't trust s_rtt until this many rtt samples have been taken. */
+#define CUBIC_MIN_RTT_SAMPLES 8
+
+/* Userland only bits. */
+#ifndef _KERNEL
+
+extern int hz;
+
+/*
+ * Implementation based on the formulae found in the CUBIC Internet Draft
+ * "draft-rhee-tcpm-cubic-02".
+ *
+ * Note BETA used in cc_cubic is equal to (1-beta) in the I-D
+ */
+
+static __inline float
+theoretical_cubic_k(double wmax_pkts)
+{
+ double C;
+
+ C = 0.4;
+
+ return (pow((wmax_pkts * 0.2) / C, (1.0 / 3.0)) * pow(2, CUBIC_SHIFT));
+}
+
+static __inline uint32_t
+theoretical_cubic_cwnd(int ticks_since_cong, uint32_t wmax, uint32_t smss)
+{
+ double C, wmax_pkts;
+
+ C = 0.4;
+ wmax_pkts = wmax / (double)smss;
+
+ return (smss * (wmax_pkts +
+ (C * pow(ticks_since_cong / (double)hz -
+ theoretical_cubic_k(wmax_pkts) / pow(2, CUBIC_SHIFT), 3.0))));
+}
+
+static __inline uint32_t
+theoretical_reno_cwnd(int ticks_since_cong, int rtt_ticks, uint32_t wmax,
+ uint32_t smss)
+{
+
+ return ((wmax * 0.5) + ((ticks_since_cong / (float)rtt_ticks) * smss));
+}
+
+static __inline uint32_t
+theoretical_tf_cwnd(int ticks_since_cong, int rtt_ticks, unsigned long wmax,
+ uint32_t smss)
+{
+
+ return ((wmax * 0.8) + ((3 * 0.2) / (2 - 0.2) *
+ (ticks_since_cong / (float)rtt_ticks) * smss));
+}
+
+#endif /* !_KERNEL */
+
+/*
+ * Compute the CUBIC K value used in the cwnd calculation, using an
+ * implementation of eqn 2 in the I-D. The method used
+ * here is adapted from Apple Computer Technical Report #KT-32.
+ */
+static __inline int64_t
+cubic_k(uint32_t wmax_pkts)
+{
+ int64_t s, K;
+ uint16_t p;
+
+ K = s = 0;
+ p = 0;
+
+ /* (wmax * beta)/C with CUBIC_SHIFT worth of precision. */
+ s = ((wmax_pkts * ONE_SUB_CUBIC_BETA) << CUBIC_SHIFT) / CUBIC_C_FACTOR;
+
+ /* Rebase s to be between 1 and 1/8 with a shift of CUBIC_SHIFT. */
+ while (s >= 256) {
+ s >>= 3;
+ p++;
+ }
+
+ /*
+ * Some magic constants taken from the Apple TR with appropriate
+ * shifts: 275 == 1.072302 << CUBIC_SHIFT, 98 == 0.3812513 <<
+ * CUBIC_SHIFT, 120 == 0.46946116 << CUBIC_SHIFT.
+ */
+ K = (((s * 275) >> CUBIC_SHIFT) + 98) -
+ (((s * s * 120) >> CUBIC_SHIFT) >> CUBIC_SHIFT);
+
+ /* Multiply by 2^p to undo the rebasing of s from above. */
+ return (K <<= p);
+}
+
+/*
+ * Compute the new cwnd value using an implementation of eqn 1 from the I-D.
+ * Thanks to Kip Macy for help debugging this function.
+ *
+ * XXXLAS: Characterise bounds for overflow.
+ */
+static __inline uint32_t
+cubic_cwnd(hrtime_t nsecs_since_cong, uint32_t wmax, uint32_t smss, int64_t K)
+{
+ int64_t t, cwnd;
+
+ /*
+ * Convert nsecs_since_cong to milliseconds, with CUBIC_SHIFT worth
+ * of precision.
+ */
+ t = NSEC2MSEC(nsecs_since_cong << CUBIC_SHIFT);
+
+ /*
+ * K is the time period in seconds that it will take to reach wmax. The
+ * value is kept in fixed point form with CUBIC_SHIFT worth of
+ * precision.
+ *
+ * For comparison with t, we convert K to milliseconds, and then convert
+ * the result back to seconds.
+ *
+ * cwnd = t - K, with CUBIC_SHIFT worth of precision.
+ */
+ cwnd = (t - K * MILLISEC) / MILLISEC;
+
+ /* cwnd = (t - K)^3, with CUBIC_SHIFT^3 worth of precision. */
+ cwnd *= (cwnd * cwnd);
+
+ /*
+ * C(t - K)^3 + wmax
+ * The down shift by CUBIC_SHIFT_4 is because cwnd has 4 lots of
+ * CUBIC_SHIFT included in the value. 3 from the cubing of cwnd above,
+ * and an extra from multiplying through by CUBIC_C_FACTOR.
+ */
+ cwnd = ((cwnd * CUBIC_C_FACTOR * smss) >> CUBIC_SHIFT_4) + wmax;
+
+ return ((uint32_t)cwnd);
+}
+
+/*
+ * Compute an approximation of the "TCP friendly" cwnd some number of
+ * nanoseconds after a congestion event that is designed to yield the same
+ * average cwnd as NewReno while using CUBIC's beta of 0.8. RTT should be the
+ * average RTT estimate for the path measured over the previous congestion
+ * epoch and wmax is the value of cwnd at the last congestion event.
+ */
+static __inline uint32_t
+tf_cwnd(hrtime_t nsecs_since_cong, hrtime_t rtt_nsecs, uint32_t wmax,
+ uint32_t smss)
+{
+
+ /* Equation 4 of I-D. */
+ return (((wmax * CUBIC_BETA) + (((THREE_X_PT2 * nsecs_since_cong *
+ smss) << CUBIC_SHIFT) / TWO_SUB_PT2 / rtt_nsecs)) >> CUBIC_SHIFT);
+}
+
+#endif /* _NETINET_CC_CUBIC_H_ */
diff --git a/usr/src/uts/common/inet/cc/cc_module.h b/usr/src/uts/common/inet/cc/cc_module.h
new file mode 100644
index 0000000000..d0d6c83c36
--- /dev/null
+++ b/usr/src/uts/common/inet/cc/cc_module.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * All rights reserved.
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ *
+ * This software was developed by Lawrence Stewart while studying at the Centre
+ * for Advanced Internet Architectures, Swinburne University of Technology, made
+ * possible in part by a grant from the Cisco University Research Program Fund
+ * at Community Foundation Silicon Valley.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This software was first released in 2009 by Lawrence Stewart as part of the
+ * NewTCP research project at Swinburne University of Technology's Centre for
+ * Advanced Internet Architectures, Melbourne, Australia, which was made
+ * possible in part by a grant from the Cisco University Research Program Fund
+ * at Community Foundation Silicon Valley. More details are available at:
+ * http://caia.swin.edu.au/urp/newtcp/
+ */
+
+#ifndef _NETINET_CC_MODULE_H_
+#define _NETINET_CC_MODULE_H_
+
+#define CCV(ccv, what) (ccv)->ccvc.tcp->what
+#define CCSV(ccv, what) (ccv)->ccvc.tcp->tcp_tcps->what
+#define CCV_PROTO(ccv) (ccv)->ccvc.tcp
+
+#define CC_ABC(ccv) (ccv)->ccvc.tcp->tcp_tcps->tcps_abc
+#define CC_ABC_L_VAR(ccv) (ccv)->ccvc.tcp->tcp_tcps->tcps_abc_l_var
+
+#define TCPTV_SRTTBASE 0
+
+#endif /* _NETINET_CC_MODULE_H_ */
diff --git a/usr/src/uts/common/inet/cc/cc_newreno.c b/usr/src/uts/common/inet/cc/cc_newreno.c
new file mode 100644
index 0000000000..ceb76d8643
--- /dev/null
+++ b/usr/src/uts/common/inet/cc/cc_newreno.c
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
+ * The Regents of the University of California.
+ * Copyright (c) 2007-2008,2010
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ *
+ * This software was developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University of Technology, by Lawrence Stewart, James
+ * Healy and David Hayes, made possible in part by a grant from the Cisco
+ * University Research Program Fund at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This software was first released in 2007 by James Healy and Lawrence Stewart
+ * whilst working on the NewTCP research project at Swinburne University of
+ * Technology's Centre for Advanced Internet Architectures, Melbourne,
+ * Australia, which was made possible in part by a grant from the Cisco
+ * University Research Program Fund at Community Foundation Silicon Valley.
+ * More details are available at:
+ * http://caia.swin.edu.au/urp/newtcp/
+ */
+
+#include <sys/errno.h>
+#include <inet/tcp.h>
+#include <inet/tcp_impl.h>
+#include <inet/cc.h>
+#include <inet/cc/cc_module.h>
+
+static void newreno_ack_received(struct cc_var *ccv, uint16_t type);
+static void newreno_after_idle(struct cc_var *ccv);
+static void newreno_cong_signal(struct cc_var *ccv, uint32_t type);
+static void newreno_post_recovery(struct cc_var *ccv);
+
+static struct modlmisc cc_newreno_modlmisc = {
+ &mod_miscops,
+ "New Reno Congestion Control"
+};
+
+static struct modlinkage cc_newreno_modlinkage = {
+ MODREV_1,
+ &cc_newreno_modlmisc,
+ NULL
+};
+
+struct cc_algo newreno_cc_algo = {
+ .name = "newreno",
+ .ack_received = newreno_ack_received,
+ .after_idle = newreno_after_idle,
+ .cong_signal = newreno_cong_signal,
+ .post_recovery = newreno_post_recovery,
+};
+
+int
+_init(void)
+{
+ int err;
+
+ if ((err = cc_register_algo(&newreno_cc_algo)) == 0) {
+ if ((err = mod_install(&cc_newreno_modlinkage)) != 0)
+ (void) cc_deregister_algo(&newreno_cc_algo);
+ }
+ return (err);
+}
+
+int
+_fini(void)
+{
+ /* XXX Not unloadable for now */
+ return (EBUSY);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&cc_newreno_modlinkage, modinfop));
+}
+
+static void
+newreno_ack_received(struct cc_var *ccv, uint16_t type)
+{
+ if (type == CC_ACK && !IN_RECOVERY(ccv->flags) &&
+ (ccv->flags & CCF_CWND_LIMITED)) {
+ uint_t cw = CCV(ccv, tcp_cwnd);
+ uint_t incr = CCV(ccv, tcp_mss);
+
+ /*
+ * Regular in-order ACK, open the congestion window.
+ * Method depends on which congestion control state we're
+ * in (slow start or cong avoid) and if ABC (RFC 3465) is
+ * enabled.
+ *
+ * slow start: cwnd <= ssthresh
+ * cong avoid: cwnd > ssthresh
+ *
+ * slow start and ABC (RFC 3465):
+ * Grow cwnd exponentially by the amount of data
+ * ACKed capping the max increment per ACK to
+ * (abc_l_var * maxseg) bytes.
+ *
+ * slow start without ABC (RFC 5681):
+ * Grow cwnd exponentially by maxseg per ACK.
+ *
+ * cong avoid and ABC (RFC 3465):
+ * Grow cwnd linearly by maxseg per RTT for each
+ * cwnd worth of ACKed data.
+ *
+ * cong avoid without ABC (RFC 5681):
+ * Grow cwnd linearly by approximately maxseg per RTT using
+ * maxseg^2 / cwnd per ACK as the increment.
+ * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
+ * avoid capping cwnd.
+ */
+ if (cw > CCV(ccv, tcp_cwnd_ssthresh)) {
+ if (CC_ABC(ccv)) {
+ if (ccv->flags & CCF_ABC_SENTAWND)
+ ccv->flags &= ~CCF_ABC_SENTAWND;
+ else
+ incr = 0;
+ } else
+ incr = max((incr * incr / cw), 1);
+ } else if (CC_ABC(ccv)) {
+ /*
+ * In slow-start with ABC enabled and no RTO in sight?
+ * (Must not use abc_l_var > 1 if slow starting after
+ * an RTO.
+ */
+ if (ccv->flags & CCF_RTO) {
+ incr = min(ccv->bytes_this_ack,
+ CCV(ccv, tcp_mss));
+ } else {
+ incr = min(ccv->bytes_this_ack,
+ CC_ABC_L_VAR(ccv) * CCV(ccv, tcp_mss));
+ }
+
+ }
+ /* ABC is on by default, so incr equals 0 frequently. */
+ if (incr > 0)
+ CCV(ccv, tcp_cwnd) = min(cw + incr,
+ TCP_MAXWIN << CCV(ccv, tcp_snd_ws));
+ }
+}
+
+static void
+newreno_after_idle(struct cc_var *ccv)
+{
+ int rw;
+
+ /*
+ * If we've been idle for more than one retransmit timeout the old
+ * congestion window is no longer current and we have to reduce it to
+ * the restart window before we can transmit again.
+ *
+ * The restart window is the initial window or the last CWND, whichever
+ * is smaller.
+ *
+ * This is done to prevent us from flooding the path with a full CWND at
+ * wirespeed, overloading router and switch buffers along the way.
+ *
+ * See RFC5681 Section 4.1. "Restarting Idle Connections".
+ */
+ if (CCV(ccv, tcp_init_cwnd) != 0) {
+ /*
+ * The TCP_INIT_CWND socket option was used to override the
+ * default.
+ */
+ rw = CCV(ccv, tcp_init_cwnd) * CCV(ccv, tcp_mss);
+ } else if (CCSV(ccv, tcps_slow_start_initial) != 0) {
+ /* The _slow_start_initial tunable was explicitly set. */
+ rw = min(TCP_MAX_INIT_CWND, CCSV(ccv, tcps_slow_start_initial))
+ * CCV(ccv, tcp_mss);
+ } else {
+ /* Do RFC 3390 */
+ rw = min(4 * CCV(ccv, tcp_mss),
+ max(2 * CCV(ccv, tcp_mss), 4380));
+ }
+
+ CCV(ccv, tcp_cwnd) = min(rw, CCV(ccv, tcp_cwnd));
+}
+
+/*
+ * Perform any necessary tasks before we enter congestion recovery.
+ */
+static void
+newreno_cong_signal(struct cc_var *ccv, uint32_t type)
+{
+ uint32_t cwin, ssthresh_on_loss;
+ uint32_t mss;
+
+ cwin = CCV(ccv, tcp_cwnd);
+ mss = CCV(ccv, tcp_mss);
+ ssthresh_on_loss =
+ max((CCV(ccv, tcp_snxt) - CCV(ccv, tcp_suna)) / 2 / mss, 2)
+ * mss;
+
+ /* Catch algos which mistakenly leak private signal types. */
+ ASSERT((type & CC_SIGPRIVMASK) == 0);
+
+ cwin = max(cwin / 2 / mss, 2) * mss;
+
+ switch (type) {
+ case CC_NDUPACK:
+ if (!IN_FASTRECOVERY(ccv->flags)) {
+ if (!IN_CONGRECOVERY(ccv->flags)) {
+ CCV(ccv, tcp_cwnd_ssthresh) = ssthresh_on_loss;
+ CCV(ccv, tcp_cwnd) = cwin;
+ }
+ ENTER_RECOVERY(ccv->flags);
+ }
+ break;
+ case CC_ECN:
+ if (!IN_CONGRECOVERY(ccv->flags)) {
+ CCV(ccv, tcp_cwnd_ssthresh) = ssthresh_on_loss;
+ CCV(ccv, tcp_cwnd) = cwin;
+ ENTER_CONGRECOVERY(ccv->flags);
+ }
+ break;
+ case CC_RTO:
+ CCV(ccv, tcp_cwnd_ssthresh) = ssthresh_on_loss;
+ CCV(ccv, tcp_cwnd) = mss;
+ break;
+ }
+}
+
+/*
+ * Perform any necessary tasks before we exit congestion recovery.
+ */
+static void
+newreno_post_recovery(struct cc_var *ccv)
+{
+ if (IN_FASTRECOVERY(ccv->flags)) {
+ /*
+ * Fast recovery will conclude after returning from this
+ * function.
+ */
+ if (CCV(ccv, tcp_cwnd) > CCV(ccv, tcp_cwnd_ssthresh)) {
+ CCV(ccv, tcp_cwnd) = CCV(ccv, tcp_cwnd_ssthresh);
+ }
+ }
+}
diff --git a/usr/src/uts/common/inet/cc/cc_sunreno.c b/usr/src/uts/common/inet/cc/cc_sunreno.c
new file mode 100644
index 0000000000..0a7a05206f
--- /dev/null
+++ b/usr/src/uts/common/inet/cc/cc_sunreno.c
@@ -0,0 +1,222 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+/*
+ * The TCP congestion control algorithm extracted from the pre-framework
+ * implementation of TCP congestion control.
+ */
+
+#include <sys/errno.h>
+#include <inet/tcp.h>
+#include <inet/tcp_impl.h>
+#include <inet/cc.h>
+#include <inet/cc/cc_module.h>
+
+static void sunreno_ack_received(struct cc_var *ccv, uint16_t type);
+static void sunreno_after_idle(struct cc_var *ccv);
+static void sunreno_cong_signal(struct cc_var *ccv, uint32_t type);
+static void sunreno_post_recovery(struct cc_var *ccv);
+
+#define CC_SUNRENO_ALGO_NAME "sunreno"
+
+static struct modlmisc cc_sunreno_modlmisc = {
+ &mod_miscops,
+ "SUNReno Congestion Control"
+};
+
+static struct modlinkage cc_sunreno_modlinkage = {
+ MODREV_1,
+ &cc_sunreno_modlmisc,
+ NULL
+};
+
+struct cc_algo sunreno_cc_algo = {
+ .name = CC_SUNRENO_ALGO_NAME,
+ .ack_received = sunreno_ack_received,
+ .after_idle = sunreno_after_idle,
+ .cong_signal = sunreno_cong_signal,
+ .post_recovery = sunreno_post_recovery,
+};
+
+int
+_init(void)
+{
+ int err;
+
+ if ((err = cc_register_algo(&sunreno_cc_algo)) == 0) {
+ if ((err = mod_install(&cc_sunreno_modlinkage)) != 0)
+ (void) cc_deregister_algo(&sunreno_cc_algo);
+ }
+ return (err);
+}
+
+int
+_fini(void)
+{
+ return (EBUSY);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&cc_sunreno_modlinkage, modinfop));
+}
+
+static void
+sunreno_ack_received(struct cc_var *ccv, uint16_t type)
+{
+ uint32_t add;
+ uint32_t cwnd;
+ int mss;
+
+ if (type == CC_ACK && !IN_RECOVERY(ccv->flags)) {
+ mss = CCV(ccv, tcp_mss);
+ cwnd = CCV(ccv, tcp_cwnd);
+ add = mss;
+
+ if (cwnd >= CCV(ccv, tcp_cwnd_ssthresh)) {
+ /*
+ * This is to prevent an increase of less than 1 MSS of
+ * tcp_cwnd. With partial increase, tcp_wput_data()
+ * may send out tinygrams in order to preserve mblk
+ * boundaries.
+ *
+ * By initializing tcp_cwnd_cnt to new tcp_cwnd and
+ * decrementing it by 1 MSS for every ACKs, tcp_cwnd is
+ * increased by 1 MSS for every RTTs.
+ */
+ if (CCV(ccv, tcp_cwnd_cnt) <= 0) {
+ CCV(ccv, tcp_cwnd_cnt) = cwnd + add;
+ } else {
+ CCV(ccv, tcp_cwnd_cnt) -= add;
+ add = 0;
+ }
+ }
+ CCV(ccv, tcp_cwnd) = MIN(cwnd + add, CCV(ccv, tcp_cwnd_max));
+ }
+}
+
+static void
+sunreno_after_idle(struct cc_var *ccv)
+{
+ int32_t num_sack_blk = 0;
+ int mss;
+
+ if (CCV(ccv, tcp_snd_sack_ok) && CCV(ccv, tcp_num_sack_blk) > 0) {
+ int32_t opt_len;
+
+ num_sack_blk = MIN(CCV(ccv, tcp_max_sack_blk),
+ CCV(ccv, tcp_num_sack_blk));
+ opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN *
+ 2 + TCPOPT_HEADER_LEN;
+ mss = CCV(ccv, tcp_mss) - opt_len;
+ } else {
+ mss = CCV(ccv, tcp_mss);
+ }
+
+ TCP_SET_INIT_CWND(CCV_PROTO(ccv), mss,
+ CCSV(ccv, tcps_slow_start_after_idle));
+}
+
+/*
+ * Perform any necessary tasks before we enter congestion recovery.
+ */
+static void
+sunreno_cong_signal(struct cc_var *ccv, uint32_t type)
+{
+ int npkt;
+ int mss;
+
+ /* Catch algos which mistakenly leak private signal types. */
+ ASSERT((type & CC_SIGPRIVMASK) == 0);
+
+ mss = CCV(ccv, tcp_mss);
+ npkt = ((CCV(ccv, tcp_snxt) - CCV(ccv, tcp_suna)) >> 1) / mss;
+
+ switch (type) {
+ case CC_NDUPACK:
+ if (!IN_FASTRECOVERY(ccv->flags)) {
+ if (!IN_CONGRECOVERY(ccv->flags)) {
+ CCV(ccv, tcp_cwnd_ssthresh) = MAX(npkt, 2) *
+ mss;
+ CCV(ccv, tcp_cwnd) = (npkt +
+ CCV(ccv, tcp_dupack_cnt)) * mss;
+ }
+ ENTER_RECOVERY(ccv->flags);
+ }
+ break;
+ case CC_ECN:
+ if (!IN_CONGRECOVERY(ccv->flags) && !CCV(ccv, tcp_cwr)) {
+ CCV(ccv, tcp_cwnd_ssthresh) = MAX(npkt, 2) * mss;
+ CCV(ccv, tcp_cwnd) = npkt * mss;
+ if (CCV(ccv, tcp_cwnd) == 0) {
+ /*
+ * This makes sure that when the ACK comes
+ * back, we will increase tcp_cwnd by 1 MSS.
+ */
+ CCV(ccv, tcp_cwnd_cnt) = 0;
+ }
+ ENTER_CONGRECOVERY(ccv->flags);
+ }
+ break;
+ case CC_RTO:
+ /*
+ * After retransmission, we need to do slow start. Set the
+ * ssthresh to one half of current effective window and cwnd to
+ * one MSS. Also reset tcp_cwnd_cnt.
+ *
+ * Note that if tcp_ssthresh is reduced because of ECN, do not
+ * reduce it again unless it is already one window of data away
+ * (tcp_cwr should then be cleared) or this is a timeout for a
+ * retransmitted segment.
+ */
+ if (!CCV(ccv, tcp_cwr) || CCV(ccv, tcp_rexmit)) {
+ if (CCV(ccv, tcp_timer_backoff) != 0)
+ npkt = CCV(ccv, tcp_cwnd_ssthresh) / 2 / mss;
+ CCV(ccv, tcp_cwnd_ssthresh) = MAX(npkt, 2) * mss;
+ }
+ CCV(ccv, tcp_cwnd) = mss;
+ CCV(ccv, tcp_cwnd_cnt) = 0;
+ break;
+ }
+}
+
+/*
+ * Perform any necessary tasks before we exit congestion recovery.
+ */
+static void
+sunreno_post_recovery(struct cc_var *ccv)
+{
+ /*
+ * Restore the congestion window back to ssthresh as per RFC 5681
+ * section 3.2.
+ */
+ if (IN_FASTRECOVERY(ccv->flags)) {
+ if (CCV(ccv, tcp_cwnd) > CCV(ccv, tcp_cwnd_ssthresh)) {
+ CCV(ccv, tcp_cwnd) = CCV(ccv, tcp_cwnd_ssthresh);
+ }
+ }
+ CCV(ccv, tcp_cwnd_cnt) = 0;
+}
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index c81331dc9f..5090f88a97 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -95,6 +95,7 @@
#include <netinet/igmp.h>
#include <netinet/ip_mroute.h>
#include <inet/ipp_common.h>
+#include <inet/cc.h>
#include <net/pfkeyv2.h>
#include <inet/sadb.h>
diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h
index 7e3910e894..5058412c32 100644
--- a/usr/src/uts/common/inet/tcp.h
+++ b/usr/src/uts/common/inet/tcp.h
@@ -22,7 +22,7 @@
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, Joyent, Inc. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -46,6 +46,7 @@ extern "C" {
#include <inet/mib2.h>
#include <inet/tcp_stack.h>
#include <inet/tcp_sack.h>
+#include <inet/cc.h>
/* TCP states */
#define TCPS_CLOSED -6
@@ -152,6 +153,9 @@ typedef struct tcp_s {
struct conn_s *tcp_connp; /* back pointer to conn_t */
tcp_stack_t *tcp_tcps; /* back pointer to tcp_stack_t */
+ struct cc_algo *tcp_cc_algo; /* congestion control algorithm */
+ struct cc_var tcp_ccv; /* congestion control specific vars */
+
int32_t tcp_state;
int32_t tcp_rcv_ws; /* My window scale power */
int32_t tcp_snd_ws; /* Sender's window scale power */
@@ -503,10 +507,10 @@ typedef struct tcp_s {
#endif
extern void tcp_conn_reclaim(void *);
-extern void tcp_free(tcp_t *tcp);
+extern void tcp_free(tcp_t *tcp);
extern void tcp_ddi_g_init(void);
extern void tcp_ddi_g_destroy(void);
-extern void *tcp_get_conn(void *arg, tcp_stack_t *);
+extern conn_t *tcp_get_conn(void *arg, tcp_stack_t *);
extern mblk_t *tcp_snmp_get(queue_t *, mblk_t *, boolean_t);
extern int tcp_snmp_set(queue_t *, int, int, uchar_t *, int len);
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index d7458c8eee..bfa08ada8c 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -23,7 +23,7 @@
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, Joyent Inc. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
* Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -74,6 +74,7 @@
#include <inet/ipsec_impl.h>
#include <inet/common.h>
+#include <inet/cc.h>
#include <inet/ip.h>
#include <inet/ip_impl.h>
#include <inet/ip6.h>
@@ -1409,6 +1410,10 @@ tcp_free(tcp_t *tcp)
*/
tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
+ /* Allow the CC algorithm to clean up after itself. */
+ if (tcp->tcp_cc_algo != NULL && tcp->tcp_cc_algo->cb_destroy != NULL)
+ tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
+
/*
* If this is a non-STREAM socket still holding on to an upper
* handle, release it. As a result of fallback we might also see
@@ -1455,7 +1460,7 @@ tcp_free(tcp_t *tcp)
* collector will free up the freelist is the connection ends up sitting
* there for too long.
*/
-void *
+conn_t *
tcp_get_conn(void *arg, tcp_stack_t *tcps)
{
tcp_t *tcp = NULL;
@@ -1494,7 +1499,7 @@ tcp_get_conn(void *arg, tcp_stack_t *tcps)
connp->conn_recv = tcp_input_data;
ASSERT(connp->conn_recvicmp == tcp_icmp_input);
ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
- return ((void *)connp);
+ return (connp);
}
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
/*
@@ -1529,7 +1534,7 @@ tcp_get_conn(void *arg, tcp_stack_t *tcps)
connp->conn_ixa->ixa_notify = tcp_notify;
connp->conn_ixa->ixa_notify_cookie = tcp;
- return ((void *)connp);
+ return (connp);
}
/*
@@ -2298,6 +2303,11 @@ tcp_reinit_values(tcp_t *tcp)
ASSERT(tcp->tcp_listen_cnt == NULL);
ASSERT(tcp->tcp_reass_tid == 0);
+ /* Allow the CC algorithm to clean up after itself. */
+ if (tcp->tcp_cc_algo->cb_destroy != NULL)
+ tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
+ tcp->tcp_cc_algo = NULL;
+
#undef DONTCARE
#undef PRESERVE
}
@@ -2318,7 +2328,12 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent)
(connp->conn_ipversion == IPV4_VERSION ||
connp->conn_ipversion == IPV6_VERSION)));
+ tcp->tcp_ccv.type = IPPROTO_TCP;
+ tcp->tcp_ccv.ccvc.tcp = tcp;
+
if (parent == NULL) {
+ tcp->tcp_cc_algo = tcps->tcps_default_cc_algo;
+
tcp->tcp_naglim = tcps->tcps_naglim_def;
tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial;
@@ -2346,6 +2361,8 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent)
*/
} else {
/* Inherit various TCP parameters from the parent. */
+ tcp->tcp_cc_algo = parent->tcp_cc_algo;
+
tcp->tcp_naglim = parent->tcp_naglim;
tcp->tcp_rto_initial = parent->tcp_rto_initial;
@@ -2372,6 +2389,9 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent)
tcp->tcp_init_cwnd = parent->tcp_init_cwnd;
}
+ if (tcp->tcp_cc_algo->cb_init != NULL)
+ VERIFY(tcp->tcp_cc_algo->cb_init(&tcp->tcp_ccv) == 0);
+
/*
* Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
* will be close to tcp_rexmit_interval_initial. By doing this, we
@@ -2616,7 +2636,7 @@ tcp_create_common(cred_t *credp, boolean_t isv6, boolean_t issocket,
}
sqp = IP_SQUEUE_GET((uint_t)gethrtime());
- connp = (conn_t *)tcp_get_conn(sqp, tcps);
+ connp = tcp_get_conn(sqp, tcps);
/*
* Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
* so we drop it by one.
@@ -3807,6 +3827,9 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns)
list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t),
offsetof(tcp_listener_t, tl_link));
+ tcps->tcps_default_cc_algo = cc_load_algo(CC_DEFAULT_ALGO_NAME);
+ VERIFY3P(tcps->tcps_default_cc_algo, !=, NULL);
+
return (tcps);
}
diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c
index dd50c3f6ad..f7ea79da15 100644
--- a/usr/src/uts/common/inet/tcp/tcp_input.c
+++ b/usr/src/uts/common/inet/tcp/tcp_input.c
@@ -170,6 +170,133 @@ static void tcp_set_rto(tcp_t *, hrtime_t);
static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
/*
+ * CC wrapper hook functions
+ */
+static void
+cc_ack_received(tcp_t *tcp, uint32_t seg_ack, int32_t bytes_acked,
+ uint16_t type)
+{
+ uint32_t old_cwnd = tcp->tcp_cwnd;
+
+ tcp->tcp_ccv.bytes_this_ack = bytes_acked;
+ if (tcp->tcp_cwnd <= tcp->tcp_swnd)
+ tcp->tcp_ccv.flags |= CCF_CWND_LIMITED;
+ else
+ tcp->tcp_ccv.flags &= ~CCF_CWND_LIMITED;
+
+ if (type == CC_ACK) {
+ if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
+ if (tcp->tcp_ccv.flags & CCF_RTO)
+ tcp->tcp_ccv.flags &= ~CCF_RTO;
+
+ tcp->tcp_ccv.t_bytes_acked +=
+ min(tcp->tcp_ccv.bytes_this_ack,
+ tcp->tcp_tcps->tcps_abc_l_var * tcp->tcp_mss);
+ if (tcp->tcp_ccv.t_bytes_acked >= tcp->tcp_cwnd) {
+ tcp->tcp_ccv.t_bytes_acked -= tcp->tcp_cwnd;
+ tcp->tcp_ccv.flags |= CCF_ABC_SENTAWND;
+ }
+ } else {
+ tcp->tcp_ccv.flags &= ~CCF_ABC_SENTAWND;
+ tcp->tcp_ccv.t_bytes_acked = 0;
+ }
+ }
+
+ if (CC_ALGO(tcp)->ack_received != NULL) {
+ /*
+ * The FreeBSD code where this originated had a comment "Find
+ * a way to live without this" in several places where curack
+ * got set. If they eventually dump curack from the cc
+ * variables, we'll need to adapt our code.
+ */
+ tcp->tcp_ccv.curack = seg_ack;
+ CC_ALGO(tcp)->ack_received(&tcp->tcp_ccv, type);
+ }
+
+ DTRACE_PROBE3(cwnd__cc__ack__received, tcp_t *, tcp, uint32_t, old_cwnd,
+ uint32_t, tcp->tcp_cwnd);
+}
+
+void
+cc_cong_signal(tcp_t *tcp, uint32_t seg_ack, uint32_t type)
+{
+ uint32_t old_cwnd = tcp->tcp_cwnd;
+ uint32_t old_cwnd_ssthresh = tcp->tcp_cwnd_ssthresh;
+ switch (type) {
+ case CC_NDUPACK:
+ if (!IN_FASTRECOVERY(tcp->tcp_ccv.flags)) {
+ tcp->tcp_rexmit_max = tcp->tcp_snxt;
+ if (tcp->tcp_ecn_ok) {
+ tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
+ tcp->tcp_cwr = B_TRUE;
+ tcp->tcp_ecn_cwr_sent = B_FALSE;
+ }
+ }
+ break;
+ case CC_ECN:
+ if (!IN_CONGRECOVERY(tcp->tcp_ccv.flags)) {
+ tcp->tcp_rexmit_max = tcp->tcp_snxt;
+ if (tcp->tcp_ecn_ok) {
+ tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
+ tcp->tcp_cwr = B_TRUE;
+ tcp->tcp_ecn_cwr_sent = B_FALSE;
+ }
+ }
+ break;
+ case CC_RTO:
+ tcp->tcp_ccv.flags |= CCF_RTO;
+ tcp->tcp_dupack_cnt = 0;
+ tcp->tcp_ccv.t_bytes_acked = 0;
+ /*
+ * Give up on fast recovery and congestion recovery if we were
+ * attempting either.
+ */
+ EXIT_RECOVERY(tcp->tcp_ccv.flags);
+ if (CC_ALGO(tcp)->cong_signal == NULL) {
+ /*
+ * RFC5681 Section 3.1
+ * ssthresh = max (FlightSize / 2, 2*SMSS) eq (4)
+ */
+ tcp->tcp_cwnd_ssthresh = max(
+ (tcp->tcp_snxt - tcp->tcp_suna) / 2 / tcp->tcp_mss,
+ 2) * tcp->tcp_mss;
+ tcp->tcp_cwnd = tcp->tcp_mss;
+ }
+
+ if (tcp->tcp_ecn_ok) {
+ tcp->tcp_cwr = B_TRUE;
+ tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
+ tcp->tcp_ecn_cwr_sent = B_FALSE;
+ }
+ break;
+ }
+
+ if (CC_ALGO(tcp)->cong_signal != NULL) {
+ tcp->tcp_ccv.curack = seg_ack;
+ CC_ALGO(tcp)->cong_signal(&tcp->tcp_ccv, type);
+ }
+
+ DTRACE_PROBE6(cwnd__cc__cong__signal, tcp_t *, tcp, uint32_t, old_cwnd,
+ uint32_t, tcp->tcp_cwnd, uint32_t, old_cwnd_ssthresh,
+ uint32_t, tcp->tcp_cwnd_ssthresh, uint32_t, type);
+}
+
+static void
+cc_post_recovery(tcp_t *tcp, uint32_t seg_ack)
+{
+ uint32_t old_cwnd = tcp->tcp_cwnd;
+
+ if (CC_ALGO(tcp)->post_recovery != NULL) {
+ tcp->tcp_ccv.curack = seg_ack;
+ CC_ALGO(tcp)->post_recovery(&tcp->tcp_ccv);
+ }
+ tcp->tcp_ccv.t_bytes_acked = 0;
+
+ DTRACE_PROBE3(cwnd__cc__post__recovery, tcp_t *, tcp,
+ uint32_t, old_cwnd, uint32_t, tcp->tcp_cwnd);
+}
+
+/*
* Set the MSS associated with a particular tcp based on its current value,
* and a new one passed in. Observe minimums and maximums, and reset other
* state variables that we want to view as multiples of MSS.
@@ -548,6 +675,9 @@ tcp_process_options(tcp_t *tcp, tcpha_t *tcpha)
* updated properly.
*/
TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
+
+ if (tcp->tcp_cc_algo->conn_init != NULL)
+ tcp->tcp_cc_algo->conn_init(&tcp->tcp_ccv);
}
/*
@@ -1405,7 +1535,7 @@ tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
ASSERT(ira->ira_sqp != NULL);
new_sqp = ira->ira_sqp;
- econnp = (conn_t *)tcp_get_conn(arg2, tcps);
+ econnp = tcp_get_conn(arg2, tcps);
if (econnp == NULL)
goto error2;
@@ -2324,8 +2454,6 @@ tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
ip_pkt_t ipp;
boolean_t ofo_seg = B_FALSE; /* Out of order segment */
uint32_t cwnd;
- uint32_t add;
- int npkt;
int mss;
conn_t *connp = (conn_t *)arg;
squeue_t *sqp = (squeue_t *)arg2;
@@ -2601,6 +2729,9 @@ tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
* draft-floyd-incr-init-win-01.txt,
* Increasing TCP's Initial Window.
*/
+ DTRACE_PROBE3(cwnd__retransmitted__syn,
+ tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
+ uint32_t, tcp->tcp_mss);
tcp->tcp_cwnd = tcp->tcp_mss;
}
@@ -3823,6 +3954,9 @@ process_ack:
tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
tcp->tcp_rexmit_max = tcp->tcp_snxt;
tcp->tcp_ms_we_have_waited = 0;
+ DTRACE_PROBE3(cwnd__retransmitted__syn,
+ tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
+ uint32_t, tcp->tcp_mss);
tcp->tcp_cwnd = mss;
}
@@ -3866,33 +4000,22 @@ process_ack:
*/
if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
tcp->tcp_cwr = B_FALSE;
- if (tcp->tcp_ecn_ok && (flags & TH_ECE)) {
- if (!tcp->tcp_cwr) {
- npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss;
- tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss;
- tcp->tcp_cwnd = npkt * mss;
- /*
- * If the cwnd is 0, use the timer to clock out
- * new segments. This is required by the ECN spec.
- */
- if (npkt == 0) {
- TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
- /*
- * This makes sure that when the ACK comes
- * back, we will increase tcp_cwnd by 1 MSS.
- */
- tcp->tcp_cwnd_cnt = 0;
- }
- tcp->tcp_cwr = B_TRUE;
- /*
- * This marks the end of the current window of in
- * flight data. That is why we don't use
- * tcp_suna + tcp_swnd. Only data in flight can
- * provide ECN info.
- */
- tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
- tcp->tcp_ecn_cwr_sent = B_FALSE;
- }
+ if (tcp->tcp_ecn_ok && (flags & TH_ECE) && !tcp->tcp_cwr) {
+ cc_cong_signal(tcp, seg_ack, CC_ECN);
+ /*
+ * If the cwnd is 0, use the timer to clock out
+ * new segments. This is required by the ECN spec.
+ */
+ if (tcp->tcp_cwnd == 0)
+ TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
+ tcp->tcp_cwr = B_TRUE;
+ /*
+ * This marks the end of the current window of in
+ * flight data. That is why we don't use
+ * tcp_suna + tcp_swnd. Only data in flight can
+ * provide ECN info.
+ */
+ tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
}
mp1 = tcp->tcp_xmit_head;
@@ -3914,6 +4037,8 @@ process_ack:
/* Do Limited Transmit */
if ((dupack_cnt = ++tcp->tcp_dupack_cnt) <
tcps->tcps_dupack_fast_retransmit) {
+ cc_ack_received(tcp, seg_ack,
+ bytes_acked, CC_DUPACK);
/*
* RFC 3042
*
@@ -3960,12 +4085,10 @@ process_ack:
* dropped (due to congestion.)
*/
if (!tcp->tcp_cwr) {
- npkt = ((tcp->tcp_snxt -
- tcp->tcp_suna) >> 1) / mss;
- tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
- mss;
- tcp->tcp_cwnd = (npkt +
- tcp->tcp_dupack_cnt) * mss;
+ cc_cong_signal(tcp, seg_ack,
+ CC_NDUPACK);
+ cc_ack_received(tcp, seg_ack,
+ bytes_acked, CC_DUPACK);
}
if (tcp->tcp_ecn_ok) {
tcp->tcp_cwr = B_TRUE;
@@ -4027,6 +4150,8 @@ process_ack:
} /* tcp_snd_sack_ok */
} else {
+ cc_ack_received(tcp, seg_ack,
+ bytes_acked, CC_DUPACK);
/*
* Here we perform congestion
* avoidance, but NOT slow start.
@@ -4048,6 +4173,10 @@ process_ack:
cwnd = tcp->tcp_cwnd + mss;
if (cwnd > tcp->tcp_cwnd_max)
cwnd = tcp->tcp_cwnd_max;
+ DTRACE_PROBE3(cwnd__fast__recovery,
+ tcp_t *, tcp,
+ uint32_t, tcp->tcp_cwnd,
+ uint32_t, cwnd);
tcp->tcp_cwnd = cwnd;
if (tcp->tcp_unsent > 0)
flags |= TH_XMIT_NEEDED;
@@ -4180,15 +4309,10 @@ process_ack:
ASSERT(tcp->tcp_rexmit == B_FALSE);
if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) {
tcp->tcp_dupack_cnt = 0;
- /*
- * Restore the orig tcp_cwnd_ssthresh after
- * fast retransmit phase.
- */
- if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) {
- tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh;
- }
+
+ cc_post_recovery(tcp, seg_ack);
+
tcp->tcp_rexmit_max = seg_ack;
- tcp->tcp_cwnd_cnt = 0;
/*
* Remove all notsack info to avoid confusion with
@@ -4217,8 +4341,12 @@ process_ack:
* aggressive behaviour in sending new
* segments.
*/
- tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh +
+ cwnd = tcp->tcp_cwnd_ssthresh +
tcps->tcps_dupack_fast_retransmit * mss;
+ DTRACE_PROBE3(cwnd__fast__retransmit__part__ack,
+ tcp_t *, tcp, uint32_t, tcp->tcp_cwnd,
+ uint32_t, cwnd);
+ tcp->tcp_cwnd = cwnd;
tcp->tcp_cwnd_cnt = tcp->tcp_cwnd;
flags |= TH_REXMIT_NEEDED;
}
@@ -4279,28 +4407,10 @@ process_ack:
* usual.
*/
if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) {
- cwnd = tcp->tcp_cwnd;
- add = mss;
-
- if (cwnd >= tcp->tcp_cwnd_ssthresh) {
- /*
- * This is to prevent an increase of less than 1 MSS of
- * tcp_cwnd. With partial increase, tcp_wput_data()
- * may send out tinygrams in order to preserve mblk
- * boundaries.
- *
- * By initializing tcp_cwnd_cnt to new tcp_cwnd and
- * decrementing it by 1 MSS for every ACKs, tcp_cwnd is
- * increased by 1 MSS for every RTTs.
- */
- if (tcp->tcp_cwnd_cnt <= 0) {
- tcp->tcp_cwnd_cnt = cwnd + add;
- } else {
- tcp->tcp_cwnd_cnt -= add;
- add = 0;
- }
+ if (IN_RECOVERY(tcp->tcp_ccv.flags)) {
+ EXIT_RECOVERY(tcp->tcp_ccv.flags);
}
- tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max);
+ cc_ack_received(tcp, seg_ack, bytes_acked, CC_ACK);
}
/* See if the latest urgent data has been acknowledged */
@@ -5634,6 +5744,10 @@ noticmpv4:
npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
tcp->tcp_mss;
tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
+
+ DTRACE_PROBE3(cwnd__source__quench, tcp_t *, tcp,
+ uint32_t, tcp->tcp_cwnd,
+ uint32_t, tcp->tcp_mss);
tcp->tcp_cwnd = tcp->tcp_mss;
tcp->tcp_cwnd_cnt = 0;
}
diff --git a/usr/src/uts/common/inet/tcp/tcp_output.c b/usr/src/uts/common/inet/tcp/tcp_output.c
index f54ab3fb33..ae9efe863d 100644
--- a/usr/src/uts/common/inet/tcp/tcp_output.c
+++ b/usr/src/uts/common/inet/tcp/tcp_output.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
* Copyright 2019 Joyent, Inc.
*/
@@ -81,6 +81,18 @@ static void tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *);
*/
static int tcp_tx_pull_len = 16;
+static void
+cc_after_idle(tcp_t *tcp)
+{
+ uint32_t old_cwnd = tcp->tcp_cwnd;
+
+ if (CC_ALGO(tcp)->after_idle != NULL)
+ CC_ALGO(tcp)->after_idle(&tcp->tcp_ccv);
+
+ DTRACE_PROBE3(cwnd__cc__after__idle, tcp_t *, tcp, uint32_t, old_cwnd,
+ uint32_t, tcp->tcp_cwnd);
+}
+
int
tcp_wput(queue_t *q, mblk_t *mp)
{
@@ -219,7 +231,6 @@ tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
int32_t total_hdr_len;
int32_t tcp_hdr_len;
int rc;
- tcp_stack_t *tcps = tcp->tcp_tcps;
conn_t *connp = tcp->tcp_connp;
clock_t now = LBOLT_FASTPATH;
@@ -374,7 +385,7 @@ data_null:
if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
(TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
- TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
+ cc_after_idle(tcp);
}
if (tcpstate == TCPS_SYN_RCVD) {
/*
@@ -1195,7 +1206,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
now = LBOLT_FASTPATH;
if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
(TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
- TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle);
+ cc_after_idle(tcp);
}
usable = tcp->tcp_swnd; /* tcp window size */
diff --git a/usr/src/uts/common/inet/tcp/tcp_timers.c b/usr/src/uts/common/inet/tcp/tcp_timers.c
index 81cf5c57a5..804160f628 100644
--- a/usr/src/uts/common/inet/tcp/tcp_timers.c
+++ b/usr/src/uts/common/inet/tcp/tcp_timers.c
@@ -23,7 +23,7 @@
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright 2011 Joyent, Inc. All rights reserved.
- * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
*/
#include <sys/types.h>
@@ -784,36 +784,7 @@ tcp_timer(void *arg)
SL_TRACE, "tcp_timer: zero win");
}
} else {
- /*
- * After retransmission, we need to do
- * slow start. Set the ssthresh to one
- * half of current effective window and
- * cwnd to one MSS. Also reset
- * tcp_cwnd_cnt.
- *
- * Note that if tcp_ssthresh is reduced because
- * of ECN, do not reduce it again unless it is
- * already one window of data away (tcp_cwr
- * should then be cleared) or this is a
- * timeout for a retransmitted segment.
- */
- uint32_t npkt;
-
- if (!tcp->tcp_cwr || tcp->tcp_rexmit) {
- npkt = ((tcp->tcp_timer_backoff ?
- tcp->tcp_cwnd_ssthresh :
- tcp->tcp_snxt -
- tcp->tcp_suna) >> 1) / tcp->tcp_mss;
- tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
- tcp->tcp_mss;
- }
- tcp->tcp_cwnd = tcp->tcp_mss;
- tcp->tcp_cwnd_cnt = 0;
- if (tcp->tcp_ecn_ok) {
- tcp->tcp_cwr = B_TRUE;
- tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
- tcp->tcp_ecn_cwr_sent = B_FALSE;
- }
+ cc_cong_signal(tcp, NULL, CC_RTO);
}
break;
}
diff --git a/usr/src/uts/common/inet/tcp/tcp_tunables.c b/usr/src/uts/common/inet/tcp/tcp_tunables.c
index f4d6c71914..6348e02ae6 100644
--- a/usr/src/uts/common/inet/tcp/tcp_tunables.c
+++ b/usr/src/uts/common/inet/tcp/tcp_tunables.c
@@ -22,12 +22,13 @@
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2016 Joyent, Inc.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
*/
/* Copyright (c) 1990 Mentat Inc. */
#include <inet/ip.h>
#include <inet/tcp_impl.h>
+#include <inet/cc.h>
#include <sys/multidata.h>
#include <sys/sunddi.h>
@@ -38,6 +39,12 @@
/* Max of the above */
#define TCP_MSS_MAX TCP_MSS_MAX_IPV4
+typedef struct {
+ char *ccn_buf;
+ uint_t ccn_bufsize;
+ uint_t ccn_bytes;
+} tcp_copy_ccname_t;
+
/*
* Set the RFC 1948 pass phrase
*/
@@ -239,6 +246,65 @@ tcp_largest_anon_set(netstack_t *stack, cred_t *cr, mod_prop_info_t *pinfo,
return (0);
}
+/* ARGSUSED */
+static int
+tcp_set_cc_algorithm(netstack_t *stack, cred_t *cr, mod_prop_info_t *pinfo,
+ const char *ifname, const void *pval, uint_t flags)
+{
+ tcp_stack_t *tcps = stack->netstack_tcp;
+ char *name = (flags & MOD_PROP_DEFAULT) ?
+ CC_DEFAULT_ALGO_NAME : (char *)pval;
+ struct cc_algo *algo = cc_load_algo(name);
+
+ if (algo == NULL) {
+ return (EINVAL);
+ }
+
+ tcps->tcps_default_cc_algo = algo;
+
+ return (0);
+}
+
+static int
+tcp_copy_ccname(void *data, struct cc_algo *algo)
+{
+ tcp_copy_ccname_t *cd = data;
+ char *sep = cd->ccn_bytes > 0 ? "," : "";
+ size_t avail = 0;
+
+ if (cd->ccn_bytes < cd->ccn_bufsize) {
+ avail = cd->ccn_bufsize - cd->ccn_bytes;
+ }
+
+ cd->ccn_bytes += snprintf(cd->ccn_buf + cd->ccn_bytes, avail,
+ "%s%s", sep, algo->name);
+
+ return (cd->ccn_bytes >= cd->ccn_bufsize ? ENOBUFS : 0);
+}
+
+/* ARGSUSED */
+static int
+tcp_get_cc_algorithm(netstack_t *stack, mod_prop_info_t *pinfo,
+ const char *ifname, void *pval, uint_t psize, uint_t flags)
+{
+ size_t nbytes;
+
+ if (flags & MOD_PROP_POSSIBLE) {
+ tcp_copy_ccname_t cd = { pval, psize, 0 };
+ return (cc_walk_algos(tcp_copy_ccname, &cd));
+ } else if (flags & MOD_PROP_PERM) {
+ nbytes = snprintf(pval, psize, "%u", MOD_PROP_PERM_RW);
+ } else if (flags & MOD_PROP_DEFAULT) {
+ nbytes = snprintf(pval, psize, "%s", CC_DEFAULT_ALGO_NAME);
+ } else {
+ nbytes = snprintf(pval, psize, "%s",
+ stack->netstack_tcp->tcps_default_cc_algo->name);
+ }
+ if (nbytes >= psize)
+ return (ENOBUFS);
+ return (0);
+}
+
/*
* All of these are alterable, within the min/max values given, at run time.
*
@@ -527,6 +593,17 @@ mod_prop_info_t tcp_propinfo_tbl[] = {
{1, ISS_INCR, ISS_INCR},
{ISS_INCR} },
+ { "congestion_control", MOD_PROTO_TCP,
+ tcp_set_cc_algorithm, tcp_get_cc_algorithm, {0}, {0} },
+
+ /* RFC 3465 - TCP Congestion Control with Appropriate Byte Counting */
+ { "_abc", MOD_PROTO_TCP,
+ mod_set_boolean, mod_get_boolean, {B_TRUE}, {B_TRUE} },
+
+ /* "L" value from RFC 3465 */
+ { "_abc_l_var", MOD_PROTO_TCP,
+ mod_set_uint32, mod_get_uint32, {1, UINT32_MAX, 2}, {2} },
+
{ "?", MOD_PROTO_TCP, NULL, mod_get_allprop, {0}, {0} },
{ NULL, 0, NULL, NULL, {0}, {0} }
diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h
index b110a60fab..5669592cff 100644
--- a/usr/src/uts/common/inet/tcp_impl.h
+++ b/usr/src/uts/common/inet/tcp_impl.h
@@ -22,7 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2019 Joyent, Inc.
* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
*/
#ifndef _INET_TCP_IMPL_H
@@ -562,6 +562,8 @@ extern uint32_t tcp_early_abort;
#define tcps_dev_flow_ctl tcps_propinfo_tbl[58].prop_cur_bval
#define tcps_reass_timeout tcps_propinfo_tbl[59].prop_cur_uval
#define tcps_iss_incr tcps_propinfo_tbl[65].prop_cur_uval
+#define tcps_abc tcps_propinfo_tbl[67].prop_cur_bval
+#define tcps_abc_l_var tcps_propinfo_tbl[68].prop_cur_uval
/*
@@ -733,6 +735,7 @@ extern mblk_t *tcp_xmit_mp(tcp_t *, mblk_t *, int32_t, int32_t *,
/*
* Input related functions in tcp_input.c.
*/
+extern void cc_cong_signal(tcp_t *, uint32_t, uint32_t);
extern void tcp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
extern void tcp_input_data(void *, mblk_t *, void *, ip_recv_attr_t *);
extern void tcp_input_listener_unbound(void *, mblk_t *, void *,
diff --git a/usr/src/uts/common/inet/tcp_stack.h b/usr/src/uts/common/inet/tcp_stack.h
index e46ebe08da..9bde97617f 100644
--- a/usr/src/uts/common/inet/tcp_stack.h
+++ b/usr/src/uts/common/inet/tcp_stack.h
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2017 by Delphix. All rights reserved.
*/
#ifndef _INET_TCP_STACK_H
@@ -114,6 +115,8 @@ struct tcp_stack {
kmutex_t tcps_listener_conf_lock;
list_t tcps_listener_conf;
+ struct cc_algo *tcps_default_cc_algo;
+
/*
* Per CPU stats
*
diff --git a/usr/src/uts/intel/Makefile.intel b/usr/src/uts/intel/Makefile.intel
index dabde6b98f..ca1cce10d9 100644
--- a/usr/src/uts/intel/Makefile.intel
+++ b/usr/src/uts/intel/Makefile.intel
@@ -21,6 +21,7 @@
#
# Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2013 Andrew Stormont. All rights reserved.
+# Copyright (c) 2014 by Delphix. All rights reserved.
# Copyright 2019 Joyent, Inc.
# Copyright 2016 Garrett D'Amore <garrett@damore.org>
# Copyright 2018 Nexenta Systems, Inc.
@@ -174,6 +175,12 @@ $(IF_DEBUG_OBJ)clock.o := DEBUG_DEFS += -DKSLICE=1
ALL_DEFS = $(DEBUG_DEFS) $(OPTION_DEFS)
#
+# TCP congestion control modules (/kernel/cc)
+#
+MISC_KMODS += cc
+CC_KMODS += cc_newreno cc_cubic cc_sunreno
+
+#
# The kernels modules which are "implementation architecture"
# specific for this machine are enumerated below. Note that most
# of these modules must exist (in one form or another) for each
diff --git a/usr/src/uts/intel/cc/Makefile b/usr/src/uts/intel/cc/Makefile
new file mode 100644
index 0000000000..27a74f2c95
--- /dev/null
+++ b/usr/src/uts/intel/cc/Makefile
@@ -0,0 +1,69 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = cc
+OBJECTS = $(CC_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(CC_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_MISC_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/intel/Makefile.intel
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+#
+# Overrides.
+#
+CFLAGS += $(CCVERBOSE)
+LDFLAGS += -dy
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/intel/Makefile.targ
diff --git a/usr/src/uts/intel/cc_cubic/Makefile b/usr/src/uts/intel/cc_cubic/Makefile
new file mode 100644
index 0000000000..a4edef5f46
--- /dev/null
+++ b/usr/src/uts/intel/cc_cubic/Makefile
@@ -0,0 +1,73 @@
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = cc_cubic
+OBJECTS = $(CC_CUBIC_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(CC_CUBIC_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_CC_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/intel/Makefile.intel
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+#
+# Overrides.
+#
+CFLAGS += $(CCVERBOSE)
+LDFLAGS += -dy -N misc/cc -N cc/cc_newreno
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/intel/Makefile.targ
diff --git a/usr/src/uts/intel/cc_newreno/Makefile b/usr/src/uts/intel/cc_newreno/Makefile
new file mode 100644
index 0000000000..aaa47dcd05
--- /dev/null
+++ b/usr/src/uts/intel/cc_newreno/Makefile
@@ -0,0 +1,73 @@
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = cc_newreno
+OBJECTS = $(CC_NEWRENO_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(CC_NEWRENO_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_CC_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/intel/Makefile.intel
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+#
+# Overrides.
+#
+CFLAGS += $(CCVERBOSE)
+LDFLAGS += -dy -N misc/cc
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/intel/Makefile.targ
diff --git a/usr/src/uts/intel/cc_sunreno/Makefile b/usr/src/uts/intel/cc_sunreno/Makefile
new file mode 100644
index 0000000000..90463e9268
--- /dev/null
+++ b/usr/src/uts/intel/cc_sunreno/Makefile
@@ -0,0 +1,73 @@
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = cc_sunreno
+OBJECTS = $(CC_SUNRENO_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(CC_SUNRENO_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_CC_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/intel/Makefile.intel
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+#
+# Overrides.
+#
+CFLAGS += $(CCVERBOSE)
+LDFLAGS += -dy -N misc/cc
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/intel/Makefile.targ
diff --git a/usr/src/uts/intel/ip/Makefile b/usr/src/uts/intel/ip/Makefile
index 009a644393..9cc6e5499f 100644
--- a/usr/src/uts/intel/ip/Makefile
+++ b/usr/src/uts/intel/ip/Makefile
@@ -22,7 +22,7 @@
# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# Copyright (c) 2018, Joyent, Inc.
+# Copyright 2019 Joyent, Inc.
#
# This makefile drives the production of the ip driver
# kernel module.
@@ -53,7 +53,7 @@ include $(UTSBASE)/intel/Makefile.intel
#
# Define targets
#
-ALL_TARGET = $(BINARY) $(SRC_CONFILE)
+ALL_TARGET = $(BINARY) $(SRC_CONFFILE)
LINT_TARGET = $(MODULE).lint
INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE)
@@ -91,6 +91,12 @@ INC_PATH += -I$(UTSBASE)/common/io/bpf
LDFLAGS += -dy -Nmisc/md5 -Ncrypto/swrand -Nmisc/hook -Nmisc/neti
#
+# Depends on the congestion control framework for TCP connections.
+# We make several different algorithms available by default.
+#
+LDFLAGS += -N misc/cc -N cc/cc_sunreno -N cc/cc_newreno -N cc/cc_cubic
+
+#
# For now, disable these lint checks; maintainers should endeavor
# to investigate and remove these for maximum lint coverage.
# Please do not carry these forward to new Makefiles.
diff --git a/usr/src/uts/sparc/Makefile.sparc b/usr/src/uts/sparc/Makefile.sparc
index a37de46509..c5e3581c46 100644
--- a/usr/src/uts/sparc/Makefile.sparc
+++ b/usr/src/uts/sparc/Makefile.sparc
@@ -22,6 +22,7 @@
#
# Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2013 Andrew Stormont. All rights reserved.
+# Copyright (c) 2015, 2017 by Delphix. All rights reserved.
# Copyright 2019 Joyent, Inc.
# Copyright 2016 Gary Mills
# Copyright 2016 Nexenta Systems, Inc.
@@ -203,6 +204,13 @@ $(IF_DEBUG_OBJ)clock.o := DEBUG_DEFS += -DKSLICE=1
# files.
#
ALL_DEFS = $(MACHINE_DEFS) $(DEBUG_DEFS) $(OPTION_DEFS)
+
+#
+# TCP congestion control modules (/kernel/cc)
+#
+MISC_KMODS += cc
+CC_KMODS += cc_newreno cc_cubic cc_sunreno
+
#
#
# The kernels modules which are "implementation architecture"
diff --git a/usr/src/uts/sparc/cc/Makefile b/usr/src/uts/sparc/cc/Makefile
new file mode 100644
index 0000000000..928a085458
--- /dev/null
+++ b/usr/src/uts/sparc/cc/Makefile
@@ -0,0 +1,69 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = cc
+OBJECTS = $(CC_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(CC_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_MISC_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/sparc/Makefile.sparc
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+#
+# Overrides.
+#
+CFLAGS += $(CCVERBOSE)
+LDFLAGS += -dy
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/sparc/Makefile.targ
diff --git a/usr/src/uts/sparc/cc_cubic/Makefile b/usr/src/uts/sparc/cc_cubic/Makefile
new file mode 100644
index 0000000000..ae7926a614
--- /dev/null
+++ b/usr/src/uts/sparc/cc_cubic/Makefile
@@ -0,0 +1,73 @@
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = cc_cubic
+OBJECTS = $(CC_CUBIC_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(CC_CUBIC_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_CC_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/sparc/Makefile.sparc
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+#
+# Overrides.
+#
+CFLAGS += $(CCVERBOSE)
+LDFLAGS += -dy -N misc/cc -N cc/cc_newreno
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/sparc/Makefile.targ
diff --git a/usr/src/uts/sparc/cc_newreno/Makefile b/usr/src/uts/sparc/cc_newreno/Makefile
new file mode 100644
index 0000000000..6159e48c72
--- /dev/null
+++ b/usr/src/uts/sparc/cc_newreno/Makefile
@@ -0,0 +1,73 @@
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = cc_newreno
+OBJECTS = $(CC_NEWRENO_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(CC_NEWRENO_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_CC_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/sparc/Makefile.sparc
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+#
+# Overrides.
+#
+CFLAGS += $(CCVERBOSE)
+LDFLAGS += -dy -N misc/cc
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/sparc/Makefile.targ
diff --git a/usr/src/uts/sparc/cc_sunreno/Makefile b/usr/src/uts/sparc/cc_sunreno/Makefile
new file mode 100644
index 0000000000..912019834f
--- /dev/null
+++ b/usr/src/uts/sparc/cc_sunreno/Makefile
@@ -0,0 +1,73 @@
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = cc_sunreno
+OBJECTS = $(CC_SUNRENO_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(CC_SUNRENO_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_CC_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/sparc/Makefile.sparc
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+#
+# Overrides.
+#
+CFLAGS += $(CCVERBOSE)
+LDFLAGS += -dy -N misc/cc
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/sparc/Makefile.targ
diff --git a/usr/src/uts/sparc/ip/Makefile b/usr/src/uts/sparc/ip/Makefile
index a6f693aec2..4b6ac89108 100644
--- a/usr/src/uts/sparc/ip/Makefile
+++ b/usr/src/uts/sparc/ip/Makefile
@@ -22,6 +22,7 @@
# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
+# Copyright 2019 Joyent, Inc.
#
# This makefile drives the production of the ip driver
# kernel module.
@@ -66,6 +67,19 @@ CFLAGS += -xinline=tcp_set_ws_value
INC_PATH += -I$(UTSBASE)/common/io/bpf
#
+# Depends on md5 and swrand (for SCTP). SCTP needs to depend on
+# swrand as it needs random numbers early on during boot before
+# kCF subsystem can load swrand.
+#
+LDFLAGS += -dy -Nmisc/md5 -Ncrypto/swrand -Nmisc/hook -Nmisc/neti
+
+#
+# Depends on the congestion control framework for TCP connections.
+# We make several different algorithms available by default.
+#
+LDFLAGS += -N misc/cc -N cc/cc_sunreno -N cc/cc_newreno -N cc/cc_cubic
+
+#
# For now, disable these lint checks; maintainers should endeavor
# to investigate and remove these for maximum lint coverage.
# Please do not carry these forward to new Makefiles.
@@ -86,13 +100,6 @@ CERRWARN += $(CNOWARN_UNINIT)
CERRWARN += -_gcc=-Wno-type-limits
#
-# Depends on md5 and swrand (for SCTP). SCTP needs to depend on
-# swrand as it needs random numbers early on during boot before
-# kCF subsystem can load swrand.
-#
-LDFLAGS += -dy -Nmisc/md5 -Ncrypto/swrand -Nmisc/hook -Nmisc/neti
-
-#
# Default build targets.
#
.KEEP_STATE: