diff options
author | Sebastien Roy <seb@delphix.com> | 2017-08-01 13:21:40 -0400 |
---|---|---|
committer | Robert Mustacchi <rm@joyent.com> | 2019-08-23 18:42:52 +0000 |
commit | 45a4b79d042e642c2ed7090ec290469ccf8fc563 (patch) | |
tree | 3a2b9b0104d34bf6063ec1875142e69c1bc7a296 | |
parent | 867a2ce85cd3f659cb7bc187ba93a095fe1df597 (diff) | |
download | illumos-joyent-45a4b79d042e642c2ed7090ec290469ccf8fc563.tar.gz |
11553 Want pluggable TCP congestion control algorithms
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Reviewed by: Robert Mustacchi <robert.mustacchi@joyent.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
41 files changed, 2626 insertions, 128 deletions
diff --git a/exception_lists/cstyle b/exception_lists/cstyle index 06801920d4..3d2fc67caa 100644 --- a/exception_lists/cstyle +++ b/exception_lists/cstyle @@ -808,6 +808,10 @@ usr/src/uts/common/gssapi/mechs/krb5/mech/util_validate.c usr/src/uts/common/gssapi/mechs/krb5/mech/val_cred.c usr/src/uts/common/gssapi/mechs/krb5/mech/verify.c usr/src/uts/common/gssapi/mechs/krb5/mech/wrap_size_limit.c +usr/src/uts/common/inet/cc.h +usr/src/uts/common/inet/cc/cc_cubic.c +usr/src/uts/common/inet/cc/cc_module.h +usr/src/uts/common/inet/cc/cc_newreno.c usr/src/uts/common/io/bnx/570x/* usr/src/uts/common/io/bnx/include/* usr/src/uts/common/io/bnxe/577xx/common/bnxe_clc.c diff --git a/exception_lists/hdrchk b/exception_lists/hdrchk index ba30a7d6bb..9e8194ac8d 100644 --- a/exception_lists/hdrchk +++ b/exception_lists/hdrchk @@ -226,6 +226,8 @@ usr/src/uts/common/gssapi/mechs/krb5/include/krb5.h usr/src/uts/common/gssapi/mechs/krb5/include/old.h usr/src/uts/common/gssapi/mechs/krb5/include/raw.h usr/src/uts/common/gssapi/mechs/krb5/include/rsa-md4.h +usr/src/uts/common/inet/cc.h +usr/src/uts/common/inet/cc/cc_module.h usr/src/uts/common/io/axf/ax88172reg.h usr/src/uts/common/io/bnx/570x/* usr/src/uts/common/io/bnx/include/* diff --git a/exception_lists/packaging b/exception_lists/packaging index 8fb6466e79..1f91b551b4 100644 --- a/exception_lists/packaging +++ b/exception_lists/packaging @@ -137,6 +137,7 @@ usr/lib/llib-like.ln usr/lib/amd64/llib-like.ln i386 usr/lib/sparcv9/llib-like.ln sparc # +usr/include/inet/cc.h usr/include/inet/ip_impl.h usr/include/inet/ip_ndp.h usr/include/inet/ip2mac_impl.h diff --git a/usr/src/lib/libipadm/common/ipadm_prop.c b/usr/src/lib/libipadm/common/ipadm_prop.c index 0c3a25382f..4fc0dc0851 100644 --- a/usr/src/lib/libipadm/common/ipadm_prop.c +++ b/usr/src/lib/libipadm/common/ipadm_prop.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. */ /* @@ -153,6 +153,9 @@ static const char *ecn_sack_vals[] = {"never", "passive", "active", NULL}; /* Supported TCP protocol properties */ static ipadm_prop_desc_t ipadm_tcp_prop_table[] = { + { "congestion_control", NULL, IPADMPROP_CLASS_MODULE, MOD_PROTO_TCP, 0, + i_ipadm_set_prop, i_ipadm_get_prop, i_ipadm_get_prop }, + { "ecn", NULL, IPADMPROP_CLASS_MODULE, MOD_PROTO_TCP, 0, i_ipadm_set_ecnsack, i_ipadm_get_ecnsack, i_ipadm_get_ecnsack }, diff --git a/usr/src/man/man1m/ipadm.1m b/usr/src/man/man1m/ipadm.1m index 0381aa130d..8208657ddc 100644 --- a/usr/src/man/man1m/ipadm.1m +++ b/usr/src/man/man1m/ipadm.1m @@ -10,7 +10,7 @@ .\" .\" .\" Copyright (c) 2012, Joyent, Inc. All Rights Reserved -.\" Copyright (c) 2013 by Delphix. All rights reserved. +.\" Copyright (c) 2013, 2017 by Delphix. All rights reserved. .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright (c) 2016-2017, Chris Fraire <cfraire@me.com>. .\" Copyright 2019 OmniOS Community Edition (OmniOSce) Association. @@ -777,6 +777,9 @@ syntax can be used to add/remove values from the current list of values on the property. The property name can be one of the following: .Bl -tag -compact -width "smallest_nonpriv_port" +.It Cm congestion_control +The default congestion-control algorithm to be used for new connections +.Pq TCP . .It Cm ecn Explicit congestion control .Pq Cm never Ns / Ns Cm passive Ns / Ns Cm active diff --git a/usr/src/pkg/manifests/system-kernel.mf b/usr/src/pkg/manifests/system-kernel.mf index 0aea96988d..02186d11de 100644 --- a/usr/src/pkg/manifests/system-kernel.mf +++ b/usr/src/pkg/manifests/system-kernel.mf @@ -21,6 +21,7 @@ # # Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2014, 2017 by Delphix. All rights reserved. # Copyright 2013 Saso Kiselkov. All rights reserved. # Copyright 2015 Nexenta Systems, Inc. All rights reserved. # Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> @@ -66,6 +67,8 @@ dir path=etc/crypto group=sys dir path=etc/sock2path.d group=sys dir path=kernel group=sys $(i386_ONLY)dir path=kernel/$(ARCH64) group=sys +dir path=kernel/cc group=sys +dir path=kernel/cc/$(ARCH64) group=sys dir path=kernel/crypto group=sys dir path=kernel/crypto/$(ARCH64) group=sys dir path=kernel/dacf group=sys @@ -311,6 +314,9 @@ file path=etc/name_to_sysnum group=sys \ file path=etc/sock2path.d/system%2Fkernel group=sys file path=etc/system group=sys original_name=SUNWckr:etc/system preserve=true $(i386_ONLY)file path=kernel/$(ARCH64)/genunix group=sys mode=0755 +file path=kernel/cc/$(ARCH64)/cc_cubic group=sys mode=0755 +file path=kernel/cc/$(ARCH64)/cc_newreno group=sys mode=0755 +file path=kernel/cc/$(ARCH64)/cc_sunreno group=sys mode=0755 file path=kernel/crypto/$(ARCH64)/aes group=sys mode=0755 file path=kernel/crypto/$(ARCH64)/arcfour group=sys mode=0755 file path=kernel/crypto/$(ARCH64)/blowfish group=sys mode=0755 @@ -533,6 +539,7 @@ file path=kernel/misc/$(ARCH64)/bignum group=sys mode=0755 $(i386_ONLY)file path=kernel/misc/$(ARCH64)/bootdev group=sys mode=0755 file path=kernel/misc/$(ARCH64)/busra group=sys mode=0755 file path=kernel/misc/$(ARCH64)/cardbus group=sys mode=0755 +file path=kernel/misc/$(ARCH64)/cc group=sys mode=0755 file path=kernel/misc/$(ARCH64)/cmlb group=sys mode=0755 file path=kernel/misc/$(ARCH64)/consconfig group=sys mode=0755 file path=kernel/misc/$(ARCH64)/ctf group=sys mode=0755 diff --git a/usr/src/uts/Makefile.targ b/usr/src/uts/Makefile.targ index 80e4e7d115..c5c32caa19 100644 --- a/usr/src/uts/Makefile.targ +++ b/usr/src/uts/Makefile.targ @@ -22,6 +22,7 @@ # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright 2014 Garrett D'Amore <garrett@damore.org> # Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> +# Copyright (c) 2017 by Delphix. All rights reserved. # # This Makefiles contains the common targets and definitions for # all kernels. It is to be included in the Makefiles for specific @@ -177,6 +178,9 @@ $(ROOT_DACF_DIR)/%: $(OBJS_DIR)/% $(ROOT_DACF_DIR) FRC $(ROOT_BRAND_DIR)/%: $(OBJS_DIR)/% $(ROOT_BRAND_DIR) FRC $(INS.file) +$(ROOT_CC_DIR)/%: $(OBJS_DIR)/% $(ROOT_MOD_DIR) $(ROOT_CC_DIR) FRC + $(INS.file) + $(ROOT_CRYPTO_DIR)/%: $(OBJS_DIR)/% $(ROOT_CRYPTO_DIR) FRC $(INS.file) diff --git a/usr/src/uts/Makefile.uts b/usr/src/uts/Makefile.uts index 5fe9959e65..033d50a146 100644 --- a/usr/src/uts/Makefile.uts +++ b/usr/src/uts/Makefile.uts @@ -22,7 +22,7 @@ # # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright (c) 2011 Bayard G. Bell. All rights reserved. -# Copyright (c) 2011 by Delphix. All rights reserved. +# Copyright (c) 2011,2017 by Delphix. All rights reserved. # Copyright (c) 2013 Andrew Stormont. All rights reserved. # Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> # Copyright (c) 2019, Joyent, Inc. @@ -430,6 +430,7 @@ ROOT_FONT_DIR_32 = $(ROOT_MOD_DIR)/fonts ROOT_DACF_DIR_32 = $(ROOT_MOD_DIR)/dacf ROOT_CRYPTO_DIR_32 = $(ROOT_MOD_DIR)/crypto ROOT_MAC_DIR_32 = $(ROOT_MOD_DIR)/mac +ROOT_CC_DIR_32 = $(ROOT_MOD_DIR)/cc ROOT_KICONV_DIR_32 = $(ROOT_MOD_DIR)/kiconv ROOT_KERN_DIR_64 = $(ROOT_MOD_DIR)/$(SUBDIR64) @@ -457,6 +458,7 @@ ROOT_FONT_DIR_64 = $(ROOT_MOD_DIR)/fonts/$(SUBDIR64) ROOT_DACF_DIR_64 = $(ROOT_MOD_DIR)/dacf/$(SUBDIR64) ROOT_CRYPTO_DIR_64 = $(ROOT_MOD_DIR)/crypto/$(SUBDIR64) ROOT_MAC_DIR_64 = $(ROOT_MOD_DIR)/mac/$(SUBDIR64) +ROOT_CC_DIR_64 = $(ROOT_MOD_DIR)/cc/$(SUBDIR64) ROOT_KICONV_DIR_64 = $(ROOT_MOD_DIR)/kiconv/$(SUBDIR64) ROOT_KERN_DIR = $(ROOT_KERN_DIR_$(CLASS)) @@ -484,6 +486,7 @@ ROOT_FONT_DIR = $(ROOT_FONT_DIR_$(CLASS)) ROOT_DACF_DIR = $(ROOT_DACF_DIR_$(CLASS)) ROOT_CRYPTO_DIR = $(ROOT_CRYPTO_DIR_$(CLASS)) ROOT_MAC_DIR = $(ROOT_MAC_DIR_$(CLASS)) +ROOT_CC_DIR = $(ROOT_CC_DIR_$(CLASS)) ROOT_KICONV_DIR = $(ROOT_KICONV_DIR_$(CLASS)) ROOT_FIRMWARE_DIR = $(ROOT_MOD_DIR)/firmware @@ -502,6 +505,7 @@ ROOT_MOD_DIRS_32 += $(ROOT_EMLXS_FW_DIR_32) ROOT_MOD_DIRS_32 += $(ROOT_CPU_DIR_32) $(ROOT_FONT_DIR_32) ROOT_MOD_DIRS_32 += $(ROOT_TOD_DIR_32) $(ROOT_DACF_DIR_32) ROOT_MOD_DIRS_32 += $(ROOT_CRYPTO_DIR_32) $(ROOT_MAC_DIR_32) +ROOT_MOD_DIRS_32 += $(ROOT_CC_DIR_32) ROOT_MOD_DIRS_32 += $(ROOT_KICONV_DIR_32) ROOT_MOD_DIRS_32 += $(ROOT_FIRMWARE_DIR) @@ -595,7 +599,7 @@ PARALLEL_KMODS = $(DRV_KMODS) $(EXEC_KMODS) $(FS_KMODS) $(SCHED_KMODS) \ $(MMU_KMODS) $(DACF_KMODS) $(EXPORT_KMODS) $(IPP_KMODS) \ $(CRYPTO_KMODS) $(PCBE_KMODS) \ $(DRV_KMODS_$(CLASS)) $(MISC_KMODS_$(CLASS)) $(MAC_KMODS) \ - $(BRAND_KMODS) $(KICONV_KMODS) \ + $(BRAND_KMODS) $(KICONV_KMODS) $(CC_KMODS) \ $(SOCKET_KMODS) KMODS = $(GENUNIX_KMODS) $(PARALLEL_KMODS) @@ -607,7 +611,7 @@ LINT_KMODS = $(DRV_KMODS) $(EXEC_KMODS) $(FS_KMODS) $(SCHED_KMODS) \ $(MACH_KMODS) $(GSS_KMODS) $(DACF_KMODS) $(IPP_KMODS) \ $(CRYPTO_KMODS) $(PCBE_KMODS) \ $(DRV_KMODS_$(CLASS)) $(MISC_KMODS_$(CLASS)) $(MAC_KMODS) \ - $(BRAND_KMODS) $(KICONV_KMODS) $(SOCKET_KMODS) + $(BRAND_KMODS) $(KICONV_KMODS) $(CC_KMODS) $(SOCKET_KMODS) # # Files to be compiled with -xa, to generate basic block execution diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index ebc1b2db90..9a5a48c4c8 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -488,6 +488,14 @@ BLKDEV_OBJS += blkdev.o CARDBUS_OBJS += cardbus.o cardbus_hp.o cardbus_cfg.o +CC_OBJS += cc.o + +CC_CUBIC_OBJS += cc_cubic.o + +CC_NEWRENO_OBJS += cc_newreno.o + +CC_SUNRENO_OBJS += cc_sunreno.o + CONSKBD_OBJS += conskbd.o CONSMS_OBJS += consms.o diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules index 983c5c359d..a489d8314a 100644 --- a/usr/src/uts/common/Makefile.rules +++ b/usr/src/uts/common/Makefile.rules @@ -25,7 +25,7 @@ # Copyright 2013 Saso Kiselkov. All rights reserved. # Copyright 2019 Joyent, Inc. # Copyright 2018 Nexenta Systems, Inc. -# Copyright (c) 2016 by Delphix. All rights reserved. +# Copyright (c) 2017 by Delphix. All rights reserved. # # @@ -527,6 +527,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/dlpistub/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/cc/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1962,6 +1966,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/nca/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/dlpistub/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/cc/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) diff --git a/usr/src/uts/common/inet/Makefile b/usr/src/uts/common/inet/Makefile index 14ce78a884..5bcbc7532f 100644 --- a/usr/src/uts/common/inet/Makefile +++ b/usr/src/uts/common/inet/Makefile @@ -1,4 +1,4 @@ -# +# # CDDL HEADER START # # The contents of this file are subject to the terms of the @@ -23,12 +23,14 @@ # Copyright 2010 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # +# Copyright (c) 2014, 2017 by Delphix. All rights reserved. +# # uts/common/inet/Makefile # # include global definitions include ../../../Makefile.master -HDRS= arp.h common.h ipclassifier.h ip.h ip6.h ipdrop.h ipnet.h \ +HDRS= arp.h cc.h common.h ipclassifier.h ip.h ip6.h ipdrop.h ipnet.h \ ipsecah.h ipsecesp.h ipsec_info.h iptun.h ip6_asp.h ip_if.h ip_ire.h \ ip_multi.h ip_netinfo.h ip_ndp.h ip_rts.h ipsec_impl.h keysock.h \ led.h mi.h mib2.h nd.h optcom.h sadb.h sctp_itf.h snmpcom.h tcp.h \ diff --git a/usr/src/uts/common/inet/cc.h b/usr/src/uts/common/inet/cc.h new file mode 100644 index 0000000000..170d0e7f8b --- /dev/null +++ b/usr/src/uts/common/inet/cc.h @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2007-2008 + * Swinburne University of Technology, Melbourne, Australia. + * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * Copyright (c) 2017 by Delphix. All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University of Technology, by Lawrence Stewart and + * James Healy, made possible in part by a grant from the Cisco University + * Research Program Fund at Community Foundation Silicon Valley. + * + * Portions of this software were developed at the Centre for Advanced + * Internet Architectures, Swinburne University of Technology, Melbourne, + * Australia by David Hayes under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This software was first released in 2007 by James Healy and Lawrence Stewart + * whilst working on the NewTCP research project at Swinburne University of + * Technology's Centre for Advanced Internet Architectures, Melbourne, + * Australia, which was made possible in part by a grant from the Cisco + * University Research Program Fund at Community Foundation Silicon Valley. + * More details are available at: + * http://caia.swin.edu.au/urp/newtcp/ + */ + +#ifndef _NETINET_CC_H_ +#define _NETINET_CC_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <netinet/tcp.h> +#include <sys/queue.h> +#include <sys/rwlock.h> + +#define CC_ALGO_NAME_MAX 16 /* max congestion control name length */ + +#define CC_DEFAULT_ALGO_NAME "sunreno" + +struct tcp_s; +struct sctp_s; + +/* CC housekeeping functions. */ +extern struct cc_algo *cc_load_algo(const char *name); +extern int cc_register_algo(struct cc_algo *add_cc); +extern int cc_deregister_algo(struct cc_algo *remove_cc); + +/* + * Wrapper around transport structs that contain same-named congestion + * control variables. Allows algos to be shared amongst multiple CC aware + * transports. + * + * In theory, this code (from FreeBSD) can be used to support pluggable + * congestion control for sctp as well as tcp. However, the support for sctp + * in FreeBSD is incomplete, and in practice "type" is ignored. cc_module.h + * provides a CCV macro which implementations can use to get a variable out of + * the protocol-appropriate structure. + * + * If FreeBSD eventually does extend support for pluggable congestion control + * to sctp, we'll need to make sure we're setting "type" appropriately or use + * a definition of CCV that ignores it. + */ +struct cc_var { + void *cc_data; /* Per-connection private algorithm data. */ + int bytes_this_ack; /* # bytes acked by the current ACK. */ + int t_bytes_acked; /* # bytes acked during current RTT */ + tcp_seq curack; /* Most recent ACK. */ + uint32_t flags; /* Flags for cc_var (see below) */ + int type; /* Indicates which ptr is valid in ccvc. */ + union ccv_container { + struct tcp_s *tcp; + struct sctp_s *sctp; + } ccvc; + uint16_t nsegs; /* # segments coalesced into current chain. */ +}; + +/* + * cc_var flags. + * + * CCF_ABC_SENTAWND is set when a full congestion window of data has been ACKed + * according to the Appropriate Byte Counting spec, defined in RFC 3465. + */ +#define CCF_ABC_SENTAWND 0x0001 /* ABC counted cwnd worth of bytes? */ +#define CCF_CWND_LIMITED 0x0002 /* Are we currently cwnd limited? */ +#define CCF_FASTRECOVERY 0x0004 /* in NewReno Fast Recovery */ +#define CCF_WASFRECOVERY 0x0008 /* was in NewReno Fast Recovery */ +#define CCF_CONGRECOVERY 0x0010 /* congestion recovery mode */ +#define CCF_WASCRECOVERY 0x0020 /* was in congestion recovery */ +/* + * In slow-start due to a retransmission timeout. This flag is enabled for the + * duration of the slow-start phase. + */ +#define CCF_RTO 0x0040 /* in slow-start due to timeout */ + +#define IN_FASTRECOVERY(flags) (flags & CCF_FASTRECOVERY) +#define ENTER_FASTRECOVERY(flags) flags |= CCF_FASTRECOVERY +#define EXIT_FASTRECOVERY(flags) flags &= ~CCF_FASTRECOVERY + +#define IN_CONGRECOVERY(flags) (flags & CCF_CONGRECOVERY) +#define ENTER_CONGRECOVERY(flags) flags |= CCF_CONGRECOVERY +#define EXIT_CONGRECOVERY(flags) flags &= ~CCF_CONGRECOVERY + +#define IN_RECOVERY(flags) (flags & (CCF_CONGRECOVERY | CCF_FASTRECOVERY)) +#define ENTER_RECOVERY(flags) flags |= (CCF_CONGRECOVERY | CCF_FASTRECOVERY) +#define EXIT_RECOVERY(flags) flags &= ~(CCF_CONGRECOVERY | CCF_FASTRECOVERY) + +/* + * ACK types passed to the ack_received() hook. + * + * CC_ACK is passed when an ACK acknowledges previously unACKed data. + * CC_DUPACK is passed when a duplicate ACK is received. The conditions under + * which an ACK is considered a duplicate ACK are defined in RFC 5681. + */ +#define CC_ACK 0x0001 /* Regular in sequence ACK. */ +#define CC_DUPACK 0x0002 /* Duplicate ACK. */ +#define CC_PARTIALACK 0x0004 /* Not yet. */ +#define CC_SACK 0x0008 /* Not yet. */ + +/* + * Congestion signal types passed to the cong_signal() hook. The highest order 8 + * bits (0x01000000 - 0x80000000) are reserved for CC algos to declare their own + * congestion signal types. + * + * The congestion signals defined here cover the following situations: + * CC_ECN: A packet with an Explicit Congestion Notification was received + * See RFC 3168. + * CC_RTO: A round-trip timeout occured. + * CC_RTO_ERR: An ACK was received for a sequence number after we fired an RTO + * for that sequence number + * CC_NDUPACK: Trigger fast retransmit based on the assumption that receiving + * N duplicate ACKs indicates packet loss rather than reordering. Fast + * retransmit is followed by fast recovery. Fast retransmit and recovery + * were originally described in RFC 2581 and were updated by RFC3782 + * (NewReno). In both RFC2581 and RFC3782, N is 3. + */ +#define CC_ECN 0x00000001 /* ECN marked packet received. */ +#define CC_RTO 0x00000002 /* RTO fired. */ +#define CC_RTO_ERR 0x00000004 /* RTO fired in error. */ +#define CC_NDUPACK 0x00000008 /* Threshold of dupack's reached. */ + +#define CC_SIGPRIVMASK 0xFF000000 /* Mask to check if sig is private. */ + +/* + * Structure to hold data and function pointers that together represent a + * congestion control algorithm. + */ +struct cc_algo { + char name[CC_ALGO_NAME_MAX]; + + /* Init CC state for a new control block. */ + int (*cb_init)(struct cc_var *ccv); + + /* Cleanup CC state for a terminating control block. */ + void (*cb_destroy)(struct cc_var *ccv); + + /* Init variables for a newly established connection. */ + void (*conn_init)(struct cc_var *ccv); + + /* Called on receipt of an ack. */ + void (*ack_received)(struct cc_var *ccv, uint16_t type); + + /* Called on detection of a congestion signal. */ + void (*cong_signal)(struct cc_var *ccv, uint32_t type); + + /* Called after exiting congestion recovery. */ + void (*post_recovery)(struct cc_var *ccv); + + /* Called when data transfer resumes after an idle period. */ + void (*after_idle)(struct cc_var *ccv); + + STAILQ_ENTRY(cc_algo) entries; +}; + +typedef int cc_walk_func_t(void *, struct cc_algo *); +extern int cc_walk_algos(cc_walk_func_t *, void *); + +/* Macro to obtain the CC algo's struct ptr. */ +#define CC_ALGO(tp) ((tp)->tcp_cc_algo) + +/* Macro to obtain the CC algo's data ptr. */ +#define CC_DATA(tp) ((tp)->tcp_ccv.cc_data) + +#ifdef __cplusplus +} +#endif + +#endif /* _NETINET_CC_H_ */ diff --git a/usr/src/uts/common/inet/cc/THIRDPARTYLICENSE b/usr/src/uts/common/inet/cc/THIRDPARTYLICENSE new file mode 100644 index 0000000000..d2cdf5164e --- /dev/null +++ b/usr/src/uts/common/inet/cc/THIRDPARTYLICENSE @@ -0,0 +1,29 @@ +This software was developed by Lawrence Stewart while studying at the Centre +for Advanced Internet Architectures, Swinburne University of Technology, made +possible in part by a grant from the Cisco University Research Program Fund +at Community Foundation Silicon Valley. + +Portions of this software were developed at the Centre for Advanced +Internet Architectures, Swinburne University of Technology, Melbourne, +Australia by David Hayes under sponsorship from the FreeBSD Foundation. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. diff --git a/usr/src/uts/common/inet/cc/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/inet/cc/THIRDPARTYLICENSE.descrip new file mode 100644 index 0000000000..4740689711 --- /dev/null +++ b/usr/src/uts/common/inet/cc/THIRDPARTYLICENSE.descrip @@ -0,0 +1,3 @@ +The congestion control framework, its header, and the CUBIC and NewReno +congestion control modules came from FreeBSD, and are therefore licensed +under the 2-Clause BSD License. diff --git a/usr/src/uts/common/inet/cc/cc.c b/usr/src/uts/common/inet/cc/cc.c new file mode 100644 index 0000000000..7bb213f74e --- /dev/null +++ b/usr/src/uts/common/inet/cc/cc.c @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2007-2008 + * Swinburne University of Technology, Melbourne, Australia. + * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * Copyright (c) 2017 by Delphix. All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University of Technology, by Lawrence Stewart and + * James Healy, made possible in part by a grant from the Cisco University + * Research Program Fund at Community Foundation Silicon Valley. + * + * Portions of this software were developed at the Centre for Advanced + * Internet Architectures, Swinburne University of Technology, Melbourne, + * Australia by David Hayes under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This software was first released in 2007 by James Healy and Lawrence Stewart + * whilst working on the NewTCP research project at Swinburne University of + * Technology's Centre for Advanced Internet Architectures, Melbourne, + * Australia, which was made possible in part by a grant from the Cisco + * University Research Program Fund at Community Foundation Silicon Valley. + * More details are available at: + * http://caia.swin.edu.au/urp/newtcp/ + */ + +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/queue.h> +#include <inet/cc.h> +#include <inet/tcp.h> +#include <sys/sdt.h> + +#define CC_KMODDIR "cc" + +/* + * List of available cc algorithms on the current system. Access is + * synchronized using cc_list_lock. + */ +static STAILQ_HEAD(cc_head, cc_algo) cc_list = STAILQ_HEAD_INITIALIZER(cc_list); +static kmutex_t cc_list_lock; + +static struct modlmisc cc_modlmisc = { + &mod_miscops, + "Pluggable Congestion Control Framework" +}; + +static struct modlinkage cc_modlinkage = { + MODREV_1, + &cc_modlmisc, + NULL +}; + +/* + * Initialise CC subsystem on system boot. + */ +int +_init(void) +{ + STAILQ_INIT(&cc_list); + + return (mod_install(&cc_modlinkage)); +} + +int +_fini(void) +{ + return (EBUSY); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&cc_modlinkage, modinfop)); +} + +int +cc_walk_algos(cc_walk_func_t *func, void *cd) +{ + struct cc_algo *algo; + int ret = 0; + + mutex_enter(&cc_list_lock); + STAILQ_FOREACH(algo, &cc_list, entries) { + if ((ret = func(cd, algo)) != 0) { + break; + } + } + mutex_exit(&cc_list_lock); + + return (ret); +} + +/* + * Search for an algorithm of a given name, and return the corresponding set of + * operations. If there is no algorithm with the given name present, then this + * function returns NULL. + * + * Since this function is passed names from userland, it needs to be paranoid + * about the string, in case it's missing a terminating NUL character. + */ +struct cc_algo * +cc_load_algo(const char *name) +{ + struct cc_algo *algo; + boolean_t found = B_FALSE; + + if (strnlen(name, CC_ALGO_NAME_MAX) >= CC_ALGO_NAME_MAX) { + return (NULL); + } + + mutex_enter(&cc_list_lock); + STAILQ_FOREACH(algo, &cc_list, entries) { + if (strncmp(algo->name, name, CC_ALGO_NAME_MAX) == 0) { + found = B_TRUE; + break; + } + } + mutex_exit(&cc_list_lock); + + return (found ? algo : NULL); +} + +/* + * Returns non-zero on success, 0 on failure. + */ +int +cc_deregister_algo(struct cc_algo *remove_cc) +{ + struct cc_algo *funcs, *tmpfuncs; + int err = ENOENT; + + mutex_enter(&cc_list_lock); + STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) { + if (funcs == remove_cc) { + STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries); + err = 0; + break; + } + } + mutex_exit(&cc_list_lock); + return (err); +} + +/* + * Returns 0 on success, non-zero on failure. + */ +int +cc_register_algo(struct cc_algo *add_cc) +{ + struct cc_algo *funcs; + size_t nlen; + int err = 0; + + nlen = strnlen(add_cc->name, CC_ALGO_NAME_MAX); + if (nlen == 0 || nlen >= CC_ALGO_NAME_MAX) { + return (EINVAL); + } + + /* + * Iterate over list of registered CC algorithms and make sure + * we're not trying to add a duplicate. + */ + mutex_enter(&cc_list_lock); + STAILQ_FOREACH(funcs, &cc_list, entries) { + if (strncmp(funcs->name, add_cc->name, CC_ALGO_NAME_MAX) == 0) + err = EEXIST; + } + + if (err == 0) + STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); + + mutex_exit(&cc_list_lock); + + return (err); +} diff --git a/usr/src/uts/common/inet/cc/cc_cubic.c b/usr/src/uts/common/inet/cc/cc_cubic.c new file mode 100644 index 0000000000..a4b8f29e18 --- /dev/null +++ b/usr/src/uts/common/inet/cc/cc_cubic.c @@ -0,0 +1,428 @@ +/* + * Copyright (c) 2008-2010 Lawrence Stewart <lstewart@freebsd.org> + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * Copyright (c) 2017 by Delphix. All rights reserved. + * + * This software was developed by Lawrence Stewart while studying at the Centre + * for Advanced Internet Architectures, Swinburne University of Technology, made + * possible in part by a grant from the Cisco University Research Program Fund + * at Community Foundation Silicon Valley. + * + * Portions of this software were developed at the Centre for Advanced + * Internet Architectures, Swinburne University of Technology, Melbourne, + * Australia by David Hayes under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * An implementation of the CUBIC congestion control algorithm for FreeBSD, + * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha. + * Originally released as part of the NewTCP research project at Swinburne + * University of Technology's Centre for Advanced Internet Architectures, + * Melbourne, Australia, which was made possible in part by a grant from the + * Cisco University Research Program Fund at Community Foundation Silicon + * Valley. More details are available at: + * http://caia.swin.edu.au/urp/newtcp/ + */ + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/modctl.h> +#include <sys/time.h> + +#include <inet/tcp_impl.h> +#include <inet/cc.h> +#include <inet/cc/cc_cubic.h> +#include <inet/cc/cc_module.h> + +static struct modlmisc cc_cubic_modlmisc = { + &mod_miscops, + "Cubic Congestion Control" +}; + +static struct modlinkage cc_cubic_modlinkage = { + MODREV_1, + &cc_cubic_modlmisc, + NULL +}; + +/* + * cubic uses the NewReno implementation of after_idle and uses NewReno's + * ack_received callback during slow start. + */ +static struct cc_algo *newreno_cc_algo; + +static void cubic_ack_received(struct cc_var *ccv, uint16_t type); +static void cubic_cb_destroy(struct cc_var *ccv); +static int cubic_cb_init(struct cc_var *ccv); +static void cubic_cong_signal(struct cc_var *ccv, uint32_t type); +static void cubic_conn_init(struct cc_var *ccv); +static void cubic_post_recovery(struct cc_var *ccv); +static void cubic_record_rtt(struct cc_var *ccv); +static void cubic_ssthresh_update(struct cc_var *ccv); + +struct cubic { + /* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */ + int64_t K; + /* Sum of RTT samples across an epoch in nanoseconds. */ + hrtime_t sum_rtt_nsecs; + /* cwnd at the most recent congestion event. */ + uint32_t max_cwnd; + /* cwnd at the previous congestion event. */ + uint32_t prev_max_cwnd; + /* Number of congestion events. */ + uint32_t num_cong_events; + /* Minimum observed rtt in nanoseconds. */ + hrtime_t min_rtt_nsecs; + /* Mean observed rtt between congestion epochs. */ + hrtime_t mean_rtt_nsecs; + /* ACKs since last congestion event. */ + int epoch_ack_count; + /* Time of last congestion event in nanoseconds. */ + hrtime_t t_last_cong; +}; + +struct cc_algo cubic_cc_algo = { + .name = "cubic", + .ack_received = cubic_ack_received, + .cb_destroy = cubic_cb_destroy, + .cb_init = cubic_cb_init, + .cong_signal = cubic_cong_signal, + .conn_init = cubic_conn_init, + .post_recovery = cubic_post_recovery, +}; + +int +_init(void) +{ + int err; + + if ((newreno_cc_algo = cc_load_algo("newreno")) == NULL) + return (EINVAL); + + if ((err = cc_register_algo(&cubic_cc_algo)) == 0) { + if ((err = mod_install(&cc_cubic_modlinkage)) != 0) + (void) cc_deregister_algo(&cubic_cc_algo); + } + cubic_cc_algo.after_idle = newreno_cc_algo->after_idle; + return (err); +} + +int +_fini(void) +{ + /* XXX Not unloadable for now */ + return (EBUSY); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&cc_cubic_modlinkage, modinfop)); +} + +static void +cubic_ack_received(struct cc_var *ccv, uint16_t type) +{ + struct cubic *cubic_data; + uint32_t w_tf, w_cubic_next; + hrtime_t nsecs_since_cong; + + cubic_data = ccv->cc_data; + cubic_record_rtt(ccv); + + /* + * Regular ACK and we're not in cong/fast recovery and we're cwnd + * limited and we're either not doing ABC or are slow starting or are + * doing ABC and we've sent a cwnd's worth of bytes. + */ + if (type == CC_ACK && !IN_RECOVERY(ccv->flags) && + (ccv->flags & CCF_CWND_LIMITED) && (!CC_ABC(ccv) || + CCV(ccv, tcp_cwnd) <= CCV(ccv, tcp_cwnd_ssthresh) || + (CC_ABC(ccv) && (ccv->flags & CCF_ABC_SENTAWND)))) { + /* Use the logic in NewReno ack_received() for slow start. */ + if (CCV(ccv, tcp_cwnd) <= CCV(ccv, tcp_cwnd_ssthresh) || + cubic_data->min_rtt_nsecs == TCPTV_SRTTBASE) + newreno_cc_algo->ack_received(ccv, type); + else { + nsecs_since_cong = gethrtime() - + cubic_data->t_last_cong; + + /* + * The mean RTT is used to best reflect the equations in + * the I-D. Using min_rtt in the tf_cwnd calculation + * causes w_tf to grow much faster than it should if the + * RTT is dominated by network buffering rather than + * propagation delay. + */ + w_tf = tf_cwnd(nsecs_since_cong, + cubic_data->mean_rtt_nsecs, cubic_data->max_cwnd, + CCV(ccv, tcp_mss)); + + w_cubic_next = cubic_cwnd(nsecs_since_cong + + cubic_data->mean_rtt_nsecs, cubic_data->max_cwnd, + CCV(ccv, tcp_mss), cubic_data->K); + + ccv->flags &= ~CCF_ABC_SENTAWND; + + if (w_cubic_next < w_tf) { + /* + * TCP-friendly region, follow tf + * cwnd growth. + */ + CCV(ccv, tcp_cwnd) = w_tf; + } else if (CCV(ccv, tcp_cwnd) < w_cubic_next) { + /* + * Concave or convex region, follow CUBIC + * cwnd growth. + */ + if (CC_ABC(ccv)) + CCV(ccv, tcp_cwnd) = w_cubic_next; + else + CCV(ccv, tcp_cwnd) += ((w_cubic_next - + CCV(ccv, tcp_cwnd)) * + CCV(ccv, tcp_mss)) / + CCV(ccv, tcp_cwnd); + } + + /* + * If we're not in slow start and we're probing for a + * new cwnd limit at the start of a connection + * (happens when hostcache has a relevant entry), + * keep updating our current estimate of the + * max_cwnd. + */ + if (cubic_data->num_cong_events == 0 && + cubic_data->max_cwnd < CCV(ccv, tcp_cwnd)) + cubic_data->max_cwnd = CCV(ccv, tcp_cwnd); + } + } +} + +static void +cubic_cb_destroy(struct cc_var *ccv) +{ + + if (ccv->cc_data != NULL) + kmem_free(ccv->cc_data, sizeof (struct cubic)); +} + +static int +cubic_cb_init(struct cc_var *ccv) +{ + struct cubic *cubic_data; + + cubic_data = kmem_alloc(sizeof (struct cubic), KM_NOSLEEP); + + if (cubic_data == NULL) + return (ENOMEM); + + /* Init some key variables with sensible defaults. */ + cubic_data->t_last_cong = gethrtime(); + cubic_data->min_rtt_nsecs = TCPTV_SRTTBASE; + cubic_data->mean_rtt_nsecs = 1; + + ccv->cc_data = cubic_data; + + return (0); +} + +/* + * Perform any necessary tasks before we enter congestion recovery. + */ +static void +cubic_cong_signal(struct cc_var *ccv, uint32_t type) +{ + struct cubic *cubic_data; + uint32_t cwin; + uint32_t mss; + + cubic_data = ccv->cc_data; + cwin = CCV(ccv, tcp_cwnd); + mss = CCV(ccv, tcp_mss); + + switch (type) { + case CC_NDUPACK: + if (!IN_FASTRECOVERY(ccv->flags)) { + if (!IN_CONGRECOVERY(ccv->flags)) { + cubic_ssthresh_update(ccv); + cubic_data->num_cong_events++; + cubic_data->prev_max_cwnd = + cubic_data->max_cwnd; + cubic_data->max_cwnd = cwin; + CCV(ccv, tcp_cwnd) = + CCV(ccv, tcp_cwnd_ssthresh); + } + ENTER_RECOVERY(ccv->flags); + } + break; + + case CC_ECN: + if (!IN_CONGRECOVERY(ccv->flags)) { + cubic_ssthresh_update(ccv); + cubic_data->num_cong_events++; + cubic_data->prev_max_cwnd = cubic_data->max_cwnd; + cubic_data->max_cwnd = cwin; + cubic_data->t_last_cong = gethrtime(); + CCV(ccv, tcp_cwnd) = CCV(ccv, tcp_cwnd_ssthresh); + ENTER_CONGRECOVERY(ccv->flags); + } + break; + + case CC_RTO: + /* + * Grab the current time and record it so we know when the + * most recent congestion event was. Only record it when the + * timeout has fired more than once, as there is a reasonable + * chance the first one is a false alarm and may not indicate + * congestion. + */ + if (CCV(ccv, tcp_timer_backoff) >= 2) { + cubic_data->num_cong_events++; + cubic_data->t_last_cong = gethrtime(); + cubic_ssthresh_update(ccv); + cubic_data->max_cwnd = cwin; + CCV(ccv, tcp_cwnd) = mss; + } + break; + } +} + +static void +cubic_conn_init(struct cc_var *ccv) +{ + struct cubic *cubic_data; + + cubic_data = ccv->cc_data; + + /* + * Ensure we have a sane initial value for max_cwnd recorded. Without + * this here bad things happen when entries from the TCP hostcache + * get used. + */ + cubic_data->max_cwnd = CCV(ccv, tcp_cwnd); +} + +/* + * Perform any necessary tasks before we exit congestion recovery. + */ +static void +cubic_post_recovery(struct cc_var *ccv) +{ + struct cubic *cubic_data; + + cubic_data = ccv->cc_data; + + /* Fast convergence heuristic. */ + if (cubic_data->max_cwnd < cubic_data->prev_max_cwnd) { + cubic_data->max_cwnd = (cubic_data->max_cwnd * CUBIC_FC_FACTOR) + >> CUBIC_SHIFT; + } + + if (IN_FASTRECOVERY(ccv->flags)) { + /* Update cwnd based on beta and adjusted max_cwnd. */ + CCV(ccv, tcp_cwnd) = max(1, ((CUBIC_BETA * + cubic_data->max_cwnd) >> CUBIC_SHIFT)); + } + cubic_data->t_last_cong = gethrtime(); + + /* Calculate the average RTT between congestion epochs. */ + if (cubic_data->epoch_ack_count > 0 && + cubic_data->sum_rtt_nsecs >= cubic_data->epoch_ack_count) { + cubic_data->mean_rtt_nsecs = + (cubic_data->sum_rtt_nsecs / cubic_data->epoch_ack_count); + } + + cubic_data->epoch_ack_count = 0; + cubic_data->sum_rtt_nsecs = 0; + cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, tcp_mss)); +} + +/* + * Record the min RTT and sum samples for the epoch average RTT calculation. + */ +static void +cubic_record_rtt(struct cc_var *ccv) +{ + struct cubic *cubic_data; + int t_srtt_nsecs; + + /* Ignore srtt until a min number of samples have been taken. */ + if (CCV(ccv, tcp_rtt_update) >= CUBIC_MIN_RTT_SAMPLES) { + cubic_data = ccv->cc_data; + /* tcp_rtt_sa is 8 * smoothed RTT in nanoseconds */ + t_srtt_nsecs = CCV(ccv, tcp_rtt_sa) >> 3; + + /* + * Record the current SRTT as our minrtt if it's the smallest + * we've seen or minrtt is currently equal to its initialized + * value. + * + * XXXLAS: Should there be some hysteresis for minrtt? + */ + if ((t_srtt_nsecs < cubic_data->min_rtt_nsecs || + cubic_data->min_rtt_nsecs == TCPTV_SRTTBASE)) { + cubic_data->min_rtt_nsecs = max(1, t_srtt_nsecs); + + /* + * If the connection is within its first congestion + * epoch, ensure we prime mean_rtt_nsecs with a + * reasonable value until the epoch average RTT is + * calculated in cubic_post_recovery(). + */ + if (cubic_data->min_rtt_nsecs > + cubic_data->mean_rtt_nsecs) + cubic_data->mean_rtt_nsecs = + cubic_data->min_rtt_nsecs; + } + + /* Sum samples for epoch average RTT calculation. */ + cubic_data->sum_rtt_nsecs += t_srtt_nsecs; + cubic_data->epoch_ack_count++; + } +} + +/* + * Update the ssthresh in the event of congestion. + */ +static void +cubic_ssthresh_update(struct cc_var *ccv) +{ + struct cubic *cubic_data; + + cubic_data = ccv->cc_data; + + /* + * On the first congestion event, set ssthresh to cwnd * 0.5, on + * subsequent congestion events, set it to cwnd * beta. + */ + if (cubic_data->num_cong_events == 0) + CCV(ccv, tcp_cwnd_ssthresh) = CCV(ccv, tcp_cwnd) >> 1; + else + CCV(ccv, tcp_cwnd_ssthresh) = + (CCV(ccv, tcp_cwnd) * CUBIC_BETA) >> CUBIC_SHIFT; +} diff --git a/usr/src/uts/common/inet/cc/cc_cubic.h b/usr/src/uts/common/inet/cc/cc_cubic.h new file mode 100644 index 0000000000..c87751d257 --- /dev/null +++ b/usr/src/uts/common/inet/cc/cc_cubic.h @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2008-2010 Lawrence Stewart <lstewart@freebsd.org> + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * Copyright (c) 2017 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. + * + * This software was developed by Lawrence Stewart while studying at the Centre + * for Advanced Internet Architectures, Swinburne University of Technology, made + * possible in part by a grant from the Cisco University Research Program Fund + * at Community Foundation Silicon Valley. + * + * Portions of this software were developed at the Centre for Advanced + * Internet Architectures, Swinburne University of Technology, Melbourne, + * Australia by David Hayes under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_CC_CUBIC_H_ +#define _NETINET_CC_CUBIC_H_ + +/* Number of bits of precision for fixed point math calcs. */ +#define CUBIC_SHIFT 8 + +#define CUBIC_SHIFT_4 32 + +/* 0.5 << CUBIC_SHIFT. */ +#define RENO_BETA 128 + +/* ~0.8 << CUBIC_SHIFT. */ +#define CUBIC_BETA 204 + +/* ~0.2 << CUBIC_SHIFT. */ +#define ONE_SUB_CUBIC_BETA 51 + +/* 3 * ONE_SUB_CUBIC_BETA. */ +#define THREE_X_PT2 153 + +/* (2 << CUBIC_SHIFT) - ONE_SUB_CUBIC_BETA. */ +#define TWO_SUB_PT2 461 + +/* ~0.4 << CUBIC_SHIFT. */ +#define CUBIC_C_FACTOR 102 + +/* CUBIC fast convergence factor: ~0.9 << CUBIC_SHIFT. */ +#define CUBIC_FC_FACTOR 230 + +/* Don't trust s_rtt until this many rtt samples have been taken. */ +#define CUBIC_MIN_RTT_SAMPLES 8 + +/* Userland only bits. */ +#ifndef _KERNEL + +extern int hz; + +/* + * Implementation based on the formulae found in the CUBIC Internet Draft + * "draft-rhee-tcpm-cubic-02". + * + * Note BETA used in cc_cubic is equal to (1-beta) in the I-D + */ + +static __inline float +theoretical_cubic_k(double wmax_pkts) +{ + double C; + + C = 0.4; + + return (pow((wmax_pkts * 0.2) / C, (1.0 / 3.0)) * pow(2, CUBIC_SHIFT)); +} + +static __inline uint32_t +theoretical_cubic_cwnd(int ticks_since_cong, uint32_t wmax, uint32_t smss) +{ + double C, wmax_pkts; + + C = 0.4; + wmax_pkts = wmax / (double)smss; + + return (smss * (wmax_pkts + + (C * pow(ticks_since_cong / (double)hz - + theoretical_cubic_k(wmax_pkts) / pow(2, CUBIC_SHIFT), 3.0)))); +} + +static __inline uint32_t +theoretical_reno_cwnd(int ticks_since_cong, int rtt_ticks, uint32_t wmax, + uint32_t smss) +{ + + return ((wmax * 0.5) + ((ticks_since_cong / (float)rtt_ticks) * smss)); +} + +static __inline uint32_t +theoretical_tf_cwnd(int ticks_since_cong, int rtt_ticks, unsigned long wmax, + uint32_t smss) +{ + + return ((wmax * 0.8) + ((3 * 0.2) / (2 - 0.2) * + (ticks_since_cong / (float)rtt_ticks) * smss)); +} + +#endif /* !_KERNEL */ + +/* + * Compute the CUBIC K value used in the cwnd calculation, using an + * implementation of eqn 2 in the I-D. The method used + * here is adapted from Apple Computer Technical Report #KT-32. + */ +static __inline int64_t +cubic_k(uint32_t wmax_pkts) +{ + int64_t s, K; + uint16_t p; + + K = s = 0; + p = 0; + + /* (wmax * beta)/C with CUBIC_SHIFT worth of precision. */ + s = ((wmax_pkts * ONE_SUB_CUBIC_BETA) << CUBIC_SHIFT) / CUBIC_C_FACTOR; + + /* Rebase s to be between 1 and 1/8 with a shift of CUBIC_SHIFT. */ + while (s >= 256) { + s >>= 3; + p++; + } + + /* + * Some magic constants taken from the Apple TR with appropriate + * shifts: 275 == 1.072302 << CUBIC_SHIFT, 98 == 0.3812513 << + * CUBIC_SHIFT, 120 == 0.46946116 << CUBIC_SHIFT. + */ + K = (((s * 275) >> CUBIC_SHIFT) + 98) - + (((s * s * 120) >> CUBIC_SHIFT) >> CUBIC_SHIFT); + + /* Multiply by 2^p to undo the rebasing of s from above. */ + return (K <<= p); +} + +/* + * Compute the new cwnd value using an implementation of eqn 1 from the I-D. + * Thanks to Kip Macy for help debugging this function. + * + * XXXLAS: Characterise bounds for overflow. + */ +static __inline uint32_t +cubic_cwnd(hrtime_t nsecs_since_cong, uint32_t wmax, uint32_t smss, int64_t K) +{ + int64_t t, cwnd; + + /* + * Convert nsecs_since_cong to milliseconds, with CUBIC_SHIFT worth + * of precision. + */ + t = NSEC2MSEC(nsecs_since_cong << CUBIC_SHIFT); + + /* + * K is the time period in seconds that it will take to reach wmax. The + * value is kept in fixed point form with CUBIC_SHIFT worth of + * precision. + * + * For comparison with t, we convert K to milliseconds, and then convert + * the result back to seconds. + * + * cwnd = t - K, with CUBIC_SHIFT worth of precision. + */ + cwnd = (t - K * MILLISEC) / MILLISEC; + + /* cwnd = (t - K)^3, with CUBIC_SHIFT^3 worth of precision. */ + cwnd *= (cwnd * cwnd); + + /* + * C(t - K)^3 + wmax + * The down shift by CUBIC_SHIFT_4 is because cwnd has 4 lots of + * CUBIC_SHIFT included in the value. 3 from the cubing of cwnd above, + * and an extra from multiplying through by CUBIC_C_FACTOR. + */ + cwnd = ((cwnd * CUBIC_C_FACTOR * smss) >> CUBIC_SHIFT_4) + wmax; + + return ((uint32_t)cwnd); +} + +/* + * Compute an approximation of the "TCP friendly" cwnd some number of + * nanoseconds after a congestion event that is designed to yield the same + * average cwnd as NewReno while using CUBIC's beta of 0.8. RTT should be the + * average RTT estimate for the path measured over the previous congestion + * epoch and wmax is the value of cwnd at the last congestion event. + */ +static __inline uint32_t +tf_cwnd(hrtime_t nsecs_since_cong, hrtime_t rtt_nsecs, uint32_t wmax, + uint32_t smss) +{ + + /* Equation 4 of I-D. */ + return (((wmax * CUBIC_BETA) + (((THREE_X_PT2 * nsecs_since_cong * + smss) << CUBIC_SHIFT) / TWO_SUB_PT2 / rtt_nsecs)) >> CUBIC_SHIFT); +} + +#endif /* _NETINET_CC_CUBIC_H_ */ diff --git a/usr/src/uts/common/inet/cc/cc_module.h b/usr/src/uts/common/inet/cc/cc_module.h new file mode 100644 index 0000000000..d0d6c83c36 --- /dev/null +++ b/usr/src/uts/common/inet/cc/cc_module.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> + * All rights reserved. + * Copyright (c) 2017 by Delphix. All rights reserved. + * + * This software was developed by Lawrence Stewart while studying at the Centre + * for Advanced Internet Architectures, Swinburne University of Technology, made + * possible in part by a grant from the Cisco University Research Program Fund + * at Community Foundation Silicon Valley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This software was first released in 2009 by Lawrence Stewart as part of the + * NewTCP research project at Swinburne University of Technology's Centre for + * Advanced Internet Architectures, Melbourne, Australia, which was made + * possible in part by a grant from the Cisco University Research Program Fund + * at Community Foundation Silicon Valley. More details are available at: + * http://caia.swin.edu.au/urp/newtcp/ + */ + +#ifndef _NETINET_CC_MODULE_H_ +#define _NETINET_CC_MODULE_H_ + +#define CCV(ccv, what) (ccv)->ccvc.tcp->what +#define CCSV(ccv, what) (ccv)->ccvc.tcp->tcp_tcps->what +#define CCV_PROTO(ccv) (ccv)->ccvc.tcp + +#define CC_ABC(ccv) (ccv)->ccvc.tcp->tcp_tcps->tcps_abc +#define CC_ABC_L_VAR(ccv) (ccv)->ccvc.tcp->tcp_tcps->tcps_abc_l_var + +#define TCPTV_SRTTBASE 0 + +#endif /* _NETINET_CC_MODULE_H_ */ diff --git a/usr/src/uts/common/inet/cc/cc_newreno.c b/usr/src/uts/common/inet/cc/cc_newreno.c new file mode 100644 index 0000000000..ceb76d8643 --- /dev/null +++ b/usr/src/uts/common/inet/cc/cc_newreno.c @@ -0,0 +1,268 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 + * The Regents of the University of California. + * Copyright (c) 2007-2008,2010 + * Swinburne University of Technology, Melbourne, Australia. + * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * Copyright (c) 2017 by Delphix. All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University of Technology, by Lawrence Stewart, James + * Healy and David Hayes, made possible in part by a grant from the Cisco + * University Research Program Fund at Community Foundation Silicon Valley. + * + * Portions of this software were developed at the Centre for Advanced + * Internet Architectures, Swinburne University of Technology, Melbourne, + * Australia by David Hayes under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This software was first released in 2007 by James Healy and Lawrence Stewart + * whilst working on the NewTCP research project at Swinburne University of + * Technology's Centre for Advanced Internet Architectures, Melbourne, + * Australia, which was made possible in part by a grant from the Cisco + * University Research Program Fund at Community Foundation Silicon Valley. + * More details are available at: + * http://caia.swin.edu.au/urp/newtcp/ + */ + +#include <sys/errno.h> +#include <inet/tcp.h> +#include <inet/tcp_impl.h> +#include <inet/cc.h> +#include <inet/cc/cc_module.h> + +static void newreno_ack_received(struct cc_var *ccv, uint16_t type); +static void newreno_after_idle(struct cc_var *ccv); +static void newreno_cong_signal(struct cc_var *ccv, uint32_t type); +static void newreno_post_recovery(struct cc_var *ccv); + +static struct modlmisc cc_newreno_modlmisc = { + &mod_miscops, + "New Reno Congestion Control" +}; + +static struct modlinkage cc_newreno_modlinkage = { + MODREV_1, + &cc_newreno_modlmisc, + NULL +}; + +struct cc_algo newreno_cc_algo = { + .name = "newreno", + .ack_received = newreno_ack_received, + .after_idle = newreno_after_idle, + .cong_signal = newreno_cong_signal, + .post_recovery = newreno_post_recovery, +}; + +int +_init(void) +{ + int err; + + if ((err = cc_register_algo(&newreno_cc_algo)) == 0) { + if ((err = mod_install(&cc_newreno_modlinkage)) != 0) + (void) cc_deregister_algo(&newreno_cc_algo); + } + return (err); +} + +int +_fini(void) +{ + /* XXX Not unloadable for now */ + return (EBUSY); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&cc_newreno_modlinkage, modinfop)); +} + +static void +newreno_ack_received(struct cc_var *ccv, uint16_t type) +{ + if (type == CC_ACK && !IN_RECOVERY(ccv->flags) && + (ccv->flags & CCF_CWND_LIMITED)) { + uint_t cw = CCV(ccv, tcp_cwnd); + uint_t incr = CCV(ccv, tcp_mss); + + /* + * Regular in-order ACK, open the congestion window. + * Method depends on which congestion control state we're + * in (slow start or cong avoid) and if ABC (RFC 3465) is + * enabled. + * + * slow start: cwnd <= ssthresh + * cong avoid: cwnd > ssthresh + * + * slow start and ABC (RFC 3465): + * Grow cwnd exponentially by the amount of data + * ACKed capping the max increment per ACK to + * (abc_l_var * maxseg) bytes. + * + * slow start without ABC (RFC 5681): + * Grow cwnd exponentially by maxseg per ACK. + * + * cong avoid and ABC (RFC 3465): + * Grow cwnd linearly by maxseg per RTT for each + * cwnd worth of ACKed data. + * + * cong avoid without ABC (RFC 5681): + * Grow cwnd linearly by approximately maxseg per RTT using + * maxseg^2 / cwnd per ACK as the increment. + * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to + * avoid capping cwnd. + */ + if (cw > CCV(ccv, tcp_cwnd_ssthresh)) { + if (CC_ABC(ccv)) { + if (ccv->flags & CCF_ABC_SENTAWND) + ccv->flags &= ~CCF_ABC_SENTAWND; + else + incr = 0; + } else + incr = max((incr * incr / cw), 1); + } else if (CC_ABC(ccv)) { + /* + * In slow-start with ABC enabled and no RTO in sight? + * (Must not use abc_l_var > 1 if slow starting after + * an RTO. + */ + if (ccv->flags & CCF_RTO) { + incr = min(ccv->bytes_this_ack, + CCV(ccv, tcp_mss)); + } else { + incr = min(ccv->bytes_this_ack, + CC_ABC_L_VAR(ccv) * CCV(ccv, tcp_mss)); + } + + } + /* ABC is on by default, so incr equals 0 frequently. */ + if (incr > 0) + CCV(ccv, tcp_cwnd) = min(cw + incr, + TCP_MAXWIN << CCV(ccv, tcp_snd_ws)); + } +} + +static void +newreno_after_idle(struct cc_var *ccv) +{ + int rw; + + /* + * If we've been idle for more than one retransmit timeout the old + * congestion window is no longer current and we have to reduce it to + * the restart window before we can transmit again. + * + * The restart window is the initial window or the last CWND, whichever + * is smaller. + * + * This is done to prevent us from flooding the path with a full CWND at + * wirespeed, overloading router and switch buffers along the way. + * + * See RFC5681 Section 4.1. "Restarting Idle Connections". + */ + if (CCV(ccv, tcp_init_cwnd) != 0) { + /* + * The TCP_INIT_CWND socket option was used to override the + * default. + */ + rw = CCV(ccv, tcp_init_cwnd) * CCV(ccv, tcp_mss); + } else if (CCSV(ccv, tcps_slow_start_initial) != 0) { + /* The _slow_start_initial tunable was explicitly set. */ + rw = min(TCP_MAX_INIT_CWND, CCSV(ccv, tcps_slow_start_initial)) + * CCV(ccv, tcp_mss); + } else { + /* Do RFC 3390 */ + rw = min(4 * CCV(ccv, tcp_mss), + max(2 * CCV(ccv, tcp_mss), 4380)); + } + + CCV(ccv, tcp_cwnd) = min(rw, CCV(ccv, tcp_cwnd)); +} + +/* + * Perform any necessary tasks before we enter congestion recovery. + */ +static void +newreno_cong_signal(struct cc_var *ccv, uint32_t type) +{ + uint32_t cwin, ssthresh_on_loss; + uint32_t mss; + + cwin = CCV(ccv, tcp_cwnd); + mss = CCV(ccv, tcp_mss); + ssthresh_on_loss = + max((CCV(ccv, tcp_snxt) - CCV(ccv, tcp_suna)) / 2 / mss, 2) + * mss; + + /* Catch algos which mistakenly leak private signal types. */ + ASSERT((type & CC_SIGPRIVMASK) == 0); + + cwin = max(cwin / 2 / mss, 2) * mss; + + switch (type) { + case CC_NDUPACK: + if (!IN_FASTRECOVERY(ccv->flags)) { + if (!IN_CONGRECOVERY(ccv->flags)) { + CCV(ccv, tcp_cwnd_ssthresh) = ssthresh_on_loss; + CCV(ccv, tcp_cwnd) = cwin; + } + ENTER_RECOVERY(ccv->flags); + } + break; + case CC_ECN: + if (!IN_CONGRECOVERY(ccv->flags)) { + CCV(ccv, tcp_cwnd_ssthresh) = ssthresh_on_loss; + CCV(ccv, tcp_cwnd) = cwin; + ENTER_CONGRECOVERY(ccv->flags); + } + break; + case CC_RTO: + CCV(ccv, tcp_cwnd_ssthresh) = ssthresh_on_loss; + CCV(ccv, tcp_cwnd) = mss; + break; + } +} + +/* + * Perform any necessary tasks before we exit congestion recovery. + */ +static void +newreno_post_recovery(struct cc_var *ccv) +{ + if (IN_FASTRECOVERY(ccv->flags)) { + /* + * Fast recovery will conclude after returning from this + * function. + */ + if (CCV(ccv, tcp_cwnd) > CCV(ccv, tcp_cwnd_ssthresh)) { + CCV(ccv, tcp_cwnd) = CCV(ccv, tcp_cwnd_ssthresh); + } + } +} diff --git a/usr/src/uts/common/inet/cc/cc_sunreno.c b/usr/src/uts/common/inet/cc/cc_sunreno.c new file mode 100644 index 0000000000..0a7a05206f --- /dev/null +++ b/usr/src/uts/common/inet/cc/cc_sunreno.c @@ -0,0 +1,222 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +/* + * The TCP congestion control algorithm extracted from the pre-framework + * implementation of TCP congestion control. + */ + +#include <sys/errno.h> +#include <inet/tcp.h> +#include <inet/tcp_impl.h> +#include <inet/cc.h> +#include <inet/cc/cc_module.h> + +static void sunreno_ack_received(struct cc_var *ccv, uint16_t type); +static void sunreno_after_idle(struct cc_var *ccv); +static void sunreno_cong_signal(struct cc_var *ccv, uint32_t type); +static void sunreno_post_recovery(struct cc_var *ccv); + +#define CC_SUNRENO_ALGO_NAME "sunreno" + +static struct modlmisc cc_sunreno_modlmisc = { + &mod_miscops, + "SUNReno Congestion Control" +}; + +static struct modlinkage cc_sunreno_modlinkage = { + MODREV_1, + &cc_sunreno_modlmisc, + NULL +}; + +struct cc_algo sunreno_cc_algo = { + .name = CC_SUNRENO_ALGO_NAME, + .ack_received = sunreno_ack_received, + .after_idle = sunreno_after_idle, + .cong_signal = sunreno_cong_signal, + .post_recovery = sunreno_post_recovery, +}; + +int +_init(void) +{ + int err; + + if ((err = cc_register_algo(&sunreno_cc_algo)) == 0) { + if ((err = mod_install(&cc_sunreno_modlinkage)) != 0) + (void) cc_deregister_algo(&sunreno_cc_algo); + } + return (err); +} + +int +_fini(void) +{ + return (EBUSY); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&cc_sunreno_modlinkage, modinfop)); +} + +static void +sunreno_ack_received(struct cc_var *ccv, uint16_t type) +{ + uint32_t add; + uint32_t cwnd; + int mss; + + if (type == CC_ACK && !IN_RECOVERY(ccv->flags)) { + mss = CCV(ccv, tcp_mss); + cwnd = CCV(ccv, tcp_cwnd); + add = mss; + + if (cwnd >= CCV(ccv, tcp_cwnd_ssthresh)) { + /* + * This is to prevent an increase of less than 1 MSS of + * tcp_cwnd. With partial increase, tcp_wput_data() + * may send out tinygrams in order to preserve mblk + * boundaries. + * + * By initializing tcp_cwnd_cnt to new tcp_cwnd and + * decrementing it by 1 MSS for every ACKs, tcp_cwnd is + * increased by 1 MSS for every RTTs. + */ + if (CCV(ccv, tcp_cwnd_cnt) <= 0) { + CCV(ccv, tcp_cwnd_cnt) = cwnd + add; + } else { + CCV(ccv, tcp_cwnd_cnt) -= add; + add = 0; + } + } + CCV(ccv, tcp_cwnd) = MIN(cwnd + add, CCV(ccv, tcp_cwnd_max)); + } +} + +static void +sunreno_after_idle(struct cc_var *ccv) +{ + int32_t num_sack_blk = 0; + int mss; + + if (CCV(ccv, tcp_snd_sack_ok) && CCV(ccv, tcp_num_sack_blk) > 0) { + int32_t opt_len; + + num_sack_blk = MIN(CCV(ccv, tcp_max_sack_blk), + CCV(ccv, tcp_num_sack_blk)); + opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN * + 2 + TCPOPT_HEADER_LEN; + mss = CCV(ccv, tcp_mss) - opt_len; + } else { + mss = CCV(ccv, tcp_mss); + } + + TCP_SET_INIT_CWND(CCV_PROTO(ccv), mss, + CCSV(ccv, tcps_slow_start_after_idle)); +} + +/* + * Perform any necessary tasks before we enter congestion recovery. + */ +static void +sunreno_cong_signal(struct cc_var *ccv, uint32_t type) +{ + int npkt; + int mss; + + /* Catch algos which mistakenly leak private signal types. */ + ASSERT((type & CC_SIGPRIVMASK) == 0); + + mss = CCV(ccv, tcp_mss); + npkt = ((CCV(ccv, tcp_snxt) - CCV(ccv, tcp_suna)) >> 1) / mss; + + switch (type) { + case CC_NDUPACK: + if (!IN_FASTRECOVERY(ccv->flags)) { + if (!IN_CONGRECOVERY(ccv->flags)) { + CCV(ccv, tcp_cwnd_ssthresh) = MAX(npkt, 2) * + mss; + CCV(ccv, tcp_cwnd) = (npkt + + CCV(ccv, tcp_dupack_cnt)) * mss; + } + ENTER_RECOVERY(ccv->flags); + } + break; + case CC_ECN: + if (!IN_CONGRECOVERY(ccv->flags) && !CCV(ccv, tcp_cwr)) { + CCV(ccv, tcp_cwnd_ssthresh) = MAX(npkt, 2) * mss; + CCV(ccv, tcp_cwnd) = npkt * mss; + if (CCV(ccv, tcp_cwnd) == 0) { + /* + * This makes sure that when the ACK comes + * back, we will increase tcp_cwnd by 1 MSS. + */ + CCV(ccv, tcp_cwnd_cnt) = 0; + } + ENTER_CONGRECOVERY(ccv->flags); + } + break; + case CC_RTO: + /* + * After retransmission, we need to do slow start. Set the + * ssthresh to one half of current effective window and cwnd to + * one MSS. Also reset tcp_cwnd_cnt. + * + * Note that if tcp_ssthresh is reduced because of ECN, do not + * reduce it again unless it is already one window of data away + * (tcp_cwr should then be cleared) or this is a timeout for a + * retransmitted segment. + */ + if (!CCV(ccv, tcp_cwr) || CCV(ccv, tcp_rexmit)) { + if (CCV(ccv, tcp_timer_backoff) != 0) + npkt = CCV(ccv, tcp_cwnd_ssthresh) / 2 / mss; + CCV(ccv, tcp_cwnd_ssthresh) = MAX(npkt, 2) * mss; + } + CCV(ccv, tcp_cwnd) = mss; + CCV(ccv, tcp_cwnd_cnt) = 0; + break; + } +} + +/* + * Perform any necessary tasks before we exit congestion recovery. + */ +static void +sunreno_post_recovery(struct cc_var *ccv) +{ + /* + * Restore the congestion window back to ssthresh as per RFC 5681 + * section 3.2. + */ + if (IN_FASTRECOVERY(ccv->flags)) { + if (CCV(ccv, tcp_cwnd) > CCV(ccv, tcp_cwnd_ssthresh)) { + CCV(ccv, tcp_cwnd) = CCV(ccv, tcp_cwnd_ssthresh); + } + } + CCV(ccv, tcp_cwnd_cnt) = 0; +} diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c index c81331dc9f..5090f88a97 100644 --- a/usr/src/uts/common/inet/ip/ip.c +++ b/usr/src/uts/common/inet/ip/ip.c @@ -95,6 +95,7 @@ #include <netinet/igmp.h> #include <netinet/ip_mroute.h> #include <inet/ipp_common.h> +#include <inet/cc.h> #include <net/pfkeyv2.h> #include <inet/sadb.h> diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h index 7e3910e894..5058412c32 100644 --- a/usr/src/uts/common/inet/tcp.h +++ b/usr/src/uts/common/inet/tcp.h @@ -22,7 +22,7 @@ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, Joyent, Inc. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2014, 2016 by Delphix. All rights reserved. + * Copyright (c) 2014, 2017 by Delphix. All rights reserved. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -46,6 +46,7 @@ extern "C" { #include <inet/mib2.h> #include <inet/tcp_stack.h> #include <inet/tcp_sack.h> +#include <inet/cc.h> /* TCP states */ #define TCPS_CLOSED -6 @@ -152,6 +153,9 @@ typedef struct tcp_s { struct conn_s *tcp_connp; /* back pointer to conn_t */ tcp_stack_t *tcp_tcps; /* back pointer to tcp_stack_t */ + struct cc_algo *tcp_cc_algo; /* congestion control algorithm */ + struct cc_var tcp_ccv; /* congestion control specific vars */ + int32_t tcp_state; int32_t tcp_rcv_ws; /* My window scale power */ int32_t tcp_snd_ws; /* Sender's window scale power */ @@ -503,10 +507,10 @@ typedef struct tcp_s { #endif extern void tcp_conn_reclaim(void *); -extern void tcp_free(tcp_t *tcp); +extern void tcp_free(tcp_t *tcp); extern void tcp_ddi_g_init(void); extern void tcp_ddi_g_destroy(void); -extern void *tcp_get_conn(void *arg, tcp_stack_t *); +extern conn_t *tcp_get_conn(void *arg, tcp_stack_t *); extern mblk_t *tcp_snmp_get(queue_t *, mblk_t *, boolean_t); extern int tcp_snmp_set(queue_t *, int, int, uchar_t *, int len); diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index d7458c8eee..bfa08ada8c 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -23,7 +23,7 @@ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, Joyent Inc. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -74,6 +74,7 @@ #include <inet/ipsec_impl.h> #include <inet/common.h> +#include <inet/cc.h> #include <inet/ip.h> #include <inet/ip_impl.h> #include <inet/ip6.h> @@ -1409,6 +1410,10 @@ tcp_free(tcp_t *tcp) */ tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); + /* Allow the CC algorithm to clean up after itself. */ + if (tcp->tcp_cc_algo != NULL && tcp->tcp_cc_algo->cb_destroy != NULL) + tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv); + /* * If this is a non-STREAM socket still holding on to an upper * handle, release it. As a result of fallback we might also see @@ -1455,7 +1460,7 @@ tcp_free(tcp_t *tcp) * collector will free up the freelist is the connection ends up sitting * there for too long. */ -void * +conn_t * tcp_get_conn(void *arg, tcp_stack_t *tcps) { tcp_t *tcp = NULL; @@ -1494,7 +1499,7 @@ tcp_get_conn(void *arg, tcp_stack_t *tcps) connp->conn_recv = tcp_input_data; ASSERT(connp->conn_recvicmp == tcp_icmp_input); ASSERT(connp->conn_verifyicmp == tcp_verifyicmp); - return ((void *)connp); + return (connp); } mutex_exit(&tcp_time_wait->tcp_time_wait_lock); /* @@ -1529,7 +1534,7 @@ tcp_get_conn(void *arg, tcp_stack_t *tcps) connp->conn_ixa->ixa_notify = tcp_notify; connp->conn_ixa->ixa_notify_cookie = tcp; - return ((void *)connp); + return (connp); } /* @@ -2298,6 +2303,11 @@ tcp_reinit_values(tcp_t *tcp) ASSERT(tcp->tcp_listen_cnt == NULL); ASSERT(tcp->tcp_reass_tid == 0); + /* Allow the CC algorithm to clean up after itself. */ + if (tcp->tcp_cc_algo->cb_destroy != NULL) + tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv); + tcp->tcp_cc_algo = NULL; + #undef DONTCARE #undef PRESERVE } @@ -2318,7 +2328,12 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent) (connp->conn_ipversion == IPV4_VERSION || connp->conn_ipversion == IPV6_VERSION))); + tcp->tcp_ccv.type = IPPROTO_TCP; + tcp->tcp_ccv.ccvc.tcp = tcp; + if (parent == NULL) { + tcp->tcp_cc_algo = tcps->tcps_default_cc_algo; + tcp->tcp_naglim = tcps->tcps_naglim_def; tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial; @@ -2346,6 +2361,8 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent) */ } else { /* Inherit various TCP parameters from the parent. */ + tcp->tcp_cc_algo = parent->tcp_cc_algo; + tcp->tcp_naglim = parent->tcp_naglim; tcp->tcp_rto_initial = parent->tcp_rto_initial; @@ -2372,6 +2389,9 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent) tcp->tcp_init_cwnd = parent->tcp_init_cwnd; } + if (tcp->tcp_cc_algo->cb_init != NULL) + VERIFY(tcp->tcp_cc_algo->cb_init(&tcp->tcp_ccv) == 0); + /* * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO * will be close to tcp_rexmit_interval_initial. By doing this, we @@ -2616,7 +2636,7 @@ tcp_create_common(cred_t *credp, boolean_t isv6, boolean_t issocket, } sqp = IP_SQUEUE_GET((uint_t)gethrtime()); - connp = (conn_t *)tcp_get_conn(sqp, tcps); + connp = tcp_get_conn(sqp, tcps); /* * Both tcp_get_conn and netstack_find_by_cred incremented refcnt, * so we drop it by one. @@ -3807,6 +3827,9 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns) list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t), offsetof(tcp_listener_t, tl_link)); + tcps->tcps_default_cc_algo = cc_load_algo(CC_DEFAULT_ALGO_NAME); + VERIFY3P(tcps->tcps_default_cc_algo, !=, NULL); + return (tcps); } diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c index dd50c3f6ad..f7ea79da15 100644 --- a/usr/src/uts/common/inet/tcp/tcp_input.c +++ b/usr/src/uts/common/inet/tcp/tcp_input.c @@ -170,6 +170,133 @@ static void tcp_set_rto(tcp_t *, hrtime_t); static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *); /* + * CC wrapper hook functions + */ +static void +cc_ack_received(tcp_t *tcp, uint32_t seg_ack, int32_t bytes_acked, + uint16_t type) +{ + uint32_t old_cwnd = tcp->tcp_cwnd; + + tcp->tcp_ccv.bytes_this_ack = bytes_acked; + if (tcp->tcp_cwnd <= tcp->tcp_swnd) + tcp->tcp_ccv.flags |= CCF_CWND_LIMITED; + else + tcp->tcp_ccv.flags &= ~CCF_CWND_LIMITED; + + if (type == CC_ACK) { + if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { + if (tcp->tcp_ccv.flags & CCF_RTO) + tcp->tcp_ccv.flags &= ~CCF_RTO; + + tcp->tcp_ccv.t_bytes_acked += + min(tcp->tcp_ccv.bytes_this_ack, + tcp->tcp_tcps->tcps_abc_l_var * tcp->tcp_mss); + if (tcp->tcp_ccv.t_bytes_acked >= tcp->tcp_cwnd) { + tcp->tcp_ccv.t_bytes_acked -= tcp->tcp_cwnd; + tcp->tcp_ccv.flags |= CCF_ABC_SENTAWND; + } + } else { + tcp->tcp_ccv.flags &= ~CCF_ABC_SENTAWND; + tcp->tcp_ccv.t_bytes_acked = 0; + } + } + + if (CC_ALGO(tcp)->ack_received != NULL) { + /* + * The FreeBSD code where this originated had a comment "Find + * a way to live without this" in several places where curack + * got set. If they eventually dump curack from the cc + * variables, we'll need to adapt our code. + */ + tcp->tcp_ccv.curack = seg_ack; + CC_ALGO(tcp)->ack_received(&tcp->tcp_ccv, type); + } + + DTRACE_PROBE3(cwnd__cc__ack__received, tcp_t *, tcp, uint32_t, old_cwnd, + uint32_t, tcp->tcp_cwnd); +} + +void +cc_cong_signal(tcp_t *tcp, uint32_t seg_ack, uint32_t type) +{ + uint32_t old_cwnd = tcp->tcp_cwnd; + uint32_t old_cwnd_ssthresh = tcp->tcp_cwnd_ssthresh; + switch (type) { + case CC_NDUPACK: + if (!IN_FASTRECOVERY(tcp->tcp_ccv.flags)) { + tcp->tcp_rexmit_max = tcp->tcp_snxt; + if (tcp->tcp_ecn_ok) { + tcp->tcp_cwr_snd_max = tcp->tcp_snxt; + tcp->tcp_cwr = B_TRUE; + tcp->tcp_ecn_cwr_sent = B_FALSE; + } + } + break; + case CC_ECN: + if (!IN_CONGRECOVERY(tcp->tcp_ccv.flags)) { + tcp->tcp_rexmit_max = tcp->tcp_snxt; + if (tcp->tcp_ecn_ok) { + tcp->tcp_cwr_snd_max = tcp->tcp_snxt; + tcp->tcp_cwr = B_TRUE; + tcp->tcp_ecn_cwr_sent = B_FALSE; + } + } + break; + case CC_RTO: + tcp->tcp_ccv.flags |= CCF_RTO; + tcp->tcp_dupack_cnt = 0; + tcp->tcp_ccv.t_bytes_acked = 0; + /* + * Give up on fast recovery and congestion recovery if we were + * attempting either. + */ + EXIT_RECOVERY(tcp->tcp_ccv.flags); + if (CC_ALGO(tcp)->cong_signal == NULL) { + /* + * RFC5681 Section 3.1 + * ssthresh = max (FlightSize / 2, 2*SMSS) eq (4) + */ + tcp->tcp_cwnd_ssthresh = max( + (tcp->tcp_snxt - tcp->tcp_suna) / 2 / tcp->tcp_mss, + 2) * tcp->tcp_mss; + tcp->tcp_cwnd = tcp->tcp_mss; + } + + if (tcp->tcp_ecn_ok) { + tcp->tcp_cwr = B_TRUE; + tcp->tcp_cwr_snd_max = tcp->tcp_snxt; + tcp->tcp_ecn_cwr_sent = B_FALSE; + } + break; + } + + if (CC_ALGO(tcp)->cong_signal != NULL) { + tcp->tcp_ccv.curack = seg_ack; + CC_ALGO(tcp)->cong_signal(&tcp->tcp_ccv, type); + } + + DTRACE_PROBE6(cwnd__cc__cong__signal, tcp_t *, tcp, uint32_t, old_cwnd, + uint32_t, tcp->tcp_cwnd, uint32_t, old_cwnd_ssthresh, + uint32_t, tcp->tcp_cwnd_ssthresh, uint32_t, type); +} + +static void +cc_post_recovery(tcp_t *tcp, uint32_t seg_ack) +{ + uint32_t old_cwnd = tcp->tcp_cwnd; + + if (CC_ALGO(tcp)->post_recovery != NULL) { + tcp->tcp_ccv.curack = seg_ack; + CC_ALGO(tcp)->post_recovery(&tcp->tcp_ccv); + } + tcp->tcp_ccv.t_bytes_acked = 0; + + DTRACE_PROBE3(cwnd__cc__post__recovery, tcp_t *, tcp, + uint32_t, old_cwnd, uint32_t, tcp->tcp_cwnd); +} + +/* * Set the MSS associated with a particular tcp based on its current value, * and a new one passed in. Observe minimums and maximums, and reset other * state variables that we want to view as multiples of MSS. @@ -548,6 +675,9 @@ tcp_process_options(tcp_t *tcp, tcpha_t *tcpha) * updated properly. */ TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial); + + if (tcp->tcp_cc_algo->conn_init != NULL) + tcp->tcp_cc_algo->conn_init(&tcp->tcp_ccv); } /* @@ -1405,7 +1535,7 @@ tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) ASSERT(ira->ira_sqp != NULL); new_sqp = ira->ira_sqp; - econnp = (conn_t *)tcp_get_conn(arg2, tcps); + econnp = tcp_get_conn(arg2, tcps); if (econnp == NULL) goto error2; @@ -2324,8 +2454,6 @@ tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) ip_pkt_t ipp; boolean_t ofo_seg = B_FALSE; /* Out of order segment */ uint32_t cwnd; - uint32_t add; - int npkt; int mss; conn_t *connp = (conn_t *)arg; squeue_t *sqp = (squeue_t *)arg2; @@ -2601,6 +2729,9 @@ tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) * draft-floyd-incr-init-win-01.txt, * Increasing TCP's Initial Window. */ + DTRACE_PROBE3(cwnd__retransmitted__syn, + tcp_t *, tcp, uint32_t, tcp->tcp_cwnd, + uint32_t, tcp->tcp_mss); tcp->tcp_cwnd = tcp->tcp_mss; } @@ -3823,6 +3954,9 @@ process_ack: tcp->tcp_rexmit_nxt = tcp->tcp_snxt; tcp->tcp_rexmit_max = tcp->tcp_snxt; tcp->tcp_ms_we_have_waited = 0; + DTRACE_PROBE3(cwnd__retransmitted__syn, + tcp_t *, tcp, uint32_t, tcp->tcp_cwnd, + uint32_t, tcp->tcp_mss); tcp->tcp_cwnd = mss; } @@ -3866,33 +4000,22 @@ process_ack: */ if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max)) tcp->tcp_cwr = B_FALSE; - if (tcp->tcp_ecn_ok && (flags & TH_ECE)) { - if (!tcp->tcp_cwr) { - npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss; - tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss; - tcp->tcp_cwnd = npkt * mss; - /* - * If the cwnd is 0, use the timer to clock out - * new segments. This is required by the ECN spec. - */ - if (npkt == 0) { - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - /* - * This makes sure that when the ACK comes - * back, we will increase tcp_cwnd by 1 MSS. - */ - tcp->tcp_cwnd_cnt = 0; - } - tcp->tcp_cwr = B_TRUE; - /* - * This marks the end of the current window of in - * flight data. That is why we don't use - * tcp_suna + tcp_swnd. Only data in flight can - * provide ECN info. - */ - tcp->tcp_cwr_snd_max = tcp->tcp_snxt; - tcp->tcp_ecn_cwr_sent = B_FALSE; - } + if (tcp->tcp_ecn_ok && (flags & TH_ECE) && !tcp->tcp_cwr) { + cc_cong_signal(tcp, seg_ack, CC_ECN); + /* + * If the cwnd is 0, use the timer to clock out + * new segments. This is required by the ECN spec. + */ + if (tcp->tcp_cwnd == 0) + TCP_TIMER_RESTART(tcp, tcp->tcp_rto); + tcp->tcp_cwr = B_TRUE; + /* + * This marks the end of the current window of in + * flight data. That is why we don't use + * tcp_suna + tcp_swnd. Only data in flight can + * provide ECN info. + */ + tcp->tcp_cwr_snd_max = tcp->tcp_snxt; } mp1 = tcp->tcp_xmit_head; @@ -3914,6 +4037,8 @@ process_ack: /* Do Limited Transmit */ if ((dupack_cnt = ++tcp->tcp_dupack_cnt) < tcps->tcps_dupack_fast_retransmit) { + cc_ack_received(tcp, seg_ack, + bytes_acked, CC_DUPACK); /* * RFC 3042 * @@ -3960,12 +4085,10 @@ process_ack: * dropped (due to congestion.) */ if (!tcp->tcp_cwr) { - npkt = ((tcp->tcp_snxt - - tcp->tcp_suna) >> 1) / mss; - tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * - mss; - tcp->tcp_cwnd = (npkt + - tcp->tcp_dupack_cnt) * mss; + cc_cong_signal(tcp, seg_ack, + CC_NDUPACK); + cc_ack_received(tcp, seg_ack, + bytes_acked, CC_DUPACK); } if (tcp->tcp_ecn_ok) { tcp->tcp_cwr = B_TRUE; @@ -4027,6 +4150,8 @@ process_ack: } /* tcp_snd_sack_ok */ } else { + cc_ack_received(tcp, seg_ack, + bytes_acked, CC_DUPACK); /* * Here we perform congestion * avoidance, but NOT slow start. @@ -4048,6 +4173,10 @@ process_ack: cwnd = tcp->tcp_cwnd + mss; if (cwnd > tcp->tcp_cwnd_max) cwnd = tcp->tcp_cwnd_max; + DTRACE_PROBE3(cwnd__fast__recovery, + tcp_t *, tcp, + uint32_t, tcp->tcp_cwnd, + uint32_t, cwnd); tcp->tcp_cwnd = cwnd; if (tcp->tcp_unsent > 0) flags |= TH_XMIT_NEEDED; @@ -4180,15 +4309,10 @@ process_ack: ASSERT(tcp->tcp_rexmit == B_FALSE); if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { tcp->tcp_dupack_cnt = 0; - /* - * Restore the orig tcp_cwnd_ssthresh after - * fast retransmit phase. - */ - if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { - tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh; - } + + cc_post_recovery(tcp, seg_ack); + tcp->tcp_rexmit_max = seg_ack; - tcp->tcp_cwnd_cnt = 0; /* * Remove all notsack info to avoid confusion with @@ -4217,8 +4341,12 @@ process_ack: * aggressive behaviour in sending new * segments. */ - tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh + + cwnd = tcp->tcp_cwnd_ssthresh + tcps->tcps_dupack_fast_retransmit * mss; + DTRACE_PROBE3(cwnd__fast__retransmit__part__ack, + tcp_t *, tcp, uint32_t, tcp->tcp_cwnd, + uint32_t, cwnd); + tcp->tcp_cwnd = cwnd; tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; flags |= TH_REXMIT_NEEDED; } @@ -4279,28 +4407,10 @@ process_ack: * usual. */ if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) { - cwnd = tcp->tcp_cwnd; - add = mss; - - if (cwnd >= tcp->tcp_cwnd_ssthresh) { - /* - * This is to prevent an increase of less than 1 MSS of - * tcp_cwnd. With partial increase, tcp_wput_data() - * may send out tinygrams in order to preserve mblk - * boundaries. - * - * By initializing tcp_cwnd_cnt to new tcp_cwnd and - * decrementing it by 1 MSS for every ACKs, tcp_cwnd is - * increased by 1 MSS for every RTTs. - */ - if (tcp->tcp_cwnd_cnt <= 0) { - tcp->tcp_cwnd_cnt = cwnd + add; - } else { - tcp->tcp_cwnd_cnt -= add; - add = 0; - } + if (IN_RECOVERY(tcp->tcp_ccv.flags)) { + EXIT_RECOVERY(tcp->tcp_ccv.flags); } - tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max); + cc_ack_received(tcp, seg_ack, bytes_acked, CC_ACK); } /* See if the latest urgent data has been acknowledged */ @@ -5634,6 +5744,10 @@ noticmpv4: npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / tcp->tcp_mss; tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss; + + DTRACE_PROBE3(cwnd__source__quench, tcp_t *, tcp, + uint32_t, tcp->tcp_cwnd, + uint32_t, tcp->tcp_mss); tcp->tcp_cwnd = tcp->tcp_mss; tcp->tcp_cwnd_cnt = 0; } diff --git a/usr/src/uts/common/inet/tcp/tcp_output.c b/usr/src/uts/common/inet/tcp/tcp_output.c index f54ab3fb33..ae9efe863d 100644 --- a/usr/src/uts/common/inet/tcp/tcp_output.c +++ b/usr/src/uts/common/inet/tcp/tcp_output.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, 2016 by Delphix. All rights reserved. + * Copyright (c) 2014, 2017 by Delphix. All rights reserved. * Copyright 2019 Joyent, Inc. */ @@ -81,6 +81,18 @@ static void tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *); */ static int tcp_tx_pull_len = 16; +static void +cc_after_idle(tcp_t *tcp) +{ + uint32_t old_cwnd = tcp->tcp_cwnd; + + if (CC_ALGO(tcp)->after_idle != NULL) + CC_ALGO(tcp)->after_idle(&tcp->tcp_ccv); + + DTRACE_PROBE3(cwnd__cc__after__idle, tcp_t *, tcp, uint32_t, old_cwnd, + uint32_t, tcp->tcp_cwnd); +} + int tcp_wput(queue_t *q, mblk_t *mp) { @@ -219,7 +231,6 @@ tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent) int32_t total_hdr_len; int32_t tcp_hdr_len; int rc; - tcp_stack_t *tcps = tcp->tcp_tcps; conn_t *connp = tcp->tcp_connp; clock_t now = LBOLT_FASTPATH; @@ -374,7 +385,7 @@ data_null: if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { - TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle); + cc_after_idle(tcp); } if (tcpstate == TCPS_SYN_RCVD) { /* @@ -1195,7 +1206,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) now = LBOLT_FASTPATH; if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { - TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle); + cc_after_idle(tcp); } usable = tcp->tcp_swnd; /* tcp window size */ diff --git a/usr/src/uts/common/inet/tcp/tcp_timers.c b/usr/src/uts/common/inet/tcp/tcp_timers.c index 81cf5c57a5..804160f628 100644 --- a/usr/src/uts/common/inet/tcp/tcp_timers.c +++ b/usr/src/uts/common/inet/tcp/tcp_timers.c @@ -23,7 +23,7 @@ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2011 Joyent, Inc. All rights reserved. - * Copyright (c) 2014, 2016 by Delphix. All rights reserved. + * Copyright (c) 2014, 2017 by Delphix. All rights reserved. */ #include <sys/types.h> @@ -784,36 +784,7 @@ tcp_timer(void *arg) SL_TRACE, "tcp_timer: zero win"); } } else { - /* - * After retransmission, we need to do - * slow start. Set the ssthresh to one - * half of current effective window and - * cwnd to one MSS. Also reset - * tcp_cwnd_cnt. - * - * Note that if tcp_ssthresh is reduced because - * of ECN, do not reduce it again unless it is - * already one window of data away (tcp_cwr - * should then be cleared) or this is a - * timeout for a retransmitted segment. - */ - uint32_t npkt; - - if (!tcp->tcp_cwr || tcp->tcp_rexmit) { - npkt = ((tcp->tcp_timer_backoff ? - tcp->tcp_cwnd_ssthresh : - tcp->tcp_snxt - - tcp->tcp_suna) >> 1) / tcp->tcp_mss; - tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * - tcp->tcp_mss; - } - tcp->tcp_cwnd = tcp->tcp_mss; - tcp->tcp_cwnd_cnt = 0; - if (tcp->tcp_ecn_ok) { - tcp->tcp_cwr = B_TRUE; - tcp->tcp_cwr_snd_max = tcp->tcp_snxt; - tcp->tcp_ecn_cwr_sent = B_FALSE; - } + cc_cong_signal(tcp, NULL, CC_RTO); } break; } diff --git a/usr/src/uts/common/inet/tcp/tcp_tunables.c b/usr/src/uts/common/inet/tcp/tcp_tunables.c index f4d6c71914..6348e02ae6 100644 --- a/usr/src/uts/common/inet/tcp/tcp_tunables.c +++ b/usr/src/uts/common/inet/tcp/tcp_tunables.c @@ -22,12 +22,13 @@ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2016 Joyent, Inc. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. */ /* Copyright (c) 1990 Mentat Inc. */ #include <inet/ip.h> #include <inet/tcp_impl.h> +#include <inet/cc.h> #include <sys/multidata.h> #include <sys/sunddi.h> @@ -38,6 +39,12 @@ /* Max of the above */ #define TCP_MSS_MAX TCP_MSS_MAX_IPV4 +typedef struct { + char *ccn_buf; + uint_t ccn_bufsize; + uint_t ccn_bytes; +} tcp_copy_ccname_t; + /* * Set the RFC 1948 pass phrase */ @@ -239,6 +246,65 @@ tcp_largest_anon_set(netstack_t *stack, cred_t *cr, mod_prop_info_t *pinfo, return (0); } +/* ARGSUSED */ +static int +tcp_set_cc_algorithm(netstack_t *stack, cred_t *cr, mod_prop_info_t *pinfo, + const char *ifname, const void *pval, uint_t flags) +{ + tcp_stack_t *tcps = stack->netstack_tcp; + char *name = (flags & MOD_PROP_DEFAULT) ? + CC_DEFAULT_ALGO_NAME : (char *)pval; + struct cc_algo *algo = cc_load_algo(name); + + if (algo == NULL) { + return (EINVAL); + } + + tcps->tcps_default_cc_algo = algo; + + return (0); +} + +static int +tcp_copy_ccname(void *data, struct cc_algo *algo) +{ + tcp_copy_ccname_t *cd = data; + char *sep = cd->ccn_bytes > 0 ? "," : ""; + size_t avail = 0; + + if (cd->ccn_bytes < cd->ccn_bufsize) { + avail = cd->ccn_bufsize - cd->ccn_bytes; + } + + cd->ccn_bytes += snprintf(cd->ccn_buf + cd->ccn_bytes, avail, + "%s%s", sep, algo->name); + + return (cd->ccn_bytes >= cd->ccn_bufsize ? ENOBUFS : 0); +} + +/* ARGSUSED */ +static int +tcp_get_cc_algorithm(netstack_t *stack, mod_prop_info_t *pinfo, + const char *ifname, void *pval, uint_t psize, uint_t flags) +{ + size_t nbytes; + + if (flags & MOD_PROP_POSSIBLE) { + tcp_copy_ccname_t cd = { pval, psize, 0 }; + return (cc_walk_algos(tcp_copy_ccname, &cd)); + } else if (flags & MOD_PROP_PERM) { + nbytes = snprintf(pval, psize, "%u", MOD_PROP_PERM_RW); + } else if (flags & MOD_PROP_DEFAULT) { + nbytes = snprintf(pval, psize, "%s", CC_DEFAULT_ALGO_NAME); + } else { + nbytes = snprintf(pval, psize, "%s", + stack->netstack_tcp->tcps_default_cc_algo->name); + } + if (nbytes >= psize) + return (ENOBUFS); + return (0); +} + /* * All of these are alterable, within the min/max values given, at run time. * @@ -527,6 +593,17 @@ mod_prop_info_t tcp_propinfo_tbl[] = { {1, ISS_INCR, ISS_INCR}, {ISS_INCR} }, + { "congestion_control", MOD_PROTO_TCP, + tcp_set_cc_algorithm, tcp_get_cc_algorithm, {0}, {0} }, + + /* RFC 3465 - TCP Congestion Control with Appropriate Byte Counting */ + { "_abc", MOD_PROTO_TCP, + mod_set_boolean, mod_get_boolean, {B_TRUE}, {B_TRUE} }, + + /* "L" value from RFC 3465 */ + { "_abc_l_var", MOD_PROTO_TCP, + mod_set_uint32, mod_get_uint32, {1, UINT32_MAX, 2}, {2} }, + { "?", MOD_PROTO_TCP, NULL, mod_get_allprop, {0}, {0} }, { NULL, 0, NULL, NULL, {0}, {0} } diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h index b110a60fab..5669592cff 100644 --- a/usr/src/uts/common/inet/tcp_impl.h +++ b/usr/src/uts/common/inet/tcp_impl.h @@ -22,7 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2019 Joyent, Inc. * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. */ #ifndef _INET_TCP_IMPL_H @@ -562,6 +562,8 @@ extern uint32_t tcp_early_abort; #define tcps_dev_flow_ctl tcps_propinfo_tbl[58].prop_cur_bval #define tcps_reass_timeout tcps_propinfo_tbl[59].prop_cur_uval #define tcps_iss_incr tcps_propinfo_tbl[65].prop_cur_uval +#define tcps_abc tcps_propinfo_tbl[67].prop_cur_bval +#define tcps_abc_l_var tcps_propinfo_tbl[68].prop_cur_uval /* @@ -733,6 +735,7 @@ extern mblk_t *tcp_xmit_mp(tcp_t *, mblk_t *, int32_t, int32_t *, /* * Input related functions in tcp_input.c. */ +extern void cc_cong_signal(tcp_t *, uint32_t, uint32_t); extern void tcp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); extern void tcp_input_data(void *, mblk_t *, void *, ip_recv_attr_t *); extern void tcp_input_listener_unbound(void *, mblk_t *, void *, diff --git a/usr/src/uts/common/inet/tcp_stack.h b/usr/src/uts/common/inet/tcp_stack.h index e46ebe08da..9bde97617f 100644 --- a/usr/src/uts/common/inet/tcp_stack.h +++ b/usr/src/uts/common/inet/tcp_stack.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017 by Delphix. All rights reserved. */ #ifndef _INET_TCP_STACK_H @@ -114,6 +115,8 @@ struct tcp_stack { kmutex_t tcps_listener_conf_lock; list_t tcps_listener_conf; + struct cc_algo *tcps_default_cc_algo; + /* * Per CPU stats * diff --git a/usr/src/uts/intel/Makefile.intel b/usr/src/uts/intel/Makefile.intel index dabde6b98f..ca1cce10d9 100644 --- a/usr/src/uts/intel/Makefile.intel +++ b/usr/src/uts/intel/Makefile.intel @@ -21,6 +21,7 @@ # # Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright (c) 2013 Andrew Stormont. All rights reserved. +# Copyright (c) 2014 by Delphix. All rights reserved. # Copyright 2019 Joyent, Inc. # Copyright 2016 Garrett D'Amore <garrett@damore.org> # Copyright 2018 Nexenta Systems, Inc. @@ -174,6 +175,12 @@ $(IF_DEBUG_OBJ)clock.o := DEBUG_DEFS += -DKSLICE=1 ALL_DEFS = $(DEBUG_DEFS) $(OPTION_DEFS) # +# TCP congestion control modules (/kernel/cc) +# +MISC_KMODS += cc +CC_KMODS += cc_newreno cc_cubic cc_sunreno + +# # The kernels modules which are "implementation architecture" # specific for this machine are enumerated below. Note that most # of these modules must exist (in one form or another) for each diff --git a/usr/src/uts/intel/cc/Makefile b/usr/src/uts/intel/cc/Makefile new file mode 100644 index 0000000000..27a74f2c95 --- /dev/null +++ b/usr/src/uts/intel/cc/Makefile @@ -0,0 +1,69 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# Copyright 2019 Joyent, Inc. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = cc +OBJECTS = $(CC_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(CC_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_MISC_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# Overrides. +# +CFLAGS += $(CCVERBOSE) +LDFLAGS += -dy + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/cc_cubic/Makefile b/usr/src/uts/intel/cc_cubic/Makefile new file mode 100644 index 0000000000..a4edef5f46 --- /dev/null +++ b/usr/src/uts/intel/cc_cubic/Makefile @@ -0,0 +1,73 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# +# Copyright (c) 2017 by Delphix. All rights reserved. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = cc_cubic +OBJECTS = $(CC_CUBIC_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(CC_CUBIC_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_CC_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# Overrides. +# +CFLAGS += $(CCVERBOSE) +LDFLAGS += -dy -N misc/cc -N cc/cc_newreno + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/cc_newreno/Makefile b/usr/src/uts/intel/cc_newreno/Makefile new file mode 100644 index 0000000000..aaa47dcd05 --- /dev/null +++ b/usr/src/uts/intel/cc_newreno/Makefile @@ -0,0 +1,73 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# +# Copyright (c) 2017 by Delphix. All rights reserved. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = cc_newreno +OBJECTS = $(CC_NEWRENO_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(CC_NEWRENO_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_CC_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# Overrides. +# +CFLAGS += $(CCVERBOSE) +LDFLAGS += -dy -N misc/cc + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/cc_sunreno/Makefile b/usr/src/uts/intel/cc_sunreno/Makefile new file mode 100644 index 0000000000..90463e9268 --- /dev/null +++ b/usr/src/uts/intel/cc_sunreno/Makefile @@ -0,0 +1,73 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# +# Copyright (c) 2017 by Delphix. All rights reserved. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = cc_sunreno +OBJECTS = $(CC_SUNRENO_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(CC_SUNRENO_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_CC_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# Overrides. +# +CFLAGS += $(CCVERBOSE) +LDFLAGS += -dy -N misc/cc + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/ip/Makefile b/usr/src/uts/intel/ip/Makefile index 009a644393..9cc6e5499f 100644 --- a/usr/src/uts/intel/ip/Makefile +++ b/usr/src/uts/intel/ip/Makefile @@ -22,7 +22,7 @@ # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# Copyright (c) 2018, Joyent, Inc. +# Copyright 2019 Joyent, Inc. # # This makefile drives the production of the ip driver # kernel module. @@ -53,7 +53,7 @@ include $(UTSBASE)/intel/Makefile.intel # # Define targets # -ALL_TARGET = $(BINARY) $(SRC_CONFILE) +ALL_TARGET = $(BINARY) $(SRC_CONFFILE) LINT_TARGET = $(MODULE).lint INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE) @@ -91,6 +91,12 @@ INC_PATH += -I$(UTSBASE)/common/io/bpf LDFLAGS += -dy -Nmisc/md5 -Ncrypto/swrand -Nmisc/hook -Nmisc/neti # +# Depends on the congestion control framework for TCP connections. +# We make several different algorithms available by default. +# +LDFLAGS += -N misc/cc -N cc/cc_sunreno -N cc/cc_newreno -N cc/cc_cubic + +# # For now, disable these lint checks; maintainers should endeavor # to investigate and remove these for maximum lint coverage. # Please do not carry these forward to new Makefiles. diff --git a/usr/src/uts/sparc/Makefile.sparc b/usr/src/uts/sparc/Makefile.sparc index a37de46509..c5e3581c46 100644 --- a/usr/src/uts/sparc/Makefile.sparc +++ b/usr/src/uts/sparc/Makefile.sparc @@ -22,6 +22,7 @@ # # Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright (c) 2013 Andrew Stormont. All rights reserved. +# Copyright (c) 2015, 2017 by Delphix. All rights reserved. # Copyright 2019 Joyent, Inc. # Copyright 2016 Gary Mills # Copyright 2016 Nexenta Systems, Inc. @@ -203,6 +204,13 @@ $(IF_DEBUG_OBJ)clock.o := DEBUG_DEFS += -DKSLICE=1 # files. # ALL_DEFS = $(MACHINE_DEFS) $(DEBUG_DEFS) $(OPTION_DEFS) + +# +# TCP congestion control modules (/kernel/cc) +# +MISC_KMODS += cc +CC_KMODS += cc_newreno cc_cubic cc_sunreno + # # # The kernels modules which are "implementation architecture" diff --git a/usr/src/uts/sparc/cc/Makefile b/usr/src/uts/sparc/cc/Makefile new file mode 100644 index 0000000000..928a085458 --- /dev/null +++ b/usr/src/uts/sparc/cc/Makefile @@ -0,0 +1,69 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# Copyright 2019 Joyent, Inc. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = cc +OBJECTS = $(CC_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(CC_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_MISC_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sparc/Makefile.sparc + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# Overrides. +# +CFLAGS += $(CCVERBOSE) +LDFLAGS += -dy + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/sparc/Makefile.targ diff --git a/usr/src/uts/sparc/cc_cubic/Makefile b/usr/src/uts/sparc/cc_cubic/Makefile new file mode 100644 index 0000000000..ae7926a614 --- /dev/null +++ b/usr/src/uts/sparc/cc_cubic/Makefile @@ -0,0 +1,73 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# +# Copyright (c) 2017 by Delphix. All rights reserved. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = cc_cubic +OBJECTS = $(CC_CUBIC_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(CC_CUBIC_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_CC_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sparc/Makefile.sparc + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# Overrides. +# +CFLAGS += $(CCVERBOSE) +LDFLAGS += -dy -N misc/cc -N cc/cc_newreno + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/sparc/Makefile.targ diff --git a/usr/src/uts/sparc/cc_newreno/Makefile b/usr/src/uts/sparc/cc_newreno/Makefile new file mode 100644 index 0000000000..6159e48c72 --- /dev/null +++ b/usr/src/uts/sparc/cc_newreno/Makefile @@ -0,0 +1,73 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# +# Copyright (c) 2017 by Delphix. All rights reserved. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = cc_newreno +OBJECTS = $(CC_NEWRENO_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(CC_NEWRENO_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_CC_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sparc/Makefile.sparc + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# Overrides. +# +CFLAGS += $(CCVERBOSE) +LDFLAGS += -dy -N misc/cc + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/sparc/Makefile.targ diff --git a/usr/src/uts/sparc/cc_sunreno/Makefile b/usr/src/uts/sparc/cc_sunreno/Makefile new file mode 100644 index 0000000000..912019834f --- /dev/null +++ b/usr/src/uts/sparc/cc_sunreno/Makefile @@ -0,0 +1,73 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# +# Copyright (c) 2017 by Delphix. All rights reserved. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = cc_sunreno +OBJECTS = $(CC_SUNRENO_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(CC_SUNRENO_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_CC_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sparc/Makefile.sparc + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# Overrides. +# +CFLAGS += $(CCVERBOSE) +LDFLAGS += -dy -N misc/cc + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/sparc/Makefile.targ diff --git a/usr/src/uts/sparc/ip/Makefile b/usr/src/uts/sparc/ip/Makefile index a6f693aec2..4b6ac89108 100644 --- a/usr/src/uts/sparc/ip/Makefile +++ b/usr/src/uts/sparc/ip/Makefile @@ -22,6 +22,7 @@ # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # +# Copyright 2019 Joyent, Inc. # # This makefile drives the production of the ip driver # kernel module. @@ -66,6 +67,19 @@ CFLAGS += -xinline=tcp_set_ws_value INC_PATH += -I$(UTSBASE)/common/io/bpf # +# Depends on md5 and swrand (for SCTP). SCTP needs to depend on +# swrand as it needs random numbers early on during boot before +# kCF subsystem can load swrand. +# +LDFLAGS += -dy -Nmisc/md5 -Ncrypto/swrand -Nmisc/hook -Nmisc/neti + +# +# Depends on the congestion control framework for TCP connections. +# We make several different algorithms available by default. +# +LDFLAGS += -N misc/cc -N cc/cc_sunreno -N cc/cc_newreno -N cc/cc_cubic + +# # For now, disable these lint checks; maintainers should endeavor # to investigate and remove these for maximum lint coverage. # Please do not carry these forward to new Makefiles. @@ -86,13 +100,6 @@ CERRWARN += $(CNOWARN_UNINIT) CERRWARN += -_gcc=-Wno-type-limits # -# Depends on md5 and swrand (for SCTP). SCTP needs to depend on -# swrand as it needs random numbers early on during boot before -# kCF subsystem can load swrand. -# -LDFLAGS += -dy -Nmisc/md5 -Ncrypto/swrand -Nmisc/hook -Nmisc/neti - -# # Default build targets. # .KEEP_STATE: |