diff options
| author | Dan McDonald <danmcd@mnx.io> | 2022-05-13 17:20:24 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2022-05-13 17:20:24 -0400 |
| commit | bb7d6c9b47695f41cbacbcf6662baf3d0e152fdf (patch) | |
| tree | 75f2d0cab5fb92f97f2ab2c3186a0b5d1579a33a /usr/src/uts/common/inet | |
| parent | 8ca5534c77e93c25d2c1f777499b12da0f7cc0cd (diff) | |
| parent | 402559e299331588f209b3a9693e3bcd6a83d22c (diff) | |
| download | illumos-joyent-OS-8149.tar.gz | |
Merge branch 'master' into OS-8149OS-8149
Diffstat (limited to 'usr/src/uts/common/inet')
35 files changed, 426 insertions, 138 deletions
diff --git a/usr/src/uts/common/inet/cc/cc_cubic.c b/usr/src/uts/common/inet/cc/cc_cubic.c index 11c238afd8..bb26a2358d 100644 --- a/usr/src/uts/common/inet/cc/cc_cubic.c +++ b/usr/src/uts/common/inet/cc/cc_cubic.c @@ -4,6 +4,7 @@ * All rights reserved. * Copyright (c) 2017 by Delphix. All rights reserved. * Copyright 2019 Joyent, Inc. + * Copyright 2020 RackTop Systems, Inc. * * This software was developed by Lawrence Stewart while studying at the Centre * for Advanced Internet Architectures, Swinburne University of Technology, made @@ -85,6 +86,7 @@ static void cubic_conn_init(struct cc_var *ccv); static void cubic_post_recovery(struct cc_var *ccv); static void cubic_record_rtt(struct cc_var *ccv); static void cubic_ssthresh_update(struct cc_var *ccv); +static void cubic_after_idle(struct cc_var *ccv); struct cubic { /* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */ @@ -115,6 +117,7 @@ struct cc_algo cubic_cc_algo = { .cong_signal = cubic_cong_signal, .conn_init = cubic_conn_init, .post_recovery = cubic_post_recovery, + .after_idle = cubic_after_idle, }; int @@ -129,7 +132,7 @@ _init(void) if ((err = mod_install(&cc_cubic_modlinkage)) != 0) (void) cc_deregister_algo(&cubic_cc_algo); } - cubic_cc_algo.after_idle = newreno_cc_algo->after_idle; + return (err); } @@ -195,19 +198,22 @@ cubic_ack_received(struct cc_var *ccv, uint16_t type) * TCP-friendly region, follow tf * cwnd growth. */ - CCV(ccv, tcp_cwnd) = w_tf; + if (CCV(ccv, tcp_cwnd) < w_tf) + CCV(ccv, tcp_cwnd) = w_tf; } else if (CCV(ccv, tcp_cwnd) < w_cubic_next) { /* * Concave or convex region, follow CUBIC * cwnd growth. */ if (CC_ABC(ccv)) - CCV(ccv, tcp_cwnd) = w_cubic_next; + CCV(ccv, tcp_cwnd) = MIN(w_cubic_next, + INT_MAX); else - CCV(ccv, tcp_cwnd) += ((w_cubic_next - + CCV(ccv, tcp_cwnd) += MAX(1, + ((MIN(w_cubic_next, INT_MAX) - CCV(ccv, tcp_cwnd)) * CCV(ccv, tcp_mss)) / - CCV(ccv, tcp_cwnd); + CCV(ccv, tcp_cwnd)); } /* @@ -218,12 +224,34 @@ cubic_ack_received(struct cc_var *ccv, uint16_t type) * max_cwnd. */ if (cubic_data->num_cong_events == 0 && - cubic_data->max_cwnd < CCV(ccv, tcp_cwnd)) + cubic_data->max_cwnd < CCV(ccv, tcp_cwnd)) { cubic_data->max_cwnd = CCV(ccv, tcp_cwnd); + cubic_data->K = cubic_k(cubic_data->max_cwnd / + CCV(ccv, tcp_mss)); + } } } } +/* + * This is a Cubic specific implementation of after_idle. + * - Reset cwnd by calling New Reno implementation of after_idle. + * - Reset t_last_cong. + */ +static void +cubic_after_idle(struct cc_var *ccv) +{ + struct cubic *cubic_data; + + cubic_data = ccv->cc_data; + + cubic_data->max_cwnd = max(cubic_data->max_cwnd, CCV(ccv, tcp_cwnd)); + cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, tcp_mss)); + + newreno_cc_algo->after_idle(ccv); + cubic_data->t_last_cong = gethrtime(); +} + static void cubic_cb_destroy(struct cc_var *ccv) { @@ -237,7 +265,7 @@ cubic_cb_init(struct cc_var *ccv) { struct cubic *cubic_data; - cubic_data = kmem_alloc(sizeof (struct cubic), KM_NOSLEEP); + cubic_data = kmem_zalloc(sizeof (struct cubic), KM_NOSLEEP); if (cubic_data == NULL) return (ENOMEM); @@ -330,6 +358,7 @@ static void cubic_post_recovery(struct cc_var *ccv) { struct cubic *cubic_data; + uint32_t mss, pipe; cubic_data = ccv->cc_data; @@ -339,11 +368,39 @@ cubic_post_recovery(struct cc_var *ccv) >> CUBIC_SHIFT; } + /* + * There is a risk that if the cwnd becomes less than mss, and + * we do not get enough acks to drive it back up beyond mss, + * we will stop transmitting data altogether. + * + * The Cubic RFC defines values in terms of units of mss. Therefore + * we must make sure we have at least 1 mss to make progress + * since the algorthm is written that way. + */ + mss = CCV(ccv, tcp_mss); + if (IN_FASTRECOVERY(ccv->flags)) { - /* Update cwnd based on beta and adjusted max_cwnd. */ - CCV(ccv, tcp_cwnd) = max(1, ((CUBIC_BETA * - cubic_data->max_cwnd) >> CUBIC_SHIFT)); + /* + * If inflight data is less than ssthresh, set cwnd + * conservatively to avoid a burst of data, as suggested in + * the NewReno RFC. Otherwise, use the CUBIC method. + */ + pipe = CCV(ccv, tcp_snxt) - CCV(ccv, tcp_suna); + if (pipe < CCV(ccv, tcp_cwnd_ssthresh)) { + /* + * Ensure that cwnd does not collapse to 1 MSS under + * adverse conditions. Implements RFC6582 + */ + CCV(ccv, tcp_cwnd) = MAX(pipe, mss) + mss; + } else { + /* Update cwnd based on beta and adjusted max_cwnd. */ + CCV(ccv, tcp_cwnd) = max(mss, ((CUBIC_BETA * + cubic_data->max_cwnd) >> CUBIC_SHIFT)); + } + } else { + CCV(ccv, tcp_cwnd) = max(mss, CCV(ccv, tcp_cwnd)); } + cubic_data->t_last_cong = gethrtime(); /* Calculate the average RTT between congestion epochs. */ @@ -355,7 +412,7 @@ cubic_post_recovery(struct cc_var *ccv) cubic_data->epoch_ack_count = 0; cubic_data->sum_rtt_nsecs = 0; - cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, tcp_mss)); + cubic_data->K = cubic_k(cubic_data->max_cwnd / mss); } /* diff --git a/usr/src/uts/common/inet/cc/cc_cubic.h b/usr/src/uts/common/inet/cc/cc_cubic.h index c87751d257..cc6e6e459a 100644 --- a/usr/src/uts/common/inet/cc/cc_cubic.h +++ b/usr/src/uts/common/inet/cc/cc_cubic.h @@ -4,6 +4,7 @@ * All rights reserved. * Copyright (c) 2017 by Delphix. All rights reserved. * Copyright 2019 Joyent, Inc. + * Copyright 2020 RackTop Systems, Inc. * * This software was developed by Lawrence Stewart while studying at the Centre * for Advanced Internet Architectures, Swinburne University of Technology, made @@ -70,6 +71,12 @@ /* Don't trust s_rtt until this many rtt samples have been taken. */ #define CUBIC_MIN_RTT_SAMPLES 8 +/* + * (2^21)^3 is long max. Dividing (2^63) by Cubic_C_factor + * and taking cube-root yields 448845 as the effective useful limit + */ +#define CUBED_ROOT_MAX_ULONG 448845 + /* Userland only bits. */ #ifndef _KERNEL @@ -188,6 +195,11 @@ cubic_cwnd(hrtime_t nsecs_since_cong, uint32_t wmax, uint32_t smss, int64_t K) */ cwnd = (t - K * MILLISEC) / MILLISEC; + if (cwnd > CUBED_ROOT_MAX_ULONG) + return (INT_MAX); + if (cwnd < -CUBED_ROOT_MAX_ULONG) + return (0); + /* cwnd = (t - K)^3, with CUBIC_SHIFT^3 worth of precision. */ cwnd *= (cwnd * cwnd); @@ -199,7 +211,10 @@ cubic_cwnd(hrtime_t nsecs_since_cong, uint32_t wmax, uint32_t smss, int64_t K) */ cwnd = ((cwnd * CUBIC_C_FACTOR * smss) >> CUBIC_SHIFT_4) + wmax; - return ((uint32_t)cwnd); + /* + * for negative cwnd, limiting to zero as lower bound + */ + return (max(0, cwnd)); } /* diff --git a/usr/src/uts/common/inet/cc/cc_newreno.c b/usr/src/uts/common/inet/cc/cc_newreno.c index ceb76d8643..5cb1c32534 100644 --- a/usr/src/uts/common/inet/cc/cc_newreno.c +++ b/usr/src/uts/common/inet/cc/cc_newreno.c @@ -7,6 +7,7 @@ * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * Copyright (c) 2017 by Delphix. All rights reserved. + * Copyright 2020 RackTop Systems, Inc. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart, James @@ -256,12 +257,25 @@ newreno_cong_signal(struct cc_var *ccv, uint32_t type) static void newreno_post_recovery(struct cc_var *ccv) { + uint32_t pipe; + if (IN_FASTRECOVERY(ccv->flags)) { /* * Fast recovery will conclude after returning from this - * function. + * function. Window inflation should have left us with + * approximately cwnd_ssthresh outstanding data. But in case we + * would be inclined to send a burst, better to do it via the + * slow start mechanism. */ - if (CCV(ccv, tcp_cwnd) > CCV(ccv, tcp_cwnd_ssthresh)) { + pipe = CCV(ccv, tcp_snxt) - CCV(ccv, tcp_suna); + if (pipe < CCV(ccv, tcp_cwnd_ssthresh)) { + /* + * Ensure that cwnd does not collapse to 1 MSS under + * adverse conditions. Implements RFC6582 + */ + CCV(ccv, tcp_cwnd) = MAX(pipe, CCV(ccv, tcp_mss)) + + CCV(ccv, tcp_mss); + } else if (CCV(ccv, tcp_cwnd) > CCV(ccv, tcp_cwnd_ssthresh)) { CCV(ccv, tcp_cwnd) = CCV(ccv, tcp_cwnd_ssthresh); } } diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h index 89574da71f..7687fdd29e 100644 --- a/usr/src/uts/common/inet/ip.h +++ b/usr/src/uts/common/inet/ip.h @@ -177,7 +177,7 @@ typedef struct ipoptp_s #define IPOPTP_ERROR 0x00000001 #endif /* _KERNEL */ -/* Controls forwarding of IP packets, set via ipadm(1M)/ndd(1M) */ +/* Controls forwarding of IP packets, set via ipadm(8)/ndd(8) */ #define IP_FORWARD_NEVER 0 #define IP_FORWARD_ALWAYS 1 @@ -1596,7 +1596,8 @@ struct ill_zerocopy_capab_s { struct ill_lso_capab_s { uint_t ill_lso_flags; /* capabilities */ - uint_t ill_lso_max; /* maximum size of payload */ + uint_t ill_lso_max_tcpv4; /* maximum size of payload */ + uint_t ill_lso_max_tcpv6; /* maximum size of payload */ }; /* diff --git a/usr/src/uts/common/inet/ip/conn_opt.c b/usr/src/uts/common/inet/ip/conn_opt.c index 8a05a25b08..eeec56b162 100644 --- a/usr/src/uts/common/inet/ip/conn_opt.c +++ b/usr/src/uts/common/inet/ip/conn_opt.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2016 Joyent, Inc. + * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -236,11 +237,22 @@ conn_recvancillary_size(conn_t *connp, crb_t recv_ancillary, } /* + * If IP_RECVTOS is set allocate the appropriately sized buffer + */ + if (recv_ancillary.crb_recvtos && + (ira->ira_flags & IRAF_IS_IPV4)) { + ancil_size += sizeof (struct T_opthdr) + + P2ROUNDUP(sizeof (uint8_t), __TPI_ALIGN_SIZE); + IP_STAT(ipst, conn_in_recvtos); + } + + /* * If IP_RECVTTL is set allocate the appropriate sized buffer */ if (recv_ancillary.crb_recvttl && (ira->ira_flags & IRAF_IS_IPV4)) { - ancil_size += sizeof (struct T_opthdr) + sizeof (uint8_t); + ancil_size += sizeof (struct T_opthdr) + + P2ROUNDUP(sizeof (uint8_t), __TPI_ALIGN_SIZE); IP_STAT(ipst, conn_in_recvttl); } @@ -550,14 +562,25 @@ conn_recvancillary_add(conn_t *connp, crb_t recv_ancillary, ancil_size -= toh->len; } - /* - * CAUTION: - * Due to aligment issues - * Processing of IP_RECVTTL option - * should always be the last. Adding - * any option processing after this will - * cause alignment panic. - */ + if (recv_ancillary.crb_recvtos && + (ira->ira_flags & IRAF_IS_IPV4)) { + struct T_opthdr *toh; + uint8_t *dstptr; + + toh = (struct T_opthdr *)ancil_buf; + toh->level = IPPROTO_IP; + toh->name = IP_RECVTOS; + toh->len = sizeof (struct T_opthdr) + + P2ROUNDUP(sizeof (uint8_t), __TPI_ALIGN_SIZE); + toh->status = 0; + ancil_buf += sizeof (struct T_opthdr); + dstptr = (uint8_t *)ancil_buf; + *dstptr = ipp->ipp_type_of_service; + ancil_buf = (uchar_t *)toh + toh->len; + ancil_size -= toh->len; + ASSERT(__TPI_TOPT_ISALIGNED(toh)); + } + if (recv_ancillary.crb_recvttl && (ira->ira_flags & IRAF_IS_IPV4)) { struct T_opthdr *toh; @@ -566,13 +589,15 @@ conn_recvancillary_add(conn_t *connp, crb_t recv_ancillary, toh = (struct T_opthdr *)ancil_buf; toh->level = IPPROTO_IP; toh->name = IP_RECVTTL; - toh->len = sizeof (struct T_opthdr) + sizeof (uint8_t); + toh->len = sizeof (struct T_opthdr) + + P2ROUNDUP(sizeof (uint8_t), __TPI_ALIGN_SIZE); toh->status = 0; ancil_buf += sizeof (struct T_opthdr); dstptr = (uint8_t *)ancil_buf; *dstptr = ipp->ipp_hoplimit; - ancil_buf += sizeof (uint8_t); + ancil_buf = (uchar_t *)toh + toh->len; ancil_size -= toh->len; + ASSERT(__TPI_TOPT_ISALIGNED(toh)); } /* Consumed all of allocated space */ @@ -777,6 +802,9 @@ conn_opt_get(conn_opt_arg_t *coa, t_scalar_t level, t_scalar_t name, case IP_RECVTTL: *i1 = connp->conn_recv_ancillary.crb_recvttl; break; /* goto sizeof (int) option return */ + case IP_RECVTOS: + *i1 = connp->conn_recv_ancillary.crb_recvtos; + break; /* goto sizeof (int) option return */ case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: case MCAST_JOIN_GROUP: @@ -1385,6 +1413,11 @@ conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen, connp->conn_recv_ancillary.crb_recvttl = onoff; mutex_exit(&connp->conn_lock); break; + case IP_RECVTOS: + mutex_enter(&connp->conn_lock); + connp->conn_recv_ancillary.crb_recvtos = onoff; + mutex_exit(&connp->conn_lock); + break; case IP_PKTINFO: { /* * This also handles IP_RECVPKTINFO. diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c index 925d06c62b..274bf9b2eb 100644 --- a/usr/src/uts/common/inet/ip/ip.c +++ b/usr/src/uts/common/inet/ip/ip.c @@ -24,7 +24,8 @@ * Copyright (c) 1990 Mentat Inc. * Copyright (c) 2017 OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. - * Copyright (c) 2019 Joyent, Inc. All rights reserved. + * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. + * Copyright 2021 Joyent, Inc. */ #include <sys/types.h> @@ -2844,6 +2845,20 @@ icmp_pkt(mblk_t *mp, void *stuff, size_t len, ip_recv_attr_t *ira) len_needed = IPH_HDR_LENGTH(ipha); if (ipha->ipha_protocol == IPPROTO_ENCAP || ipha->ipha_protocol == IPPROTO_IPV6) { + /* + * NOTE: It is posssible that the inner packet is poorly + * formed (e.g. IP version is corrupt, or v6 extension headers + * got cut off). The receiver of the ICMP message should see + * what we saw. In the absence of a sane inner-packet (which + * protocol types IPPPROTO_ENCAP and IPPROTO_IPV6 indicate + * would be an IP header), we should send the size of what is + * normally expected to be there (either sizeof (ipha_t) or + * sizeof (ip6_t). It may be useful for diagnostic purposes. + * + * ALSO NOTE: "inner_ip6h" is the inner packet header, v4 or v6. + */ + ip6_t *inner_ip6h = (ip6_t *)((uchar_t *)ipha + len_needed); + if (!pullupmsg(mp, -1)) { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); ip_drop_output("ipIfStatsOutDiscards", mp, NULL); @@ -2853,13 +2868,20 @@ icmp_pkt(mblk_t *mp, void *stuff, size_t len, ip_recv_attr_t *ira) ipha = (ipha_t *)mp->b_rptr; if (ipha->ipha_protocol == IPPROTO_ENCAP) { - len_needed += IPH_HDR_LENGTH(((uchar_t *)ipha + - len_needed)); + /* + * Check the inner IP version here to guard against + * bogons. + */ + if (IPH_HDR_VERSION(inner_ip6h) == IPV4_VERSION) { + len_needed += + IPH_HDR_LENGTH(((uchar_t *)inner_ip6h)); + } else { + len_needed = sizeof (ipha_t); + } } else { - ip6_t *ip6h = (ip6_t *)((uchar_t *)ipha + len_needed); - ASSERT(ipha->ipha_protocol == IPPROTO_IPV6); - len_needed += ip_hdr_length_v6(mp, ip6h); + /* function called next-line checks inner IP version */ + len_needed += ip_hdr_length_v6(mp, inner_ip6h); } } len_needed += ipst->ips_ip_icmp_return; @@ -5791,7 +5813,7 @@ ip_net_mask(ipaddr_t addr) ipaddr_t mask = 0; uchar_t *maskp = (uchar_t *)&mask; -#if defined(__i386) || defined(__amd64) +#if defined(__x86) #define TOTALLY_BRAIN_DAMAGED_C_COMPILER #endif #ifdef TOTALLY_BRAIN_DAMAGED_C_COMPILER @@ -13923,6 +13945,7 @@ ip_kstat2_init(netstackid_t stackid, ip_stat_t *ip_statisticsp) { "conn_in_recvslla", KSTAT_DATA_UINT64 }, { "conn_in_recvucred", KSTAT_DATA_UINT64 }, { "conn_in_recvttl", KSTAT_DATA_UINT64 }, + { "conn_in_recvtos", KSTAT_DATA_UINT64 }, { "conn_in_recvhopopts", KSTAT_DATA_UINT64 }, { "conn_in_recvhoplimit", KSTAT_DATA_UINT64 }, { "conn_in_recvdstopts", KSTAT_DATA_UINT64 }, diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c index 26e7be2fe8..15ca8adbaa 100644 --- a/usr/src/uts/common/inet/ip/ip6.c +++ b/usr/src/uts/common/inet/ip/ip6.c @@ -22,7 +22,7 @@ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 1990 Mentat Inc. * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright 2019 Joyent, Inc. + * Copyright 2021 Joyent, Inc. */ #include <sys/types.h> @@ -2732,14 +2732,15 @@ done: /* * Return the length of the IPv6 related headers (including extension headers) - * Returns a length even if the packet is malformed. + * If the packet is malformed, this returns the simple IPv6 header length. */ uint16_t ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h) { uint16_t hdr_len; - (void) ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_len, NULL); + if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_len, NULL)) + hdr_len = sizeof (*ip6h); return (hdr_len); } diff --git a/usr/src/uts/common/inet/ip/ip_attr.c b/usr/src/uts/common/inet/ip/ip_attr.c index 6dfbc53d77..0aa969d971 100644 --- a/usr/src/uts/common/inet/ip/ip_attr.c +++ b/usr/src/uts/common/inet/ip/ip_attr.c @@ -862,7 +862,7 @@ conn_get_ixa_exclusive(conn_t *connp) ip_xmit_attr_t *oldixa; ip_xmit_attr_t *ixa; - ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP | KM_NORMALPRI); + ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP_LAZY); if (ixa == NULL) return (NULL); diff --git a/usr/src/uts/common/inet/ip/ip_ftable.c b/usr/src/uts/common/inet/ip/ip_ftable.c index 408b9d0ea1..91918b671e 100644 --- a/usr/src/uts/common/inet/ip/ip_ftable.c +++ b/usr/src/uts/common/inet/ip/ip_ftable.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2021 Racktop Systems, Inc. */ /* @@ -1185,7 +1186,7 @@ ip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src, return (ire); } - /* Now for unicast */ + /* Now for unicast and broadcast */ if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) { if (ixaflags & IXAF_SCOPEID_SET) { /* sin6_scope_id takes precedence over ixa_ifindex */ @@ -1224,7 +1225,7 @@ ip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src, * we check that IP_BOUND_IF, IP_PKTINFO, etc specify * an interface that is consistent with the source address. */ - if (src_multihoming == 2 && + if (verify_src && src_multihoming == 2 && !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) { if (errorp != NULL) *errorp = EADDRNOTAVAIL; diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c index a2ddcb3547..2307837eb8 100644 --- a/usr/src/uts/common/inet/ip/ip_if.c +++ b/usr/src/uts/common/inet/ip/ip_if.c @@ -2112,7 +2112,7 @@ ill_capability_lso_enable(ill_t *ill) dld_capab_lso_t lso; int rc; - ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); + ASSERT(IAM_WRITER_ILL(ill)); if (ill->ill_lso_capab == NULL) { ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t), @@ -2129,7 +2129,8 @@ ill_capability_lso_enable(ill_t *ill) if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso, DLD_ENABLE)) == 0) { ill->ill_lso_capab->ill_lso_flags = lso.lso_flags; - ill->ill_lso_capab->ill_lso_max = lso.lso_max; + ill->ill_lso_capab->ill_lso_max_tcpv4 = lso.lso_max_tcpv4; + ill->ill_lso_capab->ill_lso_max_tcpv6 = lso.lso_max_tcpv6; ill->ill_capabilities |= ILL_CAPAB_LSO; ip1dbg(("ill_capability_lso_enable: interface %s " "has enabled LSO\n ", ill->ill_name)); @@ -2194,11 +2195,10 @@ ill_capability_dld_enable(ill_t *ill) if (!ill->ill_isv6) { ill_capability_direct_enable(ill); ill_capability_poll_enable(ill); - ill_capability_lso_enable(ill); } ill_capability_ipcheck_enable(ill); - + ill_capability_lso_enable(ill); ill->ill_capabilities |= ILL_CAPAB_DLD; ill_mac_perim_exit(ill, mph); } @@ -8656,8 +8656,8 @@ ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd) * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also * atomically set/clear the muxids. Also complete the ioctl by acking or * naking it. Note that the code is structured such that the link type, - * whether it's persistent or not, is treated equally. ifconfig(1M) and - * its clones use the persistent link, while pppd(1M) and perhaps many + * whether it's persistent or not, is treated equally. ifconfig(8) and + * its clones use the persistent link, while pppd(8) and perhaps many * other daemons may use non-persistent link. When combined with some * ill_t states, linking and unlinking lower streams may be used as * indicators of dynamic re-plumbing events [see PSARC/1999/348]. diff --git a/usr/src/uts/common/inet/ip/ip_output.c b/usr/src/uts/common/inet/ip/ip_output.c index a6ca2aabd5..a0157d3c48 100644 --- a/usr/src/uts/common/inet/ip/ip_output.c +++ b/usr/src/uts/common/inet/ip/ip_output.c @@ -673,7 +673,8 @@ ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa) /* * Capability has changed, refresh the copy in ixa. */ - if (lsoc->ill_lso_max != new_lsoc->ill_lso_max) { + if (lsoc->ill_lso_max_tcpv4 != new_lsoc->ill_lso_max_tcpv4 || + lsoc->ill_lso_max_tcpv6 != new_lsoc->ill_lso_max_tcpv6) { *lsoc = *new_lsoc; return (B_FALSE); diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c index 4f3ec2d817..d47997a4aa 100644 --- a/usr/src/uts/common/inet/ip/ipclassifier.c +++ b/usr/src/uts/common/inet/ip/ipclassifier.c @@ -22,6 +22,7 @@ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2016 Joyent, Inc. * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. + * Copyright 2022 Joyent, Inc. */ /* @@ -2758,6 +2759,8 @@ conn_get_socket_info(conn_t *connp, mib2_socketInfoEntry_t *sie) vnode_t *vn = NULL; vattr_t attr; uint64_t flags = 0; + sock_upcalls_t *upcalls; + sock_upper_handle_t upper_handle; /* * If the connection is closing, it is not safe to make an upcall or @@ -2772,11 +2775,25 @@ conn_get_socket_info(conn_t *connp, mib2_socketInfoEntry_t *sie) return (NULL); } - mutex_exit(&connp->conn_lock); + /* + * Continue to hold conn_lock because we don't want to race with an + * in-progress close, which will have set-to-NULL (and destroyed + * upper_handle, aka sonode (and vnode)) BEFORE setting CONN_CLOSING. + * + * There is still a race with an in-progress OPEN, however, where + * conn_upper_handle and conn_upcalls are being assigned (in multiple + * codepaths) WITHOUT conn_lock being held. We address that race + * HERE, however, given that both are going from NULL to non-NULL, + * if we lose the race, we don't get any data for the in-progress-OPEN + * socket. + */ - if (connp->conn_upper_handle != NULL) { - vn = (*connp->conn_upcalls->su_get_vnode) - (connp->conn_upper_handle); + upcalls = connp->conn_upcalls; + upper_handle = connp->conn_upper_handle; + /* Check BOTH for non-NULL before attempting an upcall. */ + if (upper_handle != NULL && upcalls != NULL) { + /* su_get_vnode() returns one with VN_HOLD() already done. */ + vn = upcalls->su_get_vnode(upper_handle); } else if (!IPCL_IS_NONSTR(connp) && connp->conn_rq != NULL) { vn = STREAM(connp->conn_rq)->sd_pvnode; if (vn != NULL) @@ -2784,6 +2801,8 @@ conn_get_socket_info(conn_t *connp, mib2_socketInfoEntry_t *sie) flags |= MIB2_SOCKINFO_STREAM; } + mutex_exit(&connp->conn_lock); + if (vn == NULL || VOP_GETATTR(vn, &attr, 0, CRED(), NULL) != 0) { if (vn != NULL) VN_RELE(vn); diff --git a/usr/src/uts/common/inet/ip/ipsecesp.c b/usr/src/uts/common/inet/ip/ipsecesp.c index e0efbbf3ce..4b4e88dcf6 100644 --- a/usr/src/uts/common/inet/ip/ipsecesp.c +++ b/usr/src/uts/common/inet/ip/ipsecesp.c @@ -1843,6 +1843,7 @@ esp_submit_req_inbound(mblk_t *esp_mp, ip_recv_attr_t *ira, ipsec_stack_t *ipss = ns->netstack_ipsec; ipsecesp_stack_t *espstack = ns->netstack_ipsecesp; + mp = NULL; do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE; do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL; force = (assoc->ipsa_flags & IPSA_F_ASYNC); @@ -2172,6 +2173,7 @@ esp_submit_req_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa, ipsa_t *assoc, esp3dbg(espstack, ("esp_submit_req_outbound:%s", is_natt ? "natt" : "not natt")); + mp = NULL; do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL; do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE; force = (assoc->ipsa_flags & IPSA_F_ASYNC); @@ -2441,6 +2443,7 @@ esp_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa) * Reality check.... */ ipha = (ipha_t *)data_mp->b_rptr; /* So we can call esp_acquire(). */ + ip6h = (ip6_t *)ipha; if (ixa->ixa_flags & IXAF_IS_IPV4) { ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); @@ -2455,7 +2458,6 @@ esp_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa) ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); af = AF_INET6; - ip6h = (ip6_t *)ipha; bzero(&ipp, sizeof (ipp)); divpoint = ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp, NULL); if (ipp.ipp_dstopts != NULL && diff --git a/usr/src/uts/common/inet/ip/sadb.c b/usr/src/uts/common/inet/ip/sadb.c index 288c0e3e18..14848656ed 100644 --- a/usr/src/uts/common/inet/ip/sadb.c +++ b/usr/src/uts/common/inet/ip/sadb.c @@ -1067,6 +1067,15 @@ sadb_sa2msg(ipsa_t *ipsa, sadb_msg_t *samsg) int srcidsize, dstidsize, senslen, osenslen; sa_family_t fam, pfam; /* Address family for SADB_EXT_ADDRESS */ /* src/dst and proxy sockaddrs. */ + + authsize = 0; + encrsize = 0; + pfam = 0; + srcidsize = 0; + dstidsize = 0; + paddrsize = 0; + senslen = 0; + osenslen = 0; /* * The following are pointers into the PF_KEY message this PF_KEY * message creates. @@ -1100,6 +1109,7 @@ sadb_sa2msg(ipsa_t *ipsa, sadb_msg_t *samsg) */ alloclen = sizeof (sadb_msg_t) + sizeof (sadb_sa_t) + sizeof (sadb_lifetime_t); + otherspi = 0; fam = ipsa->ipsa_addrfam; switch (fam) { @@ -1770,6 +1780,8 @@ sadb_addrcheck(queue_t *pfkey_q, mblk_t *mp, sadb_ext_t *ext, uint_t serial, (ext->sadb_ext_type == SADB_X_EXT_ADDRESS_NATT_LOC) || (ext->sadb_ext_type == SADB_X_EXT_ADDRESS_NATT_REM)); + diagnostic = 0; + /* Assign both sockaddrs, the compiler will do the right thing. */ sin = (struct sockaddr_in *)(addr + 1); sin6 = (struct sockaddr_in6 *)(addr + 1); @@ -3227,7 +3239,7 @@ sadb_common_add(queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg, /* * An error here indicates that alg is the wrong type * (IE: not authentication) or its not in the alg tables - * created by ipsecalgs(1m), or Kcf does not like the + * created by ipsecalgs(8), or Kcf does not like the * parameters passed in with this algorithm, which is * probably a coding error! */ @@ -6855,8 +6867,8 @@ ipsec_tun_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp, return (ENOENT); } /* - * Else, this is a tunnel policy configured with ifconfig(1m) - * or "negotiate transport" with ipsecconf(1m). We have an + * Else, this is a tunnel policy configured with ifconfig(8) + * or "negotiate transport" with ipsecconf(8). We have an * itp with policy set based on any match, so don't bother * changing fields in "sel". */ diff --git a/usr/src/uts/common/inet/ip/spd.c b/usr/src/uts/common/inet/ip/spd.c index 85f06f3d02..0bcecd8a3a 100644 --- a/usr/src/uts/common/inet/ip/spd.c +++ b/usr/src/uts/common/inet/ip/spd.c @@ -4913,7 +4913,7 @@ ipsec_alg_fix_min_max(ipsec_alginfo_t *alg, ipsec_algtype_t alg_type, } /* - * Sanity check parameters provided by ipsecalgs(1m). Assume that + * Sanity check parameters provided by ipsecalgs(8). Assume that * the algoritm is marked as valid, there is a check at the top * of this function. If any of the checks below fail, the algorithm * entry is invalid. diff --git a/usr/src/uts/common/inet/ip_ndp.h b/usr/src/uts/common/inet/ip_ndp.h index 38ab51e2ff..c1a156067e 100644 --- a/usr/src/uts/common/inet/ip_ndp.h +++ b/usr/src/uts/common/inet/ip_ndp.h @@ -161,7 +161,7 @@ typedef struct ndp_g_s { /* * NCE_F_NONUD is used to disable IPv6 Neighbor Unreachability Detection or - * IPv4 aging and maps to the ATF_PERM flag for arp(1m) + * IPv4 aging and maps to the ATF_PERM flag for arp(8) */ #define NCE_F_NONUD 0x10 @@ -181,7 +181,7 @@ typedef struct ndp_g_s { * NCE_F_AUTHORITY is set for any address that we have authoritatitve * information for. This includes locally configured addresses as well * as statically configured arp entries that are set up using the "permanent" - * option described in arp(1m). The NCE_F_AUTHORITY asserts that we would + * option described in arp(8). The NCE_F_AUTHORITY asserts that we would * reject any updates for that nce's (host, link-layer-address) information */ #define NCE_F_AUTHORITY 0x800 diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h index 85885f9dd9..e45e44ad08 100644 --- a/usr/src/uts/common/inet/ip_stack.h +++ b/usr/src/uts/common/inet/ip_stack.h @@ -26,6 +26,7 @@ /* * Copyright 2019 Joyent, Inc. + * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. */ #ifndef _INET_IP_STACK_H @@ -85,6 +86,7 @@ typedef struct ip_stat { kstat_named_t conn_in_recvslla; kstat_named_t conn_in_recvucred; kstat_named_t conn_in_recvttl; + kstat_named_t conn_in_recvtos; kstat_named_t conn_in_recvhopopts; kstat_named_t conn_in_recvhoplimit; kstat_named_t conn_in_recvdstopts; diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h index 6c65f64240..70cff374a4 100644 --- a/usr/src/uts/common/inet/ipclassifier.h +++ b/usr/src/uts/common/inet/ipclassifier.h @@ -25,7 +25,7 @@ */ /* - * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. + * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. */ #ifndef _INET_IPCLASSIFIER_H @@ -186,6 +186,7 @@ typedef struct crb_s { crbb_recvslla : 1, /* IP_RECVSLLA option */ crbb_recvttl : 1, /* IP_RECVTTL option */ + crbb_recvtos : 1, /* IP_RECVTOS option */ crbb_ip_recvpktinfo : 1, /* IP*_RECVPKTINFO option */ crbb_ipv6_recvhoplimit : 1, /* IPV6_RECVHOPLIMIT option */ crbb_ipv6_recvhopopts : 1, /* IPV6_RECVHOPOPTS option */ @@ -209,6 +210,7 @@ typedef struct crb_s { #define crb_recvif crbu.crbb.crbb_recvif #define crb_recvslla crbu.crbb.crbb_recvslla #define crb_recvttl crbu.crbb.crbb_recvttl +#define crb_recvtos crbu.crbb.crbb_recvtos #define crb_ip_recvpktinfo crbu.crbb.crbb_ip_recvpktinfo #define crb_ipv6_recvhoplimit crbu.crbb.crbb_ipv6_recvhoplimit #define crb_ipv6_recvhopopts crbu.crbb.crbb_ipv6_recvhopopts diff --git a/usr/src/uts/common/inet/ipd/ipd.c b/usr/src/uts/common/inet/ipd/ipd.c index 25e0b699c5..22f2d79d24 100644 --- a/usr/src/uts/common/inet/ipd/ipd.c +++ b/usr/src/uts/common/inet/ipd/ipd.c @@ -71,7 +71,7 @@ * * ipd has two different entry points, one is administrative, the other is the * data path. The administrative path is accessed by a userland component called - * ipdadm(1M). It communicates to the kernel component via ioctls to /dev/ipd. + * ipdadm(8). It communicates to the kernel component via ioctls to /dev/ipd. * If the administrative path enables a specific zone, then the data path will * become active for that zone. Any packet that leaves that zone's IP stack or * is going to enter it, comes through the callback specified in the hook_t(9S) diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c index 9aeba33d30..b16fc9bf5f 100644 --- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c +++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c @@ -731,6 +731,7 @@ ipf_hook_protocol_notify(hook_notify_cmd_t command, void *arg, hook_hint_t hint; boolean_t out; int ret = 0; + const boolean_t gz = ifs->ifs_gz_controlled; /* We currently only care about viona hooks notifications */ @@ -2438,42 +2439,6 @@ int ipf_hook6_loop_out(hook_event_token_t token, hook_data_t info, void *arg) return ipf_hook6(info, 1, FI_NOCKSUM, arg); } -/* ------------------------------------------------------------------------ */ -/* Function: ipf_hookvndl3_in */ -/* Returns: int - 0 == packet ok, else problem, free packet if not done */ -/* Parameters: event(I) - pointer to event */ -/* info(I) - pointer to hook information for firewalling */ -/* */ -/* The vnd hooks are private hooks to ON. They represents a layer 2 */ -/* datapath generally used to implement virtual machines. The driver sends */ -/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */ -/* them is in the upper 16 bits while the remaining bits are the */ -/* traditional packet hook flags. */ -/* */ -/* They end up calling the appropriate traditional ip hooks. */ -/* ------------------------------------------------------------------------ */ -/*ARGSUSED*/ -int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg) -{ - return ipf_hook4_in(token, info, arg); -} - -int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg) -{ - return ipf_hook6_in(token, info, arg); -} - -/*ARGSUSED*/ -int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg) -{ - return ipf_hook4_out(token, info, arg); -} - -int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg) -{ - return ipf_hook6_out(token, info, arg); -} - /* Static constants used by ipf_hook_ether */ static uint8_t ipf_eth_bcast_addr[ETHERADDRL] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF @@ -2569,6 +2534,42 @@ int ipf_hook_ether(hook_event_token_t token, hook_data_t info, void *arg, } /* ------------------------------------------------------------------------ */ +/* Function: ipf_hookvndl3_in */ +/* Returns: int - 0 == packet ok, else problem, free packet if not done */ +/* Parameters: event(I) - pointer to event */ +/* info(I) - pointer to hook information for firewalling */ +/* */ +/* The vnd hooks are private hooks to ON. They represents a layer 2 */ +/* datapath generally used to implement virtual machines. The driver sends */ +/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */ +/* them is in the upper 16 bits while the remaining bits are the */ +/* traditional packet hook flags. */ +/* */ +/* They end up calling the appropriate traditional ip hooks. */ +/* ------------------------------------------------------------------------ */ +/*ARGSUSED*/ +int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook4_in(token, info, arg); +} + +int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook6_in(token, info, arg); +} + +/*ARGSUSED*/ +int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook4_out(token, info, arg); +} + +int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook6_out(token, info, arg); +} + +/* ------------------------------------------------------------------------ */ /* Function: ipf_hookviona_{in,out} */ /* Returns: int - 0 == packet ok, else problem, free packet if not done */ /* Parameters: event(I) - pointer to event */ diff --git a/usr/src/uts/common/inet/ipf/ip_nat.c b/usr/src/uts/common/inet/ipf/ip_nat.c index e8b115761e..96c71969a4 100644 --- a/usr/src/uts/common/inet/ipf/ip_nat.c +++ b/usr/src/uts/common/inet/ipf/ip_nat.c @@ -4747,6 +4747,8 @@ ipf_stack_t *ifs; ipnat_t *np; SPL_INT(s); + sum1 = 0; + sum2 = 0; if (ifs->ifs_fr_running <= 0) return; diff --git a/usr/src/uts/common/inet/iptun.h b/usr/src/uts/common/inet/iptun.h index 1cd74d87cc..7745c8b3d7 100644 --- a/usr/src/uts/common/inet/iptun.h +++ b/usr/src/uts/common/inet/iptun.h @@ -82,7 +82,7 @@ typedef struct iptun_kparams { #define IPTUN_KPARAM_RADDR 0x00000004 /* itk_raddr is set */ #define IPTUN_KPARAM_SECINFO 0x00000008 /* itk_secinfo is set */ #define IPTUN_KPARAM_IMPLICIT 0x00000010 /* implicitly created IP tunnel */ -#define IPTUN_KPARAM_IPSECPOL 0x00000020 /* ipsecconf(1M) policy present */ +#define IPTUN_KPARAM_IPSECPOL 0x00000020 /* ipsecconf(8) policy present */ #ifdef __cplusplus } diff --git a/usr/src/uts/common/inet/iptun/iptun.c b/usr/src/uts/common/inet/iptun/iptun.c index ca46539f89..e67e6bd26e 100644 --- a/usr/src/uts/common/inet/iptun/iptun.c +++ b/usr/src/uts/common/inet/iptun/iptun.c @@ -1115,12 +1115,12 @@ iptun_setparams(iptun_t *iptun, const iptun_kparams_t *ik) if (ik->iptun_kparam_flags & IPTUN_KPARAM_SECINFO) { /* - * Set IPsec policy originating from the ifconfig(1M) command + * Set IPsec policy originating from the ifconfig(8) command * line. This is traditionally called "simple" policy because * the ipsec_req_t (iptun_kparam_secinfo) can only describe a * simple policy of "do ESP on everything" and/or "do AH on * everything" (as opposed to the rich policy that can be - * defined with ipsecconf(1M)). + * defined with ipsecconf(8)). */ if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4) { /* diff --git a/usr/src/uts/common/inet/sadb.h b/usr/src/uts/common/inet/sadb.h index 30b935495f..ddb3b3902f 100644 --- a/usr/src/uts/common/inet/sadb.h +++ b/usr/src/uts/common/inet/sadb.h @@ -73,12 +73,12 @@ typedef struct ipsa_cm_mech_s { * initialize the Block Cipher, is made up of a Counter and a Salt. * The Counter is fixed at 64 bits and is incremented for each packet. * The Salt value can be any whole byte value upto 64 bits. This is - * algorithm mode specific and can be configured with ipsecalgs(1m). + * algorithm mode specific and can be configured with ipsecalgs(8). * * We only support whole byte salt lengths, this is because the salt is - * stored in an array of uint8_t's. This is enforced by ipsecalgs(1m) + * stored in an array of uint8_t's. This is enforced by ipsecalgs(8) * which configures the salt length as a number of bytes. Checks are - * made to ensure the salt length defined in ipsecalgs(1m) fits in + * made to ensure the salt length defined in ipsecalgs(8) fits in * the ipsec_nonce_t. * * The Salt value remains constant for the life of the SA, the Salt is diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c index a1c0dbe697..e65af832eb 100644 --- a/usr/src/uts/common/inet/squeue.c +++ b/usr/src/uts/common/inet/squeue.c @@ -638,11 +638,11 @@ static void squeue_drain(squeue_t *sqp, uint_t proc_type, hrtime_t expire) { mblk_t *mp; - mblk_t *head; - sqproc_t proc; + mblk_t *head; + sqproc_t proc; conn_t *connp; ill_rx_ring_t *sq_rx_ring = sqp->sq_rx_ring; - hrtime_t now; + hrtime_t now; boolean_t sq_poll_capable; ip_recv_attr_t *ira, iras; diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h index d8084fb11e..3ed2b7174a 100644 --- a/usr/src/uts/common/inet/tcp.h +++ b/usr/src/uts/common/inet/tcp.h @@ -23,6 +23,8 @@ * Copyright 2015 Joyent, Inc. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, 2017 by Delphix. All rights reserved. + * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. + * Copyright 2022 Oxide Computer Company */ /* Copyright (c) 1990 Mentat Inc. */ @@ -246,12 +248,13 @@ typedef struct tcp_s { tcp_accept_error : 1, /* Error during TLI accept */ tcp_send_discon_ind : 1, /* TLI accept err, send discon ind */ tcp_cork : 1, /* tcp_cork option */ + tcp_quickack : 1, /* Send acks immediately */ tcp_tconnind_started : 1, /* conn_ind message is being sent */ tcp_lso :1, /* Lower layer is capable of LSO */ tcp_is_wnd_shrnk : 1, /* Window has shrunk */ - tcp_pad_to_bit_31 : 18; + tcp_pad_to_bit_31 : 17; uint32_t tcp_initial_pmtu; /* Initial outgoing Path MTU. */ @@ -377,6 +380,7 @@ typedef struct tcp_s { int tcp_ipsec_overhead; + uint_t tcp_recvtos; /* Last received IP_RECVTOS */ uint_t tcp_recvifindex; /* Last received IPV6_RCVPKTINFO */ uint_t tcp_recvhops; /* Last received IPV6_RECVHOPLIMIT */ uint_t tcp_recvtclass; /* Last received IPV6_RECVTCLASS */ diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index ef4c96db1c..427a6df274 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -21,10 +21,11 @@ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2019 Joyent, Inc. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, 2017 by Delphix. All rights reserved. * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2020 Joyent, Inc. + * Copyright 2022 Oxide Computer Company */ /* Copyright (c) 1990 Mentat Inc. */ @@ -1018,10 +1019,23 @@ finish: /* If we have an upper handle (socket), release it */ if (IPCL_IS_NONSTR(connp)) { - ASSERT(connp->conn_upper_handle != NULL); - (*connp->conn_upcalls->su_closed)(connp->conn_upper_handle); + sock_upcalls_t *upcalls = connp->conn_upcalls; + sock_upper_handle_t handle = connp->conn_upper_handle; + + ASSERT(upcalls != NULL); + ASSERT(upcalls->su_closed != NULL); + ASSERT(handle != NULL); + /* + * Set these to NULL first because closed() will free upper + * structures. Acquire conn_lock because an external caller + * like conn_get_socket_info() will upcall if these are + * non-NULL. + */ + mutex_enter(&connp->conn_lock); connp->conn_upper_handle = NULL; connp->conn_upcalls = NULL; + mutex_exit(&connp->conn_lock); + upcalls->su_closed(handle); } } @@ -1435,13 +1449,26 @@ tcp_free(tcp_t *tcp) * nothing to do other than clearing the field. */ if (connp->conn_upper_handle != NULL) { + sock_upcalls_t *upcalls = connp->conn_upcalls; + sock_upper_handle_t handle = connp->conn_upper_handle; + + /* + * Set these to NULL first because closed() will free upper + * structures. Acquire conn_lock because an external caller + * like conn_get_socket_info() will upcall if these are + * non-NULL. + */ + mutex_enter(&connp->conn_lock); + connp->conn_upper_handle = NULL; + connp->conn_upcalls = NULL; + mutex_exit(&connp->conn_lock); if (IPCL_IS_NONSTR(connp)) { - (*connp->conn_upcalls->su_closed)( - connp->conn_upper_handle); + ASSERT(upcalls != NULL); + ASSERT(upcalls->su_closed != NULL); + ASSERT(handle != NULL); + upcalls->su_closed(handle); tcp->tcp_detached = B_TRUE; } - connp->conn_upper_handle = NULL; - connp->conn_upcalls = NULL; } } @@ -2394,6 +2421,7 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent) tcp->tcp_fin_wait_2_flush_interval = parent->tcp_fin_wait_2_flush_interval; + tcp->tcp_quickack = parent->tcp_quickack; tcp->tcp_ka_interval = parent->tcp_ka_interval; tcp->tcp_ka_abort_thres = parent->tcp_ka_abort_thres; @@ -3332,9 +3360,11 @@ tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa) */ if (ixa->ixa_flags & IXAF_LSO_CAPAB) { ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab; + uint_t lso_max = (ixa->ixa_flags & IXAF_IS_IPV4) ? + lsoc->ill_lso_max_tcpv4 : lsoc->ill_lso_max_tcpv6; - ASSERT(lsoc->ill_lso_max > 0); - tcp->tcp_lso_max = MIN(TCP_MAX_LSO_LENGTH, lsoc->ill_lso_max); + ASSERT3U(lso_max, >, 0); + tcp->tcp_lso_max = MIN(TCP_MAX_LSO_LENGTH, lso_max); DTRACE_PROBE3(tcp_update_lso, boolean_t, tcp->tcp_lso, boolean_t, B_TRUE, uint32_t, tcp->tcp_lso_max); diff --git a/usr/src/uts/common/inet/tcp/tcp_bind.c b/usr/src/uts/common/inet/tcp/tcp_bind.c index 876e7d48e6..5c2e1e1932 100644 --- a/usr/src/uts/common/inet/tcp/tcp_bind.c +++ b/usr/src/uts/common/inet/tcp/tcp_bind.c @@ -291,7 +291,7 @@ retry: * Return the next anonymous port in the privileged port range for * bind checking. It starts at IPPORT_RESERVED - 1 and goes * downwards. This is the same behavior as documented in the userland - * library call rresvport(3N). + * library call rresvport(3SOCKET). * * TS note: skip multilevel ports. */ @@ -1006,11 +1006,10 @@ tcp_rg_t * tcp_rg_init(tcp_t *tcp) { tcp_rg_t *rg; - rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP|KM_NORMALPRI); + rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP_LAZY); if (rg == NULL) return (NULL); - rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *), - KM_NOSLEEP|KM_NORMALPRI); + rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *), KM_NOSLEEP_LAZY); if (rg->tcprg_members == NULL) { kmem_free(rg, sizeof (tcp_rg_t)); return (NULL); @@ -1063,7 +1062,7 @@ tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp) return (EINVAL); } newmembers = kmem_zalloc(newsize * sizeof (tcp_t *), - KM_NOSLEEP|KM_NORMALPRI); + KM_NOSLEEP_LAZY); if (newmembers == NULL) { mutex_exit(&rg->tcprg_lock); return (ENOMEM); diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c index 0aaad871ba..22b0019a6a 100644 --- a/usr/src/uts/common/inet/tcp/tcp_input.c +++ b/usr/src/uts/common/inet/tcp/tcp_input.c @@ -24,6 +24,8 @@ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2019 Joyent, Inc. * Copyright (c) 2014, 2016 by Delphix. All rights reserved. + * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. + * Copyright 2022 Oxide Computer Company */ /* This file contains all TCP input processing functions. */ @@ -4753,6 +4755,9 @@ update_ack: tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max; else tcp->tcp_rack_cur_max = cur_max; + } else if (tcp->tcp_quickack) { + /* The executable asked that we ack each packet */ + flags |= TH_ACK_NEEDED; } else if (TCP_IS_DETACHED(tcp)) { /* We don't have an ACK timer for detached TCP. */ flags |= TH_ACK_NEEDED; @@ -5108,6 +5113,15 @@ tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp, optlen = 0; addflag.crb_all = 0; + + /* If app asked for TOS and it has changed ... */ + if (connp->conn_recv_ancillary.crb_recvtos && + ipp->ipp_type_of_service != tcp->tcp_recvtos && + (ira->ira_flags & IRAF_IS_IPV4)) { + optlen += sizeof (struct T_opthdr) + + P2ROUNDUP(sizeof (uint8_t), __TPI_ALIGN_SIZE); + addflag.crb_recvtos = 1; + } /* If app asked for pktinfo and the index has changed ... */ if (connp->conn_recv_ancillary.crb_ip_recvpktinfo && ira->ira_ruifindex != tcp->tcp_recvifindex) { @@ -5127,8 +5141,9 @@ tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp, optlen += sizeof (struct T_opthdr) + sizeof (uint_t); addflag.crb_ipv6_recvtclass = 1; } + /* - * If app asked for hopbyhop headers and it has changed ... + * If app asked for hop-by-hop headers and it has changed ... * For security labels, note that (1) security labels can't change on * a connected socket at all, (2) we're connected to at most one peer, * (3) if anything changes, then it must be some other extra option. @@ -5206,6 +5221,23 @@ tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp, todi->OPT_length = optlen; todi->OPT_offset = sizeof (*todi); optptr = (uchar_t *)&todi[1]; + + /* If app asked for TOS and it has changed ... */ + if (addflag.crb_recvtos) { + toh = (struct T_opthdr *)optptr; + toh->level = IPPROTO_IP; + toh->name = IP_RECVTOS; + toh->len = sizeof (*toh) + + P2ROUNDUP(sizeof (uint8_t), __TPI_ALIGN_SIZE); + toh->status = 0; + optptr += sizeof (*toh); + *(uint8_t *)optptr = ipp->ipp_type_of_service; + optptr = (uchar_t *)toh + toh->len; + ASSERT(__TPI_TOPT_ISALIGNED(optptr)); + /* Save as "last" value */ + tcp->tcp_recvtos = ipp->ipp_type_of_service; + } + /* * If app asked for pktinfo and the index has changed ... * Note that the local address never changes for the connection. diff --git a/usr/src/uts/common/inet/tcp/tcp_misc.c b/usr/src/uts/common/inet/tcp/tcp_misc.c index 0896dd7611..423d3003cf 100644 --- a/usr/src/uts/common/inet/tcp/tcp_misc.c +++ b/usr/src/uts/common/inet/tcp/tcp_misc.c @@ -44,7 +44,7 @@ static boolean_t tcp_do_reclaim = B_TRUE; * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure * (defined in tcp.h) needs to be filled in and passed into the kernel - * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t + * via an I_STR ioctl command (see streamio(4I)). The tcp_ioc_abort_conn_t * structure contains the four-tuple of a TCP connection and a range of TCP * states (specified by ac_start and ac_end). The use of wildcard addresses * and ports is allowed. Connections with a matching four tuple and a state diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c index ea4760e6bb..15e49ae070 100644 --- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c +++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c @@ -23,6 +23,8 @@ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2019 Joyent, Inc. * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. + * Copyright 2022 Oxide Computer Company */ #include <sys/types.h> @@ -135,6 +137,8 @@ opdes_t tcp_opt_arr[] = { { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ TCP_QUICKACK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, + { TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, { TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, @@ -157,6 +161,7 @@ opdes_t tcp_opt_arr[] = { { T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, +{ IP_RECVTOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, sizeof (ipsec_req_t), -1 /* not initialized */ }, @@ -448,6 +453,9 @@ tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) case TCP_CORK: *i1 = tcp->tcp_cork; return (sizeof (int)); + case TCP_QUICKACK: + *i1 = tcp->tcp_quickack; + return (sizeof (int)); case TCP_RTO_INITIAL: *i1 = tcp->tcp_rto_initial; return (sizeof (uint32_t)); @@ -626,9 +634,9 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, /* * Note: Implies T_CHECK semantics for T_OPTCOM_REQ * inlen != 0 implies value supplied and - * we have to "pretend" to set it. + * we have to "pretend" to set it. * inlen == 0 implies that there is no - * value part in T_CHECK request and just validation + * value part in T_CHECK request and just validation * done elsewhere should be enough, we just return here. */ if (inlen == 0) { @@ -1021,6 +1029,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, tcp->tcp_cork = onoff; } break; + case TCP_QUICKACK: + if (!checkonly) { + tcp->tcp_quickack = onoff; + } + break; case TCP_RTO_INITIAL: if (checkonly || val == 0) break; @@ -1132,6 +1145,16 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, return (EINVAL); } break; + case IP_RECVTOS: + if (!checkonly) { + /* + * Force it to be sent up with the next msg + * by setting it to a value which cannot + * appear in a packet (TOS is only 8-bits) + */ + tcp->tcp_recvtos = 0xffffffffU; + } + break; } break; case IPPROTO_IPV6: diff --git a/usr/src/uts/common/inet/tcp/tcp_output.c b/usr/src/uts/common/inet/tcp/tcp_output.c index 7a0472f3dd..086668f435 100644 --- a/usr/src/uts/common/inet/tcp/tcp_output.c +++ b/usr/src/uts/common/inet/tcp/tcp_output.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014, 2017 by Delphix. All rights reserved. - * Copyright 2019 Joyent, Inc. + * Copyright 2020 Joyent, Inc. */ /* This file contains all TCP output processing functions. */ @@ -1677,11 +1677,23 @@ finish: /* non-STREAM socket, release the upper handle */ if (IPCL_IS_NONSTR(connp)) { - ASSERT(connp->conn_upper_handle != NULL); - (*connp->conn_upcalls->su_closed) - (connp->conn_upper_handle); + sock_upcalls_t *upcalls = connp->conn_upcalls; + sock_upper_handle_t handle = connp->conn_upper_handle; + + ASSERT(upcalls != NULL); + ASSERT(upcalls->su_closed != NULL); + ASSERT(handle != NULL); + /* + * Set these to NULL first because closed() will free + * upper structures. Acquire conn_lock because an + * external caller like conn_get_socket_info() will + * upcall if these are non-NULL. + */ + mutex_enter(&connp->conn_lock); connp->conn_upper_handle = NULL; connp->conn_upcalls = NULL; + mutex_exit(&connp->conn_lock); + upcalls->su_closed(handle); } } diff --git a/usr/src/uts/common/inet/tcp/tcp_socket.c b/usr/src/uts/common/inet/tcp/tcp_socket.c index 2de76ea060..32422be675 100644 --- a/usr/src/uts/common/inet/tcp/tcp_socket.c +++ b/usr/src/uts/common/inet/tcp/tcp_socket.c @@ -199,7 +199,7 @@ static int tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, socklen_t len, cred_t *cr) { - int error; + int error; conn_t *connp = (conn_t *)proto_handle; /* All Solaris components should pass a cred for this operation. */ @@ -240,7 +240,7 @@ tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) { conn_t *connp = (conn_t *)proto_handle; tcp_t *tcp = connp->conn_tcp; - int error; + int error; ASSERT(connp->conn_upper_handle != NULL); @@ -660,7 +660,7 @@ static int tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, int mode, int32_t *rvalp, cred_t *cr) { - conn_t *connp = (conn_t *)proto_handle; + conn_t *connp = (conn_t *)proto_handle; int error; ASSERT(connp->conn_upper_handle != NULL); @@ -825,7 +825,7 @@ tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, struct stroptions *stropt; struct T_capability_ack tca; struct sockaddr_in6 laddr, faddr; - socklen_t laddrlen, faddrlen; + socklen_t laddrlen, faddrlen; short opts; int error; mblk_t *mp, *mpnext; @@ -999,7 +999,7 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, sock_quiesce_arg_t *arg) { tcp_t *tcp; - conn_t *connp = (conn_t *)proto_handle; + conn_t *connp = (conn_t *)proto_handle; int error; mblk_t *stropt_mp; mblk_t *ordrel_mp; diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index b2183405eb..4e208465f2 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -23,6 +23,7 @@ * Copyright 2013 Nexenta Systems, Inc. All rights reserved. * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright 2018, Joyent, Inc. + * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -2479,8 +2480,8 @@ udp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) *(uint32_t *)&sin->sin_zero[4] = 0; /* - * Add options if IP_RECVDSTADDR, IP_RECVIF, IP_RECVSLLA or - * IP_RECVTTL has been set. + * Add options if IP_RECVDSTADDR, IP_RECVIF, IP_RECVSLLA, + * IP_RECVTTL or IP_RECVTOS has been set. */ if (udi_size != 0) { conn_recvancillary_add(connp, recv_ancillary, ira, @@ -6566,7 +6567,7 @@ udp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, */ error = ip_create_helper_stream(connp, us->us_ldi_ident); if (error != 0) { - ip0dbg(("tcp_ioctl: create of IP helper stream " + ip0dbg(("udp_ioctl: create of IP helper stream " "failed %d\n", error)); return (error); } diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c index 847e2cdde6..9c05b8c876 100644 --- a/usr/src/uts/common/inet/udp/udp_opt_data.c +++ b/usr/src/uts/common/inet/udp/udp_opt_data.c @@ -22,6 +22,7 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2015, Joyent, Inc. + * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. */ #include <sys/types.h> @@ -112,8 +113,8 @@ opdes_t udp_opt_arr[] = { }, { IP_RECVIF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { IP_RECVSLLA, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IP_RECVTTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), - 0 }, +{ IP_RECVTTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ IP_RECVTOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { IP_MULTICAST_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (struct in_addr), 0 /* INADDR_ANY */ }, |
