diff options
author | Ryan Zezeski <rpz@joyent.com> | 2020-05-04 17:50:44 +0000 |
---|---|---|
committer | Patrick Mooney <pmooney@pfmooney.com> | 2020-05-18 18:37:51 +0000 |
commit | c61a1653a4d73dbc950dac7d96350fd6cb517486 (patch) | |
tree | a3050405d36b98afd4e056de8c295d7d47d3e6df | |
parent | f13f199891d2a0440db0361743dd73527f565e89 (diff) | |
download | illumos-joyent-c61a1653a4d73dbc950dac7d96350fd6cb517486.tar.gz |
12676 want better offloads for vnics
12677 simnet has bogus mi_tx_cksum_flags
12678 mac_tx() is too eager to emulate hardware offloads
Portions contributed by: Patrick Mooney <patrick.mooney@joyent.com>
Portions contributed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Patrick Mooney <pmooney@oxide.computer>
Reviewed by: Andy Fiddaman <andy@omniosce.org>
Approved by: Dan McDonald <danmcd@joyent.com>
64 files changed, 4619 insertions, 582 deletions
diff --git a/usr/src/pkg/manifests/system-test-nettest.mf b/usr/src/pkg/manifests/system-test-nettest.mf new file mode 100644 index 0000000000..b313b0cc1c --- /dev/null +++ b/usr/src/pkg/manifests/system-test-nettest.mf @@ -0,0 +1,57 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2020 Oxide Computer Company +# + +set name=pkg.fmri value=pkg:/system/test/nettest@$(PKGVERS) +set name=pkg.description value="Miscellaneous Network Unit Tests" +set name=pkg.summary value="Network Unit Test Suite" +set name=info.classification \ + value=org.opensolaris.category.2008:Development/System +set name=variant.arch value=$(ARCH) +dir path=opt/net-tests +dir path=opt/net-tests/bin +dir path=opt/net-tests/config +dir path=opt/net-tests/runfiles +dir path=opt/net-tests/tests +dir path=opt/net-tests/tests/forwarding +file path=opt/net-tests/bin/nettest mode=0555 +file path=opt/net-tests/config/ip_forwarding.config mode=0644 \ + preserve=renamenew +file path=opt/net-tests/runfiles/default.run mode=0444 +file path=opt/net-tests/tests/forwarding/README mode=0444 +file path=opt/net-tests/tests/forwarding/ip_forwarding mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_001 mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_002 mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_003 mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_004 mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_005 mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_006 mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_007 mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_008 mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_009 mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_010 mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_011 mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_012 mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_013 mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_014 mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_015 mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_016 mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_017 mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_018 mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_019 mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_020 mode=0555 +file path=opt/net-tests/tests/forwarding/ip_fwd_suite mode=0555 +file path=opt/net-tests/tests/net_common mode=0555 +license lic_CDDL license=lic_CDDL +depend fmri=system/test/testrunner type=require diff --git a/usr/src/test/Makefile b/usr/src/test/Makefile index fa57d36772..9756f02ef7 100644 --- a/usr/src/test/Makefile +++ b/usr/src/test/Makefile @@ -12,6 +12,7 @@ # # Copyright (c) 2012 by Delphix. All rights reserved. # Copyright 2014 Garrett D'Amore <garrett@damore.org> +# Copyright 2019 Joyent, Inc. # .PARALLEL: $(SUBDIRS) @@ -20,6 +21,7 @@ SUBDIRS = \ crypto-tests \ elf-tests \ libc-tests \ + net-tests \ os-tests \ smbclient-tests \ test-runner \ diff --git a/usr/src/test/net-tests/Makefile b/usr/src/test/net-tests/Makefile new file mode 100644 index 0000000000..6536e70c59 --- /dev/null +++ b/usr/src/test/net-tests/Makefile @@ -0,0 +1,20 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019, Joyent Inc. +# + +.PARALLEL: $(SUBDIRS) + +SUBDIRS = cmd config runfiles tests + +include $(SRC)/test/Makefile.com diff --git a/usr/src/test/net-tests/cmd/Makefile b/usr/src/test/net-tests/cmd/Makefile new file mode 100644 index 0000000000..b2770c84c6 --- /dev/null +++ b/usr/src/test/net-tests/cmd/Makefile @@ -0,0 +1,36 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# +include $(SRC)/Makefile.master + +ROOTOPTPKG = $(ROOT)/opt/net-tests +ROOTBIN = $(ROOTOPTPKG)/bin +PROGS = nettest +CMDS = $(PROGS:%=$(ROOTBIN)/%) +$(CMDS) := FILEMODE = 0555 + +include $(SRC)/test/Makefile.com + +install: $(CMDS) + +clobber: clean + $(RM) $(CMDS) + +$(CMDS): $(ROOTBIN) + +$(ROOTBIN): + $(INS.dir) + +$(ROOTBIN)/%: %.ksh + $(INS.rename) diff --git a/usr/src/test/net-tests/cmd/nettest.ksh b/usr/src/test/net-tests/cmd/nettest.ksh new file mode 100644 index 0000000000..e7d0e78865 --- /dev/null +++ b/usr/src/test/net-tests/cmd/nettest.ksh @@ -0,0 +1,52 @@ +#!/usr/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +export NET_TESTS="/opt/net-tests" +runner="/opt/test-runner/bin/run" + +function fail +{ + echo $1 >&2 + exit ${2:-1} +} + +function find_runfile +{ + typeset distro= + if [[ -f $NET_TESTS/runfiles/default.run ]]; then + distro=default + fi + + [[ -n $distro ]] && echo $NET_TESTS/runfiles/$distro.run +} + +while getopts c: c; do + case $c in + 'c') + runfile=$OPTARG + [[ -f $runfile ]] || fail "Cannot read file: $runfile" + ;; + esac +done +shift $((OPTIND - 1)) + +[[ -z $runfile ]] && runfile=$(find_runfile) +[[ -z $runfile ]] && fail "Couldn't determine distro" + +$runner -c $runfile + +exit $? diff --git a/usr/src/test/net-tests/config/Makefile b/usr/src/test/net-tests/config/Makefile new file mode 100644 index 0000000000..7151577083 --- /dev/null +++ b/usr/src/test/net-tests/config/Makefile @@ -0,0 +1,38 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +include $(SRC)/Makefile.master + +CFGS = ip_forwarding.config +ROOTOPTPKG = $(ROOT)/opt/net-tests +ROOTOPTPKGCFG = $(ROOT)/opt/net-tests/config +ROOTOPTPKGDIRS = $(ROOTOPTPKG) $(ROOTOPTPKGCFG) +FILES = $(CFGS:%=$(ROOTOPTPKGCFG)/%) +$(FILES) := FILEMODE = 0644 + +include $(SRC)/test/Makefile.com + +all: $(CFGS) + +install: $(ROOTOPTPKG) $(ROOTOPTPKGCFG) $(FILES) + +clobber: clean + $(RM) $(FILES) + +$(ROOTOPTPKGDIRS): + $(INS.dir) + +$(ROOTOPTPKGCFG)/%: % $(ROOTOPTPKGDIRS) + $(INS.file) diff --git a/usr/src/test/net-tests/config/ip_forwarding.config b/usr/src/test/net-tests/config/ip_forwarding.config new file mode 100644 index 0000000000..4a839cd49d --- /dev/null +++ b/usr/src/test/net-tests/config/ip_forwarding.config @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +# +# See the tests/forwarding/README file for information about how to +# configure and run the tests. +# +export NT_CLIENT=client_zone_name +export NT_ROUTER=router_zone_name +export NT_SERVER=server_zone_name diff --git a/usr/src/test/net-tests/runfiles/Makefile b/usr/src/test/net-tests/runfiles/Makefile new file mode 100644 index 0000000000..d50a8deebf --- /dev/null +++ b/usr/src/test/net-tests/runfiles/Makefile @@ -0,0 +1,38 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# +include $(SRC)/Makefile.master + +SRCS = default.run +ROOTOPTPKG = $(ROOT)/opt/net-tests +RUNFILES = $(ROOTOPTPKG)/runfiles +CMDS = $(SRCS:%=$(RUNFILES)/%) +$(CMDS) := FILEMODE = 0444 + +include $(SRC)/test/Makefile.com + +all: $(SRCS) + +install: $(CMDS) + +clobber: clean + $(RM) $(CMDS) + +$(CMDS): $(RUNFILES) $(SRCS) + +$(RUNFILES): + $(INS.dir) + +$(RUNFILES)/%: % + $(INS.file) diff --git a/usr/src/test/net-tests/runfiles/default.run b/usr/src/test/net-tests/runfiles/default.run new file mode 100644 index 0000000000..cfc1a3df8d --- /dev/null +++ b/usr/src/test/net-tests/runfiles/default.run @@ -0,0 +1,44 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +[DEFAULT] +outputdir = /var/tmp/test_results +quiet = False +timeout = 300 + +[/opt/net-tests/tests/forwarding] +tests = [ + 'ip_fwd_001', + 'ip_fwd_002', + 'ip_fwd_003', + 'ip_fwd_004', + 'ip_fwd_005', + 'ip_fwd_006', + 'ip_fwd_007', + 'ip_fwd_008', + 'ip_fwd_009', + 'ip_fwd_010', + 'ip_fwd_011', + 'ip_fwd_012', + 'ip_fwd_013', + 'ip_fwd_014', + 'ip_fwd_015', + 'ip_fwd_016', + 'ip_fwd_017', + 'ip_fwd_018', + 'ip_fwd_019', + 'ip_fwd_020' + ] +user = root diff --git a/usr/src/test/net-tests/tests/Makefile b/usr/src/test/net-tests/tests/Makefile new file mode 100644 index 0000000000..2712d62751 --- /dev/null +++ b/usr/src/test/net-tests/tests/Makefile @@ -0,0 +1,42 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# +include $(SRC)/Makefile.master +include $(SRC)/cmd/Makefile.cmd + +SUBDIRS = forwarding +SCRIPTS = net_common +ROOTOPTPKG = $(ROOT)/opt/net-tests +TESTDIR = $(ROOTOPTPKG)/tests +CMDS = $(SCRIPTS:%=$(TESTDIR)/%) +FILEMODE=0444 +$(CMDS) := FILEMODE = 0555 + +include $(SRC)/test/Makefile.com + +install: $(CMDS) + +clobber: clean + $(RM) $(CMDS) + +$(CMDS): $(TESTDIR) + +$(TESTDIR): + $(INS.dir) + +$(TESTDIR)/%: % + $(INS.file) + +$(TESTDIR)/%: %.ksh + $(INS.rename) diff --git a/usr/src/test/net-tests/tests/forwarding/Makefile b/usr/src/test/net-tests/tests/forwarding/Makefile new file mode 100644 index 0000000000..566db8c86d --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/Makefile @@ -0,0 +1,67 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# +include $(SRC)/Makefile.master +include $(SRC)/cmd/Makefile.cmd + +ROOTOPTPKG = $(ROOT)/opt/net-tests +TESTDIR = $(ROOTOPTPKG)/tests/forwarding + +PROG = \ + ip_forwarding \ + ip_fwd_suite \ + ip_fwd_001 \ + ip_fwd_002 \ + ip_fwd_003 \ + ip_fwd_004 \ + ip_fwd_005 \ + ip_fwd_006 \ + ip_fwd_007 \ + ip_fwd_008 \ + ip_fwd_009 \ + ip_fwd_010 \ + ip_fwd_011 \ + ip_fwd_012 \ + ip_fwd_013 \ + ip_fwd_014 \ + ip_fwd_015 \ + ip_fwd_016 \ + ip_fwd_017 \ + ip_fwd_018 \ + ip_fwd_019 \ + ip_fwd_020 + +DOC = $(TESTDIR)/README + +CMDS = $(PROG:%=$(TESTDIR)/%) +FILEMODE=0444 +$(CMDS) := FILEMODE = 0555 + +include $(SRC)/test/Makefile.com + +install: $(CMDS) $(DOC) + +clobber: clean + $(RM) $(CMDS) $(DOC) + +$(CMDS) $(DOC): $(TESTDIR) + +$(TESTDIR): + $(INS.dir) + +$(TESTDIR)/%: % + $(INS.file) + +$(TESTDIR)/%: %.ksh + $(INS.rename) diff --git a/usr/src/test/net-tests/tests/forwarding/README b/usr/src/test/net-tests/tests/forwarding/README new file mode 100644 index 0000000000..dbe8774a22 --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/README @@ -0,0 +1,177 @@ +Running +------- + +* Create three native zones and start them. + +* Edit config/ip_forwarding.config, entering the names of the zones + you created. + +* Run /opt/net-tests/bin/nettest. + +Overview +-------- + +The tests in this directory test the IP forwarding path under several +different variations. All tests require three zones. The tests use +these three zones, along with the simnet driver, to emulate a real IP +forwarding scenario involving multiple hosts. All tests verify that +TCP, UDP, ICMP, IPv4/IPv6, and fragmented IPv4/IPv6 traffic can cross +the IP forwarding datapath. Each test differs in its emulation of +various hardware offload features (which would typically be presented +by real NICs). The diagrams below gives a visual representation of the +situations we are testing and shows how the test components relate to +each other. + +no mac-loopback +--------------- + +In this configuration we make sure that the packet travels from server +to router via "the wire". + + +----------------------------+ ++----------------------------+ |router zone | +|client zone | | +-------------------------+| +|(ipft_client_nic0) | | |ipft_router_nic0 || +| +----------------------+ | | |+----------------------+ || +| |ipft_client0 | | | ||ipft_client_r0 | || +| |192.168.77.2 |<-+-- Wire --+->|192.168.77.1 | || +| |fd00:0:1:4d::2 | | | ||fd00:0:1:4d::1 | || +| +----------------------+ | | |+----------------------+ || ++----------------------------+ | +-------------------------+| + | ^ | + | | | + | | | + | | | + | | | + | IP | | + | forwarding | | + | | | + | | | + | | | ++----------------------------+ | v | +|server zone | |+-------------------------+ | +|(ipft_server_nic0) | ||ipft_router_nic1 | | +| +----------------------+ | || +----------------------+| | +| |ipft_server0 | | || |ipft_server_r0 || | +| |VLAN 5 | | Wire || |VLAN 5 || | +| |192.168.88.2 |<-+----------++>|192.168.88.1 || | +| |fd00:0:1:58::2 | | || |fd00:0:1:58::1 || | +| +----------------------+ | || +----------------------+| | ++----------------------------+ |+-------------------------+ | + +----------------------------+ + +mac-loopback +------------ + +In this configuration we make sure that the packet travels from server +to router via mac-loopback. + + +----------------------------+ ++----------------------------+ |router zone | +|client zone | | +-------------------------+| +|(ipft_nic0) | | |ipft_nic1 || +| +----------------------+ | | |+----------------------+ || +| |ipft_client0 | | | ||ipft_client_r0 | || +| |192.168.77.2 |<-+-- Wire --+->|192.168.77.1 | || +| |fd00:0:1:4d::2 | | | ||fd00:0:1:4d::1 | || +| +----------------------+ | | |+----------------------+ || ++----------------------------+ | +-------------------------+| + | ^ | + | | | + | | | + | | | + | | | + | IP | | + | forwarding | | + | | | + | | | + | | | ++----------------------------+ | v | +|server zone | |+-------------------------+ | +|(ipft_nic1) | ||ipft_nic1 | | +| +----------------------+ | || +----------------------+| | +| |ipft_server0 | | MAC || |ipft_server_r0 || | +| |VLAN 5 | | loopback || |VLAN 5 || | +| |192.168.88.2 |<-+----------++>|192.168.88.1 || | +| |fd00:0:1:58::2 | | || |fd00:0:1:58::1 || | +| +----------------------+ | || +----------------------+| | ++----------------------------+ |+-------------------------+ | + +----------------------------+ + +Requirements +------------ + +* The client and server zones must provide `/usr/bin/socat`. It would + be nice to use netcat but our native version is missing features + like connection timeout. + +* The user must both create and start the three required zones. + +* All three zones should be native zones. + +* You must edit the ip_forwarding.config file; providing it with the + names of the zones you have created. + +Files +----- + +ip_fowarding + + The main test script; it provides the logic for all the tests + below. The different test variations are controlled by options + and it takes the three zones as arguments. This script may be + run by hand but it's easier to use ip_fwd_suite for that + purpose. + +ip_fwd_suite + + This script runs the various configurations of the IP + forwarding test suite. You can run the entire suite or just a + single test via the '-n' option. The "Test Matrix" section + below gives an overview of all the tests in the suite. + +ip_fwd_XXX + + These scripts are mostly here to work around the fact that the + test-runner cannot pass arguments to individual tests. In + order to avoid running everything as the "ip_fwd_suite" test, + we create a file for each configuration. This gives individual + reporting of each test and steers us clear of tripping the + timeout. You can also run these scripts by hand like so: + + NET_TESTS=/opt/net-tests /opt/net-tests/tests/forwarding/ip_fwd_001 + +config/ip_forwarding.config + + This file must be modified to contain the names of the zones + the user crated for running these tests. + +Test Matrix +----------- + +This is a breakdown of all the tests in the IP forwarding test suite. +If a given offload is enabled or disable, it is done so for all +interfaces involved in the test. + +NAME Tx IP Tx ULP LSO Rx IP mac-loopback +001 off none off off no +002 on partial off off no +003 on partial on off no +004 on fullv4 off off no +005 on fullv4 on off no +006 off none off on no +007 on partial off on no +008 on partial on on no +009 on fullv4 off on no +010 on fullv4 on on no + +011 off none off off yes +012 on partial off off yes +013 on partial on off yes +014 on fullv4 off off yes +015 on fullv4 on off yes +016 off none off on yes +017 on partial off on yes +018 on partial on on yes +019 on fullv4 off on yes +020 on fullv4 on on yes diff --git a/usr/src/test/net-tests/tests/forwarding/ip_forwarding.ksh b/usr/src/test/net-tests/tests/forwarding/ip_forwarding.ksh new file mode 100644 index 0000000000..bf7a2255af --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_forwarding.ksh @@ -0,0 +1,496 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +# +# Usage: +# +# ip_forwarding.ksh -bcflnpuv <client> <router> <server> +# +# Where client, router, and server are the names of three native +# zones. The user must create and start these zones; but other +# than that there is no special configuration required for them. +# +# -b Place server and router on same underlying simnet, causing +# them to talk via MAC-loopback. +# +# -c Run cleanup only. +# +# -f Enable Tx ULP hardware checksum. +# +# -l Enable TCP LSO. +# +# -n No cleanup: the various artifacts created by this script will +# remain after execution. +# +# -p Enabled partial Tx ULP hardware checksum. +# +# -r Enable Rx IPv4 header checksum offload. +# +# -u Run UDP tests. +# +# -v Vebose mode. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +. $NET_TESTS/tests/net_common + +function cleanup +{ + if ((nt_cleanup == 0)); then + dbg "skipping cleanup" + return 0 + fi + + rm -rf ${nt_tdirprefix}* + zlogin $nt_client rm -rf ${nt_tdirprefix}* + zlogin $nt_server rm -rf ${nt_tdirprefix}* + + rm_route $nt_client $nt_server_ip $nt_server_subnet $nt_client_router_ip + rm_route $nt_server $nt_client_ip $nt_client_subnet $nt_server_router_ip + rm_route6 $nt_client $nt_server_ip6 $nt_server_subnet6 \ + $nt_client_router_ip6 + rm_route6 $nt_server $nt_client_ip6 $nt_client_subnet6 \ + $nt_server_router_ip6 + + ip_fwd_disable $nt_router + + delete_addr $nt_client ipft_client0 v4 + delete_addr $nt_router ipft_client_r0 v4 + delete_addr $nt_router ipft_server_r0 v4 + delete_addr $nt_server ipft_server0 v4 + + delete_addr $nt_client ipft_client0 v6 + delete_addr $nt_router ipft_client_r0 v6 + delete_addr $nt_router ipft_server_r0 v6 + delete_addr $nt_server ipft_server0 v6 + + delete_if $nt_client ipft_client0 + delete_if $nt_router ipft_client_r0 + delete_if $nt_router ipft_server_r0 + delete_if $nt_server ipft_server0 + + delete_vnic ipft_client0 0 $nt_client + delete_vnic ipft_client_r0 0 $nt_router + delete_vnic ipft_server_r0 5 $nt_router + delete_vnic ipft_server0 5 $nt_server + + for nt_name in ${nt_nics[@]}; do + delete_simnet $nt_name + done +} + +function usage +{ + echo "$nt_tname -bcflnpruv <client> <router> <server>" >&2 +} + +# +# Set test defaults. +# +nt_tname=${NT_TNAME:-$(basename $0)} +nt_loopback=0 +nt_ulp_full=0 +nt_ulp_partial=0 +nt_tcp_lso=0 +nt_udp=0 +nt_rx_ip_cksum=0 +nt_cleanup=1 +nt_cleanup_only=0 + +nt_tdirprefix=/var/tmp/${nt_tname} +nt_tdir=${nt_tdirprefix}.$$ +nt_dfile=${nt_tdir}/${nt_tname}.data +nt_efile=${nt_tdir}/${nt_tname}-expected-sha1 +nt_rfile=${nt_tdir}/${nt_tname}-received-sha1 +nt_ofile=${nt_tdir}/${nt_tname}-received +nt_client_subnet=192.168.77.0/24 +nt_client_ip=192.168.77.2 +nt_client_router_ip=192.168.77.1 +nt_server_subnet=192.168.88.0/24 +nt_server_ip=192.168.88.2 +nt_server_router_ip=192.168.88.1 +nt_port=7774 +nt_client_subnet6=fd00:0:1:4d::2/64 +nt_client_ip6=fd00:0:1:4d::2 +nt_client_router_ip6=fd00:0:1:4d::1 +nt_server_subnet6=fd00:0:1:58::/64 +nt_server_router_ip6=fd00:0:1:58::1 +nt_server_ip6=fd00:0:1:58::2 +nt_port6=7776 +nt_bridge=ipft_switch +typeset -A nt_nics + +while getopts "bcflnpruv" opt; do + case $opt in + b) + nt_loopback=1 + ;; + c) + nt_cleanup_only=1 + ;; + f) + nt_ulp_full=1 + ;; + l) + nt_tcp_lso=1 + ;; + n) + nt_cleanup=0 + ;; + p) + nt_ulp_partial=1 + ;; + r) + nt_rx_ip_cksum=1 + ;; + u) + nt_udp=1 + ;; + v) + DEBUG=1 + ;; + esac +done + +shift $((OPTIND - 1)) + +if ((nt_ulp_partial == 1)) && ((nt_ulp_full == 1)); then + fail "both partial and full checksum enabled" +fi + +if (( $# != 3 )); then + usage + fail "wrong number of arguments" +fi + +nt_client=$1 +nt_router=$2 +nt_server=$3 + +if [[ "$nt_client" == "$nt_router" || "$nt_router" == "$nt_server" || + "$nt_client" == "$nt_server" ]]; then + fail "all zones must be unique" +fi + +dbg "client zone: $nt_client" +dbg "router zone: $nt_router" +dbg "server zone: $nt_server" + +BAIL=1 +zone_exists $nt_client || fail "zone $nt_client not found" +zone_exists $nt_router || fail "zone $nt_router not found" +zone_exists $nt_server || fail "zone $nt_server not found" + +zone_running $nt_client +zone_running $nt_router +zone_running $nt_server + +if ! zlogin $nt_client ls /usr/bin/socat > /dev/null; then + fail "zone $nt_client missing socat" +fi + +if ! zlogin $nt_server ls /usr/bin/socat > /dev/null; then + fail "zone $nt_client missing socat" +fi + +if ((nt_loopback == 0)); then + nt_nics[0]=ipft_client_nic0 + nt_nics[1]=ipft_router_nic0 + nt_nics[2]=ipft_router_nic1 + nt_nics[3]=ipft_server_nic0 +else + nt_nics[0]=ipft_nic0 + nt_nics[1]=ipft_nic1 +fi + +# +# Make a best effort to cleanup artifacts from a previous run. +# +if ((nt_cleanup_only == 1)); then + dbg "performing cleanup only" + BAIL=0 + cleanup + BAIL=1 + exit 0 +fi + +if ! mkdir $nt_tdir; then + fail "failed to mkdir $nt_tdir in GZ" +fi +dbg "created dir $nt_tdir in GZ" +if ! zlogin $nt_client mkdir $nt_tdir; then + fail "failed to mkdir $nt_tdir in $nt_client" +fi +dbg "created dir $nt_tdir in $nt_client" +if ! zlogin $nt_server mkdir $nt_tdir; then + fail "failed to mkdir $nt_tdir in $nt_server" +fi +dbg "created dir $nt_tdir in $nt_server" + +trap cleanup ERR + +for nt_name in ${nt_nics[@]}; do + create_simnet $nt_name +done + +if ((nt_loopback == 0)); then + link_simnets ${nt_nics[0]} ${nt_nics[1]} + link_simnets ${nt_nics[2]} ${nt_nics[3]} +else + link_simnets ${nt_nics[0]} ${nt_nics[1]} +fi + +for nt_name in ${nt_nics[@]}; do + if ((nt_ulp_partial == 1)); then + set_linkprop $nt_name _tx_ulp_cksum partial + fi + + if ((nt_ulp_full == 1)); then + set_linkprop $nt_name _tx_ulp_cksum fullv4 + fi + + if ((nt_ulp_full == 1)) || ((nt_ulp_partial == 1)); then + set_linkprop $nt_name _tx_ipv4_cksum on + fi + + if ((nt_tcp_lso == 1)); then + set_linkprop $nt_name _lso on + fi + + if ((nt_rx_ip_cksum == 1)); then + set_linkprop $nt_name _rx_ipv4_cksum on + fi +done + +if ((nt_loopback == 0)); then + create_vnic ipft_client0 ipft_client_nic0 0 $nt_client + create_vnic ipft_client_r0 ipft_router_nic0 0 $nt_router + create_vnic ipft_server_r0 ipft_router_nic1 5 $nt_router + create_vnic ipft_server0 ipft_server_nic0 5 $nt_server +else + create_vnic ipft_client0 ipft_nic0 0 $nt_client + create_vnic ipft_client_r0 ipft_nic1 0 $nt_router + create_vnic ipft_server_r0 ipft_nic1 5 $nt_router + create_vnic ipft_server0 ipft_nic1 5 $nt_server +fi + +ip_fwd_enable $nt_router + +create_addr $nt_client ipft_client0 $nt_client_ip/24 +create_addr $nt_router ipft_client_r0 $nt_client_router_ip/24 +create_addr $nt_router ipft_server_r0 $nt_server_router_ip/24 +create_addr $nt_server ipft_server0 $nt_server_ip/24 + +add_route $nt_client $nt_server_ip $nt_server_subnet $nt_client_router_ip +add_route $nt_server $nt_client_ip $nt_client_subnet $nt_server_router_ip + +create_addr6 $nt_client ipft_client0 $nt_client_ip6 +create_addr6 $nt_router ipft_client_r0 $nt_client_router_ip6 +create_addr6 $nt_router ipft_server_r0 $nt_server_router_ip6 +create_addr6 $nt_server ipft_server0 $nt_server_ip6 + +add_route6 $nt_client $nt_server_ip6 $nt_server_subnet6 $nt_client_router_ip6 +add_route6 $nt_server $nt_client_ip6 $nt_client_subnet6 $nt_server_router_ip6 + +dd if=/dev/urandom of=$nt_dfile bs=1024 count=1024 > /dev/null 2>&1 +if (($? != 0)); then + fail "failed to create data file: $nt_dfile" +else + dbg "created data file: $nt_dfile" +fi + +digest -a sha1 $nt_dfile > $nt_efile + +# ================================================================ +# client -> server +# ================================================================ +ping $nt_client $nt_client_ip $nt_server_ip +ping $nt_client $nt_client_ip6 $nt_server_ip6 + +start_server $nt_server TCP4 $nt_server_ip $nt_port $nt_ofile +nt_listener_ppid=$! + +# Give the server time to start. +sleep 1 + +dbg "sending 1M $nt_client ($nt_client_ip) -> $nt_server ($nt_server_ip)" +zlogin $nt_client /usr/bin/socat -b 8192 STDIN \ + TCP4:$nt_server_ip:$nt_port,connect-timeout=5 < $nt_dfile + +if (($? != 0)); then + pkill -TERM -P $nt_listener_ppid + fail "failed to run socat client" +else + dbg "sent 1M $nt_client ($nt_client_ip) -> $nt_server ($nt_server_ip)" +fi + +# +# The client may have exited but make sure to give the server time to +# exit and finish computing the SHA1. +# +dbg "waiting for listener $nt_listener_ppid" +wait_for_pid $nt_listener_ppid 5 +dbg "listener $nt_listener_ppid exited" + +digest -a sha1 /zones/$nt_server/root/$nt_ofile > $nt_rfile + +if ! diff $nt_efile $nt_rfile; then + fail "SHA1 comparison failed" +else + dbg "SHA1 comparison passed" +fi + +start_server $nt_server TCP6 $nt_server_ip6 $nt_port6 $nt_rfile +listener_ppid=$! + +# Give the server time to start. +sleep 1 + +zlogin $nt_client /usr/bin/socat -b 8192 STDIN \ + TCP6:[${nt_server_ip6}]:$nt_port6,connect-timeout=5 < $nt_dfile + +if (($? != 0)); then + pkill -TERM -P $nt_listener_ppid + fail "failed to run socat client IPv6" +else + dbg "sent 1M $nt_client ($nt_client_ip6)" \ + "-> $nt_server ($nt_server_ip6) IPv6" +fi + +# +# The client may have exited but make sure to give the server time to +# exit and finish computing the SHA1. +# +dbg "waiting for listener $nt_listener_ppid" +wait_for_pid $nt_listener_ppid 5 +dbg "listener $nt_listener_ppid exited" + +digest -a sha1 /zones/$nt_server/root/$nt_ofile > $nt_rfile + +if ! diff $nt_efile $nt_rfile; then + fail "SHA1 comparison failed" +else + dbg "SHA1 comparison passed" +fi + +if ((nt_udp == 1)); then + ping_udp $nt_client $nt_client_ip $nt_server_ip 256 3 + ping_udp $nt_client $nt_client_ip6 $nt_server_ip6 256 3 + + # + # Test IP fragmentation by sending a larger-than-MTU datagram. + # You can verify fragmentation is happening by dtracing the + # various "Frag" and "Reasm" mibs. + # + dbg "test IP fragmentation $nt_client_ip -> $nt_server_ip" + ping_udp $nt_client $nt_client_ip $nt_server_ip $((1024 * 16)) 3 + + dbg "test IPv6 fragmentation $nt_client_ip6 -> $nt_server_ip6" + ping_udp $nt_client $nt_client_ip6 $nt_server_ip6 $((1024 * 16)) 3 +fi + +# ================================================================ +# server -> client +# ================================================================ +ping $nt_server $nt_server_ip $nt_client_ip +ping $nt_server $nt_server_ip6 $nt_client_ip6 + +start_server $nt_client TCP4 $nt_client_ip $nt_port $nt_ofile +nt_listener_ppid=$! + +# Give the listener time to start. +sleep 1 + +zlogin $nt_server /usr/bin/socat -b 8192 STDIN \ + TCP4:$nt_client_ip:$nt_port,bind=$nt_server_ip,connect-timeout=5 \ + < $nt_dfile + +if (($? != 0)); then + pkill -TERM -P $nt_listener_ppid + fail "failed to run socat client" +else + dbg "sent 1M $nt_server ($nt_server_ip) -> $nt_client ($nt_client_ip)" +fi + +# +# The client may have exited but make sure to give the server time to +# exit and finish computing the SHA1. +# +dbg "waiting for listener $nt_listener_ppid" +wait_for_pid $nt_listener_ppid 5 +dbg "listener $nt_listener_ppid exited" + +digest -a sha1 /zones/$nt_client/root/$nt_ofile > $nt_rfile + +if ! diff $nt_efile $nt_rfile; then + fail "SHA1 comparison failed" +else + dbg "SHA1 comparison passed" +fi + +start_server $nt_client TCP6 $nt_client_ip6 $nt_port6 $nt_ofile +nt_listener_ppid=$! + +# Give the listener time to start. +sleep 1 + +zlogin $nt_server /usr/bin/socat -b 8192 STDIN \ + TCP6:[$nt_client_ip6]:$nt_port6,connect-timeout=5 < $nt_dfile + +if (($? != 0)); then + pkill -TERM -P $nt_listener_ppid + fail "failed to run socat client IPv6" +else + dbg "sent 1M $nt_server ($nt_server_ip6) -> $nt_client ($nt_client_ip6)" +fi + +# +# The client may have exited but make sure to give the server time to +# exit and finish computing the SHA1. +# +dbg "waiting for listener $nt_listener_ppid" +wait_for_pid $nt_listener_ppid 5 +dbg "server $nt_listener_ppid exited" + +digest -a sha1 /zones/$nt_client/root/$nt_ofile > $nt_rfile + +if ! diff $nt_efile $nt_rfile; then + fail "SHA1 comparison failed" +else + dbg "SHA1 comparison passed" +fi + +if ((nt_udp == 1)); then + ping_udp $nt_server $nt_server_ip $nt_client_ip 256 3 + ping_udp $nt_server $nt_server_ip6 $nt_client_ip6 256 3 + + # + # Test IP fragmentation by sending a larger-than-MTU datagram. + # You can verify fragmentation is happening by dtracing the + # various "Frag" and "Reasm" mibs. + # + dbg "test IP fragmentation $nt_server_ip -> $nt_client_ip" + ping_udp $nt_server $nt_server_ip $nt_client_ip $((1024 * 16)) 3 + + dbg "test IPv6 fragmentation $nt_server_ip6 -> $nt_client_ip6" + ping_udp $nt_server $nt_server_ip6 $nt_client_ip6 $((1024 * 16)) 3 +fi + +cleanup +echo "PASS [$nt_tname]" diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_001.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_001.ksh new file mode 100644 index 0000000000..9f6c98d1b3 --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_001.ksh @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +$NET_TESTS/tests/forwarding/ip_fwd_suite -n 001 +exit $? diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_002.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_002.ksh new file mode 100644 index 0000000000..06e5ec53ed --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_002.ksh @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +$NET_TESTS/tests/forwarding/ip_fwd_suite -n 002 +exit $? diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_003.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_003.ksh new file mode 100644 index 0000000000..ce84bc0866 --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_003.ksh @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +$NET_TESTS/tests/forwarding/ip_fwd_suite -n 003 +exit $? diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_004.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_004.ksh new file mode 100644 index 0000000000..b5fa65ccd1 --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_004.ksh @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +$NET_TESTS/tests/forwarding/ip_fwd_suite -n 004 +exit $? diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_005.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_005.ksh new file mode 100644 index 0000000000..9bbd536e19 --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_005.ksh @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +$NET_TESTS/tests/forwarding/ip_fwd_suite -n 005 +exit $? diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_006.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_006.ksh new file mode 100644 index 0000000000..2267072a3d --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_006.ksh @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +$NET_TESTS/tests/forwarding/ip_fwd_suite -n 006 +exit $? diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_007.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_007.ksh new file mode 100644 index 0000000000..a0380eb92e --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_007.ksh @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +$NET_TESTS/tests/forwarding/ip_fwd_suite -n 007 +exit $? diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_008.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_008.ksh new file mode 100644 index 0000000000..aed5438f63 --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_008.ksh @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +$NET_TESTS/tests/forwarding/ip_fwd_suite -n 008 +exit $? diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_009.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_009.ksh new file mode 100644 index 0000000000..8a0fa9674c --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_009.ksh @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +$NET_TESTS/tests/forwarding/ip_fwd_suite -n 009 +exit $? diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_010.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_010.ksh new file mode 100644 index 0000000000..3c45225597 --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_010.ksh @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +$NET_TESTS/tests/forwarding/ip_fwd_suite -n 010 +exit $? diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_011.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_011.ksh new file mode 100644 index 0000000000..62785ff33e --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_011.ksh @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +$NET_TESTS/tests/forwarding/ip_fwd_suite -n 011 +exit $? diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_012.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_012.ksh new file mode 100644 index 0000000000..c09cd77258 --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_012.ksh @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +$NET_TESTS/tests/forwarding/ip_fwd_suite -n 012 +exit $? diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_013.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_013.ksh new file mode 100644 index 0000000000..e3cc833f53 --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_013.ksh @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +$NET_TESTS/tests/forwarding/ip_fwd_suite -n 013 +exit $? diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_014.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_014.ksh new file mode 100644 index 0000000000..6bd76de190 --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_014.ksh @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +$NET_TESTS/tests/forwarding/ip_fwd_suite -n 014 +exit $? diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_015.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_015.ksh new file mode 100644 index 0000000000..d3b1e2fe1d --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_015.ksh @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +$NET_TESTS/tests/forwarding/ip_fwd_suite -n 015 +exit $? diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_016.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_016.ksh new file mode 100644 index 0000000000..aa5903cbe4 --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_016.ksh @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +$NET_TESTS/tests/forwarding/ip_fwd_suite -n 016 +exit $? diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_017.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_017.ksh new file mode 100644 index 0000000000..38615b9f94 --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_017.ksh @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +$NET_TESTS/tests/forwarding/ip_fwd_suite -n 017 +exit $? diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_018.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_018.ksh new file mode 100644 index 0000000000..e010141458 --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_018.ksh @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +$NET_TESTS/tests/forwarding/ip_fwd_suite -n 018 +exit $? diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_019.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_019.ksh new file mode 100644 index 0000000000..e3b16bad43 --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_019.ksh @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +$NET_TESTS/tests/forwarding/ip_fwd_suite -n 019 +exit $? diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_020.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_020.ksh new file mode 100644 index 0000000000..9710bae3c1 --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_020.ksh @@ -0,0 +1,22 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +$NET_TESTS/tests/forwarding/ip_fwd_suite -n 020 +exit $? diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_suite.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_suite.ksh new file mode 100644 index 0000000000..a1fdc444e3 --- /dev/null +++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_suite.ksh @@ -0,0 +1,115 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +# +# Run the IP forwarding test suite. +# +# Usage +# +# ip_fwd_suite [-n <name>] [-a <args>] +# +# To run all tests: +# +# NET_TESTS=/opt/net-tests ip_fwd_suite +# +# To run one test: +# +# NET_TESTS=/opt/net-tests ip_fwd_suite -n 001 +# +# To run one test with additional arguments passed to 'ip_forwarding': +# +# NET_TESTS=/opt/net-tests ip_fwd_suite -n 001 -a n +# + +if [[ -z $NET_TESTS ]]; then + echo "NET_TESTS not set" >&2 + exit 1 +fi + +. $NET_TESTS/tests/net_common +. $NET_TESTS/config/ip_forwarding.config + +if [[ -z "$NT_CLIENT" ]]; then + fail "NT_CLIENT must be set" +fi + +if [[ -z "$NT_ROUTER" ]]; then + fail "NT_ROUTER must be set" +fi + +if [[ -z "$NT_SERVER" ]]; then + fail "NT_SERVER must be set" +fi + +while getopts "a:n:" opt; do + case $opt in + a) + nt_args=$OPTARG + ;; + n) + nt_name=$OPTARG + ;; + esac +done + +shift $((OPTIND - 1)) + +nt_script=$NET_TESTS/tests/forwarding/ip_forwarding + +# +# See the "Test Matrix" section of the README for a description of +# each test. +# +typeset -A nt_name_args +nt_name_args["001"]="uv" +nt_name_args["002"]="puv" +nt_name_args["003"]="lpuv" +nt_name_args["004"]="fuv" +nt_name_args["005"]="fluv" +nt_name_args["006"]="ruv" +nt_name_args["007"]="pruv" +nt_name_args["008"]="lpruv" +nt_name_args["009"]="fruv" +nt_name_args["010"]="flruv" + +nt_name_args["011"]="buv" +nt_name_args["012"]="bpuv" +nt_name_args["013"]="blpuv" +nt_name_args["014"]="bfuv" +nt_name_args["015"]="bfluv" +nt_name_args["016"]="bruv" +nt_name_args["017"]="bpruv" +nt_name_args["018"]="blpruv" +nt_name_args["019"]="bfruv" +nt_name_args["020"]="bflruv" + +if [[ -n $nt_name ]]; then + if [[ -z ${nt_name_args[$nt_name]} ]]; then + fail "invalid test name: $nt_name" + fi + + export NT_TNAME="ip_fwd_$nt_name" + nt_args="-${nt_name_args[$nt_name]}${nt_args}" + $nt_script $nt_args $NT_CLIENT $NT_ROUTER $NT_SERVER + exit $? +fi + +for nt_name in ${!nt_name_args[@]}; do + export NT_TNAME="ip_fwd_$nt_name" + nt_args="-${nt_name_args[$nt_name]}${nt_args}" + $nt_script $nt_args $NT_CLIENT $NT_ROUTER $NT_SERVER || exit $? +done + +exit 0 diff --git a/usr/src/test/net-tests/tests/net_common.ksh b/usr/src/test/net-tests/tests/net_common.ksh new file mode 100644 index 0000000000..b83cda8c97 --- /dev/null +++ b/usr/src/test/net-tests/tests/net_common.ksh @@ -0,0 +1,650 @@ +#!/usr/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. + +# +# Copyright 2019 Joyent, Inc. +# + +# +# Functions shared across the network tests. +# + +DEBUG=0 + +function dbg +{ + typeset msg="$*" + if (($DEBUG == 1)); then + echo "DBG [$nt_tname]: $msg" + fi +} + +function fail +{ + typeset msg="$*" + echo "FAIL [$nt_tname]: $msg" >&2 + exit 1 +} + +function maybe_fail +{ + typeset msg=$1 + + if ((BAIL == 1)); then + fail "$msg" + else + dbg "$msg" + return 1 + fi +} + +function zone_exists +{ + typeset name=$1 + + if (($# != 1)); then + fail "$0: incorrect number of args provided" + fi + + dbg "checking for existence of zone: $name" + if zoneadm -z $name list > /dev/null 2>&1; then + dbg "found zone: $name" + return 0 + else + dbg "zone not found: $name" + return 1 + fi +} + +function zone_running +{ + typeset name=$1 + typeset state=$(zoneadm -z $name list -p | awk -F: '{ print $3 }') + typeset err="zone $name is not running" + + if (($# != 1)); then + fail "$0: incorrect number of args provided" + fi + + dbg "check if zone $name is running" + dbg "state of zone $name: $state" + if [[ "$state" == "running" ]]; then + dbg "zone $name is running" + return 0 + fi + + maybe_fail "$err" +} + +function simnet_exists +{ + typeset name=$1 + + if (($# != 1)); then + fail "$0: incorrect number of args provided" + fi + + if dladm show-simnet $name > /dev/null 2>&1; then + dbg "simnet $name found" + return 0 + else + dbg "simnet $name not found" + return 1 + fi +} + +function create_simnet +{ + typeset name=$1 + typeset err="failed to create simnet $name" + + if (($# != 1)); then + fail "$0: incorrect number of args provided" + fi + + dbg "creating simnet $name" + if simnet_exists $name; then + dbg "simnet $name already exists" + maybe_fail "$err" + return 1 + fi + + if dladm create-simnet > /dev/null $name; then + dbg "created simnet $name" + return 0 + fi + + maybe_fail "$err" +} + +function delete_simnet +{ + typeset name=$1 + typeset err="failed to delete simnet $name" + + if (($# != 1)); then + fail "$0: incorrect number of args provided" + fi + + dbg "deleting simnet $name" + if ! simnet_exists $name; then + dbg "simnet $name doesn't exist" + return 1 + fi + + if dladm delete-simnet $name; then + dbg "simnet $name deleted" + return 0 + fi + + maybe_fail "$err" +} + +function link_simnets +{ + typeset sim1=$1 + typeset sim2=$2 + typeset err="failed to link simnet $sim1 to $sim2" + + if (($# != 2)); then + fail "$0: incorrect number of args provided" + fi + + dbg "linking simnet $sim1 to $sim2" + if dladm modify-simnet -p $sim2 $sim1 > /dev/null; then + dbg "linked simnet $sim1 to $sim2" + return 0 + fi + + maybe_fail "$err" +} + +function vnic_exists +{ + typeset name=$1 + typeset vid=$2 + typeset over=$3 + typeset zone=$4 + + if (($# != 4)); then + fail "$0: incorrect number of args provided" + fi + + if dladm show-vnic $name > /dev/null 2>&1; then + typeset avid=$(dladm show-vnic -p -o vid $name) + typeset aover=$(dladm show-vnic -p -o over $name) + typeset azone=$(dladm show-linkprop -cp zone -o value $name) + if (($avid == $vid)) && [ $aover == $over ] && \ + [ $azone == $zone ] + then + return 0 + else + return 1 + fi + else + return 1 + fi +} + +function create_vnic +{ + typeset name=$1 + typeset over=$2 + typeset vid=$3 + typeset zone=$4 + typeset r=1 + typeset vid_opt="" + typeset vnic_info="$name, vid: $vid, over: $over, zone: $zone" + typeset err="failed to create VNIC: $vnic_info" + + if (($# != 4)); then + fail "$0: incorrect number of args provided" + fi + + if ((vid != 0)); then + vid_opt="-v $vid" + fi + + dbg "creating VNIC: $vnic_info" + if ! dladm create-vnic -t -l $over $vid_opt $name > /dev/null 2>&1 + then + maybe_fail "$err" + return 1 + fi + + dbg "created VNIC: $vnic_info" + if ! zonecfg -z $zone "add net; set physical=$name; end"; then + maybe_fail "failed to assign $name to $zone" + return 1 + fi + + dbg "assigned VNIC $name to $zone" + if zoneadm -z $zone reboot; then + dbg "rebooted $zone" + # + # Make sure the vnic is visible before returning. Without this + # a create_addr command following immediately afterwards could + # fail because the zone is up but the vnic isn't visible yet. + # + sleep 1 + return 0 + fi + + maybe_fail "failed to reboot $zone" +} + +function delete_vnic +{ + typeset name=$1 + typeset vid=$2 + typeset zone=$3 + typeset vnic_info="$name, vid: $vid, zone: $zone" + typeset err1="failed to assign VNIC $name from $zone to GZ" + typeset err2="failed to delete VNIC: $vnic_info" + + if (($# != 3)); then + fail "$0: incorrect number of args provided" + fi + + dbg "assigning VNIC $name from $zone to GZ" + + if ! zonecfg -z $zone "remove net physical=$name"; then + maybe_fail "failed to remove $name from $zone" + return 1 + fi + if ! zoneadm -z $zone reboot; then + maybe_fail "failed to reboot $zone" + return 1 + fi + + dbg "deleting VNIC: $vnic_info" + if dladm delete-vnic $name > /dev/null; then + dbg "deleted VNIC: $vnic_info" + return 0 + fi + + maybe_fail "$err2" +} + +function create_addr +{ + typeset zone=$1 + typeset vnic=$2 + typeset ip=$3 + typeset ipname=${vnic}/v4 + + if (($# != 3)); then + fail "$0: incorrect number of args provided" + fi + + if zlogin $zone ipadm create-addr -t -T static -a $ip \ + $ipname > /dev/null + then + dbg "created addr $ipname ($ip) in zone $zone" + return 0 + fi + + maybe_fail "failed to create addr $ipname ($ip) in zone $zone" +} + +function create_addr6 +{ + typeset zone=$1 + typeset vnic=$2 + typeset ip=$3 + typeset ll_name=${vnic}/v6 + typeset uni_name=${vnic}/v6add + typeset err1="failed to create link-local addr $ll_name in zone $zone" + typeset err2="failed to create unicast addr $uni_name in zone $zone" + + if (($# != 3)); then + fail "$0: incorrect number of args provided" + fi + + if zlogin $zone ipadm create-addr -t -T addrconf $ll_name; then + dbg "created link-local addr $ll_name in zone $zone" + else + maybe_fail "$err1" + return 1 + fi + + if zlogin $zone ipadm create-addr -t -T static -a $ip/64 $uni_name; then + dbg "created unicast addr $uni_name in zone $zone" + else + maybe_fail "$err2" + fi +} + +function delete_addr +{ + typeset zone=$1 + typeset ifname=$2 + typeset version=$3 + typeset ipname=$ifname/$version + + if (($# != 3)); then + fail "$0: incorrect number of args provided" + fi + + if zlogin $zone ipadm show-addr $ipname > /dev/null 2>&1; then + if zlogin $zone ipadm delete-addr $ipname > /dev/null; then + dbg "deleted addr $ipname in zone $zone" + else + maybe_fail "failed to delete addr $ipname in zone $zone" + return 1 + fi + else + dbg "addr $ipname doesn't exist in zone $zone" + fi + + if [[ "v6" == "$version" ]]; then + typeset ipname=$ifname/v6add + typeset err="failed to delete addr $ipname in zone $zone" + + if zlogin $zone ipadm show-addr $ipname > /dev/null 2>&1; then + if zlogin $zone ipadm delete-addr $ipname > /dev/null + then + dbg "deleted addr $ipname in zone $zone" + else + maybe_fail "$err" + fi + else + dbg "addr $ipname doesn't exist in zone $zone" + fi + fi +} + +function delete_if +{ + typeset zone=$1 + typeset ifname=$2 + typeset err="failed to delete interface $ifname in zone $zone" + + if (($# != 2)); then + fail "$0: incorrect number of args provided" + fi + + if zlogin $zone ipadm show-if $ifname > /dev/null 2>&1; then + if zlogin $zone ipadm delete-if $ifname > /dev/null; then + dbg "deleted interface $ifname in zone $zone" + else + maybe_fail "$err" + fi + else + dbg "interface $ifname doesn't exist in zone $zone" + fi +} + +function ip_fwd_enable +{ + typeset zone=$1 + + if (($# != 1)); then + fail "$0: incorrect number of args provided" + fi + + if zlogin $zone routeadm -p ipv4-forwarding | \ + egrep 'current=enabled' > /dev/null + then + dbg "IPv4 forwarding already enabled for $zone" + else + if zlogin $zone routeadm -ue ipv4-forwarding; then + dbg "enabled IPv4 forwarding for $zone" + else + maybe_fail "failed to enable IPv4 forwarding for $zone" + return 1 + fi + fi + + if zlogin $zone routeadm -p ipv6-forwarding | \ + egrep 'current=enabled' > /dev/null + then + dbg "IPv6 forwarding already enabled for $zone" + else + if zlogin $zone routeadm -ue ipv6-forwarding; then + dbg "enabled IPv6 forwarding for $zone" + else + maybe_fail "failed to enable IPv6 forwarding for $zone" + fi + fi +} + +function ip_fwd_disable +{ + typeset zone=$1 + + if (($# != 1)); then + fail "$0: incorrect number of args provided" + fi + + if zlogin $zone routeadm -p ipv4-forwarding | \ + egrep 'current=disabled' > /dev/null + then + dbg "IPv4 forwarding already disabled for $zone" + else + if zlogin $zone routeadm -ud ipv4-forwarding; then + dbg "disabled IPv4 forwarding in $zone" + else + maybe_fail "failed to disable IPv4 forwarding in $zone" + return 1 + fi + fi + + if zlogin $zone routeadm -p ipv6-forwarding | \ + egrep 'current=disabled' > /dev/null + then + dbg "IPv6 forwarding already disabled for $zone" + else + if zlogin $zone routeadm -ud ipv6-forwarding; then + dbg "disabled IPv6 forwarding in $zone" + else + maybe_fail "failed to disable IPv6 forwarding in $zone" + fi + fi +} + +function add_route +{ + typeset zone=$1 + typeset dest=$2 + typeset net=$3 + typeset gateway=$4 + + if (($# != 4)); then + fail "$0: incorrect number of args provided" + fi + + if zlogin $zone route -n add $net $gateway > /dev/null; then + dbg "added route $gateway => $net to $zone" + return 0 + fi + + maybe_fail "failed to add route $gateway => $net to $zone" +} + +function add_route6 +{ + typeset zone=$1 + typeset dest=$2 + typeset net=$3 + typeset gateway=$4 + + if (($# != 4)); then + fail "$0: incorrect number of args provided" + fi + + if zlogin $zone route -n add -inet6 $net $gateway > /dev/null + then + dbg "added route $gateway => $net to $zone" + return 0 + fi + + maybe_fail "failed to add route $gateway => $net to $zone" +} + +function rm_route +{ + typeset zone=$1 + typeset dest=$2 + typeset net=$3 + typeset gateway=$4 + typeset gw=$(zlogin $zone route -n get $dest | \ + grep gateway | awk '{ print $2 }') + typeset err="failed to remove route $gateway => $net from $zone" + + if (($# != 4)); then + fail "$0: incorrect number of args provided" + fi + + if [[ "$gw" == "$gateway" ]]; then + if zlogin $zone route -n delete $net $gateway > /dev/null + then + dbg "removed route $gateway => $net from $zone" + else + maybe_fail "$err" + fi + else + dbg "$zone already lacked route $gateway => $net" + fi +} + +function rm_route6 +{ + typeset zone=$1 + typeset dest=$2 + typeset net=$3 + typeset gateway=$4 + typeset gw=$(zlogin $zone route -n get -inet6 $dest | \ + grep gateway | awk '{ print $2 }') + typeset err="failed to remove route $gateway => $net from $zone" + + if (($# != 4)); then + fail "$0: incorrect number of args provided" + fi + + if [[ "$gw" == "$gateway" ]]; then + if zlogin $zone route -n delete -inet6 $net $gateway > /dev/null + then + dbg "removed route $gateway => $net from $zone" + else + maybe_fail "$err" + fi + else + dbg "$zone already lacked route $gateway => $net" + fi +} + +function set_linkprop +{ + typeset link=$1 + typeset prop=$2 + typeset val=$3 + typeset err="failed to set $link prop: $prop=$val" + + if (($# != 3)); then + fail "$0: incorrect number of args provided" + fi + + dbg "attempt to set $link prop: $prop=$val" + if dladm set-linkprop -p $prop=$val $link; then + dbg "set $link prop: $prop=$val" + return 0 + fi + + maybe_fail "$err" +} + +function ping +{ + typeset zone=$1 + typeset src=$2 + typeset dst=$3 + typeset info="$src -> $dst" + + if (($# != 3)); then + fail "$0: incorrect number of args provided" + fi + + dbg "ping: $info" + if zlogin $zone ping $dst > /dev/null 2>&1; then + dbg "successful ping: $info" + return 0 + fi + + maybe_fail "could not ping: $info" +} + +function ping_udp +{ + typeset client=$1 + typeset client_ip=$2 + typeset server_ip=$3 + typeset size=$4 + typeset num=$5 + typeset info="$client_ip -> $server_ip (size: $size)" + + if (($# != 5)); then + fail "$0: incorrect number of args provided" + fi + + dbg "UDP ping: $info" + if zlogin $client ping -ns -U $server_ip $size $num > /dev/null; then + dbg "UDP ping passed: $info" + return 0 + fi + + maybe_fail "UDP ping failed: $info" +} + +function start_server +{ + typeset zone=$1 + typeset type=$2 + typeset ip=$3 + typeset port=$4 + typeset ofile=$5 + + if (($# != 5)); then + fail "$0: incorrect number of args provided" + fi + + dbg "start server $rfile" + zlogin $zone \ + /usr/bin/socat -u ${type}-LISTEN:$port,bind=[$ip],reuseaddr \ + CREATE:$ofile & + listener_ppid=$! + dbg "listener PPID: $listener_ppid, zone $zone" +} + +function wait_for_pid +{ + typeset pid=$1 + typeset seconds=$2 + typeset s=0 + + if (($# != 2)); then + fail "$0: incorrect number of args provided" + fi + + while true; do + if kill -0 $pid > /dev/null 2>&1; then + if ((seconds == s)); then + maybe_fail "timed out waiting for pid $pid" + return 1 + fi + dbg "waiting for pid $pid" + sleep 1 + ((s++)) + else + return 0 + fi + done +} diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c index 659eda42f3..26e7be2fe8 100644 --- a/usr/src/uts/common/inet/ip/ip6.c +++ b/usr/src/uts/common/inet/ip/ip6.c @@ -22,6 +22,7 @@ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 1990 Mentat Inc. * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #include <sys/types.h> @@ -2730,108 +2731,15 @@ done: } /* - * Try to determine where and what are the IPv6 header length and - * pointer to nexthdr value for the upper layer protocol (or an - * unknown next hdr). - * - * Parameters returns a pointer to the nexthdr value; - * Must handle malformed packets of various sorts. - * Function returns failure for malformed cases. - */ -boolean_t -ip_hdr_length_nexthdr_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length_ptr, - uint8_t **nexthdrpp) -{ - uint16_t length; - uint_t ehdrlen; - uint8_t *nexthdrp; - uint8_t *whereptr; - uint8_t *endptr; - ip6_dest_t *desthdr; - ip6_rthdr_t *rthdr; - ip6_frag_t *fraghdr; - - ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); - length = IPV6_HDR_LEN; - whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ - endptr = mp->b_wptr; - - nexthdrp = &ip6h->ip6_nxt; - while (whereptr < endptr) { - /* Is there enough left for len + nexthdr? */ - if (whereptr + MIN_EHDR_LEN > endptr) - break; - - switch (*nexthdrp) { - case IPPROTO_HOPOPTS: - case IPPROTO_DSTOPTS: - /* Assumes the headers are identical for hbh and dst */ - desthdr = (ip6_dest_t *)whereptr; - ehdrlen = 8 * (desthdr->ip6d_len + 1); - if ((uchar_t *)desthdr + ehdrlen > endptr) - return (B_FALSE); - nexthdrp = &desthdr->ip6d_nxt; - break; - case IPPROTO_ROUTING: - rthdr = (ip6_rthdr_t *)whereptr; - ehdrlen = 8 * (rthdr->ip6r_len + 1); - if ((uchar_t *)rthdr + ehdrlen > endptr) - return (B_FALSE); - nexthdrp = &rthdr->ip6r_nxt; - break; - case IPPROTO_FRAGMENT: - fraghdr = (ip6_frag_t *)whereptr; - ehdrlen = sizeof (ip6_frag_t); - if ((uchar_t *)&fraghdr[1] > endptr) - return (B_FALSE); - nexthdrp = &fraghdr->ip6f_nxt; - break; - case IPPROTO_NONE: - /* No next header means we're finished */ - default: - *hdr_length_ptr = length; - *nexthdrpp = nexthdrp; - return (B_TRUE); - } - length += ehdrlen; - whereptr += ehdrlen; - *hdr_length_ptr = length; - *nexthdrpp = nexthdrp; - } - switch (*nexthdrp) { - case IPPROTO_HOPOPTS: - case IPPROTO_DSTOPTS: - case IPPROTO_ROUTING: - case IPPROTO_FRAGMENT: - /* - * If any know extension headers are still to be processed, - * the packet's malformed (or at least all the IP header(s) are - * not in the same mblk - and that should never happen. - */ - return (B_FALSE); - - default: - /* - * If we get here, we know that all of the IP headers were in - * the same mblk, even if the ULP header is in the next mblk. - */ - *hdr_length_ptr = length; - *nexthdrpp = nexthdrp; - return (B_TRUE); - } -} - -/* * Return the length of the IPv6 related headers (including extension headers) * Returns a length even if the packet is malformed. */ -int +uint16_t ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h) { uint16_t hdr_len; - uint8_t *nexthdrp; - (void) ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_len, &nexthdrp); + (void) ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_len, NULL); return (hdr_len); } diff --git a/usr/src/uts/common/inet/ip/ip6_input.c b/usr/src/uts/common/inet/ip/ip6_input.c index cdff35273e..066b5c3f56 100644 --- a/usr/src/uts/common/inet/ip/ip6_input.c +++ b/usr/src/uts/common/inet/ip/ip6_input.c @@ -23,7 +23,7 @@ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved * * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -1903,13 +1903,12 @@ ip_input_cksum_v6(iaflags_t iraflags, mblk_t *mp, ip6_t *ip6h, return (ip_input_sw_cksum_v6(mp, ip6h, ira)); } + hck_flags = DB_CKSUMFLAGS(mp); + /* * We apply this for all ULP protocols. Does the HW know to * not set the flags for SCTP and other protocols. */ - - hck_flags = DB_CKSUMFLAGS(mp); - if (hck_flags & HCK_FULLCKSUM_OK) { /* * Hardware has already verified the checksum. diff --git a/usr/src/uts/common/inet/ip/ip_input.c b/usr/src/uts/common/inet/ip/ip_input.c index aea49c19d3..cd6c50c446 100644 --- a/usr/src/uts/common/inet/ip/ip_input.c +++ b/usr/src/uts/common/inet/ip/ip_input.c @@ -23,7 +23,7 @@ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -57,6 +57,7 @@ #include <sys/vtrace.h> #include <sys/isa_defs.h> #include <sys/mac.h> +#include <sys/mac_client.h> #include <net/if.h> #include <net/if_arp.h> #include <net/route.h> @@ -659,11 +660,12 @@ ill_input_short_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg, } /* - * If there is a good HW IP header checksum we clear the need + * If the packet originated from a same-machine sender or + * there is a good HW IP header checksum, we clear the need * look at the IP header checksum. */ - if ((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) && - ILL_HCKSUM_CAPABLE(ill) && dohwcksum) { + if (((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) && + ILL_HCKSUM_CAPABLE(ill) && dohwcksum)) { /* Header checksum was ok. Clear the flag */ DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM; @@ -1134,8 +1136,12 @@ ip_forward_xmit_v4(nce_t *nce, ill_t *ill, mblk_t *mp, ipha_t *ipha, icmp_time_exceeded(mp, ICMP_TTL_EXCEEDED, ira); return; } + + /* + * Count the forward as a hop and update the checksum + * accordingly. + */ ipha->ipha_ttl--; - /* Adjust the checksum to reflect the ttl decrement. */ sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST; ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16)); @@ -2240,6 +2246,7 @@ ip_input_cksum_v4(iaflags_t iraflags, mblk_t *mp, ipha_t *ipha, /* No ULP checksum to verify. */ return (B_TRUE); } + /* * Revert to software checksum calculation if the interface * isn't capable of checksum offload. @@ -2252,13 +2259,12 @@ ip_input_cksum_v4(iaflags_t iraflags, mblk_t *mp, ipha_t *ipha, return (ip_input_sw_cksum_v4(mp, ipha, ira)); } + hck_flags = DB_CKSUMFLAGS(mp); + /* * We apply this for all ULP protocols. Does the HW know to * not set the flags for SCTP and other protocols. */ - - hck_flags = DB_CKSUMFLAGS(mp); - if (hck_flags & HCK_FULLCKSUM_OK) { /* * Hardware has already verified the checksum. diff --git a/usr/src/uts/common/inet/ip6.h b/usr/src/uts/common/inet/ip6.h index 4f5b81c12f..01c25b52b5 100644 --- a/usr/src/uts/common/inet/ip6.h +++ b/usr/src/uts/common/inet/ip6.h @@ -23,6 +23,7 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #ifndef _INET_IP6_H @@ -255,7 +256,7 @@ extern in6_addr_t ip_get_dst_v6(ip6_t *, const mblk_t *, boolean_t *); extern ip6_rthdr_t *ip_find_rthdr_v6(ip6_t *, uint8_t *); extern boolean_t ip_hdr_length_nexthdr_v6(mblk_t *, ip6_t *, uint16_t *, uint8_t **); -extern int ip_hdr_length_v6(mblk_t *, ip6_t *); +extern uint16_t ip_hdr_length_v6(mblk_t *, ip6_t *); extern uint32_t ip_massage_options_v6(ip6_t *, ip6_rthdr_t *, netstack_t *); extern void ip_forward_xmit_v6(nce_t *, mblk_t *, ip6_t *, ip_recv_attr_t *, uint32_t, uint32_t); diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h index 2b37528eb9..87086b4c17 100644 --- a/usr/src/uts/common/inet/ip_impl.h +++ b/usr/src/uts/common/inet/ip_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. */ #ifndef _INET_IP_IMPL_H @@ -159,9 +160,24 @@ extern "C" { #define ILL_DIRECT_CAPABLE(ill) \ (((ill)->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) -/* This macro is used by the mac layer */ +/* + * Determine if a mblk needs to take the "slow path", aka OTH + * softring. There are multiple reasons why a mblk might take the slow + * path. + * + * o The mblk is not a data message. + * + * o There is more than one outstanding reference to the mblk. + * + * o The IP header is not aligned (we assume alignment in the checksum + * routine). + * + * o The mblk doesn't contain enough data to populate a simple IP header. + */ #define MBLK_RX_FANOUT_SLOWPATH(mp, ipha) \ - (DB_TYPE(mp) != M_DATA || DB_REF(mp) != 1 || !OK_32PTR(ipha) || \ + (DB_TYPE(mp) != M_DATA || \ + (DB_REF(mp) != 1) || \ + !OK_32PTR(ipha) || \ (((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH) >= (mp)->b_wptr)) /* diff --git a/usr/src/uts/common/io/bridge.c b/usr/src/uts/common/io/bridge.c index bc54527515..389948e295 100644 --- a/usr/src/uts/common/io/bridge.c +++ b/usr/src/uts/common/io/bridge.c @@ -23,6 +23,7 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. */ /* @@ -41,6 +42,7 @@ #include <sys/modctl.h> #include <sys/note.h> #include <sys/param.h> +#include <sys/pattr.h> #include <sys/policy.h> #include <sys/sdt.h> #include <sys/stat.h> @@ -1705,7 +1707,12 @@ reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid) if (mp == NULL) return (mp); - /* No forwarded packet can have hardware checksum enabled */ + /* + * A forwarded packet cannot have hardware offloads enabled + * because we don't know if the destination can handle them. + * By this point, any hardware offloads present should have + * been emulated. + */ DB_CKSUMFLAGS(mp) = 0; /* Get the no-modification cases out of the way first */ @@ -1907,17 +1914,22 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp, blp->bl_trillthreads++; mutex_exit(&blp->bl_trilllock); update_header(mp, hdr_info, B_FALSE); - if (is_xmit) - mp = mac_fix_cksum(mp); - /* all trill data frames have Inner.VLAN */ + + /* + * All trill data frames have + * Inner.VLAN. + */ mp = reform_vlan_header(mp, vlanid, tci, 0); + if (mp == NULL) { KIINCR(bki_drops); - fwd_unref(bfp); - return (NULL); + goto done; } + trill_encap_fn(tdp, blp, hdr_info, mp, bfp->bf_trill_nick); + +done: mutex_enter(&blp->bl_trilllock); if (--blp->bl_trillthreads == 0 && blp->bl_trilldata == NULL) @@ -1959,17 +1971,16 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp, mpsend = copymsg(mp); } - if (!from_trill && is_xmit) - mpsend = mac_fix_cksum(mpsend); - mpsend = reform_vlan_header(mpsend, vlanid, tci, blpsend->bl_pvid); + if (mpsend == NULL) { KIINCR(bki_drops); continue; } KIINCR(bki_forwards); + /* * No need to bump up the link reference count, as * the forwarding entry itself holds a reference to @@ -1979,11 +1990,12 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp, mac_rx_common(blpsend->bl_mh, NULL, mpsend); } else { KLPINCR(blpsend, bkl_xmit); - MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, + mpsend = mac_ring_tx(blpsend->bl_mh, NULL, mpsend); freemsg(mpsend); } } + /* * Handle a special case: if we're transmitting to the original * link, then check whether the localaddr flag is set. If it @@ -2070,11 +2082,9 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp, mpsend = copymsg(mp); } - if (!from_trill && is_xmit) - mpsend = mac_fix_cksum(mpsend); - mpsend = reform_vlan_header(mpsend, vlanid, tci, blpsend->bl_pvid); + if (mpsend == NULL) { KIINCR(bki_drops); continue; @@ -2084,10 +2094,13 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp, KIINCR(bki_unknown); else KIINCR(bki_mbcast); + KLPINCR(blpsend, bkl_xmit); - if ((mpcopy = copymsg(mpsend)) != NULL) + if ((mpcopy = copymsg(mpsend)) != NULL) { mac_rx_common(blpsend->bl_mh, NULL, mpcopy); - MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, mpsend); + } + + mpsend = mac_ring_tx(blpsend->bl_mh, NULL, mpsend); freemsg(mpsend); link_unref(blpsend); } @@ -2465,7 +2478,7 @@ bridge_xmit_cb(mac_handle_t mh, mac_ring_handle_t rh, mblk_t *mpnext) (blp->bl_flags & BLF_SDUFAIL)))) { KIINCR(bki_sent); KLINCR(bkl_xmit); - MAC_RING_TX(blp->bl_mh, rh, mpnext, mp); + mp = mac_ring_tx(blp->bl_mh, rh, mpnext); return (mp); } @@ -2523,7 +2536,7 @@ bridge_xmit_cb(mac_handle_t mh, mac_ring_handle_t rh, mblk_t *mpnext) B_FALSE, B_TRUE); } if (mp != NULL) { - MAC_RING_TX(blp->bl_mh, rh, mp, mp); + mp = mac_ring_tx(blp->bl_mh, rh, mp); if (mp == NULL) { KIINCR(bki_sent); KLINCR(bkl_xmit); @@ -2589,7 +2602,7 @@ bridge_trill_decaps(bridge_link_t *blp, mblk_t *mp, uint16_t ingress_nick) /* Deliver a copy locally as well */ if ((mpcopy = copymsg(mp)) != NULL) mac_rx_common(blp->bl_mh, NULL, mpcopy); - MAC_RING_TX(blp->bl_mh, NULL, mp, mp); + mp = mac_ring_tx(blp->bl_mh, NULL, mp); } if (mp == NULL) { KIINCR(bki_sent); @@ -2610,7 +2623,7 @@ bridge_trill_output(bridge_link_t *blp, mblk_t *mp) bridge_inst_t *bip = blp->bl_inst; /* used by macros */ mac_trill_snoop(blp->bl_mh, mp); - MAC_RING_TX(blp->bl_mh, NULL, mp, mp); + mp = mac_ring_tx(blp->bl_mh, NULL, mp); if (mp == NULL) { KIINCR(bki_sent); KLINCR(bkl_xmit); diff --git a/usr/src/uts/common/io/dls/dls_link.c b/usr/src/uts/common/io/dls/dls_link.c index 6f9049b724..4099d0b801 100644 --- a/usr/src/uts/common/io/dls/dls_link.c +++ b/usr/src/uts/common/io/dls/dls_link.c @@ -21,7 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ /* @@ -566,7 +566,13 @@ dls_rx_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp, dls_head_t *dhp; mod_hash_key_t key; + /* + * We expect to deal with only a single packet. + */ + ASSERT3P(mp->b_next, ==, NULL); + DLS_PREPARE_PKT(dlp->dl_mh, mp, &mhi, err); + if (err != 0) goto drop; diff --git a/usr/src/uts/common/io/fcoe/fcoe_fc.c b/usr/src/uts/common/io/fcoe/fcoe_fc.c index 42764e48d6..54402b027f 100644 --- a/usr/src/uts/common/io/fcoe/fcoe_fc.c +++ b/usr/src/uts/common/io/fcoe/fcoe_fc.c @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. */ /* @@ -39,6 +40,7 @@ #include <sys/fcntl.h> #include <sys/unistd.h> #include <sys/mac_client.h> +#include <sys/strsubr.h> /* * FCoE header files @@ -209,6 +211,7 @@ tx_frame: ret_cookie = mac_tx(mac->fm_cli_handle, FRM2MBLK(frm), 0, MAC_TX_NO_ENQUEUE, &ret_mblk); if (ret_cookie != (mac_tx_cookie_t)NULL) { + frm->frm_netb = ret_mblk; mutex_enter(&mac->fm_mutex); (void) cv_reltimedwait(&mac->fm_tx_cv, &mac->fm_mutex, drv_usectohz(100000), TR_CLOCK_TICK); @@ -265,7 +268,7 @@ fcoe_alloc_netb(fcoe_port_t *eport, uint32_t fc_frame_size, uint8_t **ppfc) static void fcoe_free_netb(void *netb) { - freeb((mblk_t *)netb); + freemsgchain((mblk_t *)netb); } fcoe_frame_t * diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c index 76b4765de6..0a52043a15 100644 --- a/usr/src/uts/common/io/mac/mac.c +++ b/usr/src/uts/common/io/mac/mac.c @@ -1753,7 +1753,7 @@ mac_client_clear_flow_cb(mac_client_handle_t mch) flow_entry_t *flent = mcip->mci_flent; mutex_enter(&flent->fe_lock); - flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop; + flent->fe_cb_fn = (flow_fn_t)mac_rx_def; flent->fe_cb_arg1 = NULL; flent->fe_cb_arg2 = NULL; flent->fe_flags |= FE_MC_NO_DATAPATH; @@ -1936,8 +1936,7 @@ mac_hwring_send_priv(mac_client_handle_t mch, mac_ring_handle_t rh, mblk_t *mp) mac_client_impl_t *mcip = (mac_client_impl_t *)mch; mac_impl_t *mip = mcip->mci_mip; - MAC_TX(mip, rh, mp, mcip); - return (mp); + return (mac_provider_tx(mip, rh, mp, mcip)); } /* @@ -4712,9 +4711,9 @@ mac_group_remmac(mac_group_t *group, const uint8_t *addr) } /* - * This is the entry point for packets transmitted through the bridging code. - * If no bridge is in place, MAC_RING_TX transmits using tx ring. The 'rh' - * pointer may be NULL to select the default ring. + * This is the entry point for packets transmitted through the bridge + * code. If no bridge is in place, mac_ring_tx() transmits via the tx + * ring. The 'rh' pointer may be NULL to select the default ring. */ mblk_t * mac_bridge_tx(mac_impl_t *mip, mac_ring_handle_t rh, mblk_t *mp) @@ -4731,8 +4730,34 @@ mac_bridge_tx(mac_impl_t *mip, mac_ring_handle_t rh, mblk_t *mp) mac_bridge_ref_cb(mh, B_TRUE); mutex_exit(&mip->mi_bridge_lock); if (mh == NULL) { - MAC_RING_TX(mip, rh, mp, mp); + mp = mac_ring_tx((mac_handle_t)mip, rh, mp); } else { + /* + * The bridge may place this mblk on a provider's Tx + * path, a mac's Rx path, or both. Since we don't have + * enough information at this point, we can't be sure + * that the destination(s) are capable of handling the + * hardware offloads requested by the mblk. We emulate + * them here as it is the safest choice. In the + * future, if bridge performance becomes a priority, + * we can elide the emulation here and leave the + * choice up to bridge. + * + * We don't clear the DB_CKSUMFLAGS here because + * HCK_IPV4_HDRCKSUM (Tx) and HCK_IPV4_HDRCKSUM_OK + * (Rx) still have the same value. If the bridge + * receives a packet from a HCKSUM_IPHDRCKSUM NIC then + * the mac(s) it is forwarded on may calculate the + * checksum again, but incorrectly (because the + * checksum field is not zero). Until the + * HCK_IPV4_HDRCKSUM/HCK_IPV4_HDRCKSUM_OK issue is + * resovled, we leave the flag clearing in bridge + * itself. + */ + if ((DB_CKSUMFLAGS(mp) & (HCK_TX_FLAGS | HW_LSO_FLAGS)) != 0) { + mac_hw_emul(&mp, NULL, NULL, MAC_ALL_EMULS); + } + mp = mac_bridge_tx_cb(mh, rh, mp); mac_bridge_ref_cb(mh, B_FALSE); } @@ -8804,3 +8829,52 @@ mac_led_set(mac_handle_t mh, mac_led_mode_t desired) return (ret); } + +/* + * Send packets through the Tx ring ('mrh') or through the default + * handler if no ring is specified. Before passing the packet down to + * the MAC provider, emulate any hardware offloads which have been + * requested but are not supported by the provider. + */ +mblk_t * +mac_ring_tx(mac_handle_t mh, mac_ring_handle_t mrh, mblk_t *mp) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + if (mrh == NULL) + mrh = mip->mi_default_tx_ring; + + if (mrh == NULL) + return (mip->mi_tx(mip->mi_driver, mp)); + else + return (mac_hwring_tx(mrh, mp)); +} + +/* + * This is the final stop before reaching the underlying MAC provider. + * This is also where the bridging hook is inserted. Packets that are + * bridged will return through mac_bridge_tx(), with rh nulled out if + * the bridge chooses to send output on a different link due to + * forwarding. + */ +mblk_t * +mac_provider_tx(mac_impl_t *mip, mac_ring_handle_t rh, mblk_t *mp, + mac_client_impl_t *mcip) +{ + /* + * If there is a bound Hybrid I/O share, send packets through + * the default tx ring. When there's a bound Hybrid I/O share, + * the tx rings of this client are mapped in the guest domain + * and not accessible from here. + */ + if (mcip->mci_state_flags & MCIS_SHARE_BOUND) + rh = mip->mi_default_tx_ring; + + if (mip->mi_promisc_list != NULL) + mac_promisc_dispatch(mip, mp, mcip, B_FALSE); + + if (mip->mi_bridge_link == NULL) + return (mac_ring_tx((mac_handle_t)mip, rh, mp)); + else + return (mac_bridge_tx(mip, rh, mp)); +} diff --git a/usr/src/uts/common/io/mac/mac_bcast.c b/usr/src/uts/common/io/mac/mac_bcast.c index 1ff33c3578..5302b89196 100644 --- a/usr/src/uts/common/io/mac/mac_bcast.c +++ b/usr/src/uts/common/io/mac/mac_bcast.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ #include <sys/types.h> @@ -146,7 +147,7 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback) uint64_t gen; uint_t i; mblk_t *mp_chain1; - flow_entry_t *flent; + flow_entry_t *flent; int err; rw_enter(&mip->mi_rw_lock, RW_READER); @@ -182,13 +183,6 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback) */ if ((mp_chain1 = mac_copymsgchain_cksum(mp_chain)) == NULL) break; - /* - * Fix the checksum for packets originating - * from the local machine. - */ - if ((src_mcip != NULL) && - (mp_chain1 = mac_fix_cksum(mp_chain1)) == NULL) - break; FLOW_TRY_REFHOLD(flent, err); if (err != 0) { @@ -246,7 +240,8 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback) MCIP_STAT_UPDATE(src_mcip, brdcstxmt, 1); MCIP_STAT_UPDATE(src_mcip, brdcstxmtbytes, msgdsize(mp_chain)); - MAC_TX(mip, mip->mi_default_tx_ring, mp_chain, src_mcip); + mp_chain = mac_provider_tx(mip, mip->mi_default_tx_ring, + mp_chain, src_mcip); if (mp_chain != NULL) freemsgchain(mp_chain); } else { diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c index 7ff05f2ab6..605cb51bf7 100644 --- a/usr/src/uts/common/io/mac/mac_client.c +++ b/usr/src/uts/common/io/mac/mac_client.c @@ -115,6 +115,7 @@ #include <sys/stream.h> #include <sys/strsun.h> #include <sys/strsubr.h> +#include <sys/pattr.h> #include <sys/dlpi.h> #include <sys/modhash.h> #include <sys/mac_impl.h> @@ -1357,7 +1358,7 @@ mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name, mcip->mci_mip = mip; mcip->mci_upper_mip = NULL; - mcip->mci_rx_fn = mac_pkt_drop; + mcip->mci_rx_fn = mac_rx_def; mcip->mci_rx_arg = NULL; mcip->mci_rx_p_fn = NULL; mcip->mci_rx_p_arg = NULL; @@ -1629,7 +1630,7 @@ mac_rx_set(mac_client_handle_t mch, mac_rx_t rx_fn, void *arg) void mac_rx_clear(mac_client_handle_t mch) { - mac_rx_set(mch, mac_pkt_drop, NULL); + mac_rx_set(mch, mac_rx_def, NULL); } void @@ -1641,7 +1642,7 @@ mac_rx_barrier(mac_client_handle_t mch) i_mac_perim_enter(mip); /* If a RX callback is set, quiesce and restart that datapath */ - if (mcip->mci_rx_fn != mac_pkt_drop) { + if (mcip->mci_rx_fn != mac_rx_def) { mac_rx_client_quiesce(mch); mac_rx_client_restart(mch); } @@ -2998,7 +2999,7 @@ mac_client_datapath_teardown(mac_client_handle_t mch, mac_unicast_impl_t *muip, mac_misc_stat_delete(flent); /* Initialize the receiver function to a safe routine */ - flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop; + flent->fe_cb_fn = (flow_fn_t)mac_rx_def; flent->fe_cb_arg1 = NULL; flent->fe_cb_arg2 = NULL; @@ -3578,7 +3579,9 @@ mac_tx(mac_client_handle_t mch, mblk_t *mp_chain, uintptr_t hint, srs_tx = &srs->srs_tx; if (srs_tx->st_mode == SRS_TX_DEFAULT && (srs->srs_state & SRS_ENQUEUED) == 0 && - mip->mi_nactiveclients == 1 && mp_chain->b_next == NULL) { + mip->mi_nactiveclients == 1 && + mp_chain->b_next == NULL && + (DB_CKSUMFLAGS(mp_chain) & HW_LSO) == 0) { uint64_t obytes; /* @@ -3613,7 +3616,9 @@ mac_tx(mac_client_handle_t mch, mblk_t *mp_chain, uintptr_t hint, obytes = (mp_chain->b_cont == NULL ? MBLKL(mp_chain) : msgdsize(mp_chain)); - MAC_TX(mip, srs_tx->st_arg2, mp_chain, mcip); + mp_chain = mac_provider_tx(mip, srs_tx->st_arg2, mp_chain, + mcip); + if (mp_chain == NULL) { cookie = 0; SRS_TX_STAT_UPDATE(srs, opackets, 1); @@ -3625,7 +3630,74 @@ mac_tx(mac_client_handle_t mch, mblk_t *mp_chain, uintptr_t hint, mutex_exit(&srs->srs_lock); } } else { - cookie = srs_tx->st_func(srs, mp_chain, hint, flag, ret_mp); + mblk_t *mp = mp_chain; + mblk_t *new_head = NULL; + mblk_t *new_tail = NULL; + + /* + * There are occasions where the packets arriving here + * may request hardware offloads that are not + * available from the underlying MAC provider. This + * currently only happens when a packet is sent across + * the MAC-loopback path of one MAC and then forwarded + * (via IP) to another MAC that lacks one or more of + * the hardware offloads provided by the first one. + * However, in the future, we may choose to pretend + * all MAC providers support all offloads, performing + * emulation on Tx as needed. + * + * We iterate each mblk in-turn, emulating hardware + * offloads as required. From this process, we create + * a new chain. The new chain may be the same as the + * original chain (no hardware emulation needed), a + * collection of new mblks (hardware emulation + * needed), or a mix. At this point, the chain is safe + * for consumption by the underlying MAC provider and + * is passed down to the SRS. + */ + while (mp != NULL) { + mblk_t *next = mp->b_next; + mblk_t *tail = NULL; + const uint16_t needed = + (DB_CKSUMFLAGS(mp) ^ mip->mi_tx_cksum_flags) & + DB_CKSUMFLAGS(mp); + + mp->b_next = NULL; + + if ((needed & (HCK_TX_FLAGS | HW_LSO_FLAGS)) != 0) { + mac_emul_t emul = 0; + + if (needed & HCK_IPV4_HDRCKSUM) + emul |= MAC_IPCKSUM_EMUL; + if (needed & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) + emul |= MAC_HWCKSUM_EMUL; + if (needed & HW_LSO) + emul = MAC_LSO_EMUL; + + mac_hw_emul(&mp, &tail, NULL, emul); + + if (mp == NULL) { + mp = next; + continue; + } + } + + if (new_head == NULL) { + new_head = mp; + } else { + new_tail->b_next = mp; + } + + new_tail = (tail == NULL) ? mp : tail; + mp = next; + } + + if (new_head == NULL) { + cookie = 0; + goto done; + } + + cookie = srs_tx->st_func(srs, new_head, hint, flag, ret_mp); } done: @@ -4026,14 +4098,15 @@ mac_client_get_effective_resources(mac_client_handle_t mch, * The unicast packets of MAC_CLIENT_PROMISC_FILTER callbacks are dispatched * after classification by mac_rx_deliver(). */ - static void mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp, - boolean_t loopback) + boolean_t loopback, boolean_t local) { - mblk_t *mp_copy, *mp_next; + mblk_t *mp_next; if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag) { + mblk_t *mp_copy; + mp_copy = copymsg(mp); if (mp_copy == NULL) return; @@ -4043,16 +4116,24 @@ mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp, if (mp_copy == NULL) return; } - mp_next = NULL; - } else { - mp_copy = mp; - mp_next = mp->b_next; + + /* + * There is code upstack that can't deal with message + * chains. + */ + for (mblk_t *tmp = mp_copy; tmp != NULL; tmp = mp_next) { + mp_next = tmp->b_next; + tmp->b_next = NULL; + mpip->mpi_fn(mpip->mpi_arg, NULL, tmp, loopback); + } + + return; } - mp_copy->b_next = NULL; - mpip->mpi_fn(mpip->mpi_arg, NULL, mp_copy, loopback); - if (mp_copy == mp) - mp->b_next = mp_next; + mp_next = mp->b_next; + mp->b_next = NULL; + mpip->mpi_fn(mpip->mpi_arg, NULL, mp, loopback); + mp->b_next = mp_next; } /* @@ -4094,7 +4175,7 @@ mac_is_mcast(mac_impl_t *mip, mblk_t *mp) */ void mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain, - mac_client_impl_t *sender) + mac_client_impl_t *sender, boolean_t local) { mac_promisc_impl_t *mpip; mac_cb_t *mcb; @@ -4134,8 +4215,10 @@ mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain, if (is_sender || mpip->mpi_type == MAC_CLIENT_PROMISC_ALL || - is_mcast) - mac_promisc_dispatch_one(mpip, mp, is_sender); + is_mcast) { + mac_promisc_dispatch_one(mpip, mp, is_sender, + local); + } } } MAC_PROMISC_WALKER_DCR(mip); @@ -4164,7 +4247,8 @@ mac_promisc_client_dispatch(mac_client_impl_t *mcip, mblk_t *mp_chain) mpip = (mac_promisc_impl_t *)mcb->mcb_objp; if (mpip->mpi_type == MAC_CLIENT_PROMISC_FILTERED && !is_mcast) { - mac_promisc_dispatch_one(mpip, mp, B_FALSE); + mac_promisc_dispatch_one(mpip, mp, B_FALSE, + B_FALSE); } } } @@ -4278,8 +4362,9 @@ mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data) mac_impl_t *mip = (mac_impl_t *)mh; /* - * if mi_nactiveclients > 1, only MAC_CAPAB_LEGACY, MAC_CAPAB_HCKSUM, - * MAC_CAPAB_NO_NATIVEVLAN and MAC_CAPAB_NO_ZCOPY can be advertised. + * Some capabilities are restricted when there are more than one active + * clients on the MAC resource. The ones noted below are safe, + * independent of that count. */ if (mip->mi_nactiveclients > 1) { switch (cap) { @@ -4287,6 +4372,7 @@ mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data) return (B_TRUE); case MAC_CAPAB_LEGACY: case MAC_CAPAB_HCKSUM: + case MAC_CAPAB_LSO: case MAC_CAPAB_NO_NATIVEVLAN: break; default: diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c index e3b660c3b3..9a5f94e7d2 100644 --- a/usr/src/uts/common/io/mac/mac_datapath_setup.c +++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c @@ -3476,7 +3476,7 @@ mac_srs_free(mac_soft_ring_set_t *mac_srs) ASSERT((mac_srs->srs_state & (SRS_CONDEMNED | SRS_CONDEMNED_DONE | SRS_PROC | SRS_PROC_FAST)) == (SRS_CONDEMNED | SRS_CONDEMNED_DONE)); - mac_pkt_drop(NULL, NULL, mac_srs->srs_first, B_FALSE); + mac_drop_chain(mac_srs->srs_first, "SRS free"); mac_srs_ring_free(mac_srs); mac_srs_soft_rings_free(mac_srs); mac_srs_fanout_list_free(mac_srs); diff --git a/usr/src/uts/common/io/mac/mac_flow.c b/usr/src/uts/common/io/mac/mac_flow.c index aa4985fe4c..62612122d6 100644 --- a/usr/src/uts/common/io/mac/mac_flow.c +++ b/usr/src/uts/common/io/mac/mac_flow.c @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ #include <sys/strsun.h> @@ -229,7 +230,7 @@ mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name, cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL); /* Initialize the receiver function to a safe routine */ - flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop; + flent->fe_cb_fn = (flow_fn_t)mac_rx_def; flent->fe_index = -1; } (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c index fbeef1fd2f..ce986fd4bf 100644 --- a/usr/src/uts/common/io/mac/mac_provider.c +++ b/usr/src/uts/common/io/mac/mac_provider.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved. */ @@ -116,6 +116,37 @@ mac_free(mac_register_t *mregp) } /* + * Convert a MAC's offload features into the equivalent DB_CKSUMFLAGS + * value. + */ +static uint16_t +mac_features_to_flags(mac_handle_t mh) +{ + uint16_t flags = 0; + uint32_t cap_sum = 0; + mac_capab_lso_t cap_lso; + + if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap_sum)) { + if (cap_sum & HCKSUM_IPHDRCKSUM) + flags |= HCK_IPV4_HDRCKSUM; + + if (cap_sum & HCKSUM_INET_PARTIAL) + flags |= HCK_PARTIALCKSUM; + else if (cap_sum & (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6)) + flags |= HCK_FULLCKSUM; + } + + /* + * We don't need the information stored in 'cap_lso', but we + * need to pass a non-NULL pointer to appease the driver. + */ + if (mac_capab_get(mh, MAC_CAPAB_LSO, &cap_lso)) + flags |= HW_LSO; + + return (flags); +} + +/* * mac_register() is how drivers register new MACs with the GLDv3 * framework. The mregp argument is allocated by drivers using the * mac_alloc() function, and can be freed using mac_free() immediately upon @@ -345,9 +376,13 @@ mac_register(mac_register_t *mregp, mac_handle_t *mhp) mip, 0, &p0, TS_RUN, minclsyspri); /* - * Initialize the capabilities + * Cache the DB_CKSUMFLAGS that this MAC supports. */ + mip->mi_tx_cksum_flags = mac_features_to_flags((mac_handle_t)mip); + /* + * Initialize the capabilities + */ bzero(&mip->mi_rx_rings_cap, sizeof (mac_capab_rings_t)); bzero(&mip->mi_tx_rings_cap, sizeof (mac_capab_rings_t)); @@ -689,7 +724,7 @@ mac_trill_snoop(mac_handle_t mh, mblk_t *mp) mac_impl_t *mip = (mac_impl_t *)mh; if (mip->mi_promisc_list != NULL) - mac_promisc_dispatch(mip, mp, NULL); + mac_promisc_dispatch(mip, mp, NULL, B_FALSE); } /* @@ -709,7 +744,7 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) * this MAC, pass them a copy if appropriate. */ if (mip->mi_promisc_list != NULL) - mac_promisc_dispatch(mip, mp_chain, NULL); + mac_promisc_dispatch(mip, mp_chain, NULL, B_FALSE); if (mr != NULL) { /* @@ -969,12 +1004,33 @@ mac_pdata_update(mac_handle_t mh, void *mac_pdata, size_t dsize) } /* - * Invoked by driver as well as the framework to notify its capability change. + * The mac provider or mac frameowrk calls this function when it wants + * to notify upstream consumers that the capabilities have changed and + * that they should modify their own internal state accordingly. + * + * We currently have no regard for the fact that a provider could + * decide to drop capabilities which would invalidate pending traffic. + * For example, if one was to disable the Tx checksum offload while + * TCP/IP traffic was being sent by mac clients relying on that + * feature, then those packets would hit the write with missing or + * partial checksums. A proper solution involves not only providing + * notfication, but also performing client quiescing. That is, a capab + * change should be treated as an atomic transaction that forms a + * barrier between traffic relying on the current capabs and traffic + * relying on the new capabs. In practice, simnet is currently the + * only provider that could hit this, and it's an easily avoidable + * situation (and at worst it should only lead to some dropped + * packets). But if we ever want better on-the-fly capab change to + * actual hardware providers, then we should give this update + * mechanism a proper implementation. */ void mac_capab_update(mac_handle_t mh) { - /* Send MAC_NOTE_CAPAB_CHG notification */ + /* + * Send a MAC_NOTE_CAPAB_CHG notification to alert upstream + * clients to renegotiate capabilities. + */ i_mac_notify((mac_impl_t *)mh, MAC_NOTE_CAPAB_CHG); } @@ -1277,6 +1333,19 @@ i_mac_notify_thread(void *arg) } /* + * Depending on which capabs have changed, the Tx + * checksum flags may also need to be updated. + */ + if ((bits & (1 << MAC_NOTE_CAPAB_CHG)) != 0) { + mac_perim_handle_t mph; + mac_handle_t mh = (mac_handle_t)mip; + + mac_perim_enter_by_mh(mh, &mph); + mip->mi_tx_cksum_flags = mac_features_to_flags(mh); + mac_perim_exit(mph); + } + + /* * Do notification callbacks for each notification type. */ for (type = 0; type < MAC_NNOTE; type++) { @@ -1542,15 +1611,22 @@ mac_hcksum_clone(const mblk_t *src, mblk_t *dst) ASSERT3U(DB_TYPE(dst), ==, M_DATA); /* - * Do these assignments unconditionally, rather than only when flags is - * non-zero. This protects a situation where zeroed hcksum data does - * not make the jump onto an mblk_t with stale data in those fields. + * Do these assignments unconditionally, rather than only when + * flags is non-zero. This protects a situation where zeroed + * hcksum data does not make the jump onto an mblk_t with + * stale data in those fields. It's important to copy all + * possible flags (HCK_* as well as HW_*) and not just the + * checksum specific flags. Dropping flags during a clone + * could result in dropped packets. If the caller has good + * reason to drop those flags then it should do it manually, + * after the clone. */ - DB_CKSUMFLAGS(dst) = (DB_CKSUMFLAGS(src) & HCK_FLAGS); + DB_CKSUMFLAGS(dst) = DB_CKSUMFLAGS(src); DB_CKSUMSTART(dst) = DB_CKSUMSTART(src); DB_CKSUMSTUFF(dst) = DB_CKSUMSTUFF(src); DB_CKSUMEND(dst) = DB_CKSUMEND(src); DB_CKSUM16(dst) = DB_CKSUM16(src); + DB_LSOMSS(dst) = DB_LSOMSS(src); } void diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c index cbd5ce1e19..5b3e87dfd1 100644 --- a/usr/src/uts/common/io/mac/mac_sched.c +++ b/usr/src/uts/common/io/mac/mac_sched.c @@ -968,6 +968,7 @@ #include <sys/types.h> #include <sys/callb.h> +#include <sys/pattr.h> #include <sys/sdt.h> #include <sys/strsubr.h> #include <sys/strsun.h> @@ -1327,7 +1328,7 @@ int mac_srs_worker_wakeup_ticks = 0; * b_prev may be set to the fanout hint \ * hence can't use freemsg directly \ */ \ - mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \ + mac_drop_chain(mp_chain, "SRS Tx max queue"); \ DTRACE_PROBE1(tx_queued_hiwat, \ mac_soft_ring_set_t *, srs); \ enqueue = 0; \ @@ -1346,11 +1347,11 @@ int mac_srs_worker_wakeup_ticks = 0; if (!(srs->srs_type & SRST_TX)) \ mutex_exit(&srs->srs_bw->mac_bw_lock); -#define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \ - mac_pkt_drop(NULL, NULL, mp, B_FALSE); \ +#define MAC_TX_SRS_DROP_MESSAGE(srs, chain, cookie, s) { \ + mac_drop_chain((chain), (s)); \ /* increment freed stats */ \ - mac_srs->srs_tx.st_stat.mts_sdrops++; \ - cookie = (mac_tx_cookie_t)srs; \ + (srs)->srs_tx.st_stat.mts_sdrops++; \ + (cookie) = (mac_tx_cookie_t)(srs); \ } #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \ @@ -2321,7 +2322,7 @@ check_again: if (smcip->mci_mip->mi_promisc_list != NULL) { mutex_exit(lock); mac_promisc_dispatch(smcip->mci_mip, - head, NULL); + head, NULL, B_FALSE); mutex_enter(lock); } } @@ -2893,7 +2894,7 @@ again: mac_srs->srs_bw->mac_bw_sz -= sz; mac_srs->srs_bw->mac_bw_drop_bytes += sz; mutex_exit(&mac_srs->srs_bw->mac_bw_lock); - mac_pkt_drop(NULL, NULL, head, B_FALSE); + mac_drop_chain(head, "Rx no bandwidth"); goto leave_poll; } else { mutex_exit(&mac_srs->srs_bw->mac_bw_lock); @@ -3275,9 +3276,10 @@ mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs, } /* - * mac_rx_srs_process - * - * Receive side routine called from the interrupt path. + * MAC SRS receive side routine. If the data is coming from the + * network (i.e. from a NIC) then this is called in interrupt context. + * If the data is coming from a local sender (e.g. mac_tx_send() or + * bridge_forward()) then this is not called in interrupt context. * * loopback is set to force a context switch on the loopback * path between MAC clients. @@ -3337,7 +3339,7 @@ mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain, mac_bw->mac_bw_drop_bytes += sz; mutex_exit(&mac_bw->mac_bw_lock); mutex_exit(&mac_srs->srs_lock); - mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); + mac_drop_chain(mp_chain, "Rx no bandwidth"); return; } else { if ((mac_bw->mac_bw_sz + sz) <= @@ -3459,7 +3461,8 @@ mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW); if (flag & MAC_DROP_ON_NO_DESC) { - MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); + MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie, + "Tx no desc"); } else { if (mac_srs->srs_first != NULL) wakeup_worker = B_FALSE; @@ -3522,7 +3525,8 @@ mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); if (flag & MAC_DROP_ON_NO_DESC) { if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) { - MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); + MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie, + "Tx SRS hiwat"); } else { MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, cnt, sz); @@ -3895,7 +3899,8 @@ mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, cookie = (mac_tx_cookie_t)mac_srs; *ret_mp = mp_chain; } else { - MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); + MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie, + "Tx no bandwidth"); } mutex_exit(&mac_srs->srs_lock); return (cookie); @@ -4342,7 +4347,7 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, msgdsize(mp)); CHECK_VID_AND_ADD_TAG(mp); - MAC_TX(mip, ring, mp, src_mcip); + mp = mac_provider_tx(mip, ring, mp, src_mcip); /* * If the driver is out of descriptors and does a @@ -4373,7 +4378,6 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, flow_entry_t *dst_flow_ent; void *flow_cookie; size_t pkt_size; - mblk_t *mp1; next = mp->b_next; mp->b_next = NULL; @@ -4388,44 +4392,12 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, dst_flow_ent = mac_tx_classify(mip, mp); if (dst_flow_ent != NULL) { - size_t hdrsize; - int err = 0; - - if (mip->mi_info.mi_nativemedia == DL_ETHER) { - struct ether_vlan_header *evhp = - (struct ether_vlan_header *)mp->b_rptr; - - if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) - hdrsize = sizeof (*evhp); - else - hdrsize = sizeof (struct ether_header); - } else { - mac_header_info_t mhi; - - err = mac_header_info((mac_handle_t)mip, - mp, &mhi); - if (err == 0) - hdrsize = mhi.mhi_hdrsize; - } - /* * Got a matching flow. It's either another * MAC client, or a broadcast/multicast flow. - * Make sure the packet size is within the - * allowed size. If not drop the packet and - * move to next packet. */ - if (err != 0 || - (pkt_size - hdrsize) > mip->mi_sdu_max) { - oerrors++; - DTRACE_PROBE2(loopback__drop, size_t, pkt_size, - mblk_t *, mp); - freemsg(mp); - mp = next; - FLOW_REFRELE(dst_flow_ent); - continue; - } flow_cookie = mac_flow_get_client_cookie(dst_flow_ent); + if (flow_cookie != NULL) { /* * The vnic_bcast_send function expects @@ -4443,6 +4415,7 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, * bypass is set. */ boolean_t do_switch; + mac_client_impl_t *dst_mcip = dst_flow_ent->fe_mcip; @@ -4458,19 +4431,23 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, * check is done inside the MAC_TX() * macro. */ - if (mip->mi_promisc_list != NULL) - mac_promisc_dispatch(mip, mp, src_mcip); + if (mip->mi_promisc_list != NULL) { + mac_promisc_dispatch(mip, mp, src_mcip, + B_TRUE); + } do_switch = ((src_mcip->mci_state_flags & dst_mcip->mci_state_flags & MCIS_CLIENT_POLL_CAPABLE) != 0); - if ((mp1 = mac_fix_cksum(mp)) != NULL) { + mac_hw_emul(&mp, NULL, NULL, MAC_ALL_EMULS); + if (mp != NULL) { (dst_flow_ent->fe_cb_fn)( dst_flow_ent->fe_cb_arg1, dst_flow_ent->fe_cb_arg2, - mp1, do_switch); + mp, do_switch); } + } FLOW_REFRELE(dst_flow_ent); } else { @@ -4478,7 +4455,7 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, * Unknown destination, send via the underlying * NIC. */ - MAC_TX(mip, ring, mp, src_mcip); + mp = mac_provider_tx(mip, ring, mp, src_mcip); if (mp != NULL) { /* * Adjust for the last packet that @@ -4827,7 +4804,7 @@ mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); if (flag & MAC_DROP_ON_NO_DESC) { - mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); + mac_drop_chain(mp_chain, "Tx softring no desc"); /* increment freed stats */ ringp->s_ring_drops += cnt; cookie = (mac_tx_cookie_t)ringp; @@ -4871,8 +4848,8 @@ mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, * b_prev may be set to the fanout hint * hence can't use freemsg directly */ - mac_pkt_drop(NULL, NULL, - mp_chain, B_FALSE); + mac_drop_chain(mp_chain, + "Tx softring max queue"); DTRACE_PROBE1(tx_queued_hiwat, mac_soft_ring_t *, ringp); enqueue = B_FALSE; diff --git a/usr/src/uts/common/io/mac/mac_soft_ring.c b/usr/src/uts/common/io/mac/mac_soft_ring.c index f4d2a5ee81..c8a16e6fd3 100644 --- a/usr/src/uts/common/io/mac/mac_soft_ring.c +++ b/usr/src/uts/common/io/mac/mac_soft_ring.c @@ -242,7 +242,7 @@ mac_soft_ring_free(mac_soft_ring_t *softring) ASSERT((softring->s_ring_state & (S_RING_CONDEMNED | S_RING_CONDEMNED_DONE | S_RING_PROC)) == (S_RING_CONDEMNED | S_RING_CONDEMNED_DONE)); - mac_pkt_drop(NULL, NULL, softring->s_ring_first, B_FALSE); + mac_drop_chain(softring->s_ring_first, "softring free"); softring->s_ring_tx_arg2 = NULL; mac_soft_ring_stat_delete(softring); mac_callback_free(softring->s_ring_notify_cb_list); diff --git a/usr/src/uts/common/io/mac/mac_util.c b/usr/src/uts/common/io/mac/mac_util.c index 924d018ad0..03da3a3504 100644 --- a/usr/src/uts/common/io/mac/mac_util.c +++ b/usr/src/uts/common/io/mac/mac_util.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ /* @@ -48,6 +48,75 @@ #include <inet/sadb.h> #include <inet/ipsecesp.h> #include <inet/ipsecah.h> +#include <inet/tcp.h> +#include <inet/udp_impl.h> +#include <inet/sctp_ip.h> + +/* + * The next two functions are used for dropping packets or chains of + * packets, respectively. We could use one function for both but + * separating the use cases allows us to specify intent and prevent + * dropping more data than intended. + * + * The purpose of these functions is to aid the debugging effort, + * especially in production. Rather than use freemsg()/freemsgchain(), + * it's preferable to use these functions when dropping a packet in + * the MAC layer. These functions should only be used during + * unexpected conditions. That is, any time a packet is dropped + * outside of the regular, successful datapath. Consolidating all + * drops on these functions allows the user to trace one location and + * determine why the packet was dropped based on the msg. It also + * allows the user to inspect the packet before it is freed. Finally, + * it allows the user to avoid tracing freemsg()/freemsgchain() thus + * keeping the hot path running as efficiently as possible. + * + * NOTE: At this time not all MAC drops are aggregated on these + * functions; but that is the plan. This comment should be erased once + * completed. + */ + +/*PRINTFLIKE2*/ +void +mac_drop_pkt(mblk_t *mp, const char *fmt, ...) +{ + va_list adx; + char msg[128]; + char *msgp = msg; + + ASSERT3P(mp->b_next, ==, NULL); + + va_start(adx, fmt); + (void) vsnprintf(msgp, sizeof (msg), fmt, adx); + va_end(adx); + + DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp); + freemsg(mp); +} + +/*PRINTFLIKE2*/ +void +mac_drop_chain(mblk_t *chain, const char *fmt, ...) +{ + va_list adx; + char msg[128]; + char *msgp = msg; + + va_start(adx, fmt); + (void) vsnprintf(msgp, sizeof (msg), fmt, adx); + va_end(adx); + + /* + * We could use freemsgchain() for the actual freeing but + * since we are already walking the chain to fire the dtrace + * probe we might as well free the msg here too. + */ + for (mblk_t *mp = chain, *next; mp != NULL; ) { + next = mp->b_next; + DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp); + freemsg(mp); + mp = next; + } +} /* * Copy an mblk, preserving its hardware checksum flags. @@ -89,222 +158,1272 @@ mac_copymsgchain_cksum(mblk_t *mp) } /* - * Process the specified mblk chain for proper handling of hardware - * checksum offload. This routine is invoked for loopback traffic - * between MAC clients. - * The function handles a NULL mblk chain passed as argument. + * Calculate the ULP checksum for IPv4. Return true if the calculation + * was successful, or false if an error occurred. If the later, place + * an error message into '*err'. */ -mblk_t * -mac_fix_cksum(mblk_t *mp_chain) +static boolean_t +mac_sw_cksum_ipv4(mblk_t *mp, uint32_t ip_hdr_offset, ipha_t *ipha, + const char **err) { - mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1; + const uint8_t proto = ipha->ipha_protocol; + size_t len; + const uint32_t ip_hdr_sz = IPH_HDR_LENGTH(ipha); + /* ULP offset from start of L2. */ + const uint32_t ulp_offset = ip_hdr_offset + ip_hdr_sz; + ipaddr_t src, dst; + uint32_t cksum; + uint16_t *up; + + /* + * We need a pointer to the ULP checksum. We're assuming the + * ULP checksum pointer resides in the first mblk. Our native + * TCP stack should always put the headers in the first mblk, + * but currently we have no way to guarantee that other + * clients don't spread headers (or even header fields) across + * mblks. + */ + switch (proto) { + case IPPROTO_TCP: + ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t))); + if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) { + *err = "mblk doesn't contain TCP header"; + goto bail; + } + + up = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_sz); + cksum = IP_TCP_CSUM_COMP; + break; + + case IPPROTO_UDP: + ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t))); + if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) { + *err = "mblk doesn't contain UDP header"; + goto bail; + } + + up = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_sz); + cksum = IP_UDP_CSUM_COMP; + break; + + case IPPROTO_SCTP: { + sctp_hdr_t *sctph; + + ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t))); + if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) { + *err = "mblk doesn't contain SCTP header"; + goto bail; + } + + sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset); + sctph->sh_chksum = 0; + sctph->sh_chksum = sctp_cksum(mp, ulp_offset); + return (B_TRUE); + } + + default: + *err = "unexpected protocol"; + goto bail; + + } + + /* Pseudo-header checksum. */ + src = ipha->ipha_src; + dst = ipha->ipha_dst; + len = ntohs(ipha->ipha_length) - ip_hdr_sz; + + cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); + cksum += htons(len); + + /* + * We have already accounted for the pseudo checksum above. + * Make sure the ULP checksum field is zero before computing + * the rest. + */ + *up = 0; + cksum = IP_CSUM(mp, ulp_offset, cksum); + *up = (uint16_t)(cksum ? cksum : ~cksum); + + return (B_TRUE); + +bail: + return (B_FALSE); +} + +/* + * Calculate the ULP checksum for IPv6. Return true if the calculation + * was successful, or false if an error occurred. If the later, place + * an error message into '*err'. + */ +static boolean_t +mac_sw_cksum_ipv6(mblk_t *mp, uint32_t ip_hdr_offset, const char **err) +{ + ip6_t *ip6h = (ip6_t *)(mp->b_rptr + ip_hdr_offset); + const uint8_t proto = ip6h->ip6_nxt; + const uint16_t *iphs = (uint16_t *)ip6h; + /* ULP offset from start of L2. */ + uint32_t ulp_offset; + size_t len; + uint32_t cksum; + uint16_t *up; + uint16_t ip_hdr_sz; + + if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_sz, NULL)) { + *err = "malformed IPv6 header"; + goto bail; + } + + ulp_offset = ip_hdr_offset + ip_hdr_sz; + + /* + * We need a pointer to the ULP checksum. We're assuming the + * ULP checksum pointer resides in the first mblk. Our native + * TCP stack should always put the headers in the first mblk, + * but currently we have no way to guarantee that other + * clients don't spread headers (or even header fields) across + * mblks. + */ + switch (proto) { + case IPPROTO_TCP: + ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t))); + if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) { + *err = "mblk doesn't contain TCP header"; + goto bail; + } + + up = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_sz); + cksum = IP_TCP_CSUM_COMP; + break; + + case IPPROTO_UDP: + ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t))); + if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) { + *err = "mblk doesn't contain UDP header"; + goto bail; + } + + up = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_sz); + cksum = IP_UDP_CSUM_COMP; + break; + + case IPPROTO_SCTP: { + sctp_hdr_t *sctph; + + ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t))); + if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) { + *err = "mblk doesn't contain SCTP header"; + goto bail; + } + + sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset); + /* + * Zero out the checksum field to ensure proper + * checksum calculation. + */ + sctph->sh_chksum = 0; + sctph->sh_chksum = sctp_cksum(mp, ulp_offset); + return (B_TRUE); + } + + default: + *err = "unexpected protocol"; + goto bail; + } + + /* + * The payload length includes the payload and the IPv6 + * extension headers; the idea is to subtract the extension + * header length to get the real payload length. + */ + len = ntohs(ip6h->ip6_plen) - (ip_hdr_sz - IPV6_HDR_LEN); + cksum += len; + + /* + * We accumulate the pseudo header checksum in cksum; then we + * call IP_CSUM to compute the checksum over the payload. + */ + cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] + iphs[8] + iphs[9] + + iphs[10] + iphs[11] + iphs[12] + iphs[13] + iphs[14] + iphs[15] + + iphs[16] + iphs[17] + iphs[18] + iphs[19]; + cksum = IP_CSUM(mp, ulp_offset, cksum); + + /* For UDP/IPv6 a zero UDP checksum is not allowed. Change to 0xffff */ + if (proto == IPPROTO_UDP && cksum == 0) + cksum = ~cksum; + + *up = (uint16_t)cksum; + + return (B_TRUE); + +bail: + return (B_FALSE); +} + +/* + * Perform software checksum on a single message, if needed. The + * emulation performed is determined by an intersection of the mblk's + * flags and the emul flags requested. The emul flags are documented + * in mac.h. + */ +static mblk_t * +mac_sw_cksum(mblk_t *mp, mac_emul_t emul) +{ + mblk_t *skipped_hdr = NULL; uint32_t flags, start, stuff, end, value; + uint32_t ip_hdr_offset; + uint16_t etype; + size_t ip_hdr_sz; + struct ether_header *ehp; + const char *err = ""; - for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) { - uint16_t len; - uint32_t offset; - struct ether_header *ehp; - uint16_t sap; + /* + * This function should only be called from mac_hw_emul() + * which handles mblk chains and the shared ref case. + */ + ASSERT3P(mp->b_next, ==, NULL); - mac_hcksum_get(mp, &start, &stuff, &end, &value, &flags); - if (flags == 0) - continue; + mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL); + + flags = DB_CKSUMFLAGS(mp); + + /* Why call this if checksum emulation isn't needed? */ + ASSERT3U(flags & (HCK_FLAGS), !=, 0); + + /* + * Ethernet, and optionally VLAN header. mac_hw_emul() has + * already verified we have enough data to read the L2 header. + */ + ehp = (struct ether_header *)mp->b_rptr; + if (ntohs(ehp->ether_type) == VLAN_TPID) { + struct ether_vlan_header *evhp; + + evhp = (struct ether_vlan_header *)mp->b_rptr; + etype = ntohs(evhp->ether_type); + ip_hdr_offset = sizeof (struct ether_vlan_header); + } else { + etype = ntohs(ehp->ether_type); + ip_hdr_offset = sizeof (struct ether_header); + } + + /* + * If this packet isn't IP, then leave it alone. We don't want + * to affect non-IP traffic like ARP. Assume the IP header + * doesn't include any options, for now. We will use the + * correct size later after we know there are enough bytes to + * at least fill out the basic header. + */ + switch (etype) { + case ETHERTYPE_IP: + ip_hdr_sz = sizeof (ipha_t); + break; + case ETHERTYPE_IPV6: + ip_hdr_sz = sizeof (ip6_t); + break; + default: + return (mp); + } + + ASSERT3U(MBLKL(mp), >=, ip_hdr_offset); + + /* + * If the first mblk of this packet contains only the ethernet + * header, skip past it for now. Packets with their data + * contained in only a single mblk can then use the fastpaths + * tuned to that possibility. + */ + if (MBLKL(mp) == ip_hdr_offset) { + ip_hdr_offset -= MBLKL(mp); + /* This is guaranteed by mac_hw_emul(). */ + ASSERT3P(mp->b_cont, !=, NULL); + skipped_hdr = mp; + mp = mp->b_cont; + } + + /* + * Both full and partial checksum rely on finding the IP + * header in the current mblk. Our native TCP stack honors + * this assumption but it's prudent to guard our future + * clients that might not honor this contract. + */ + ASSERT3U(MBLKL(mp), >=, ip_hdr_offset + ip_hdr_sz); + if (MBLKL(mp) < (ip_hdr_offset + ip_hdr_sz)) { + err = "mblk doesn't contain IP header"; + goto bail; + } + + /* + * We are about to modify the header mblk; make sure we are + * modifying our own copy. The code that follows assumes that + * the IP/ULP headers exist in this mblk (and drops the + * message if they don't). + */ + if (DB_REF(mp) > 1) { + mblk_t *tmp = copyb(mp); + + if (tmp == NULL) { + err = "copyb failed"; + goto bail; + } + + if (skipped_hdr != NULL) { + ASSERT3P(skipped_hdr->b_cont, ==, mp); + skipped_hdr->b_cont = tmp; + } + + tmp->b_cont = mp->b_cont; + freeb(mp); + mp = tmp; + } + + if (etype == ETHERTYPE_IP) { + ipha_t *ipha = (ipha_t *)(mp->b_rptr + ip_hdr_offset); + + if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) { + if (!mac_sw_cksum_ipv4(mp, ip_hdr_offset, ipha, &err)) + goto bail; + } + + /* We always update the ULP checksum flags. */ + if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMULS)) { + flags &= ~HCK_FULLCKSUM; + flags |= HCK_FULLCKSUM_OK; + value = 0; + } /* - * Since the processing of checksum offload for loopback - * traffic requires modification of the packet contents, - * ensure sure that we are always modifying our own copy. + * While unlikely, it's possible to write code that + * might end up calling mac_sw_cksum() twice on the + * same mblk (performing both LSO and checksum + * emualtion in a single mblk chain loop -- the LSO + * emulation inserts a new chain into the existing + * chain and then the loop iterates back over the new + * segments and emulates the checksum a second time). + * Normally this wouldn't be a problem, because the + * HCK_*_OK flags are supposed to indicate that we + * don't need to do peform the work. But + * HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the + * same value; so we cannot use these flags to + * determine if the IP header checksum has already + * been calculated or not. For this reason, we zero + * out the the checksum first. In the future, we + * should fix the HCK_* flags. */ - if (DB_REF(mp) > 1) { - mp1 = copymsg(mp); - if (mp1 == NULL) - continue; - mp1->b_next = mp->b_next; - mp->b_next = NULL; - freemsg(mp); - if (prev != NULL) - prev->b_next = mp1; - else - new_chain = mp1; - mp = mp1; + if ((flags & HCK_IPV4_HDRCKSUM) && (emul & MAC_HWCKSUM_EMULS)) { + ipha->ipha_hdr_checksum = 0; + ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha); + flags &= ~HCK_IPV4_HDRCKSUM; + flags |= HCK_IPV4_HDRCKSUM_OK; + } + } else if (etype == ETHERTYPE_IPV6) { + /* There is no IP header checksum for IPv6. */ + if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) { + if (!mac_sw_cksum_ipv6(mp, ip_hdr_offset, &err)) + goto bail; + flags &= ~HCK_FULLCKSUM; + flags |= HCK_FULLCKSUM_OK; + value = 0; } + } + + /* + * Partial checksum is the same for both IPv4 and IPv6. + */ + if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMUL)) { + uint16_t *up, partial, cksum; + uchar_t *ipp; /* ptr to beginning of IP header */ + + ipp = mp->b_rptr + ip_hdr_offset; + up = (uint16_t *)((uchar_t *)ipp + stuff); + partial = *up; + *up = 0; + + ASSERT3S(end, >, start); + cksum = ~IP_CSUM_PARTIAL(mp, ip_hdr_offset + start, partial); + *up = cksum != 0 ? cksum : ~cksum; + } + + /* We always update the ULP checksum flags. */ + if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMULS)) { + flags &= ~HCK_PARTIALCKSUM; + flags |= HCK_FULLCKSUM_OK; + value = 0; + } + + mac_hcksum_set(mp, start, stuff, end, value, flags); + + /* Don't forget to reattach the header. */ + if (skipped_hdr != NULL) { + ASSERT3P(skipped_hdr->b_cont, ==, mp); /* - * Ethernet, and optionally VLAN header. + * Duplicate the HCKSUM data into the header mblk. + * This mimics mac_add_vlan_tag which ensures that + * both the first mblk _and_ the first data bearing + * mblk possess the HCKSUM information. Consumers like + * IP will end up discarding the ether_header mblk, so + * for now, it is important that the data be available + * in both places. */ - /* LINTED: improper alignment cast */ - ehp = (struct ether_header *)mp->b_rptr; - if (ntohs(ehp->ether_type) == VLAN_TPID) { - struct ether_vlan_header *evhp; + mac_hcksum_clone(mp, skipped_hdr); + mp = skipped_hdr; + } - ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); - /* LINTED: improper alignment cast */ - evhp = (struct ether_vlan_header *)mp->b_rptr; - sap = ntohs(evhp->ether_type); - offset = sizeof (struct ether_vlan_header); + return (mp); + +bail: + if (skipped_hdr != NULL) { + ASSERT3P(skipped_hdr->b_cont, ==, mp); + mp = skipped_hdr; + } + + mac_drop_pkt(mp, err); + return (NULL); +} + +/* + * Build a single data segment from an LSO packet. The mblk chain + * returned, seg_head, represents the data segment and is always + * exactly seg_len bytes long. The lso_mp and offset input/output + * parameters track our position in the LSO packet. This function + * exists solely as a helper to mac_sw_lso(). + * + * Case A + * + * The current lso_mp is larger than the requested seg_len. The + * beginning of seg_head may start at the beginning of lso_mp or + * offset into it. In either case, a single mblk is returned, and + * *offset is updated to reflect our new position in the current + * lso_mp. + * + * +----------------------------+ + * | in *lso_mp / out *lso_mp | + * +----------------------------+ + * ^ ^ + * | | + * | | + * | | + * +------------------------+ + * | seg_head | + * +------------------------+ + * ^ ^ + * | | + * in *offset = 0 out *offset = seg_len + * + * |------ seg_len ----| + * + * + * +------------------------------+ + * | in *lso_mp / out *lso_mp | + * +------------------------------+ + * ^ ^ + * | | + * | | + * | | + * +------------------------+ + * | seg_head | + * +------------------------+ + * ^ ^ + * | | + * in *offset = N out *offset = N + seg_len + * + * |------ seg_len ----| + * + * + * + * Case B + * + * The requested seg_len consumes exactly the rest of the lso_mp. + * I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr. + * The seg_head may start at the beginning of the lso_mp or at some + * offset into it. In either case we return a single mblk, reset + * *offset to zero, and walk to the next lso_mp. + * + * +------------------------+ +------------------------+ + * | in *lso_mp |---------->| out *lso_mp | + * +------------------------+ +------------------------+ + * ^ ^ ^ + * | | | + * | | out *offset = 0 + * | | + * +------------------------+ + * | seg_head | + * +------------------------+ + * ^ + * | + * in *offset = 0 + * + * |------ seg_len ----| + * + * + * + * +----------------------------+ +------------------------+ + * | in *lso_mp |---------->| out *lso_mp | + * +----------------------------+ +------------------------+ + * ^ ^ ^ + * | | | + * | | out *offset = 0 + * | | + * +------------------------+ + * | seg_head | + * +------------------------+ + * ^ + * | + * in *offset = N + * + * |------ seg_len ----| + * + * + * Case C + * + * The requested seg_len is greater than the current lso_mp. In + * this case we must consume LSO mblks until we have enough data to + * satisfy either case (A) or (B) above. We will return multiple + * mblks linked via b_cont, offset will be set based on the cases + * above, and lso_mp will walk forward at least one mblk, but maybe + * more. + * + * N.B. This digram is not exhaustive. The seg_head may start on + * the beginning of an lso_mp. The seg_tail may end exactly on the + * boundary of an lso_mp. And there may be two (in this case the + * middle block wouldn't exist), three, or more mblks in the + * seg_head chain. This is meant as one example of what might + * happen. The main thing to remember is that the seg_tail mblk + * must be one of case (A) or (B) above. + * + * +------------------+ +----------------+ +------------------+ + * | in *lso_mp |--->| *lso_mp |--->| out *lso_mp | + * +------------------+ +----------------+ +------------------+ + * ^ ^ ^ ^ ^ ^ + * | | | | | | + * | | | | | | + * | | | | | | + * | | | | | | + * +------------+ +----------------+ +------------+ + * | seg_head |--->| |--->| seg_tail | + * +------------+ +----------------+ +------------+ + * ^ ^ + * | | + * in *offset = N out *offset = MBLKL(seg_tail) + * + * |------------------- seg_len -------------------| + * + */ +static mblk_t * +build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len) +{ + mblk_t *seg_head, *seg_tail, *seg_mp; + + ASSERT3P(*lso_mp, !=, NULL); + ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr); + + seg_mp = dupb(*lso_mp); + if (seg_mp == NULL) + return (NULL); + + seg_head = seg_mp; + seg_tail = seg_mp; + + /* Continue where we left off from in the lso_mp. */ + seg_mp->b_rptr += *offset; + +last_mblk: + /* Case (A) */ + if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) { + *offset += seg_len; + seg_mp->b_wptr = seg_mp->b_rptr + seg_len; + return (seg_head); + } + + /* Case (B) */ + if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) { + *offset = 0; + *lso_mp = (*lso_mp)->b_cont; + return (seg_head); + } + + /* Case (C) */ + ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr); + + /* + * The current LSO mblk doesn't have enough data to satisfy + * seg_len -- continue peeling off LSO mblks to build the new + * segment message. If allocation fails we free the previously + * allocated segment mblks and return NULL. + */ + while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) { + ASSERT3U(MBLKL(seg_mp), <=, seg_len); + seg_len -= MBLKL(seg_mp); + *offset = 0; + *lso_mp = (*lso_mp)->b_cont; + seg_mp = dupb(*lso_mp); + + if (seg_mp == NULL) { + freemsgchain(seg_head); + return (NULL); + } + + seg_tail->b_cont = seg_mp; + seg_tail = seg_mp; + } + + /* + * We've walked enough LSO mblks that we can now satisfy the + * remaining seg_len. At this point we need to jump back to + * determine if we have arrived at case (A) or (B). + */ + + /* Just to be paranoid that we didn't underflow. */ + ASSERT3U(seg_len, <, IP_MAXPACKET); + ASSERT3U(seg_len, >, 0); + goto last_mblk; +} + +/* + * Perform software segmentation of a single LSO message. Take an LSO + * message as input and return head/tail pointers as output. This + * function should not be invoked directly but instead through + * mac_hw_emul(). + * + * The resulting chain is comprised of multiple (nsegs) MSS sized + * segments. Each segment will consist of two or more mblks joined by + * b_cont: a header and one or more data mblks. The header mblk is + * allocated anew for each message. The first segment's header is used + * as a template for the rest with adjustments made for things such as + * ID, sequence, length, TCP flags, etc. The data mblks reference into + * the existing LSO mblk (passed in as omp) by way of dupb(). Their + * b_rptr/b_wptr values are adjusted to reference only the fraction of + * the LSO message they are responsible for. At the successful + * completion of this function the original mblk (omp) is freed, + * leaving the newely created segment chain as the only remaining + * reference to the data. + */ +static void +mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail, + uint_t *count) +{ + uint32_t ocsum_flags, ocsum_start, ocsum_stuff; + uint32_t mss; + uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen, odatalen; + uint32_t oleft; + uint_t nsegs, seg; + int len; + + struct ether_vlan_header *oevh; + const ipha_t *oiph; + const tcph_t *otcph; + ipha_t *niph; + tcph_t *ntcph; + uint16_t ip_id; + uint32_t tcp_seq, tcp_sum, otcp_sum; + + uint32_t offset; + mblk_t *odatamp; + mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp; + mblk_t *tmptail; + + ASSERT3P(head, !=, NULL); + ASSERT3P(tail, !=, NULL); + ASSERT3P(count, !=, NULL); + ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0); + + /* Assume we are dealing with a single LSO message. */ + ASSERT3P(omp->b_next, ==, NULL); + + /* + * XXX: This is a hack to deal with mac_add_vlan_tag(). + * + * When VLANs are in play, mac_add_vlan_tag() creates a new + * mblk with just the ether_vlan_header and tacks it onto the + * front of 'omp'. This breaks the assumptions made below; + * namely that the TCP/IP headers are in the first mblk. In + * this case, since we already have to pay the cost of LSO + * emulation, we simply pull up everything. While this might + * seem irksome, keep in mind this will only apply in a couple + * of scenarios: a) an LSO-capable VLAN client sending to a + * non-LSO-capable client over the "MAC/bridge loopback" + * datapath or b) an LSO-capable VLAN client is sending to a + * client that, for whatever reason, doesn't have DLS-bypass + * enabled. Finally, we have to check for both a tagged and + * untagged sized mblk depending on if the mblk came via + * mac_promisc_dispatch() or mac_rx_deliver(). + * + * In the future, two things should be done: + * + * 1. This function should make use of some yet to be + * implemented "mblk helpers". These helper functions would + * perform all the b_cont walking for us and guarantee safe + * access to the mblk data. + * + * 2. We should add some slop to the mblks so that + * mac_add_vlan_tag() can just edit the first mblk instead + * of allocating on the hot path. + */ + if (MBLKL(omp) == sizeof (struct ether_vlan_header) || + MBLKL(omp) == sizeof (struct ether_header)) { + mblk_t *tmp = msgpullup(omp, -1); + + if (tmp == NULL) { + mac_drop_pkt(omp, "failed to pull up"); + goto fail; + } + + mac_hcksum_clone(omp, tmp); + freemsg(omp); + omp = tmp; + } + + mss = DB_LSOMSS(omp); + ASSERT3U(msgsize(omp), <=, IP_MAXPACKET + + sizeof (struct ether_vlan_header)); + opktlen = msgsize(omp); + + /* + * First, get references to the IP and TCP headers and + * determine the total TCP length (header + data). + * + * Thanks to mac_hw_emul() we know that the first mblk must + * contain (at minimum) the full L2 header. However, this + * function assumes more than that. It assumes the L2/L3/L4 + * headers are all contained in the first mblk of a message + * (i.e., no b_cont walking for headers). While this is a + * current reality (our native TCP stack and viona both + * enforce this) things may become more nuanced in the future + * (e.g. when introducing encap support or adding new + * clients). For now we guard against this case by dropping + * the packet. + */ + oevh = (struct ether_vlan_header *)omp->b_rptr; + if (oevh->ether_tpid == htons(ETHERTYPE_VLAN)) + oehlen = sizeof (struct ether_vlan_header); + else + oehlen = sizeof (struct ether_header); + + ASSERT3U(MBLKL(omp), >=, (oehlen + sizeof (ipha_t) + sizeof (tcph_t))); + if (MBLKL(omp) < (oehlen + sizeof (ipha_t) + sizeof (tcph_t))) { + mac_drop_pkt(omp, "mblk doesn't contain TCP/IP headers"); + goto fail; + } + + oiph = (ipha_t *)(omp->b_rptr + oehlen); + oiphlen = IPH_HDR_LENGTH(oiph); + otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen); + otcphlen = TCP_HDR_LENGTH(otcph); + + /* + * Currently we only support LSO for TCP/IPv4. + */ + if (IPH_HDR_VERSION(oiph) != IPV4_VERSION) { + mac_drop_pkt(omp, "LSO unsupported IP version: %uhh", + IPH_HDR_VERSION(oiph)); + goto fail; + } + + if (oiph->ipha_protocol != IPPROTO_TCP) { + mac_drop_pkt(omp, "LSO unsupported protocol: %uhh", + oiph->ipha_protocol); + goto fail; + } + + if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) { + mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set"); + goto fail; + } + + ohdrslen = oehlen + oiphlen + otcphlen; + if ((len = MBLKL(omp)) < ohdrslen) { + mac_drop_pkt(omp, "LSO packet too short: %d < %u", len, + ohdrslen); + goto fail; + } + + /* + * Either we have data in the first mblk or it's just the + * header. In either case, we need to set rptr to the start of + * the TCP data. + */ + if (len > ohdrslen) { + odatamp = omp; + offset = ohdrslen; + } else { + ASSERT3U(len, ==, ohdrslen); + odatamp = omp->b_cont; + offset = 0; + } + + /* Make sure we still have enough data. */ + ASSERT3U(msgsize(odatamp), >=, opktlen - ohdrslen); + + /* + * If a MAC negotiated LSO then it must negotioate both + * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or + * HCKSUM_INET_PARTIAL; because both the IP and TCP headers + * change during LSO segmentation (only the 3 fields of the + * pseudo header checksum don't change: src, dst, proto). Thus + * we would expect these flags (HCK_IPV4_HDRCKSUM | + * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this + * function to emulate those checksums in software. However, + * that assumes a world where we only expose LSO if the + * underlying hardware exposes LSO. Moving forward the plan is + * to assume LSO in the upper layers and have MAC perform + * software LSO when the underlying provider doesn't support + * it. In such a world, if the provider doesn't support LSO + * but does support hardware checksum offload, then we could + * simply perform the segmentation and allow the hardware to + * calculate the checksums. To the hardware it's just another + * chain of non-LSO packets. + */ + ASSERT3S(DB_TYPE(omp), ==, M_DATA); + ocsum_flags = DB_CKSUMFLAGS(omp); + ASSERT3U(ocsum_flags & HCK_IPV4_HDRCKSUM, !=, 0); + ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0); + + /* + * If hardware only provides partial checksum then software + * must supply the pseudo-header checksum. In the case of LSO + * we leave the TCP length at zero to be filled in by + * hardware. This function must handle two scenarios. + * + * 1. Being called by a MAC client on the Rx path to segment + * an LSO packet and calculate the checksum. + * + * 2. Being called by a MAC provider to segment an LSO packet. + * In this case the LSO segmentation is performed in + * software (by this routine) but the MAC provider should + * still calculate the TCP/IP checksums in hardware. + * + * To elaborate on the second case: we cannot have the + * scenario where IP sends LSO packets but the underlying HW + * doesn't support checksum offload -- because in that case + * TCP/IP would calculate the checksum in software (for the + * LSO packet) but then MAC would segment the packet and have + * to redo all the checksum work. So IP should never do LSO + * if HW doesn't support both IP and TCP checksum. + */ + if (ocsum_flags & HCK_PARTIALCKSUM) { + ocsum_start = (uint32_t)DB_CKSUMSTART(omp); + ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp); + } + + odatalen = opktlen - ohdrslen; + + /* + * Subtract one to account for the case where the data length + * is evenly divisble by the MSS. Add one to account for the + * fact that the division will always result in one less + * segment than needed. + */ + nsegs = ((odatalen - 1) / mss) + 1; + if (nsegs < 2) { + mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs); + goto fail; + } + + DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph, + __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, uint_t, + nsegs); + + seg_chain = NULL; + tmptail = seg_chain; + oleft = odatalen; + + for (uint_t i = 0; i < nsegs; i++) { + boolean_t last_seg = ((i + 1) == nsegs); + uint32_t seg_len; + + /* + * If we fail to allocate, then drop the partially + * allocated chain as well as the LSO packet. Let the + * sender deal with the fallout. + */ + if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) { + freemsgchain(seg_chain); + mac_drop_pkt(omp, "failed to alloc segment header"); + goto fail; + } + ASSERT3P(nhdrmp->b_cont, ==, NULL); + + if (seg_chain == NULL) { + seg_chain = nhdrmp; } else { - sap = ntohs(ehp->ether_type); - offset = sizeof (struct ether_header); + ASSERT3P(tmptail, !=, NULL); + tmptail->b_next = nhdrmp; } - if (MBLKL(mp) <= offset) { - offset -= MBLKL(mp); - if (mp->b_cont == NULL) { - /* corrupted packet, skip it */ - if (prev != NULL) - prev->b_next = mp->b_next; - else - new_chain = mp->b_next; - mp1 = mp->b_next; - mp->b_next = NULL; - freemsg(mp); - mp = mp1; - continue; - } - mp = mp->b_cont; + tmptail = nhdrmp; + + /* + * Calculate this segment's lengh. It's either the MSS + * or whatever remains for the last segment. + */ + seg_len = last_seg ? oleft : mss; + ASSERT3U(seg_len, <=, mss); + ndatamp = build_data_seg(&odatamp, &offset, seg_len); + + if (ndatamp == NULL) { + freemsgchain(seg_chain); + mac_drop_pkt(omp, "LSO failed to segment data"); + goto fail; } - if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) { - ipha_t *ipha = NULL; + /* Attach data mblk to header mblk. */ + nhdrmp->b_cont = ndatamp; + DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO; + ASSERT3U(seg_len, <=, oleft); + oleft -= seg_len; + } + + /* We should have consumed entire LSO msg. */ + ASSERT3S(oleft, ==, 0); + ASSERT3P(odatamp, ==, NULL); - /* - * In order to compute the full and header - * checksums, we need to find and parse - * the IP and/or ULP headers. - */ + /* + * All seg data mblks are referenced by the header mblks, null + * out this pointer to catch any bad derefs. + */ + ndatamp = NULL; + + /* + * Set headers and checksum for first segment. + */ + nhdrmp = seg_chain; + bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen); + nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; + niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); + ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss); + niph->ipha_length = htons(oiphlen + otcphlen + mss); + niph->ipha_hdr_checksum = 0; + ip_id = ntohs(niph->ipha_ident); + ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); + tcp_seq = BE32_TO_U32(ntcph->th_seq); + tcp_seq += mss; + + /* + * The first segment shouldn't: + * + * o indicate end of data transmission (FIN), + * o indicate immediate handling of the data (PUSH). + */ + ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH); + DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); + + /* + * If the underlying HW provides partial checksum, then make + * sure to correct the pseudo header checksum before calling + * mac_sw_cksum(). The native TCP stack doesn't include the + * length field in the pseudo header when LSO is in play -- so + * we need to calculate it here. + */ + if (ocsum_flags & HCK_PARTIALCKSUM) { + DB_CKSUMSTART(nhdrmp) = ocsum_start; + DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); + DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; + tcp_sum = BE16_TO_U16(ntcph->th_sum); + otcp_sum = tcp_sum; + tcp_sum += mss + otcphlen; + tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF); + U16_TO_BE16(tcp_sum, ntcph->th_sum); + } + + if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && + (emul & MAC_HWCKSUM_EMULS)) { + next_nhdrmp = nhdrmp->b_next; + nhdrmp->b_next = NULL; + nhdrmp = mac_sw_cksum(nhdrmp, emul); + nhdrmp->b_next = next_nhdrmp; + next_nhdrmp = NULL; + + /* + * We may have freed the nhdrmp argument during + * checksum emulation, make sure that seg_chain + * references a valid mblk. + */ + seg_chain = nhdrmp; + } + + ASSERT3P(nhdrmp, !=, NULL); - sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; + seg = 1; + DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, + (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, + (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss, + uint_t, seg); + seg++; + /* There better be at least 2 segs. */ + ASSERT3P(nhdrmp->b_next, !=, NULL); + prev_nhdrmp = nhdrmp; + nhdrmp = nhdrmp->b_next; + + /* + * Now adjust the headers of the middle segments. For each + * header we need to adjust the following. + * + * o IP ID + * o IP length + * o TCP sequence + * o TCP flags + * o cksum flags + * o cksum values (if MAC_HWCKSUM_EMUL is set) + */ + for (; seg < nsegs; seg++) { + /* + * We use seg_chain as a reference to the first seg + * header mblk -- this first header is a template for + * the rest of the segments. This copy will include + * the now updated checksum values from the first + * header. We must reset these checksum values to + * their original to make sure we produce the correct + * value. + */ + bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen); + nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; + niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); + niph->ipha_ident = htons(++ip_id); + ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss); + niph->ipha_length = htons(oiphlen + otcphlen + mss); + niph->ipha_hdr_checksum = 0; + ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); + U32_TO_BE32(tcp_seq, ntcph->th_seq); + tcp_seq += mss; + /* + * Just like the first segment, the middle segments + * shouldn't have these flags set. + */ + ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH); + DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); + + if (ocsum_flags & HCK_PARTIALCKSUM) { /* - * IP header. + * First and middle segs have same + * pseudo-header checksum. */ - if (sap != ETHERTYPE_IP) - continue; + U16_TO_BE16(tcp_sum, ntcph->th_sum); + DB_CKSUMSTART(nhdrmp) = ocsum_start; + DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); + DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; + } - ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t)); - /* LINTED: improper alignment cast */ - ipha = (ipha_t *)(mp->b_rptr + offset); - - if (flags & HCK_FULLCKSUM) { - ipaddr_t src, dst; - uint32_t cksum; - uint16_t *up; - uint8_t proto; - - /* - * Pointer to checksum field in ULP header. - */ - proto = ipha->ipha_protocol; - ASSERT(ipha->ipha_version_and_hdr_length == - IP_SIMPLE_HDR_VERSION); - - switch (proto) { - case IPPROTO_TCP: - /* LINTED: improper alignment cast */ - up = IPH_TCPH_CHECKSUMP(ipha, - IP_SIMPLE_HDR_LENGTH); - break; - - case IPPROTO_UDP: - /* LINTED: improper alignment cast */ - up = IPH_UDPH_CHECKSUMP(ipha, - IP_SIMPLE_HDR_LENGTH); - break; - - default: - cmn_err(CE_WARN, "mac_fix_cksum: " - "unexpected protocol: %d", proto); - continue; - } - - /* - * Pseudo-header checksum. - */ - src = ipha->ipha_src; - dst = ipha->ipha_dst; - len = ntohs(ipha->ipha_length) - - IP_SIMPLE_HDR_LENGTH; - - cksum = (dst >> 16) + (dst & 0xFFFF) + - (src >> 16) + (src & 0xFFFF); - cksum += htons(len); - - /* - * The checksum value stored in the packet needs - * to be correct. Compute it here. - */ - *up = 0; - cksum += (((proto) == IPPROTO_UDP) ? - IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP); - cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH + - offset, cksum); - *(up) = (uint16_t)(cksum ? cksum : ~cksum); - - /* - * Flag the packet so that it appears - * that the checksum has already been - * verified by the hardware. - */ - flags &= ~HCK_FULLCKSUM; - flags |= HCK_FULLCKSUM_OK; - value = 0; - } + if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && + (emul & MAC_HWCKSUM_EMULS)) { + next_nhdrmp = nhdrmp->b_next; + nhdrmp->b_next = NULL; + nhdrmp = mac_sw_cksum(nhdrmp, emul); + nhdrmp->b_next = next_nhdrmp; + next_nhdrmp = NULL; + /* We may have freed the original nhdrmp. */ + prev_nhdrmp->b_next = nhdrmp; + } - if (flags & HCK_IPV4_HDRCKSUM) { - ASSERT(ipha != NULL); - ipha->ipha_hdr_checksum = - (uint16_t)ip_csum_hdr(ipha); - flags &= ~HCK_IPV4_HDRCKSUM; - flags |= HCK_IPV4_HDRCKSUM_OK; + DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, + (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, + (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), + uint_t, mss, uint_t, seg); - } + ASSERT3P(nhdrmp->b_next, !=, NULL); + prev_nhdrmp = nhdrmp; + nhdrmp = nhdrmp->b_next; + } + + /* Make sure we are on the last segment. */ + ASSERT3U(seg, ==, nsegs); + ASSERT3P(nhdrmp->b_next, ==, NULL); + + /* + * Now we set the last segment header. The difference being + * that FIN/PSH/RST flags are allowed. + */ + bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen); + nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; + niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); + niph->ipha_ident = htons(++ip_id); + len = msgsize(nhdrmp->b_cont); + ASSERT3S(len, >, 0); + niph->ipha_length = htons(oiphlen + otcphlen + len); + niph->ipha_hdr_checksum = 0; + ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); + U32_TO_BE32(tcp_seq, ntcph->th_seq); + + DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); + if (ocsum_flags & HCK_PARTIALCKSUM) { + DB_CKSUMSTART(nhdrmp) = ocsum_start; + DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); + DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; + tcp_sum = otcp_sum; + tcp_sum += len + otcphlen; + tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF); + U16_TO_BE16(tcp_sum, ntcph->th_sum); + } + + if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && + (emul & MAC_HWCKSUM_EMULS)) { + /* This should be the last mblk. */ + ASSERT3P(nhdrmp->b_next, ==, NULL); + nhdrmp = mac_sw_cksum(nhdrmp, emul); + prev_nhdrmp->b_next = nhdrmp; + } + + DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, + (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, + (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, len, + uint_t, seg); + + /* + * Free the reference to the original LSO message as it is + * being replaced by seg_cahin. + */ + freemsg(omp); + *head = seg_chain; + *tail = nhdrmp; + *count = nsegs; + return; + +fail: + *head = NULL; + *tail = NULL; + *count = 0; +} + +#define HCK_NEEDED (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM) + +/* + * Emulate various hardware offload features in software. Take a chain + * of packets as input and emulate the hardware features specified in + * 'emul'. The resulting chain's head pointer replaces the 'mp_chain' + * pointer given as input, and its tail pointer is written to + * '*otail'. The number of packets in the new chain is written to + * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus + * may be NULL. The 'mp_chain' argument may point to a NULL chain; in + * which case 'mp_chain' will simply stay a NULL chain. + * + * While unlikely, it is technically possible that this function could + * receive a non-NULL chain as input and return a NULL chain as output + * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be + * zero). This could happen if all the packets in the chain are + * dropped or if we fail to allocate new mblks. In this case, there is + * nothing for the caller to free. In any event, the caller shouldn't + * assume that '*mp_chain' is non-NULL on return. + * + * This function was written with three main use cases in mind. + * + * 1. To emulate hardware offloads when traveling mac-loopback (two + * clients on the same mac). This is wired up in mac_tx_send(). + * + * 2. To provide hardware offloads to the client when the underlying + * provider cannot. This is currently wired up in mac_tx() but we + * still only negotiate offloads when the underlying provider + * supports them. + * + * 3. To emulate real hardware in simnet. + */ +void +mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul) +{ + mblk_t *head = NULL, *tail = NULL; + uint_t count = 0; + + ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0); + ASSERT3P(mp_chain, !=, NULL); + + for (mblk_t *mp = *mp_chain; mp != NULL; ) { + mblk_t *tmp, *next, *tmphead, *tmptail; + struct ether_header *ehp; + uint32_t flags; + uint_t len = MBLKL(mp), l2len; + + /* Perform LSO/cksum one message at a time. */ + next = mp->b_next; + mp->b_next = NULL; + + /* + * For our sanity the first mblk should contain at + * least the full L2 header. + */ + if (len < sizeof (struct ether_header)) { + mac_drop_pkt(mp, "packet too short (A): %u", len); + mp = next; + continue; } - if (flags & HCK_PARTIALCKSUM) { - uint16_t *up, partial, cksum; - uchar_t *ipp; /* ptr to beginning of IP header */ - - if (mp->b_cont != NULL) { - mblk_t *mp1; - - mp1 = msgpullup(mp, offset + end); - if (mp1 == NULL) - continue; - mp1->b_next = mp->b_next; - mp->b_next = NULL; - freemsg(mp); - if (prev != NULL) - prev->b_next = mp1; - else - new_chain = mp1; - mp = mp1; - } + ehp = (struct ether_header *)mp->b_rptr; + if (ntohs(ehp->ether_type) == VLAN_TPID) + l2len = sizeof (struct ether_vlan_header); + else + l2len = sizeof (struct ether_header); - ipp = mp->b_rptr + offset; - /* LINTED: cast may result in improper alignment */ - up = (uint16_t *)((uchar_t *)ipp + stuff); - partial = *up; - *up = 0; + /* + * If the first mblk is solely the L2 header, then + * there better be more data. + */ + if (len < l2len || (len == l2len && mp->b_cont == NULL)) { + mac_drop_pkt(mp, "packet too short (C): %u", len); + mp = next; + continue; + } + + DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul); + + /* + * We use DB_CKSUMFLAGS (instead of mac_hcksum_get()) + * because we don't want to mask-out the LSO flag. + */ + flags = DB_CKSUMFLAGS(mp); - cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start, - end - start, partial); - cksum = ~cksum; - *up = cksum ? cksum : ~cksum; + if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) { + uint_t tmpcount = 0; /* - * Since we already computed the whole checksum, - * indicate to the stack that it has already - * been verified by the hardware. + * LSO fix-up handles checksum emulation + * inline (if requested). It also frees mp. */ - flags &= ~HCK_PARTIALCKSUM; - flags |= HCK_FULLCKSUM_OK; - value = 0; + mac_sw_lso(mp, emul, &tmphead, &tmptail, + &tmpcount); + if (tmphead == NULL) { + /* mac_sw_lso() freed the mp. */ + mp = next; + continue; + } + count += tmpcount; + } else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) { + tmp = mac_sw_cksum(mp, emul); + if (tmp == NULL) { + /* mac_sw_cksum() freed the mp. */ + mp = next; + continue; + } + tmphead = tmp; + tmptail = tmp; + count++; + } else { + /* There is nothing to emulate. */ + tmp = mp; + tmphead = tmp; + tmptail = tmp; + count++; + } + + /* + * The tmp mblk chain is either the start of the new + * chain or added to the tail of the new chain. + */ + if (head == NULL) { + head = tmphead; + tail = tmptail; + } else { + /* Attach the new mblk to the end of the new chain. */ + tail->b_next = tmphead; + tail = tmptail; } - mac_hcksum_set(mp, start, stuff, end, value, flags); + mp = next; } - return (new_chain); + *mp_chain = head; + + if (otail != NULL) + *otail = tail; + + if (ocount != NULL) + *ocount = count; } /* @@ -449,17 +1568,10 @@ mac_strip_vlan_tag_chain(mblk_t *mp_chain) */ /* ARGSUSED */ void -mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp, +mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp_chain, boolean_t loopback) { - mblk_t *mp1 = mp; - - while (mp1 != NULL) { - mp1->b_prev = NULL; - mp1->b_queue = NULL; - mp1 = mp1->b_next; - } - freemsgchain(mp); + freemsgchain(mp_chain); } /* diff --git a/usr/src/uts/common/io/simnet/simnet.c b/usr/src/uts/common/io/simnet/simnet.c index 727fbbad8e..b215f6e94b 100644 --- a/usr/src/uts/common/io/simnet/simnet.c +++ b/usr/src/uts/common/io/simnet/simnet.c @@ -21,6 +21,8 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2019 Joyent, Inc. */ /* @@ -51,6 +53,7 @@ #include <sys/atomic.h> #include <sys/mac_wifi.h> #include <sys/mac_impl.h> +#include <sys/pattr.h> #include <inet/wifi_ioctl.h> #include <sys/thread.h> #include <sys/synch.h> @@ -107,14 +110,15 @@ static int simnet_m_stat(void *, uint_t, uint64_t *); static void simnet_m_ioctl(void *, queue_t *, mblk_t *); static mblk_t *simnet_m_tx(void *, mblk_t *); static int simnet_m_setprop(void *, const char *, mac_prop_id_t, - uint_t, const void *); + const uint_t, const void *); static int simnet_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *); static void simnet_m_propinfo(void *, const char *, mac_prop_id_t, mac_prop_info_handle_t); +static boolean_t simnet_m_getcapab(void *, mac_capab_t, void *); static mac_callbacks_t simnet_m_callbacks = { - (MC_IOCTL | MC_SETPROP | MC_GETPROP | MC_PROPINFO), + (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO), simnet_m_stat, simnet_m_start, simnet_m_stop, @@ -124,7 +128,7 @@ static mac_callbacks_t simnet_m_callbacks = { simnet_m_tx, NULL, simnet_m_ioctl, - NULL, + simnet_m_getcapab, NULL, NULL, simnet_m_setprop, @@ -671,6 +675,12 @@ simnet_thread_unref(simnet_dev_t *sdev) mutex_exit(&sdev->sd_instlock); } +/* + * TODO: Add properties to set Rx checksum flag behavior. + * + * o HCK_PARTIALCKSUM. + * o HCK_FULLCKSUM_OK. + */ static void simnet_rx(void *arg) { @@ -683,7 +693,7 @@ simnet_rx(void *arg) /* Check for valid packet header */ if (mac_header_info(sdev->sd_mh, mp, &hdr_info) != 0) { - freemsg(mp); + mac_drop_pkt(mp, "invalid L2 header"); sdev->sd_stats.recv_errors++; goto rx_done; } @@ -712,6 +722,16 @@ simnet_rx(void *arg) } } + /* + * We don't actually calculate and verify the IP header + * checksum because the nature of simnet makes it redundant to + * do so. The point is to test the presence of the flags. The + * Tx side will have already populated the checksum field. + */ + if ((sdev->sd_rx_cksum & HCKSUM_IPHDRCKSUM) != 0) { + mac_hcksum_set(mp, 0, 0, 0, 0, HCK_IPV4_HDRCKSUM_OK); + } + sdev->sd_stats.recv_count++; sdev->sd_stats.rbytes += msgdsize(mp); mac_rx(sdev->sd_mh, NULL, mp); @@ -719,19 +739,22 @@ rx_done: simnet_thread_unref(sdev); } +#define SIMNET_ULP_CKSUM (HCKSUM_INET_FULL_V4 | HCKSUM_INET_PARTIAL) + static mblk_t * simnet_m_tx(void *arg, mblk_t *mp_chain) { simnet_dev_t *sdev = arg; simnet_dev_t *sdev_rx; mblk_t *mpnext = mp_chain; - mblk_t *mp; + mblk_t *mp, *nmp; + mac_emul_t emul = 0; rw_enter(&simnet_dev_lock, RW_READER); if ((sdev_rx = sdev->sd_peer_dev) == NULL) { /* Discard packets when no peer exists */ rw_exit(&simnet_dev_lock); - freemsgchain(mp_chain); + mac_drop_chain(mp_chain, "no peer"); return (NULL); } @@ -748,20 +771,20 @@ simnet_m_tx(void *arg, mblk_t *mp_chain) */ if (!simnet_thread_ref(sdev_rx)) { rw_exit(&simnet_dev_lock); - freemsgchain(mp_chain); + mac_drop_chain(mp_chain, "simnet peer dev not ready"); return (NULL); } rw_exit(&simnet_dev_lock); if (!simnet_thread_ref(sdev)) { simnet_thread_unref(sdev_rx); - freemsgchain(mp_chain); + mac_drop_chain(mp_chain, "simnet dev not ready"); return (NULL); } while ((mp = mpnext) != NULL) { - int len; - int size; + size_t len; + size_t size; mblk_t *mp_new; mblk_t *mp_tmp; @@ -775,7 +798,7 @@ simnet_m_tx(void *arg, mblk_t *mp_chain) mp_new = allocb(size, BPRI_HI); if (mp_new == NULL) { sdev->sd_stats.xmit_errors++; - freemsg(mp); + mac_drop_pkt(mp, "allocb failed"); continue; } bzero(mp_new->b_wptr, size); @@ -789,25 +812,44 @@ simnet_m_tx(void *arg, mblk_t *mp_chain) } /* Pullup packet into a single mblk */ - if (!pullupmsg(mp, -1)) { - sdev->sd_stats.xmit_errors++; - freemsg(mp); - continue; - } - - /* Fix mblk checksum as the pkt dest is local */ - if ((mp = mac_fix_cksum(mp)) == NULL) { + if ((nmp = msgpullup(mp, -1)) == NULL) { sdev->sd_stats.xmit_errors++; + mac_drop_pkt(mp, "msgpullup failed"); continue; + } else { + mac_hcksum_clone(mp, nmp); + freemsg(mp); + mp = nmp; } /* Hold reference for taskq receive processing per-pkt */ if (!simnet_thread_ref(sdev_rx)) { - freemsg(mp); - freemsgchain(mpnext); + mac_drop_pkt(mp, "failed to get thread ref"); + mac_drop_chain(mpnext, "failed to get thread ref"); break; } + if ((sdev->sd_tx_cksum & HCKSUM_IPHDRCKSUM) != 0) + emul |= MAC_IPCKSUM_EMUL; + if ((sdev->sd_tx_cksum & SIMNET_ULP_CKSUM) != 0) + emul |= MAC_HWCKSUM_EMUL; + if (sdev->sd_lso) + emul |= MAC_LSO_EMUL; + + if (emul != 0) + mac_hw_emul(&mp, NULL, NULL, emul); + + if (mp == NULL) { + sdev->sd_stats.xmit_errors++; + continue; + } + + /* + * Remember, we are emulating a real NIC here; the + * checksum flags can't make the trip across the link. + */ + DB_CKSUMFLAGS(mp) = 0; + /* Use taskq for pkt receive to avoid kernel stack explosion */ mp->b_next = (mblk_t *)sdev_rx; if (ddi_taskq_dispatch(simnet_rxq, simnet_rx, mp, @@ -886,6 +928,43 @@ simnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp) miocack(q, mp, msgdsize(mp1), rc); } +static boolean_t +simnet_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) +{ + simnet_dev_t *sdev = arg; + const uint_t tcp_cksums = HCKSUM_INET_FULL_V4 | HCKSUM_INET_PARTIAL; + + switch (cap) { + case MAC_CAPAB_HCKSUM: { + uint32_t *tx_cksum_flags = cap_data; + *tx_cksum_flags = sdev->sd_tx_cksum; + break; + } + case MAC_CAPAB_LSO: { + mac_capab_lso_t *cap_lso = cap_data; + + if (sdev->sd_lso && + (sdev->sd_tx_cksum & HCKSUM_IPHDRCKSUM) != 0 && + (sdev->sd_tx_cksum & tcp_cksums) != 0) { + /* + * The LSO configuration is hardwried for now, + * but there's no reason we couldn't also make + * this configurable in the future. + */ + cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; + cap_lso->lso_basic_tcp_ipv4.lso_max = SD_LSO_MAXLEN; + break; + } else { + return (B_FALSE); + } + } + default: + return (B_FALSE); + } + + return (B_TRUE); +} + static int simnet_m_stat(void *arg, uint_t stat, uint64_t *val) { @@ -1142,20 +1221,20 @@ set_wl_esslist_priv_prop(simnet_wifidev_t *wdev, uint_t pr_valsize, } static int -simnet_set_priv_prop(simnet_dev_t *sdev, const char *pr_name, - uint_t pr_valsize, const void *pr_val) +simnet_set_priv_prop_wifi(simnet_dev_t *sdev, const char *name, + const uint_t len, const void *val) { simnet_wifidev_t *wdev = sdev->sd_wifidev; long result; - if (strcmp(pr_name, "_wl_esslist") == 0) { - if (pr_val == NULL) + if (strcmp(name, "_wl_esslist") == 0) { + if (val == NULL) return (EINVAL); - return (set_wl_esslist_priv_prop(wdev, pr_valsize, pr_val)); - } else if (strcmp(pr_name, "_wl_connected") == 0) { - if (pr_val == NULL) + return (set_wl_esslist_priv_prop(wdev, len, val)); + } else if (strcmp(name, "_wl_connected") == 0) { + if (val == NULL) return (EINVAL); - (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); + (void) ddi_strtol(val, (char **)NULL, 0, &result); wdev->swd_linkstatus = ((result == 1) ? WL_CONNECTED:WL_NOTCONNECTED); return (0); @@ -1164,37 +1243,89 @@ simnet_set_priv_prop(simnet_dev_t *sdev, const char *pr_name, return (EINVAL); } +/* ARGSUSED */ static int -simnet_m_setprop(void *arg, const char *pr_name, mac_prop_id_t wldp_pr_num, - uint_t wldp_length, const void *wldp_buf) +simnet_set_priv_prop_ether(simnet_dev_t *sdev, const char *name, + const uint_t len, const void *val) { - simnet_dev_t *sdev = arg; - simnet_wifidev_t *wdev = sdev->sd_wifidev; - int err = 0; - uint32_t mtu; + if (strcmp(name, SD_PROP_RX_IP_CKSUM) == 0) { + if (val == NULL) + return (EINVAL); - switch (wldp_pr_num) { - case MAC_PROP_MTU: - (void) memcpy(&mtu, wldp_buf, sizeof (mtu)); - if (mtu > ETHERMIN && mtu < SIMNET_MAX_MTU) - return (mac_maxsdu_update(sdev->sd_mh, mtu)); - else + if (strcmp(val, "off") == 0) { + sdev->sd_rx_cksum &= ~HCKSUM_IPHDRCKSUM; + } else if (strcmp(val, "on") == 0) { + sdev->sd_rx_cksum |= HCKSUM_IPHDRCKSUM; + } else { return (EINVAL); - default: - break; + } + + return (0); + } else if (strcmp(name, SD_PROP_TX_ULP_CKSUM) == 0) { + if (val == NULL) + return (EINVAL); + + /* + * Remember, full and partial checksum are mutually + * exclusive. + */ + if (strcmp(val, "none") == 0) { + sdev->sd_tx_cksum &= ~HCKSUM_INET_FULL_V4; + } else if (strcmp(val, "fullv4") == 0) { + sdev->sd_tx_cksum &= ~HCKSUM_INET_PARTIAL; + sdev->sd_tx_cksum |= HCKSUM_INET_FULL_V4; + } else if (strcmp(val, "partial") == 0) { + sdev->sd_tx_cksum &= HCKSUM_INET_FULL_V4; + sdev->sd_tx_cksum |= HCKSUM_INET_PARTIAL; + } else { + return (EINVAL); + } + + return (0); + } else if (strcmp(name, SD_PROP_TX_IP_CKSUM) == 0) { + if (val == NULL) + return (EINVAL); + + if (strcmp(val, "off") == 0) { + sdev->sd_tx_cksum &= ~HCKSUM_IPHDRCKSUM; + } else if (strcmp(val, "on") == 0) { + sdev->sd_tx_cksum |= HCKSUM_IPHDRCKSUM; + } else { + return (EINVAL); + } + + return (0); + } else if (strcmp(name, SD_PROP_LSO) == 0) { + if (val == NULL) + return (EINVAL); + + if (strcmp(val, "off") == 0) { + sdev->sd_lso = B_FALSE; + } else if (strcmp(val, "on") == 0) { + sdev->sd_lso = B_TRUE; + } else { + return (EINVAL); + } + + return (0); } - if (sdev->sd_type == DL_ETHER) - return (ENOTSUP); + return (ENOTSUP); +} + +static int +simnet_setprop_wifi(simnet_dev_t *sdev, const char *name, + const mac_prop_id_t num, const uint_t len, const void *val) +{ + int err = 0; + simnet_wifidev_t *wdev = sdev->sd_wifidev; - /* mac_prop_id */ - switch (wldp_pr_num) { + switch (num) { case MAC_PROP_WL_ESSID: { int i; wl_ess_conf_t *wls; - (void) memcpy(&wdev->swd_essid, wldp_buf, - sizeof (wl_essid_t)); + (void) memcpy(&wdev->swd_essid, val, sizeof (wl_essid_t)); wdev->swd_linkstatus = WL_CONNECTED; /* Lookup the signal strength of the connected ESSID */ @@ -1209,8 +1340,7 @@ simnet_m_setprop(void *arg, const char *pr_name, mac_prop_id_t wldp_pr_num, break; } case MAC_PROP_WL_BSSID: { - (void) memcpy(&wdev->swd_bssid, wldp_buf, - sizeof (wl_bssid_t)); + (void) memcpy(&wdev->swd_bssid, val, sizeof (wl_bssid_t)); break; } case MAC_PROP_WL_PHY_CONFIG: @@ -1221,10 +1351,10 @@ simnet_m_setprop(void *arg, const char *pr_name, mac_prop_id_t wldp_pr_num, case MAC_PROP_WL_DESIRED_RATES: break; case MAC_PROP_PRIVATE: - err = simnet_set_priv_prop(sdev, pr_name, - wldp_length, wldp_buf); + err = simnet_set_priv_prop_wifi(sdev, name, len, val); break; default: + err = EINVAL; break; } @@ -1232,66 +1362,159 @@ simnet_m_setprop(void *arg, const char *pr_name, mac_prop_id_t wldp_pr_num, } static int -simnet_get_priv_prop(simnet_dev_t *sdev, const char *pr_name, - uint_t pr_valsize, void *pr_val) +simnet_setprop_ether(simnet_dev_t *sdev, const char *name, + const mac_prop_id_t num, const uint_t len, const void *val) { - simnet_wifidev_t *wdev = sdev->sd_wifidev; int err = 0; - int value; - if (strcmp(pr_name, "_wl_esslist") == 0) { + switch (num) { + case MAC_PROP_PRIVATE: + err = simnet_set_priv_prop_ether(sdev, name, len, val); + break; + default: + err = EINVAL; + break; + } + + return (err); +} + +static int +simnet_m_setprop(void *arg, const char *name, mac_prop_id_t num, + const uint_t len, const void *val) +{ + simnet_dev_t *sdev = arg; + int err = 0; + uint32_t mtu; + + switch (num) { + case MAC_PROP_MTU: + (void) memcpy(&mtu, val, sizeof (mtu)); + if (mtu > ETHERMIN && mtu < SIMNET_MAX_MTU) + return (mac_maxsdu_update(sdev->sd_mh, mtu)); + else + return (EINVAL); + default: + break; + } + + switch (sdev->sd_type) { + case DL_ETHER: + err = simnet_setprop_ether(sdev, name, num, len, val); + break; + case DL_WIFI: + err = simnet_setprop_wifi(sdev, name, num, len, val); + break; + default: + err = EINVAL; + break; + } + + /* + * We may have modified the configuration of hardware + * offloads. Make sure to renegotiate capabilities with the + * upstream clients. + */ + mac_capab_update(sdev->sd_mh); + return (err); +} + +static int +simnet_get_priv_prop_wifi(const simnet_dev_t *sdev, const char *name, + const uint_t len, void *val) +{ + simnet_wifidev_t *wdev = sdev->sd_wifidev; + int ret, value; + + if (strcmp(name, "_wl_esslist") == 0) { /* Returns num of _wl_ess_conf_t that have been set */ value = wdev->swd_esslist_num; - } else if (strcmp(pr_name, "_wl_connected") == 0) { + } else if (strcmp(name, "_wl_connected") == 0) { value = ((wdev->swd_linkstatus == WL_CONNECTED) ? 1:0); } else { - err = ENOTSUP; + return (ENOTSUP); } - if (err == 0) - (void) snprintf(pr_val, pr_valsize, "%d", value); - return (err); + ret = snprintf(val, len, "%d", value); + + if (ret < 0 || ret >= len) + return (EOVERFLOW); + + return (0); } static int -simnet_m_getprop(void *arg, const char *pr_name, mac_prop_id_t wldp_pr_num, - uint_t wldp_length, void *wldp_buf) +simnet_get_priv_prop_ether(const simnet_dev_t *sdev, const char *name, + const uint_t len, void *val) { - simnet_dev_t *sdev = arg; - simnet_wifidev_t *wdev = sdev->sd_wifidev; - int err = 0; - int i; + int ret; + char *value; - if (sdev->sd_type == DL_ETHER) + if (strcmp(name, SD_PROP_RX_IP_CKSUM) == 0) { + if ((sdev->sd_rx_cksum & HCKSUM_IPHDRCKSUM) != 0) { + value = "on"; + } else { + value = "off"; + } + } else if (strcmp(name, SD_PROP_TX_ULP_CKSUM) == 0) { + if ((sdev->sd_tx_cksum & HCKSUM_INET_FULL_V4) != 0) { + value = "fullv4"; + } else if ((sdev->sd_tx_cksum & HCKSUM_INET_PARTIAL) != 0) { + value = "partial"; + } else { + value = "none"; + } + } else if (strcmp(name, SD_PROP_TX_IP_CKSUM) == 0) { + if ((sdev->sd_tx_cksum & HCKSUM_IPHDRCKSUM) != 0) { + value = "on"; + } else { + value = "off"; + } + } else if (strcmp(name, SD_PROP_LSO) == 0) { + value = sdev->sd_lso ? "on" : "off"; + } else { return (ENOTSUP); + } - /* mac_prop_id */ - switch (wldp_pr_num) { + ret = snprintf(val, len, "%s", value); + + if (ret < 0 || ret >= len) { + return (EOVERFLOW); + } + + return (0); +} + +static int +simnet_getprop_wifi(const simnet_dev_t *sdev, const char *name, + const mac_prop_id_t num, const uint_t len, void *val) +{ + const simnet_wifidev_t *wdev = sdev->sd_wifidev; + int err = 0; + + switch (num) { case MAC_PROP_WL_ESSID: - (void) memcpy(wldp_buf, &wdev->swd_essid, - sizeof (wl_essid_t)); + (void) memcpy(val, &wdev->swd_essid, sizeof (wl_essid_t)); break; case MAC_PROP_WL_BSSID: - (void) memcpy(wldp_buf, &wdev->swd_bssid, - sizeof (wl_bssid_t)); + (void) memcpy(val, &wdev->swd_bssid, sizeof (wl_bssid_t)); break; case MAC_PROP_WL_PHY_CONFIG: case MAC_PROP_WL_AUTH_MODE: case MAC_PROP_WL_ENCRYPTION: break; case MAC_PROP_WL_LINKSTATUS: - (void) memcpy(wldp_buf, &wdev->swd_linkstatus, + (void) memcpy(val, &wdev->swd_linkstatus, sizeof (wdev->swd_linkstatus)); break; case MAC_PROP_WL_ESS_LIST: { wl_ess_conf_t *w_ess_conf; - ((wl_ess_list_t *)wldp_buf)->wl_ess_list_num = - wdev->swd_esslist_num; + ((wl_ess_list_t *)val)->wl_ess_list_num = wdev->swd_esslist_num; /* LINTED E_BAD_PTR_CAST_ALIGN */ - w_ess_conf = (wl_ess_conf_t *)((char *)wldp_buf + + w_ess_conf = (wl_ess_conf_t *)((char *)val + offsetof(wl_ess_list_t, wl_ess_list_ess)); - for (i = 0; i < wdev->swd_esslist_num; i++) { + for (uint_t i = 0; i < wdev->swd_esslist_num; i++) { (void) memcpy(w_ess_conf, wdev->swd_esslist[i], sizeof (wl_ess_conf_t)); w_ess_conf++; @@ -1299,18 +1522,35 @@ simnet_m_getprop(void *arg, const char *pr_name, mac_prop_id_t wldp_pr_num, break; } case MAC_PROP_WL_RSSI: - *(wl_rssi_t *)wldp_buf = wdev->swd_rssi; + *(wl_rssi_t *)val = wdev->swd_rssi; break; case MAC_PROP_WL_RADIO: - *(wl_radio_t *)wldp_buf = B_TRUE; + *(wl_radio_t *)val = B_TRUE; break; case MAC_PROP_WL_POWER_MODE: break; case MAC_PROP_WL_DESIRED_RATES: break; case MAC_PROP_PRIVATE: - err = simnet_get_priv_prop(sdev, pr_name, wldp_length, - wldp_buf); + err = simnet_get_priv_prop_wifi(sdev, name, len, val); + break; + default: + err = ENOTSUP; + break; + } + + return (err); +} + +static int +simnet_getprop_ether(const simnet_dev_t *sdev, const char *name, + const mac_prop_id_t num, const uint_t len, void *val) +{ + int err = 0; + + switch (num) { + case MAC_PROP_PRIVATE: + err = simnet_get_priv_prop_ether(sdev, name, len, val); break; default: err = ENOTSUP; @@ -1320,14 +1560,36 @@ simnet_m_getprop(void *arg, const char *pr_name, mac_prop_id_t wldp_pr_num, return (err); } +static int +simnet_m_getprop(void *arg, const char *name, const mac_prop_id_t num, + const uint_t len, void *val) +{ + const simnet_dev_t *sdev = arg; + int err = 0; + + switch (sdev->sd_type) { + case DL_ETHER: + err = simnet_getprop_ether(sdev, name, num, len, val); + break; + case DL_WIFI: + err = simnet_getprop_wifi(sdev, name, num, len, val); + break; + default: + err = EINVAL; + break; + } + + return (err); +} + static void -simnet_priv_propinfo(const char *pr_name, mac_prop_info_handle_t prh) +simnet_priv_propinfo_wifi(const char *name, mac_prop_info_handle_t prh) { char valstr[MAXNAMELEN]; bzero(valstr, sizeof (valstr)); - if (strcmp(pr_name, "_wl_esslist") == 0) { + if (strcmp(name, "_wl_esslist") == 0) { (void) snprintf(valstr, sizeof (valstr), "%d", 0); } @@ -1336,15 +1598,10 @@ simnet_priv_propinfo(const char *pr_name, mac_prop_info_handle_t prh) } static void -simnet_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t wldp_pr_num, +simnet_propinfo_wifi(const char *name, const mac_prop_id_t num, mac_prop_info_handle_t prh) { - simnet_dev_t *sdev = arg; - - if (sdev->sd_type == DL_ETHER) - return; - - switch (wldp_pr_num) { + switch (num) { case MAC_PROP_WL_BSSTYPE: case MAC_PROP_WL_ESS_LIST: case MAC_PROP_WL_SUPPORTED_RATES: @@ -1352,7 +1609,55 @@ simnet_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t wldp_pr_num, mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); break; case MAC_PROP_PRIVATE: - simnet_priv_propinfo(pr_name, prh); + simnet_priv_propinfo_wifi(name, prh); + break; + } +} + +static void +simnet_priv_propinfo_ether(const char *name, mac_prop_info_handle_t prh) +{ + if (strcmp(name, SD_PROP_RX_IP_CKSUM) == 0 || + strcmp(name, SD_PROP_TX_ULP_CKSUM) == 0 || + strcmp(name, SD_PROP_TX_IP_CKSUM) == 0 || + strcmp(name, SD_PROP_LSO) == 0) { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW); + } + + if (strcmp(name, SD_PROP_TX_ULP_CKSUM) == 0) { + mac_prop_info_set_default_str(prh, "none"); + } + + if (strcmp(name, SD_PROP_RX_IP_CKSUM) == 0 || + strcmp(name, SD_PROP_TX_IP_CKSUM) == 0 || + strcmp(name, SD_PROP_LSO) == 0) { + mac_prop_info_set_default_str(prh, "off"); + } +} + +static void +simnet_propinfo_ether(const char *name, const mac_prop_id_t num, + mac_prop_info_handle_t prh) +{ + switch (num) { + case MAC_PROP_PRIVATE: + simnet_priv_propinfo_ether(name, prh); + break; + } +} + +static void +simnet_m_propinfo(void *arg, const char *name, const mac_prop_id_t num, + const mac_prop_info_handle_t prh) +{ + simnet_dev_t *sdev = arg; + + switch (sdev->sd_type) { + case DL_ETHER: + simnet_propinfo_ether(name, num, prh); + break; + case DL_WIFI: + simnet_propinfo_wifi(name, num, prh); break; } } diff --git a/usr/src/uts/common/io/simnet/simnet_impl.h b/usr/src/uts/common/io/simnet/simnet_impl.h index 74dcba5113..5d6f16f113 100644 --- a/usr/src/uts/common/io/simnet/simnet_impl.h +++ b/usr/src/uts/common/io/simnet/simnet_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_SIMNET_IMPL_H @@ -84,13 +85,25 @@ typedef struct simnet_dev { uint_t sd_mac_len; uchar_t sd_mac_addr[MAXMACADDRLEN]; simnet_stats_t sd_stats; + + /* Capabilities */ + uint_t sd_rx_cksum; + uint_t sd_tx_cksum; + boolean_t sd_lso; } simnet_dev_t; +/* Simnet dladm private properties. */ +#define SD_PROP_RX_IP_CKSUM "_rx_ipv4_cksum" +#define SD_PROP_TX_ULP_CKSUM "_tx_ulp_cksum" +#define SD_PROP_TX_IP_CKSUM "_tx_ipv4_cksum" +#define SD_PROP_LSO "_lso" + /* Simnet device flags */ #define SDF_SHUTDOWN 0x00000001 /* Device shutdown, no new ops */ #define SDF_STARTED 0x00000002 /* Device started, allow ops */ #define SIMNET_MAX_MTU 9000 /* Max MTU supported by simnet driver */ +#define SD_LSO_MAXLEN 65535 /* Max LSO supported by simnet driver */ #ifdef __cplusplus } diff --git a/usr/src/uts/common/io/stream.c b/usr/src/uts/common/io/stream.c index ec76c6e2b9..288f77ae47 100644 --- a/usr/src/uts/common/io/stream.c +++ b/usr/src/uts/common/io/stream.c @@ -839,7 +839,7 @@ frnop_func(void *arg) */ static mblk_t * gesballoc(unsigned char *base, size_t size, uint32_t db_rtfu, frtn_t *frp, - void (*lastfree)(mblk_t *, dblk_t *), int kmflags) + void (*lastfree)(mblk_t *, dblk_t *), int kmflags) { dblk_t *dbp; mblk_t *mp; @@ -1451,6 +1451,16 @@ copyb(mblk_t *bp) ndp = nbp->b_datap; /* + * Copy the various checksum information that came in + * originally. + */ + ndp->db_cksumstart = dp->db_cksumstart; + ndp->db_cksumend = dp->db_cksumend; + ndp->db_cksumstuff = dp->db_cksumstuff; + bcopy(dp->db_struioun.data, ndp->db_struioun.data, + sizeof (dp->db_struioun.data)); + + /* * Well, here is a potential issue. If we are trying to * trace a flow, and we copy the message, we might lose * information about where this message might have been. diff --git a/usr/src/uts/common/io/vnic/vnic_dev.c b/usr/src/uts/common/io/vnic/vnic_dev.c index bbbd9b46bd..d75db5f258 100644 --- a/usr/src/uts/common/io/vnic/vnic_dev.c +++ b/usr/src/uts/common/io/vnic/vnic_dev.c @@ -457,6 +457,20 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, } else { vnic->vn_hcksum_txflags = 0; } + + /* + * Check for LSO capabilities. LSO implementations + * depend on hardware checksumming, so the same + * requirement is enforced here. + */ + if (vnic->vn_hcksum_txflags != 0) { + if (!mac_capab_get(vnic->vn_lower_mh, MAC_CAPAB_LSO, + &vnic->vn_cap_lso)) { + vnic->vn_cap_lso.lso_flags = 0; + } + } else { + vnic->vn_cap_lso.lso_flags = 0; + } } /* register with the MAC module */ @@ -827,6 +841,15 @@ vnic_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) HCKSUM_INET_PARTIAL); break; } + case MAC_CAPAB_LSO: { + mac_capab_lso_t *cap_lso = cap_data; + + if (vnic->vn_cap_lso.lso_flags == 0) { + return (B_FALSE); + } + *cap_lso = vnic->vn_cap_lso; + break; + } case MAC_CAPAB_VNIC: { mac_capab_vnic_t *vnic_capab = cap_data; diff --git a/usr/src/uts/common/os/ip_cksum.c b/usr/src/uts/common/os/ip_cksum.c index 1fa1c9425b..0a237e86ec 100644 --- a/usr/src/uts/common/os/ip_cksum.c +++ b/usr/src/uts/common/os/ip_cksum.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -34,6 +35,7 @@ #include <sys/vtrace.h> #include <inet/sctp_crc32.h> #include <inet/ip.h> +#include <inet/ip6.h> #include <sys/multidata.h> #include <sys/multidata_impl.h> @@ -556,3 +558,109 @@ ip_csum_hdr(ipha_t *ipha) sum = 0; return ((uint16_t)sum); } + +/* + * This function takes an mblk and IPv6 header as input and returns + * three pieces of information. + * + * 'hdr_length_ptr': The IPv6 header length including extension headers. + * + * 'nethdrpp': A pointer to the "next hedader" value, aka the + * transport header. This argument may be set to NULL if + * only the length is desired. + * + * return: Whether or not the header was malformed. + * + * This function assumes the IPv6 header along with all extensions are + * contained solely in this mblk: i.e., there is no b_cont walking. + */ +boolean_t +ip_hdr_length_nexthdr_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length_ptr, + uint8_t **nexthdrpp) +{ + uint16_t length; + uint_t ehdrlen; + uint8_t *nexthdrp; + uint8_t *whereptr; + uint8_t *endptr; + ip6_dest_t *desthdr; + ip6_rthdr_t *rthdr; + ip6_frag_t *fraghdr; + + ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); + length = IPV6_HDR_LEN; + whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ + endptr = mp->b_wptr; + + nexthdrp = &ip6h->ip6_nxt; + while (whereptr < endptr) { + /* Is there enough left for len + nexthdr? */ + if (whereptr + MIN_EHDR_LEN > endptr) + break; + + switch (*nexthdrp) { + case IPPROTO_HOPOPTS: + case IPPROTO_DSTOPTS: + /* Assumes the headers are identical for hbh and dst */ + desthdr = (ip6_dest_t *)whereptr; + ehdrlen = 8 * (desthdr->ip6d_len + 1); + if ((uchar_t *)desthdr + ehdrlen > endptr) + return (B_FALSE); + nexthdrp = &desthdr->ip6d_nxt; + break; + case IPPROTO_ROUTING: + rthdr = (ip6_rthdr_t *)whereptr; + ehdrlen = 8 * (rthdr->ip6r_len + 1); + if ((uchar_t *)rthdr + ehdrlen > endptr) + return (B_FALSE); + nexthdrp = &rthdr->ip6r_nxt; + break; + case IPPROTO_FRAGMENT: + fraghdr = (ip6_frag_t *)whereptr; + ehdrlen = sizeof (ip6_frag_t); + if ((uchar_t *)&fraghdr[1] > endptr) + return (B_FALSE); + nexthdrp = &fraghdr->ip6f_nxt; + break; + case IPPROTO_NONE: + /* No next header means we're finished */ + default: + *hdr_length_ptr = length; + + if (nexthdrpp != NULL) + *nexthdrpp = nexthdrp; + + return (B_TRUE); + } + length += ehdrlen; + whereptr += ehdrlen; + *hdr_length_ptr = length; + + if (nexthdrpp != NULL) + *nexthdrpp = nexthdrp; + } + switch (*nexthdrp) { + case IPPROTO_HOPOPTS: + case IPPROTO_DSTOPTS: + case IPPROTO_ROUTING: + case IPPROTO_FRAGMENT: + /* + * If any know extension headers are still to be processed, + * the packet's malformed (or at least all the IP header(s) are + * not in the same mblk - and that should never happen. + */ + return (B_FALSE); + + default: + /* + * If we get here, we know that all of the IP headers were in + * the same mblk, even if the ULP header is in the next mblk. + */ + *hdr_length_ptr = length; + + if (nexthdrpp != NULL) + *nexthdrpp = nexthdrp; + + return (B_TRUE); + } +} diff --git a/usr/src/uts/common/sys/mac.h b/usr/src/uts/common/sys/mac.h index 0907d6deff..2ce448fc3d 100644 --- a/usr/src/uts/common/sys/mac.h +++ b/usr/src/uts/common/sys/mac.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright (c) 2015 Garrett D'Amore <garrett@damore.org> */ @@ -614,6 +614,38 @@ typedef struct mactype_register_s { } mactype_register_t; /* + * Flags to describe the hardware emulation desired from a client when + * calling mac_hw_emul(). + * + * MAC_HWCKSUM_EMUL + * + * If an mblk is marked with HCK_* flags, then calculate those + * checksums and update the checksum flags. + * + * MAC_IPCKSUM_EMUL + * + * Like MAC_HWCKSUM_EMUL, except only calculate the IPv4 header + * checksum. We still update both the IPv4 and ULP checksum + * flags. + * + * MAC_LSO_EMUL + * + * If an mblk is marked with HW_LSO, then segment the LSO mblk + * into a new chain of mblks which reference the original data + * block. This flag DOES NOT imply MAC_HWCKSUM_EMUL. If the + * caller needs both then it must set both. + */ +typedef enum mac_emul { + MAC_HWCKSUM_EMUL = (1 << 0), + MAC_IPCKSUM_EMUL = (1 << 1), + MAC_LSO_EMUL = (1 << 2) +} mac_emul_t; + +#define MAC_HWCKSUM_EMULS (MAC_HWCKSUM_EMUL | MAC_IPCKSUM_EMUL) +#define MAC_ALL_EMULS (MAC_HWCKSUM_EMUL | MAC_IPCKSUM_EMUL | \ + MAC_LSO_EMUL) + +/* * Driver interface functions. */ extern int mac_open_by_linkid(datalink_id_t, diff --git a/usr/src/uts/common/sys/mac_client.h b/usr/src/uts/common/sys/mac_client.h index 88ab5f4756..1d1915a816 100644 --- a/usr/src/uts/common/sys/mac_client.h +++ b/usr/src/uts/common/sys/mac_client.h @@ -200,6 +200,8 @@ extern int mac_set_mtu(mac_handle_t, uint_t, uint_t *); extern void mac_client_set_rings(mac_client_handle_t, int, int); +extern void mac_hw_emul(mblk_t **, mblk_t **, uint_t *, mac_emul_t); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/mac_client_impl.h b/usr/src/uts/common/sys/mac_client_impl.h index d5c66684d0..0e3a6306e0 100644 --- a/usr/src/uts/common/sys/mac_client_impl.h +++ b/usr/src/uts/common/sys/mac_client_impl.h @@ -410,8 +410,8 @@ extern int mac_tx_percpu_cnt; extern void mac_promisc_client_dispatch(mac_client_impl_t *, mblk_t *); extern void mac_client_init(void); extern void mac_client_fini(void); -extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *, - mac_client_impl_t *); +extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *, mac_client_impl_t *, + boolean_t); extern int mac_validate_props(mac_impl_t *, mac_resource_props_t *); diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h index 4625417828..da645ad382 100644 --- a/usr/src/uts/common/sys/mac_impl.h +++ b/usr/src/uts/common/sys/mac_impl.h @@ -35,6 +35,7 @@ #include <net/if.h> #include <sys/mac_flow_impl.h> #include <netinet/ip6.h> +#include <sys/pattr.h> #ifdef __cplusplus extern "C" { @@ -289,54 +290,6 @@ struct mac_group_s { #define GROUP_INTR_ENABLE_FUNC(g) (g)->mrg_info.mgi_intr.mi_enable #define GROUP_INTR_DISABLE_FUNC(g) (g)->mrg_info.mgi_intr.mi_disable -#define MAC_RING_TX(mhp, rh, mp, rest) { \ - mac_ring_handle_t mrh = rh; \ - mac_impl_t *mimpl = (mac_impl_t *)mhp; \ - /* \ - * Send packets through a selected tx ring, or through the \ - * default handler if there is no selected ring. \ - */ \ - if (mrh == NULL) \ - mrh = mimpl->mi_default_tx_ring; \ - if (mrh == NULL) { \ - rest = mimpl->mi_tx(mimpl->mi_driver, mp); \ - } else { \ - rest = mac_hwring_tx(mrh, mp); \ - } \ -} - -/* - * This is the final stop before reaching the underlying driver - * or aggregation, so this is where the bridging hook is implemented. - * Packets that are bridged will return through mac_bridge_tx(), with - * rh nulled out if the bridge chooses to send output on a different - * link due to forwarding. - */ -#define MAC_TX(mip, rh, mp, src_mcip) { \ - mac_ring_handle_t rhandle = (rh); \ - /* \ - * If there is a bound Hybrid I/O share, send packets through \ - * the default tx ring. (When there's a bound Hybrid I/O share, \ - * the tx rings of this client are mapped in the guest domain \ - * and not accessible from here.) \ - */ \ - _NOTE(CONSTANTCONDITION) \ - if ((src_mcip)->mci_state_flags & MCIS_SHARE_BOUND) \ - rhandle = (mip)->mi_default_tx_ring; \ - if (mip->mi_promisc_list != NULL) \ - mac_promisc_dispatch(mip, mp, src_mcip); \ - /* \ - * Grab the proper transmit pointer and handle. Special \ - * optimization: we can test mi_bridge_link itself atomically, \ - * and if that indicates no bridge send packets through tx ring.\ - */ \ - if (mip->mi_bridge_link == NULL) { \ - MAC_RING_TX(mip, rhandle, mp, mp); \ - } else { \ - mp = mac_bridge_tx(mip, rhandle, mp); \ - } \ -} - /* mci_tx_flag */ #define MCI_TX_QUIESCE 0x1 @@ -485,6 +438,9 @@ struct mac_impl_s { mac_led_mode_t mi_led_modes; mac_capab_led_t mi_led; + /* Cache of the Tx DB_CKSUMFLAGS that this MAC supports. */ + uint16_t mi_tx_cksum_flags; /* SL */ + /* * MAC address and VLAN lists. SL protected. */ @@ -721,16 +677,30 @@ typedef struct mac_client_impl_s mac_client_impl_t; extern void mac_init(void); extern int mac_fini(void); +/* + * MAC packet/chain drop functions to aggregate all dropped-packet + * debugging to a single surface. + */ +/*PRINTFLIKE2*/ +extern void mac_drop_pkt(mblk_t *, const char *, ...) + __KPRINTFLIKE(2); + +/*PRINTFLIKE2*/ +extern void mac_drop_chain(mblk_t *, const char *, ...) + __KPRINTFLIKE(2); + extern void mac_ndd_ioctl(mac_impl_t *, queue_t *, mblk_t *); extern boolean_t mac_ip_hdr_length_v6(ip6_t *, uint8_t *, uint16_t *, uint8_t *, ip6_frag_t **); extern mblk_t *mac_copymsgchain_cksum(mblk_t *); -extern mblk_t *mac_fix_cksum(mblk_t *); extern void mac_packet_print(mac_handle_t, mblk_t *); extern void mac_rx_deliver(void *, mac_resource_handle_t, mblk_t *, mac_header_info_t *); extern void mac_tx_notify(mac_impl_t *); +extern mblk_t *mac_ring_tx(mac_handle_t, mac_ring_handle_t, mblk_t *); +extern mblk_t *mac_provider_tx(mac_impl_t *, mac_ring_handle_t, mblk_t *, + mac_client_impl_t *); extern void mac_callback_add(mac_cb_info_t *, mac_cb_t **, mac_cb_t *); extern boolean_t mac_callback_remove(mac_cb_info_t *, mac_cb_t **, mac_cb_t *); @@ -832,7 +802,7 @@ extern void mac_flow_set_name(flow_entry_t *, const char *); extern mblk_t *mac_add_vlan_tag(mblk_t *, uint_t, uint16_t); extern mblk_t *mac_add_vlan_tag_chain(mblk_t *, uint_t, uint16_t); extern mblk_t *mac_strip_vlan_tag_chain(mblk_t *); -extern void mac_pkt_drop(void *, mac_resource_handle_t, mblk_t *, boolean_t); +extern void mac_rx_def(void *, mac_resource_handle_t, mblk_t *, boolean_t); extern mblk_t *mac_rx_flow(mac_handle_t, mac_resource_handle_t, mblk_t *); extern void i_mac_share_alloc(mac_client_impl_t *); diff --git a/usr/src/uts/common/sys/pattr.h b/usr/src/uts/common/sys/pattr.h index 1269aeca10..a1fb21ad21 100644 --- a/usr/src/uts/common/sys/pattr.h +++ b/usr/src/uts/common/sys/pattr.h @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_PATTR_H @@ -97,6 +98,8 @@ typedef struct pattr_hcksum_s { #define HCK_FLAGS (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | \ HCK_FULLCKSUM | HCK_FULLCKSUM_OK) +#define HCK_TX_FLAGS (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | \ + HCK_FULLCKSUM) /* * Extended hardware offloading flags that also use hcksum_flags */ diff --git a/usr/src/uts/common/sys/vnic_impl.h b/usr/src/uts/common/sys/vnic_impl.h index 1a91158da6..4c8d49c621 100644 --- a/usr/src/uts/common/sys/vnic_impl.h +++ b/usr/src/uts/common/sys/vnic_impl.h @@ -21,7 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2015 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_VNIC_IMPL_H @@ -64,6 +64,7 @@ typedef struct vnic_s { mac_notify_handle_t vn_mnh; uint32_t vn_hcksum_txflags; + mac_capab_lso_t vn_cap_lso; uint32_t vn_mtu; link_state_t vn_ls; } vnic_t; diff --git a/usr/src/uts/common/xen/io/xnb.c b/usr/src/uts/common/xen/io/xnb.c index 4bf424c44e..23e1d971cb 100644 --- a/usr/src/uts/common/xen/io/xnb.c +++ b/usr/src/uts/common/xen/io/xnb.c @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ #ifdef DEBUG @@ -251,8 +252,8 @@ xnb_software_csum(xnb_t *xnbp, mblk_t *mp) * because it doesn't cover all of the interesting cases :-( */ mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM); - - return (mac_fix_cksum(mp)); + mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL); + return (mp); } mblk_t * |