summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRyan Zezeski <rpz@joyent.com>2020-05-04 17:50:44 +0000
committerPatrick Mooney <pmooney@pfmooney.com>2020-05-18 18:37:51 +0000
commitc61a1653a4d73dbc950dac7d96350fd6cb517486 (patch)
treea3050405d36b98afd4e056de8c295d7d47d3e6df
parentf13f199891d2a0440db0361743dd73527f565e89 (diff)
downloadillumos-joyent-c61a1653a4d73dbc950dac7d96350fd6cb517486.tar.gz
12676 want better offloads for vnics
12677 simnet has bogus mi_tx_cksum_flags 12678 mac_tx() is too eager to emulate hardware offloads Portions contributed by: Patrick Mooney <patrick.mooney@joyent.com> Portions contributed by: Robert Mustacchi <rm@joyent.com> Reviewed by: Patrick Mooney <pmooney@oxide.computer> Reviewed by: Andy Fiddaman <andy@omniosce.org> Approved by: Dan McDonald <danmcd@joyent.com>
-rw-r--r--usr/src/pkg/manifests/system-test-nettest.mf57
-rw-r--r--usr/src/test/Makefile2
-rw-r--r--usr/src/test/net-tests/Makefile20
-rw-r--r--usr/src/test/net-tests/cmd/Makefile36
-rw-r--r--usr/src/test/net-tests/cmd/nettest.ksh52
-rw-r--r--usr/src/test/net-tests/config/Makefile38
-rw-r--r--usr/src/test/net-tests/config/ip_forwarding.config22
-rw-r--r--usr/src/test/net-tests/runfiles/Makefile38
-rw-r--r--usr/src/test/net-tests/runfiles/default.run44
-rw-r--r--usr/src/test/net-tests/tests/Makefile42
-rw-r--r--usr/src/test/net-tests/tests/forwarding/Makefile67
-rw-r--r--usr/src/test/net-tests/tests/forwarding/README177
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_forwarding.ksh496
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_001.ksh22
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_002.ksh22
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_003.ksh22
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_004.ksh22
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_005.ksh22
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_006.ksh22
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_007.ksh22
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_008.ksh22
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_009.ksh22
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_010.ksh22
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_011.ksh22
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_012.ksh22
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_013.ksh22
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_014.ksh22
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_015.ksh22
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_016.ksh22
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_017.ksh22
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_018.ksh22
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_019.ksh22
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_020.ksh22
-rw-r--r--usr/src/test/net-tests/tests/forwarding/ip_fwd_suite.ksh115
-rw-r--r--usr/src/test/net-tests/tests/net_common.ksh650
-rw-r--r--usr/src/uts/common/inet/ip/ip6.c98
-rw-r--r--usr/src/uts/common/inet/ip/ip6_input.c7
-rw-r--r--usr/src/uts/common/inet/ip/ip_input.c22
-rw-r--r--usr/src/uts/common/inet/ip6.h3
-rw-r--r--usr/src/uts/common/inet/ip_impl.h20
-rw-r--r--usr/src/uts/common/io/bridge.c51
-rw-r--r--usr/src/uts/common/io/dls/dls_link.c8
-rw-r--r--usr/src/uts/common/io/fcoe/fcoe_fc.c5
-rw-r--r--usr/src/uts/common/io/mac/mac.c88
-rw-r--r--usr/src/uts/common/io/mac/mac_bcast.c13
-rw-r--r--usr/src/uts/common/io/mac/mac_client.c134
-rw-r--r--usr/src/uts/common/io/mac/mac_datapath_setup.c2
-rw-r--r--usr/src/uts/common/io/mac/mac_flow.c3
-rw-r--r--usr/src/uts/common/io/mac/mac_provider.c96
-rw-r--r--usr/src/uts/common/io/mac/mac_sched.c91
-rw-r--r--usr/src/uts/common/io/mac/mac_soft_ring.c2
-rw-r--r--usr/src/uts/common/io/mac/mac_util.c1490
-rw-r--r--usr/src/uts/common/io/simnet/simnet.c495
-rw-r--r--usr/src/uts/common/io/simnet/simnet_impl.h13
-rw-r--r--usr/src/uts/common/io/stream.c12
-rw-r--r--usr/src/uts/common/io/vnic/vnic_dev.c23
-rw-r--r--usr/src/uts/common/os/ip_cksum.c108
-rw-r--r--usr/src/uts/common/sys/mac.h34
-rw-r--r--usr/src/uts/common/sys/mac_client.h2
-rw-r--r--usr/src/uts/common/sys/mac_client_impl.h4
-rw-r--r--usr/src/uts/common/sys/mac_impl.h70
-rw-r--r--usr/src/uts/common/sys/pattr.h3
-rw-r--r--usr/src/uts/common/sys/vnic_impl.h3
-rw-r--r--usr/src/uts/common/xen/io/xnb.c5
64 files changed, 4619 insertions, 582 deletions
diff --git a/usr/src/pkg/manifests/system-test-nettest.mf b/usr/src/pkg/manifests/system-test-nettest.mf
new file mode 100644
index 0000000000..b313b0cc1c
--- /dev/null
+++ b/usr/src/pkg/manifests/system-test-nettest.mf
@@ -0,0 +1,57 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2020 Oxide Computer Company
+#
+
+set name=pkg.fmri value=pkg:/system/test/nettest@$(PKGVERS)
+set name=pkg.description value="Miscellaneous Network Unit Tests"
+set name=pkg.summary value="Network Unit Test Suite"
+set name=info.classification \
+ value=org.opensolaris.category.2008:Development/System
+set name=variant.arch value=$(ARCH)
+dir path=opt/net-tests
+dir path=opt/net-tests/bin
+dir path=opt/net-tests/config
+dir path=opt/net-tests/runfiles
+dir path=opt/net-tests/tests
+dir path=opt/net-tests/tests/forwarding
+file path=opt/net-tests/bin/nettest mode=0555
+file path=opt/net-tests/config/ip_forwarding.config mode=0644 \
+ preserve=renamenew
+file path=opt/net-tests/runfiles/default.run mode=0444
+file path=opt/net-tests/tests/forwarding/README mode=0444
+file path=opt/net-tests/tests/forwarding/ip_forwarding mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_001 mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_002 mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_003 mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_004 mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_005 mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_006 mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_007 mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_008 mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_009 mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_010 mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_011 mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_012 mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_013 mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_014 mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_015 mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_016 mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_017 mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_018 mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_019 mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_020 mode=0555
+file path=opt/net-tests/tests/forwarding/ip_fwd_suite mode=0555
+file path=opt/net-tests/tests/net_common mode=0555
+license lic_CDDL license=lic_CDDL
+depend fmri=system/test/testrunner type=require
diff --git a/usr/src/test/Makefile b/usr/src/test/Makefile
index fa57d36772..9756f02ef7 100644
--- a/usr/src/test/Makefile
+++ b/usr/src/test/Makefile
@@ -12,6 +12,7 @@
#
# Copyright (c) 2012 by Delphix. All rights reserved.
# Copyright 2014 Garrett D'Amore <garrett@damore.org>
+# Copyright 2019 Joyent, Inc.
#
.PARALLEL: $(SUBDIRS)
@@ -20,6 +21,7 @@ SUBDIRS = \
crypto-tests \
elf-tests \
libc-tests \
+ net-tests \
os-tests \
smbclient-tests \
test-runner \
diff --git a/usr/src/test/net-tests/Makefile b/usr/src/test/net-tests/Makefile
new file mode 100644
index 0000000000..6536e70c59
--- /dev/null
+++ b/usr/src/test/net-tests/Makefile
@@ -0,0 +1,20 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2019, Joyent Inc.
+#
+
+.PARALLEL: $(SUBDIRS)
+
+SUBDIRS = cmd config runfiles tests
+
+include $(SRC)/test/Makefile.com
diff --git a/usr/src/test/net-tests/cmd/Makefile b/usr/src/test/net-tests/cmd/Makefile
new file mode 100644
index 0000000000..b2770c84c6
--- /dev/null
+++ b/usr/src/test/net-tests/cmd/Makefile
@@ -0,0 +1,36 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+include $(SRC)/Makefile.master
+
+ROOTOPTPKG = $(ROOT)/opt/net-tests
+ROOTBIN = $(ROOTOPTPKG)/bin
+PROGS = nettest
+CMDS = $(PROGS:%=$(ROOTBIN)/%)
+$(CMDS) := FILEMODE = 0555
+
+include $(SRC)/test/Makefile.com
+
+install: $(CMDS)
+
+clobber: clean
+ $(RM) $(CMDS)
+
+$(CMDS): $(ROOTBIN)
+
+$(ROOTBIN):
+ $(INS.dir)
+
+$(ROOTBIN)/%: %.ksh
+ $(INS.rename)
diff --git a/usr/src/test/net-tests/cmd/nettest.ksh b/usr/src/test/net-tests/cmd/nettest.ksh
new file mode 100644
index 0000000000..e7d0e78865
--- /dev/null
+++ b/usr/src/test/net-tests/cmd/nettest.ksh
@@ -0,0 +1,52 @@
+#!/usr/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+export NET_TESTS="/opt/net-tests"
+runner="/opt/test-runner/bin/run"
+
+function fail
+{
+ echo $1 >&2
+ exit ${2:-1}
+}
+
+function find_runfile
+{
+ typeset distro=
+ if [[ -f $NET_TESTS/runfiles/default.run ]]; then
+ distro=default
+ fi
+
+ [[ -n $distro ]] && echo $NET_TESTS/runfiles/$distro.run
+}
+
+while getopts c: c; do
+ case $c in
+ 'c')
+ runfile=$OPTARG
+ [[ -f $runfile ]] || fail "Cannot read file: $runfile"
+ ;;
+ esac
+done
+shift $((OPTIND - 1))
+
+[[ -z $runfile ]] && runfile=$(find_runfile)
+[[ -z $runfile ]] && fail "Couldn't determine distro"
+
+$runner -c $runfile
+
+exit $?
diff --git a/usr/src/test/net-tests/config/Makefile b/usr/src/test/net-tests/config/Makefile
new file mode 100644
index 0000000000..7151577083
--- /dev/null
+++ b/usr/src/test/net-tests/config/Makefile
@@ -0,0 +1,38 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+include $(SRC)/Makefile.master
+
+CFGS = ip_forwarding.config
+ROOTOPTPKG = $(ROOT)/opt/net-tests
+ROOTOPTPKGCFG = $(ROOT)/opt/net-tests/config
+ROOTOPTPKGDIRS = $(ROOTOPTPKG) $(ROOTOPTPKGCFG)
+FILES = $(CFGS:%=$(ROOTOPTPKGCFG)/%)
+$(FILES) := FILEMODE = 0644
+
+include $(SRC)/test/Makefile.com
+
+all: $(CFGS)
+
+install: $(ROOTOPTPKG) $(ROOTOPTPKGCFG) $(FILES)
+
+clobber: clean
+ $(RM) $(FILES)
+
+$(ROOTOPTPKGDIRS):
+ $(INS.dir)
+
+$(ROOTOPTPKGCFG)/%: % $(ROOTOPTPKGDIRS)
+ $(INS.file)
diff --git a/usr/src/test/net-tests/config/ip_forwarding.config b/usr/src/test/net-tests/config/ip_forwarding.config
new file mode 100644
index 0000000000..4a839cd49d
--- /dev/null
+++ b/usr/src/test/net-tests/config/ip_forwarding.config
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+#
+# See the tests/forwarding/README file for information about how to
+# configure and run the tests.
+#
+export NT_CLIENT=client_zone_name
+export NT_ROUTER=router_zone_name
+export NT_SERVER=server_zone_name
diff --git a/usr/src/test/net-tests/runfiles/Makefile b/usr/src/test/net-tests/runfiles/Makefile
new file mode 100644
index 0000000000..d50a8deebf
--- /dev/null
+++ b/usr/src/test/net-tests/runfiles/Makefile
@@ -0,0 +1,38 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+include $(SRC)/Makefile.master
+
+SRCS = default.run
+ROOTOPTPKG = $(ROOT)/opt/net-tests
+RUNFILES = $(ROOTOPTPKG)/runfiles
+CMDS = $(SRCS:%=$(RUNFILES)/%)
+$(CMDS) := FILEMODE = 0444
+
+include $(SRC)/test/Makefile.com
+
+all: $(SRCS)
+
+install: $(CMDS)
+
+clobber: clean
+ $(RM) $(CMDS)
+
+$(CMDS): $(RUNFILES) $(SRCS)
+
+$(RUNFILES):
+ $(INS.dir)
+
+$(RUNFILES)/%: %
+ $(INS.file)
diff --git a/usr/src/test/net-tests/runfiles/default.run b/usr/src/test/net-tests/runfiles/default.run
new file mode 100644
index 0000000000..cfc1a3df8d
--- /dev/null
+++ b/usr/src/test/net-tests/runfiles/default.run
@@ -0,0 +1,44 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+[DEFAULT]
+outputdir = /var/tmp/test_results
+quiet = False
+timeout = 300
+
+[/opt/net-tests/tests/forwarding]
+tests = [
+ 'ip_fwd_001',
+ 'ip_fwd_002',
+ 'ip_fwd_003',
+ 'ip_fwd_004',
+ 'ip_fwd_005',
+ 'ip_fwd_006',
+ 'ip_fwd_007',
+ 'ip_fwd_008',
+ 'ip_fwd_009',
+ 'ip_fwd_010',
+ 'ip_fwd_011',
+ 'ip_fwd_012',
+ 'ip_fwd_013',
+ 'ip_fwd_014',
+ 'ip_fwd_015',
+ 'ip_fwd_016',
+ 'ip_fwd_017',
+ 'ip_fwd_018',
+ 'ip_fwd_019',
+ 'ip_fwd_020'
+ ]
+user = root
diff --git a/usr/src/test/net-tests/tests/Makefile b/usr/src/test/net-tests/tests/Makefile
new file mode 100644
index 0000000000..2712d62751
--- /dev/null
+++ b/usr/src/test/net-tests/tests/Makefile
@@ -0,0 +1,42 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+include $(SRC)/Makefile.master
+include $(SRC)/cmd/Makefile.cmd
+
+SUBDIRS = forwarding
+SCRIPTS = net_common
+ROOTOPTPKG = $(ROOT)/opt/net-tests
+TESTDIR = $(ROOTOPTPKG)/tests
+CMDS = $(SCRIPTS:%=$(TESTDIR)/%)
+FILEMODE=0444
+$(CMDS) := FILEMODE = 0555
+
+include $(SRC)/test/Makefile.com
+
+install: $(CMDS)
+
+clobber: clean
+ $(RM) $(CMDS)
+
+$(CMDS): $(TESTDIR)
+
+$(TESTDIR):
+ $(INS.dir)
+
+$(TESTDIR)/%: %
+ $(INS.file)
+
+$(TESTDIR)/%: %.ksh
+ $(INS.rename)
diff --git a/usr/src/test/net-tests/tests/forwarding/Makefile b/usr/src/test/net-tests/tests/forwarding/Makefile
new file mode 100644
index 0000000000..566db8c86d
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/Makefile
@@ -0,0 +1,67 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+include $(SRC)/Makefile.master
+include $(SRC)/cmd/Makefile.cmd
+
+ROOTOPTPKG = $(ROOT)/opt/net-tests
+TESTDIR = $(ROOTOPTPKG)/tests/forwarding
+
+PROG = \
+ ip_forwarding \
+ ip_fwd_suite \
+ ip_fwd_001 \
+ ip_fwd_002 \
+ ip_fwd_003 \
+ ip_fwd_004 \
+ ip_fwd_005 \
+ ip_fwd_006 \
+ ip_fwd_007 \
+ ip_fwd_008 \
+ ip_fwd_009 \
+ ip_fwd_010 \
+ ip_fwd_011 \
+ ip_fwd_012 \
+ ip_fwd_013 \
+ ip_fwd_014 \
+ ip_fwd_015 \
+ ip_fwd_016 \
+ ip_fwd_017 \
+ ip_fwd_018 \
+ ip_fwd_019 \
+ ip_fwd_020
+
+DOC = $(TESTDIR)/README
+
+CMDS = $(PROG:%=$(TESTDIR)/%)
+FILEMODE=0444
+$(CMDS) := FILEMODE = 0555
+
+include $(SRC)/test/Makefile.com
+
+install: $(CMDS) $(DOC)
+
+clobber: clean
+ $(RM) $(CMDS) $(DOC)
+
+$(CMDS) $(DOC): $(TESTDIR)
+
+$(TESTDIR):
+ $(INS.dir)
+
+$(TESTDIR)/%: %
+ $(INS.file)
+
+$(TESTDIR)/%: %.ksh
+ $(INS.rename)
diff --git a/usr/src/test/net-tests/tests/forwarding/README b/usr/src/test/net-tests/tests/forwarding/README
new file mode 100644
index 0000000000..dbe8774a22
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/README
@@ -0,0 +1,177 @@
+Running
+-------
+
+* Create three native zones and start them.
+
+* Edit config/ip_forwarding.config, entering the names of the zones
+ you created.
+
+* Run /opt/net-tests/bin/nettest.
+
+Overview
+--------
+
+The tests in this directory test the IP forwarding path under several
+different variations. All tests require three zones. The tests use
+these three zones, along with the simnet driver, to emulate a real IP
+forwarding scenario involving multiple hosts. All tests verify that
+TCP, UDP, ICMP, IPv4/IPv6, and fragmented IPv4/IPv6 traffic can cross
+the IP forwarding datapath. Each test differs in its emulation of
+various hardware offload features (which would typically be presented
+by real NICs). The diagrams below gives a visual representation of the
+situations we are testing and shows how the test components relate to
+each other.
+
+no mac-loopback
+---------------
+
+In this configuration we make sure that the packet travels from server
+to router via "the wire".
+
+ +----------------------------+
++----------------------------+ |router zone |
+|client zone | | +-------------------------+|
+|(ipft_client_nic0) | | |ipft_router_nic0 ||
+| +----------------------+ | | |+----------------------+ ||
+| |ipft_client0 | | | ||ipft_client_r0 | ||
+| |192.168.77.2 |<-+-- Wire --+->|192.168.77.1 | ||
+| |fd00:0:1:4d::2 | | | ||fd00:0:1:4d::1 | ||
+| +----------------------+ | | |+----------------------+ ||
++----------------------------+ | +-------------------------+|
+ | ^ |
+ | | |
+ | | |
+ | | |
+ | | |
+ | IP | |
+ | forwarding | |
+ | | |
+ | | |
+ | | |
++----------------------------+ | v |
+|server zone | |+-------------------------+ |
+|(ipft_server_nic0) | ||ipft_router_nic1 | |
+| +----------------------+ | || +----------------------+| |
+| |ipft_server0 | | || |ipft_server_r0 || |
+| |VLAN 5 | | Wire || |VLAN 5 || |
+| |192.168.88.2 |<-+----------++>|192.168.88.1 || |
+| |fd00:0:1:58::2 | | || |fd00:0:1:58::1 || |
+| +----------------------+ | || +----------------------+| |
++----------------------------+ |+-------------------------+ |
+ +----------------------------+
+
+mac-loopback
+------------
+
+In this configuration we make sure that the packet travels from server
+to router via mac-loopback.
+
+ +----------------------------+
++----------------------------+ |router zone |
+|client zone | | +-------------------------+|
+|(ipft_nic0) | | |ipft_nic1 ||
+| +----------------------+ | | |+----------------------+ ||
+| |ipft_client0 | | | ||ipft_client_r0 | ||
+| |192.168.77.2 |<-+-- Wire --+->|192.168.77.1 | ||
+| |fd00:0:1:4d::2 | | | ||fd00:0:1:4d::1 | ||
+| +----------------------+ | | |+----------------------+ ||
++----------------------------+ | +-------------------------+|
+ | ^ |
+ | | |
+ | | |
+ | | |
+ | | |
+ | IP | |
+ | forwarding | |
+ | | |
+ | | |
+ | | |
++----------------------------+ | v |
+|server zone | |+-------------------------+ |
+|(ipft_nic1) | ||ipft_nic1 | |
+| +----------------------+ | || +----------------------+| |
+| |ipft_server0 | | MAC || |ipft_server_r0 || |
+| |VLAN 5 | | loopback || |VLAN 5 || |
+| |192.168.88.2 |<-+----------++>|192.168.88.1 || |
+| |fd00:0:1:58::2 | | || |fd00:0:1:58::1 || |
+| +----------------------+ | || +----------------------+| |
++----------------------------+ |+-------------------------+ |
+ +----------------------------+
+
+Requirements
+------------
+
+* The client and server zones must provide `/usr/bin/socat`. It would
+ be nice to use netcat but our native version is missing features
+ like connection timeout.
+
+* The user must both create and start the three required zones.
+
+* All three zones should be native zones.
+
+* You must edit the ip_forwarding.config file; providing it with the
+ names of the zones you have created.
+
+Files
+-----
+
+ip_fowarding
+
+ The main test script; it provides the logic for all the tests
+ below. The different test variations are controlled by options
+ and it takes the three zones as arguments. This script may be
+ run by hand but it's easier to use ip_fwd_suite for that
+ purpose.
+
+ip_fwd_suite
+
+ This script runs the various configurations of the IP
+ forwarding test suite. You can run the entire suite or just a
+ single test via the '-n' option. The "Test Matrix" section
+ below gives an overview of all the tests in the suite.
+
+ip_fwd_XXX
+
+ These scripts are mostly here to work around the fact that the
+ test-runner cannot pass arguments to individual tests. In
+ order to avoid running everything as the "ip_fwd_suite" test,
+ we create a file for each configuration. This gives individual
+ reporting of each test and steers us clear of tripping the
+ timeout. You can also run these scripts by hand like so:
+
+ NET_TESTS=/opt/net-tests /opt/net-tests/tests/forwarding/ip_fwd_001
+
+config/ip_forwarding.config
+
+ This file must be modified to contain the names of the zones
+ the user crated for running these tests.
+
+Test Matrix
+-----------
+
+This is a breakdown of all the tests in the IP forwarding test suite.
+If a given offload is enabled or disable, it is done so for all
+interfaces involved in the test.
+
+NAME Tx IP Tx ULP LSO Rx IP mac-loopback
+001 off none off off no
+002 on partial off off no
+003 on partial on off no
+004 on fullv4 off off no
+005 on fullv4 on off no
+006 off none off on no
+007 on partial off on no
+008 on partial on on no
+009 on fullv4 off on no
+010 on fullv4 on on no
+
+011 off none off off yes
+012 on partial off off yes
+013 on partial on off yes
+014 on fullv4 off off yes
+015 on fullv4 on off yes
+016 off none off on yes
+017 on partial off on yes
+018 on partial on on yes
+019 on fullv4 off on yes
+020 on fullv4 on on yes
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_forwarding.ksh b/usr/src/test/net-tests/tests/forwarding/ip_forwarding.ksh
new file mode 100644
index 0000000000..bf7a2255af
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_forwarding.ksh
@@ -0,0 +1,496 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+#
+# Usage:
+#
+# ip_forwarding.ksh -bcflnpuv <client> <router> <server>
+#
+# Where client, router, and server are the names of three native
+# zones. The user must create and start these zones; but other
+# than that there is no special configuration required for them.
+#
+# -b Place server and router on same underlying simnet, causing
+# them to talk via MAC-loopback.
+#
+# -c Run cleanup only.
+#
+# -f Enable Tx ULP hardware checksum.
+#
+# -l Enable TCP LSO.
+#
+# -n No cleanup: the various artifacts created by this script will
+# remain after execution.
+#
+# -p Enabled partial Tx ULP hardware checksum.
+#
+# -r Enable Rx IPv4 header checksum offload.
+#
+# -u Run UDP tests.
+#
+# -v Vebose mode.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+. $NET_TESTS/tests/net_common
+
+function cleanup
+{
+ if ((nt_cleanup == 0)); then
+ dbg "skipping cleanup"
+ return 0
+ fi
+
+ rm -rf ${nt_tdirprefix}*
+ zlogin $nt_client rm -rf ${nt_tdirprefix}*
+ zlogin $nt_server rm -rf ${nt_tdirprefix}*
+
+ rm_route $nt_client $nt_server_ip $nt_server_subnet $nt_client_router_ip
+ rm_route $nt_server $nt_client_ip $nt_client_subnet $nt_server_router_ip
+ rm_route6 $nt_client $nt_server_ip6 $nt_server_subnet6 \
+ $nt_client_router_ip6
+ rm_route6 $nt_server $nt_client_ip6 $nt_client_subnet6 \
+ $nt_server_router_ip6
+
+ ip_fwd_disable $nt_router
+
+ delete_addr $nt_client ipft_client0 v4
+ delete_addr $nt_router ipft_client_r0 v4
+ delete_addr $nt_router ipft_server_r0 v4
+ delete_addr $nt_server ipft_server0 v4
+
+ delete_addr $nt_client ipft_client0 v6
+ delete_addr $nt_router ipft_client_r0 v6
+ delete_addr $nt_router ipft_server_r0 v6
+ delete_addr $nt_server ipft_server0 v6
+
+ delete_if $nt_client ipft_client0
+ delete_if $nt_router ipft_client_r0
+ delete_if $nt_router ipft_server_r0
+ delete_if $nt_server ipft_server0
+
+ delete_vnic ipft_client0 0 $nt_client
+ delete_vnic ipft_client_r0 0 $nt_router
+ delete_vnic ipft_server_r0 5 $nt_router
+ delete_vnic ipft_server0 5 $nt_server
+
+ for nt_name in ${nt_nics[@]}; do
+ delete_simnet $nt_name
+ done
+}
+
+function usage
+{
+ echo "$nt_tname -bcflnpruv <client> <router> <server>" >&2
+}
+
+#
+# Set test defaults.
+#
+nt_tname=${NT_TNAME:-$(basename $0)}
+nt_loopback=0
+nt_ulp_full=0
+nt_ulp_partial=0
+nt_tcp_lso=0
+nt_udp=0
+nt_rx_ip_cksum=0
+nt_cleanup=1
+nt_cleanup_only=0
+
+nt_tdirprefix=/var/tmp/${nt_tname}
+nt_tdir=${nt_tdirprefix}.$$
+nt_dfile=${nt_tdir}/${nt_tname}.data
+nt_efile=${nt_tdir}/${nt_tname}-expected-sha1
+nt_rfile=${nt_tdir}/${nt_tname}-received-sha1
+nt_ofile=${nt_tdir}/${nt_tname}-received
+nt_client_subnet=192.168.77.0/24
+nt_client_ip=192.168.77.2
+nt_client_router_ip=192.168.77.1
+nt_server_subnet=192.168.88.0/24
+nt_server_ip=192.168.88.2
+nt_server_router_ip=192.168.88.1
+nt_port=7774
+nt_client_subnet6=fd00:0:1:4d::2/64
+nt_client_ip6=fd00:0:1:4d::2
+nt_client_router_ip6=fd00:0:1:4d::1
+nt_server_subnet6=fd00:0:1:58::/64
+nt_server_router_ip6=fd00:0:1:58::1
+nt_server_ip6=fd00:0:1:58::2
+nt_port6=7776
+nt_bridge=ipft_switch
+typeset -A nt_nics
+
+while getopts "bcflnpruv" opt; do
+ case $opt in
+ b)
+ nt_loopback=1
+ ;;
+ c)
+ nt_cleanup_only=1
+ ;;
+ f)
+ nt_ulp_full=1
+ ;;
+ l)
+ nt_tcp_lso=1
+ ;;
+ n)
+ nt_cleanup=0
+ ;;
+ p)
+ nt_ulp_partial=1
+ ;;
+ r)
+ nt_rx_ip_cksum=1
+ ;;
+ u)
+ nt_udp=1
+ ;;
+ v)
+ DEBUG=1
+ ;;
+ esac
+done
+
+shift $((OPTIND - 1))
+
+if ((nt_ulp_partial == 1)) && ((nt_ulp_full == 1)); then
+ fail "both partial and full checksum enabled"
+fi
+
+if (( $# != 3 )); then
+ usage
+ fail "wrong number of arguments"
+fi
+
+nt_client=$1
+nt_router=$2
+nt_server=$3
+
+if [[ "$nt_client" == "$nt_router" || "$nt_router" == "$nt_server" ||
+ "$nt_client" == "$nt_server" ]]; then
+ fail "all zones must be unique"
+fi
+
+dbg "client zone: $nt_client"
+dbg "router zone: $nt_router"
+dbg "server zone: $nt_server"
+
+BAIL=1
+zone_exists $nt_client || fail "zone $nt_client not found"
+zone_exists $nt_router || fail "zone $nt_router not found"
+zone_exists $nt_server || fail "zone $nt_server not found"
+
+zone_running $nt_client
+zone_running $nt_router
+zone_running $nt_server
+
+if ! zlogin $nt_client ls /usr/bin/socat > /dev/null; then
+ fail "zone $nt_client missing socat"
+fi
+
+if ! zlogin $nt_server ls /usr/bin/socat > /dev/null; then
+ fail "zone $nt_client missing socat"
+fi
+
+if ((nt_loopback == 0)); then
+ nt_nics[0]=ipft_client_nic0
+ nt_nics[1]=ipft_router_nic0
+ nt_nics[2]=ipft_router_nic1
+ nt_nics[3]=ipft_server_nic0
+else
+ nt_nics[0]=ipft_nic0
+ nt_nics[1]=ipft_nic1
+fi
+
+#
+# Make a best effort to cleanup artifacts from a previous run.
+#
+if ((nt_cleanup_only == 1)); then
+ dbg "performing cleanup only"
+ BAIL=0
+ cleanup
+ BAIL=1
+ exit 0
+fi
+
+if ! mkdir $nt_tdir; then
+ fail "failed to mkdir $nt_tdir in GZ"
+fi
+dbg "created dir $nt_tdir in GZ"
+if ! zlogin $nt_client mkdir $nt_tdir; then
+ fail "failed to mkdir $nt_tdir in $nt_client"
+fi
+dbg "created dir $nt_tdir in $nt_client"
+if ! zlogin $nt_server mkdir $nt_tdir; then
+ fail "failed to mkdir $nt_tdir in $nt_server"
+fi
+dbg "created dir $nt_tdir in $nt_server"
+
+trap cleanup ERR
+
+for nt_name in ${nt_nics[@]}; do
+ create_simnet $nt_name
+done
+
+if ((nt_loopback == 0)); then
+ link_simnets ${nt_nics[0]} ${nt_nics[1]}
+ link_simnets ${nt_nics[2]} ${nt_nics[3]}
+else
+ link_simnets ${nt_nics[0]} ${nt_nics[1]}
+fi
+
+for nt_name in ${nt_nics[@]}; do
+ if ((nt_ulp_partial == 1)); then
+ set_linkprop $nt_name _tx_ulp_cksum partial
+ fi
+
+ if ((nt_ulp_full == 1)); then
+ set_linkprop $nt_name _tx_ulp_cksum fullv4
+ fi
+
+ if ((nt_ulp_full == 1)) || ((nt_ulp_partial == 1)); then
+ set_linkprop $nt_name _tx_ipv4_cksum on
+ fi
+
+ if ((nt_tcp_lso == 1)); then
+ set_linkprop $nt_name _lso on
+ fi
+
+ if ((nt_rx_ip_cksum == 1)); then
+ set_linkprop $nt_name _rx_ipv4_cksum on
+ fi
+done
+
+if ((nt_loopback == 0)); then
+ create_vnic ipft_client0 ipft_client_nic0 0 $nt_client
+ create_vnic ipft_client_r0 ipft_router_nic0 0 $nt_router
+ create_vnic ipft_server_r0 ipft_router_nic1 5 $nt_router
+ create_vnic ipft_server0 ipft_server_nic0 5 $nt_server
+else
+ create_vnic ipft_client0 ipft_nic0 0 $nt_client
+ create_vnic ipft_client_r0 ipft_nic1 0 $nt_router
+ create_vnic ipft_server_r0 ipft_nic1 5 $nt_router
+ create_vnic ipft_server0 ipft_nic1 5 $nt_server
+fi
+
+ip_fwd_enable $nt_router
+
+create_addr $nt_client ipft_client0 $nt_client_ip/24
+create_addr $nt_router ipft_client_r0 $nt_client_router_ip/24
+create_addr $nt_router ipft_server_r0 $nt_server_router_ip/24
+create_addr $nt_server ipft_server0 $nt_server_ip/24
+
+add_route $nt_client $nt_server_ip $nt_server_subnet $nt_client_router_ip
+add_route $nt_server $nt_client_ip $nt_client_subnet $nt_server_router_ip
+
+create_addr6 $nt_client ipft_client0 $nt_client_ip6
+create_addr6 $nt_router ipft_client_r0 $nt_client_router_ip6
+create_addr6 $nt_router ipft_server_r0 $nt_server_router_ip6
+create_addr6 $nt_server ipft_server0 $nt_server_ip6
+
+add_route6 $nt_client $nt_server_ip6 $nt_server_subnet6 $nt_client_router_ip6
+add_route6 $nt_server $nt_client_ip6 $nt_client_subnet6 $nt_server_router_ip6
+
+dd if=/dev/urandom of=$nt_dfile bs=1024 count=1024 > /dev/null 2>&1
+if (($? != 0)); then
+ fail "failed to create data file: $nt_dfile"
+else
+ dbg "created data file: $nt_dfile"
+fi
+
+digest -a sha1 $nt_dfile > $nt_efile
+
+# ================================================================
+# client -> server
+# ================================================================
+ping $nt_client $nt_client_ip $nt_server_ip
+ping $nt_client $nt_client_ip6 $nt_server_ip6
+
+start_server $nt_server TCP4 $nt_server_ip $nt_port $nt_ofile
+nt_listener_ppid=$!
+
+# Give the server time to start.
+sleep 1
+
+dbg "sending 1M $nt_client ($nt_client_ip) -> $nt_server ($nt_server_ip)"
+zlogin $nt_client /usr/bin/socat -b 8192 STDIN \
+ TCP4:$nt_server_ip:$nt_port,connect-timeout=5 < $nt_dfile
+
+if (($? != 0)); then
+ pkill -TERM -P $nt_listener_ppid
+ fail "failed to run socat client"
+else
+ dbg "sent 1M $nt_client ($nt_client_ip) -> $nt_server ($nt_server_ip)"
+fi
+
+#
+# The client may have exited but make sure to give the server time to
+# exit and finish computing the SHA1.
+#
+dbg "waiting for listener $nt_listener_ppid"
+wait_for_pid $nt_listener_ppid 5
+dbg "listener $nt_listener_ppid exited"
+
+digest -a sha1 /zones/$nt_server/root/$nt_ofile > $nt_rfile
+
+if ! diff $nt_efile $nt_rfile; then
+ fail "SHA1 comparison failed"
+else
+ dbg "SHA1 comparison passed"
+fi
+
+start_server $nt_server TCP6 $nt_server_ip6 $nt_port6 $nt_rfile
+listener_ppid=$!
+
+# Give the server time to start.
+sleep 1
+
+zlogin $nt_client /usr/bin/socat -b 8192 STDIN \
+ TCP6:[${nt_server_ip6}]:$nt_port6,connect-timeout=5 < $nt_dfile
+
+if (($? != 0)); then
+ pkill -TERM -P $nt_listener_ppid
+ fail "failed to run socat client IPv6"
+else
+ dbg "sent 1M $nt_client ($nt_client_ip6)" \
+ "-> $nt_server ($nt_server_ip6) IPv6"
+fi
+
+#
+# The client may have exited but make sure to give the server time to
+# exit and finish computing the SHA1.
+#
+dbg "waiting for listener $nt_listener_ppid"
+wait_for_pid $nt_listener_ppid 5
+dbg "listener $nt_listener_ppid exited"
+
+digest -a sha1 /zones/$nt_server/root/$nt_ofile > $nt_rfile
+
+if ! diff $nt_efile $nt_rfile; then
+ fail "SHA1 comparison failed"
+else
+ dbg "SHA1 comparison passed"
+fi
+
+if ((nt_udp == 1)); then
+ ping_udp $nt_client $nt_client_ip $nt_server_ip 256 3
+ ping_udp $nt_client $nt_client_ip6 $nt_server_ip6 256 3
+
+ #
+ # Test IP fragmentation by sending a larger-than-MTU datagram.
+ # You can verify fragmentation is happening by dtracing the
+ # various "Frag" and "Reasm" mibs.
+ #
+ dbg "test IP fragmentation $nt_client_ip -> $nt_server_ip"
+ ping_udp $nt_client $nt_client_ip $nt_server_ip $((1024 * 16)) 3
+
+ dbg "test IPv6 fragmentation $nt_client_ip6 -> $nt_server_ip6"
+ ping_udp $nt_client $nt_client_ip6 $nt_server_ip6 $((1024 * 16)) 3
+fi
+
+# ================================================================
+# server -> client
+# ================================================================
+ping $nt_server $nt_server_ip $nt_client_ip
+ping $nt_server $nt_server_ip6 $nt_client_ip6
+
+start_server $nt_client TCP4 $nt_client_ip $nt_port $nt_ofile
+nt_listener_ppid=$!
+
+# Give the listener time to start.
+sleep 1
+
+zlogin $nt_server /usr/bin/socat -b 8192 STDIN \
+ TCP4:$nt_client_ip:$nt_port,bind=$nt_server_ip,connect-timeout=5 \
+ < $nt_dfile
+
+if (($? != 0)); then
+ pkill -TERM -P $nt_listener_ppid
+ fail "failed to run socat client"
+else
+ dbg "sent 1M $nt_server ($nt_server_ip) -> $nt_client ($nt_client_ip)"
+fi
+
+#
+# The client may have exited but make sure to give the server time to
+# exit and finish computing the SHA1.
+#
+dbg "waiting for listener $nt_listener_ppid"
+wait_for_pid $nt_listener_ppid 5
+dbg "listener $nt_listener_ppid exited"
+
+digest -a sha1 /zones/$nt_client/root/$nt_ofile > $nt_rfile
+
+if ! diff $nt_efile $nt_rfile; then
+ fail "SHA1 comparison failed"
+else
+ dbg "SHA1 comparison passed"
+fi
+
+start_server $nt_client TCP6 $nt_client_ip6 $nt_port6 $nt_ofile
+nt_listener_ppid=$!
+
+# Give the listener time to start.
+sleep 1
+
+zlogin $nt_server /usr/bin/socat -b 8192 STDIN \
+ TCP6:[$nt_client_ip6]:$nt_port6,connect-timeout=5 < $nt_dfile
+
+if (($? != 0)); then
+ pkill -TERM -P $nt_listener_ppid
+ fail "failed to run socat client IPv6"
+else
+ dbg "sent 1M $nt_server ($nt_server_ip6) -> $nt_client ($nt_client_ip6)"
+fi
+
+#
+# The client may have exited but make sure to give the server time to
+# exit and finish computing the SHA1.
+#
+dbg "waiting for listener $nt_listener_ppid"
+wait_for_pid $nt_listener_ppid 5
+dbg "server $nt_listener_ppid exited"
+
+digest -a sha1 /zones/$nt_client/root/$nt_ofile > $nt_rfile
+
+if ! diff $nt_efile $nt_rfile; then
+ fail "SHA1 comparison failed"
+else
+ dbg "SHA1 comparison passed"
+fi
+
+if ((nt_udp == 1)); then
+ ping_udp $nt_server $nt_server_ip $nt_client_ip 256 3
+ ping_udp $nt_server $nt_server_ip6 $nt_client_ip6 256 3
+
+ #
+ # Test IP fragmentation by sending a larger-than-MTU datagram.
+ # You can verify fragmentation is happening by dtracing the
+ # various "Frag" and "Reasm" mibs.
+ #
+ dbg "test IP fragmentation $nt_server_ip -> $nt_client_ip"
+ ping_udp $nt_server $nt_server_ip $nt_client_ip $((1024 * 16)) 3
+
+ dbg "test IPv6 fragmentation $nt_server_ip6 -> $nt_client_ip6"
+ ping_udp $nt_server $nt_server_ip6 $nt_client_ip6 $((1024 * 16)) 3
+fi
+
+cleanup
+echo "PASS [$nt_tname]"
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_001.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_001.ksh
new file mode 100644
index 0000000000..9f6c98d1b3
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_001.ksh
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+$NET_TESTS/tests/forwarding/ip_fwd_suite -n 001
+exit $?
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_002.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_002.ksh
new file mode 100644
index 0000000000..06e5ec53ed
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_002.ksh
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+$NET_TESTS/tests/forwarding/ip_fwd_suite -n 002
+exit $?
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_003.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_003.ksh
new file mode 100644
index 0000000000..ce84bc0866
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_003.ksh
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+$NET_TESTS/tests/forwarding/ip_fwd_suite -n 003
+exit $?
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_004.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_004.ksh
new file mode 100644
index 0000000000..b5fa65ccd1
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_004.ksh
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+$NET_TESTS/tests/forwarding/ip_fwd_suite -n 004
+exit $?
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_005.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_005.ksh
new file mode 100644
index 0000000000..9bbd536e19
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_005.ksh
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+$NET_TESTS/tests/forwarding/ip_fwd_suite -n 005
+exit $?
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_006.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_006.ksh
new file mode 100644
index 0000000000..2267072a3d
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_006.ksh
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+$NET_TESTS/tests/forwarding/ip_fwd_suite -n 006
+exit $?
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_007.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_007.ksh
new file mode 100644
index 0000000000..a0380eb92e
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_007.ksh
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+$NET_TESTS/tests/forwarding/ip_fwd_suite -n 007
+exit $?
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_008.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_008.ksh
new file mode 100644
index 0000000000..aed5438f63
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_008.ksh
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+$NET_TESTS/tests/forwarding/ip_fwd_suite -n 008
+exit $?
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_009.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_009.ksh
new file mode 100644
index 0000000000..8a0fa9674c
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_009.ksh
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+$NET_TESTS/tests/forwarding/ip_fwd_suite -n 009
+exit $?
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_010.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_010.ksh
new file mode 100644
index 0000000000..3c45225597
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_010.ksh
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+$NET_TESTS/tests/forwarding/ip_fwd_suite -n 010
+exit $?
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_011.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_011.ksh
new file mode 100644
index 0000000000..62785ff33e
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_011.ksh
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+$NET_TESTS/tests/forwarding/ip_fwd_suite -n 011
+exit $?
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_012.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_012.ksh
new file mode 100644
index 0000000000..c09cd77258
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_012.ksh
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+$NET_TESTS/tests/forwarding/ip_fwd_suite -n 012
+exit $?
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_013.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_013.ksh
new file mode 100644
index 0000000000..e3cc833f53
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_013.ksh
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+$NET_TESTS/tests/forwarding/ip_fwd_suite -n 013
+exit $?
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_014.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_014.ksh
new file mode 100644
index 0000000000..6bd76de190
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_014.ksh
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+$NET_TESTS/tests/forwarding/ip_fwd_suite -n 014
+exit $?
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_015.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_015.ksh
new file mode 100644
index 0000000000..d3b1e2fe1d
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_015.ksh
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+$NET_TESTS/tests/forwarding/ip_fwd_suite -n 015
+exit $?
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_016.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_016.ksh
new file mode 100644
index 0000000000..aa5903cbe4
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_016.ksh
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+$NET_TESTS/tests/forwarding/ip_fwd_suite -n 016
+exit $?
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_017.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_017.ksh
new file mode 100644
index 0000000000..38615b9f94
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_017.ksh
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+$NET_TESTS/tests/forwarding/ip_fwd_suite -n 017
+exit $?
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_018.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_018.ksh
new file mode 100644
index 0000000000..e010141458
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_018.ksh
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+$NET_TESTS/tests/forwarding/ip_fwd_suite -n 018
+exit $?
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_019.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_019.ksh
new file mode 100644
index 0000000000..e3b16bad43
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_019.ksh
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+$NET_TESTS/tests/forwarding/ip_fwd_suite -n 019
+exit $?
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_020.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_020.ksh
new file mode 100644
index 0000000000..9710bae3c1
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_020.ksh
@@ -0,0 +1,22 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+$NET_TESTS/tests/forwarding/ip_fwd_suite -n 020
+exit $?
diff --git a/usr/src/test/net-tests/tests/forwarding/ip_fwd_suite.ksh b/usr/src/test/net-tests/tests/forwarding/ip_fwd_suite.ksh
new file mode 100644
index 0000000000..a1fdc444e3
--- /dev/null
+++ b/usr/src/test/net-tests/tests/forwarding/ip_fwd_suite.ksh
@@ -0,0 +1,115 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+#
+# Run the IP forwarding test suite.
+#
+# Usage
+#
+# ip_fwd_suite [-n <name>] [-a <args>]
+#
+# To run all tests:
+#
+# NET_TESTS=/opt/net-tests ip_fwd_suite
+#
+# To run one test:
+#
+# NET_TESTS=/opt/net-tests ip_fwd_suite -n 001
+#
+# To run one test with additional arguments passed to 'ip_forwarding':
+#
+# NET_TESTS=/opt/net-tests ip_fwd_suite -n 001 -a n
+#
+
+if [[ -z $NET_TESTS ]]; then
+ echo "NET_TESTS not set" >&2
+ exit 1
+fi
+
+. $NET_TESTS/tests/net_common
+. $NET_TESTS/config/ip_forwarding.config
+
+if [[ -z "$NT_CLIENT" ]]; then
+ fail "NT_CLIENT must be set"
+fi
+
+if [[ -z "$NT_ROUTER" ]]; then
+ fail "NT_ROUTER must be set"
+fi
+
+if [[ -z "$NT_SERVER" ]]; then
+ fail "NT_SERVER must be set"
+fi
+
+while getopts "a:n:" opt; do
+ case $opt in
+ a)
+ nt_args=$OPTARG
+ ;;
+ n)
+ nt_name=$OPTARG
+ ;;
+ esac
+done
+
+shift $((OPTIND - 1))
+
+nt_script=$NET_TESTS/tests/forwarding/ip_forwarding
+
+#
+# See the "Test Matrix" section of the README for a description of
+# each test.
+#
+typeset -A nt_name_args
+nt_name_args["001"]="uv"
+nt_name_args["002"]="puv"
+nt_name_args["003"]="lpuv"
+nt_name_args["004"]="fuv"
+nt_name_args["005"]="fluv"
+nt_name_args["006"]="ruv"
+nt_name_args["007"]="pruv"
+nt_name_args["008"]="lpruv"
+nt_name_args["009"]="fruv"
+nt_name_args["010"]="flruv"
+
+nt_name_args["011"]="buv"
+nt_name_args["012"]="bpuv"
+nt_name_args["013"]="blpuv"
+nt_name_args["014"]="bfuv"
+nt_name_args["015"]="bfluv"
+nt_name_args["016"]="bruv"
+nt_name_args["017"]="bpruv"
+nt_name_args["018"]="blpruv"
+nt_name_args["019"]="bfruv"
+nt_name_args["020"]="bflruv"
+
+if [[ -n $nt_name ]]; then
+ if [[ -z ${nt_name_args[$nt_name]} ]]; then
+ fail "invalid test name: $nt_name"
+ fi
+
+ export NT_TNAME="ip_fwd_$nt_name"
+ nt_args="-${nt_name_args[$nt_name]}${nt_args}"
+ $nt_script $nt_args $NT_CLIENT $NT_ROUTER $NT_SERVER
+ exit $?
+fi
+
+for nt_name in ${!nt_name_args[@]}; do
+ export NT_TNAME="ip_fwd_$nt_name"
+ nt_args="-${nt_name_args[$nt_name]}${nt_args}"
+ $nt_script $nt_args $NT_CLIENT $NT_ROUTER $NT_SERVER || exit $?
+done
+
+exit 0
diff --git a/usr/src/test/net-tests/tests/net_common.ksh b/usr/src/test/net-tests/tests/net_common.ksh
new file mode 100644
index 0000000000..b83cda8c97
--- /dev/null
+++ b/usr/src/test/net-tests/tests/net_common.ksh
@@ -0,0 +1,650 @@
+#!/usr/bin/ksh
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+
+#
+# Copyright 2019 Joyent, Inc.
+#
+
+#
+# Functions shared across the network tests.
+#
+
+DEBUG=0
+
+function dbg
+{
+ typeset msg="$*"
+ if (($DEBUG == 1)); then
+ echo "DBG [$nt_tname]: $msg"
+ fi
+}
+
+function fail
+{
+ typeset msg="$*"
+ echo "FAIL [$nt_tname]: $msg" >&2
+ exit 1
+}
+
+function maybe_fail
+{
+ typeset msg=$1
+
+ if ((BAIL == 1)); then
+ fail "$msg"
+ else
+ dbg "$msg"
+ return 1
+ fi
+}
+
+function zone_exists
+{
+ typeset name=$1
+
+ if (($# != 1)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ dbg "checking for existence of zone: $name"
+ if zoneadm -z $name list > /dev/null 2>&1; then
+ dbg "found zone: $name"
+ return 0
+ else
+ dbg "zone not found: $name"
+ return 1
+ fi
+}
+
+function zone_running
+{
+ typeset name=$1
+ typeset state=$(zoneadm -z $name list -p | awk -F: '{ print $3 }')
+ typeset err="zone $name is not running"
+
+ if (($# != 1)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ dbg "check if zone $name is running"
+ dbg "state of zone $name: $state"
+ if [[ "$state" == "running" ]]; then
+ dbg "zone $name is running"
+ return 0
+ fi
+
+ maybe_fail "$err"
+}
+
+function simnet_exists
+{
+ typeset name=$1
+
+ if (($# != 1)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ if dladm show-simnet $name > /dev/null 2>&1; then
+ dbg "simnet $name found"
+ return 0
+ else
+ dbg "simnet $name not found"
+ return 1
+ fi
+}
+
+function create_simnet
+{
+ typeset name=$1
+ typeset err="failed to create simnet $name"
+
+ if (($# != 1)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ dbg "creating simnet $name"
+ if simnet_exists $name; then
+ dbg "simnet $name already exists"
+ maybe_fail "$err"
+ return 1
+ fi
+
+ if dladm create-simnet > /dev/null $name; then
+ dbg "created simnet $name"
+ return 0
+ fi
+
+ maybe_fail "$err"
+}
+
+function delete_simnet
+{
+ typeset name=$1
+ typeset err="failed to delete simnet $name"
+
+ if (($# != 1)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ dbg "deleting simnet $name"
+ if ! simnet_exists $name; then
+ dbg "simnet $name doesn't exist"
+ return 1
+ fi
+
+ if dladm delete-simnet $name; then
+ dbg "simnet $name deleted"
+ return 0
+ fi
+
+ maybe_fail "$err"
+}
+
+function link_simnets
+{
+ typeset sim1=$1
+ typeset sim2=$2
+ typeset err="failed to link simnet $sim1 to $sim2"
+
+ if (($# != 2)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ dbg "linking simnet $sim1 to $sim2"
+ if dladm modify-simnet -p $sim2 $sim1 > /dev/null; then
+ dbg "linked simnet $sim1 to $sim2"
+ return 0
+ fi
+
+ maybe_fail "$err"
+}
+
+function vnic_exists
+{
+ typeset name=$1
+ typeset vid=$2
+ typeset over=$3
+ typeset zone=$4
+
+ if (($# != 4)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ if dladm show-vnic $name > /dev/null 2>&1; then
+ typeset avid=$(dladm show-vnic -p -o vid $name)
+ typeset aover=$(dladm show-vnic -p -o over $name)
+ typeset azone=$(dladm show-linkprop -cp zone -o value $name)
+ if (($avid == $vid)) && [ $aover == $over ] && \
+ [ $azone == $zone ]
+ then
+ return 0
+ else
+ return 1
+ fi
+ else
+ return 1
+ fi
+}
+
+function create_vnic
+{
+ typeset name=$1
+ typeset over=$2
+ typeset vid=$3
+ typeset zone=$4
+ typeset r=1
+ typeset vid_opt=""
+ typeset vnic_info="$name, vid: $vid, over: $over, zone: $zone"
+ typeset err="failed to create VNIC: $vnic_info"
+
+ if (($# != 4)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ if ((vid != 0)); then
+ vid_opt="-v $vid"
+ fi
+
+ dbg "creating VNIC: $vnic_info"
+ if ! dladm create-vnic -t -l $over $vid_opt $name > /dev/null 2>&1
+ then
+ maybe_fail "$err"
+ return 1
+ fi
+
+ dbg "created VNIC: $vnic_info"
+ if ! zonecfg -z $zone "add net; set physical=$name; end"; then
+ maybe_fail "failed to assign $name to $zone"
+ return 1
+ fi
+
+ dbg "assigned VNIC $name to $zone"
+ if zoneadm -z $zone reboot; then
+ dbg "rebooted $zone"
+ #
+ # Make sure the vnic is visible before returning. Without this
+ # a create_addr command following immediately afterwards could
+ # fail because the zone is up but the vnic isn't visible yet.
+ #
+ sleep 1
+ return 0
+ fi
+
+ maybe_fail "failed to reboot $zone"
+}
+
+function delete_vnic
+{
+ typeset name=$1
+ typeset vid=$2
+ typeset zone=$3
+ typeset vnic_info="$name, vid: $vid, zone: $zone"
+ typeset err1="failed to assign VNIC $name from $zone to GZ"
+ typeset err2="failed to delete VNIC: $vnic_info"
+
+ if (($# != 3)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ dbg "assigning VNIC $name from $zone to GZ"
+
+ if ! zonecfg -z $zone "remove net physical=$name"; then
+ maybe_fail "failed to remove $name from $zone"
+ return 1
+ fi
+ if ! zoneadm -z $zone reboot; then
+ maybe_fail "failed to reboot $zone"
+ return 1
+ fi
+
+ dbg "deleting VNIC: $vnic_info"
+ if dladm delete-vnic $name > /dev/null; then
+ dbg "deleted VNIC: $vnic_info"
+ return 0
+ fi
+
+ maybe_fail "$err2"
+}
+
+function create_addr
+{
+ typeset zone=$1
+ typeset vnic=$2
+ typeset ip=$3
+ typeset ipname=${vnic}/v4
+
+ if (($# != 3)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ if zlogin $zone ipadm create-addr -t -T static -a $ip \
+ $ipname > /dev/null
+ then
+ dbg "created addr $ipname ($ip) in zone $zone"
+ return 0
+ fi
+
+ maybe_fail "failed to create addr $ipname ($ip) in zone $zone"
+}
+
+function create_addr6
+{
+ typeset zone=$1
+ typeset vnic=$2
+ typeset ip=$3
+ typeset ll_name=${vnic}/v6
+ typeset uni_name=${vnic}/v6add
+ typeset err1="failed to create link-local addr $ll_name in zone $zone"
+ typeset err2="failed to create unicast addr $uni_name in zone $zone"
+
+ if (($# != 3)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ if zlogin $zone ipadm create-addr -t -T addrconf $ll_name; then
+ dbg "created link-local addr $ll_name in zone $zone"
+ else
+ maybe_fail "$err1"
+ return 1
+ fi
+
+ if zlogin $zone ipadm create-addr -t -T static -a $ip/64 $uni_name; then
+ dbg "created unicast addr $uni_name in zone $zone"
+ else
+ maybe_fail "$err2"
+ fi
+}
+
+function delete_addr
+{
+ typeset zone=$1
+ typeset ifname=$2
+ typeset version=$3
+ typeset ipname=$ifname/$version
+
+ if (($# != 3)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ if zlogin $zone ipadm show-addr $ipname > /dev/null 2>&1; then
+ if zlogin $zone ipadm delete-addr $ipname > /dev/null; then
+ dbg "deleted addr $ipname in zone $zone"
+ else
+ maybe_fail "failed to delete addr $ipname in zone $zone"
+ return 1
+ fi
+ else
+ dbg "addr $ipname doesn't exist in zone $zone"
+ fi
+
+ if [[ "v6" == "$version" ]]; then
+ typeset ipname=$ifname/v6add
+ typeset err="failed to delete addr $ipname in zone $zone"
+
+ if zlogin $zone ipadm show-addr $ipname > /dev/null 2>&1; then
+ if zlogin $zone ipadm delete-addr $ipname > /dev/null
+ then
+ dbg "deleted addr $ipname in zone $zone"
+ else
+ maybe_fail "$err"
+ fi
+ else
+ dbg "addr $ipname doesn't exist in zone $zone"
+ fi
+ fi
+}
+
+function delete_if
+{
+ typeset zone=$1
+ typeset ifname=$2
+ typeset err="failed to delete interface $ifname in zone $zone"
+
+ if (($# != 2)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ if zlogin $zone ipadm show-if $ifname > /dev/null 2>&1; then
+ if zlogin $zone ipadm delete-if $ifname > /dev/null; then
+ dbg "deleted interface $ifname in zone $zone"
+ else
+ maybe_fail "$err"
+ fi
+ else
+ dbg "interface $ifname doesn't exist in zone $zone"
+ fi
+}
+
+function ip_fwd_enable
+{
+ typeset zone=$1
+
+ if (($# != 1)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ if zlogin $zone routeadm -p ipv4-forwarding | \
+ egrep 'current=enabled' > /dev/null
+ then
+ dbg "IPv4 forwarding already enabled for $zone"
+ else
+ if zlogin $zone routeadm -ue ipv4-forwarding; then
+ dbg "enabled IPv4 forwarding for $zone"
+ else
+ maybe_fail "failed to enable IPv4 forwarding for $zone"
+ return 1
+ fi
+ fi
+
+ if zlogin $zone routeadm -p ipv6-forwarding | \
+ egrep 'current=enabled' > /dev/null
+ then
+ dbg "IPv6 forwarding already enabled for $zone"
+ else
+ if zlogin $zone routeadm -ue ipv6-forwarding; then
+ dbg "enabled IPv6 forwarding for $zone"
+ else
+ maybe_fail "failed to enable IPv6 forwarding for $zone"
+ fi
+ fi
+}
+
+function ip_fwd_disable
+{
+ typeset zone=$1
+
+ if (($# != 1)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ if zlogin $zone routeadm -p ipv4-forwarding | \
+ egrep 'current=disabled' > /dev/null
+ then
+ dbg "IPv4 forwarding already disabled for $zone"
+ else
+ if zlogin $zone routeadm -ud ipv4-forwarding; then
+ dbg "disabled IPv4 forwarding in $zone"
+ else
+ maybe_fail "failed to disable IPv4 forwarding in $zone"
+ return 1
+ fi
+ fi
+
+ if zlogin $zone routeadm -p ipv6-forwarding | \
+ egrep 'current=disabled' > /dev/null
+ then
+ dbg "IPv6 forwarding already disabled for $zone"
+ else
+ if zlogin $zone routeadm -ud ipv6-forwarding; then
+ dbg "disabled IPv6 forwarding in $zone"
+ else
+ maybe_fail "failed to disable IPv6 forwarding in $zone"
+ fi
+ fi
+}
+
+function add_route
+{
+ typeset zone=$1
+ typeset dest=$2
+ typeset net=$3
+ typeset gateway=$4
+
+ if (($# != 4)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ if zlogin $zone route -n add $net $gateway > /dev/null; then
+ dbg "added route $gateway => $net to $zone"
+ return 0
+ fi
+
+ maybe_fail "failed to add route $gateway => $net to $zone"
+}
+
+function add_route6
+{
+ typeset zone=$1
+ typeset dest=$2
+ typeset net=$3
+ typeset gateway=$4
+
+ if (($# != 4)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ if zlogin $zone route -n add -inet6 $net $gateway > /dev/null
+ then
+ dbg "added route $gateway => $net to $zone"
+ return 0
+ fi
+
+ maybe_fail "failed to add route $gateway => $net to $zone"
+}
+
+function rm_route
+{
+ typeset zone=$1
+ typeset dest=$2
+ typeset net=$3
+ typeset gateway=$4
+ typeset gw=$(zlogin $zone route -n get $dest | \
+ grep gateway | awk '{ print $2 }')
+ typeset err="failed to remove route $gateway => $net from $zone"
+
+ if (($# != 4)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ if [[ "$gw" == "$gateway" ]]; then
+ if zlogin $zone route -n delete $net $gateway > /dev/null
+ then
+ dbg "removed route $gateway => $net from $zone"
+ else
+ maybe_fail "$err"
+ fi
+ else
+ dbg "$zone already lacked route $gateway => $net"
+ fi
+}
+
+function rm_route6
+{
+ typeset zone=$1
+ typeset dest=$2
+ typeset net=$3
+ typeset gateway=$4
+ typeset gw=$(zlogin $zone route -n get -inet6 $dest | \
+ grep gateway | awk '{ print $2 }')
+ typeset err="failed to remove route $gateway => $net from $zone"
+
+ if (($# != 4)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ if [[ "$gw" == "$gateway" ]]; then
+ if zlogin $zone route -n delete -inet6 $net $gateway > /dev/null
+ then
+ dbg "removed route $gateway => $net from $zone"
+ else
+ maybe_fail "$err"
+ fi
+ else
+ dbg "$zone already lacked route $gateway => $net"
+ fi
+}
+
+function set_linkprop
+{
+ typeset link=$1
+ typeset prop=$2
+ typeset val=$3
+ typeset err="failed to set $link prop: $prop=$val"
+
+ if (($# != 3)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ dbg "attempt to set $link prop: $prop=$val"
+ if dladm set-linkprop -p $prop=$val $link; then
+ dbg "set $link prop: $prop=$val"
+ return 0
+ fi
+
+ maybe_fail "$err"
+}
+
+function ping
+{
+ typeset zone=$1
+ typeset src=$2
+ typeset dst=$3
+ typeset info="$src -> $dst"
+
+ if (($# != 3)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ dbg "ping: $info"
+ if zlogin $zone ping $dst > /dev/null 2>&1; then
+ dbg "successful ping: $info"
+ return 0
+ fi
+
+ maybe_fail "could not ping: $info"
+}
+
+function ping_udp
+{
+ typeset client=$1
+ typeset client_ip=$2
+ typeset server_ip=$3
+ typeset size=$4
+ typeset num=$5
+ typeset info="$client_ip -> $server_ip (size: $size)"
+
+ if (($# != 5)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ dbg "UDP ping: $info"
+ if zlogin $client ping -ns -U $server_ip $size $num > /dev/null; then
+ dbg "UDP ping passed: $info"
+ return 0
+ fi
+
+ maybe_fail "UDP ping failed: $info"
+}
+
+function start_server
+{
+ typeset zone=$1
+ typeset type=$2
+ typeset ip=$3
+ typeset port=$4
+ typeset ofile=$5
+
+ if (($# != 5)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ dbg "start server $rfile"
+ zlogin $zone \
+ /usr/bin/socat -u ${type}-LISTEN:$port,bind=[$ip],reuseaddr \
+ CREATE:$ofile &
+ listener_ppid=$!
+ dbg "listener PPID: $listener_ppid, zone $zone"
+}
+
+function wait_for_pid
+{
+ typeset pid=$1
+ typeset seconds=$2
+ typeset s=0
+
+ if (($# != 2)); then
+ fail "$0: incorrect number of args provided"
+ fi
+
+ while true; do
+ if kill -0 $pid > /dev/null 2>&1; then
+ if ((seconds == s)); then
+ maybe_fail "timed out waiting for pid $pid"
+ return 1
+ fi
+ dbg "waiting for pid $pid"
+ sleep 1
+ ((s++))
+ else
+ return 0
+ fi
+ done
+}
diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c
index 659eda42f3..26e7be2fe8 100644
--- a/usr/src/uts/common/inet/ip/ip6.c
+++ b/usr/src/uts/common/inet/ip/ip6.c
@@ -22,6 +22,7 @@
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1990 Mentat Inc.
* Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
#include <sys/types.h>
@@ -2730,108 +2731,15 @@ done:
}
/*
- * Try to determine where and what are the IPv6 header length and
- * pointer to nexthdr value for the upper layer protocol (or an
- * unknown next hdr).
- *
- * Parameters returns a pointer to the nexthdr value;
- * Must handle malformed packets of various sorts.
- * Function returns failure for malformed cases.
- */
-boolean_t
-ip_hdr_length_nexthdr_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length_ptr,
- uint8_t **nexthdrpp)
-{
- uint16_t length;
- uint_t ehdrlen;
- uint8_t *nexthdrp;
- uint8_t *whereptr;
- uint8_t *endptr;
- ip6_dest_t *desthdr;
- ip6_rthdr_t *rthdr;
- ip6_frag_t *fraghdr;
-
- ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
- length = IPV6_HDR_LEN;
- whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
- endptr = mp->b_wptr;
-
- nexthdrp = &ip6h->ip6_nxt;
- while (whereptr < endptr) {
- /* Is there enough left for len + nexthdr? */
- if (whereptr + MIN_EHDR_LEN > endptr)
- break;
-
- switch (*nexthdrp) {
- case IPPROTO_HOPOPTS:
- case IPPROTO_DSTOPTS:
- /* Assumes the headers are identical for hbh and dst */
- desthdr = (ip6_dest_t *)whereptr;
- ehdrlen = 8 * (desthdr->ip6d_len + 1);
- if ((uchar_t *)desthdr + ehdrlen > endptr)
- return (B_FALSE);
- nexthdrp = &desthdr->ip6d_nxt;
- break;
- case IPPROTO_ROUTING:
- rthdr = (ip6_rthdr_t *)whereptr;
- ehdrlen = 8 * (rthdr->ip6r_len + 1);
- if ((uchar_t *)rthdr + ehdrlen > endptr)
- return (B_FALSE);
- nexthdrp = &rthdr->ip6r_nxt;
- break;
- case IPPROTO_FRAGMENT:
- fraghdr = (ip6_frag_t *)whereptr;
- ehdrlen = sizeof (ip6_frag_t);
- if ((uchar_t *)&fraghdr[1] > endptr)
- return (B_FALSE);
- nexthdrp = &fraghdr->ip6f_nxt;
- break;
- case IPPROTO_NONE:
- /* No next header means we're finished */
- default:
- *hdr_length_ptr = length;
- *nexthdrpp = nexthdrp;
- return (B_TRUE);
- }
- length += ehdrlen;
- whereptr += ehdrlen;
- *hdr_length_ptr = length;
- *nexthdrpp = nexthdrp;
- }
- switch (*nexthdrp) {
- case IPPROTO_HOPOPTS:
- case IPPROTO_DSTOPTS:
- case IPPROTO_ROUTING:
- case IPPROTO_FRAGMENT:
- /*
- * If any know extension headers are still to be processed,
- * the packet's malformed (or at least all the IP header(s) are
- * not in the same mblk - and that should never happen.
- */
- return (B_FALSE);
-
- default:
- /*
- * If we get here, we know that all of the IP headers were in
- * the same mblk, even if the ULP header is in the next mblk.
- */
- *hdr_length_ptr = length;
- *nexthdrpp = nexthdrp;
- return (B_TRUE);
- }
-}
-
-/*
* Return the length of the IPv6 related headers (including extension headers)
* Returns a length even if the packet is malformed.
*/
-int
+uint16_t
ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h)
{
uint16_t hdr_len;
- uint8_t *nexthdrp;
- (void) ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_len, &nexthdrp);
+ (void) ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_len, NULL);
return (hdr_len);
}
diff --git a/usr/src/uts/common/inet/ip/ip6_input.c b/usr/src/uts/common/inet/ip/ip6_input.c
index cdff35273e..066b5c3f56 100644
--- a/usr/src/uts/common/inet/ip/ip6_input.c
+++ b/usr/src/uts/common/inet/ip/ip6_input.c
@@ -23,7 +23,7 @@
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved
*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -1903,13 +1903,12 @@ ip_input_cksum_v6(iaflags_t iraflags, mblk_t *mp, ip6_t *ip6h,
return (ip_input_sw_cksum_v6(mp, ip6h, ira));
}
+ hck_flags = DB_CKSUMFLAGS(mp);
+
/*
* We apply this for all ULP protocols. Does the HW know to
* not set the flags for SCTP and other protocols.
*/
-
- hck_flags = DB_CKSUMFLAGS(mp);
-
if (hck_flags & HCK_FULLCKSUM_OK) {
/*
* Hardware has already verified the checksum.
diff --git a/usr/src/uts/common/inet/ip/ip_input.c b/usr/src/uts/common/inet/ip/ip_input.c
index aea49c19d3..cd6c50c446 100644
--- a/usr/src/uts/common/inet/ip/ip_input.c
+++ b/usr/src/uts/common/inet/ip/ip_input.c
@@ -23,7 +23,7 @@
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -57,6 +57,7 @@
#include <sys/vtrace.h>
#include <sys/isa_defs.h>
#include <sys/mac.h>
+#include <sys/mac_client.h>
#include <net/if.h>
#include <net/if_arp.h>
#include <net/route.h>
@@ -659,11 +660,12 @@ ill_input_short_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg,
}
/*
- * If there is a good HW IP header checksum we clear the need
+ * If the packet originated from a same-machine sender or
+ * there is a good HW IP header checksum, we clear the need
* look at the IP header checksum.
*/
- if ((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) &&
- ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
+ if (((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) &&
+ ILL_HCKSUM_CAPABLE(ill) && dohwcksum)) {
/* Header checksum was ok. Clear the flag */
DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
@@ -1134,8 +1136,12 @@ ip_forward_xmit_v4(nce_t *nce, ill_t *ill, mblk_t *mp, ipha_t *ipha,
icmp_time_exceeded(mp, ICMP_TTL_EXCEEDED, ira);
return;
}
+
+ /*
+ * Count the forward as a hop and update the checksum
+ * accordingly.
+ */
ipha->ipha_ttl--;
- /* Adjust the checksum to reflect the ttl decrement. */
sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST;
ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16));
@@ -2240,6 +2246,7 @@ ip_input_cksum_v4(iaflags_t iraflags, mblk_t *mp, ipha_t *ipha,
/* No ULP checksum to verify. */
return (B_TRUE);
}
+
/*
* Revert to software checksum calculation if the interface
* isn't capable of checksum offload.
@@ -2252,13 +2259,12 @@ ip_input_cksum_v4(iaflags_t iraflags, mblk_t *mp, ipha_t *ipha,
return (ip_input_sw_cksum_v4(mp, ipha, ira));
}
+ hck_flags = DB_CKSUMFLAGS(mp);
+
/*
* We apply this for all ULP protocols. Does the HW know to
* not set the flags for SCTP and other protocols.
*/
-
- hck_flags = DB_CKSUMFLAGS(mp);
-
if (hck_flags & HCK_FULLCKSUM_OK) {
/*
* Hardware has already verified the checksum.
diff --git a/usr/src/uts/common/inet/ip6.h b/usr/src/uts/common/inet/ip6.h
index 4f5b81c12f..01c25b52b5 100644
--- a/usr/src/uts/common/inet/ip6.h
+++ b/usr/src/uts/common/inet/ip6.h
@@ -23,6 +23,7 @@
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
#ifndef _INET_IP6_H
@@ -255,7 +256,7 @@ extern in6_addr_t ip_get_dst_v6(ip6_t *, const mblk_t *, boolean_t *);
extern ip6_rthdr_t *ip_find_rthdr_v6(ip6_t *, uint8_t *);
extern boolean_t ip_hdr_length_nexthdr_v6(mblk_t *, ip6_t *,
uint16_t *, uint8_t **);
-extern int ip_hdr_length_v6(mblk_t *, ip6_t *);
+extern uint16_t ip_hdr_length_v6(mblk_t *, ip6_t *);
extern uint32_t ip_massage_options_v6(ip6_t *, ip6_rthdr_t *, netstack_t *);
extern void ip_forward_xmit_v6(nce_t *, mblk_t *, ip6_t *, ip_recv_attr_t *,
uint32_t, uint32_t);
diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h
index 2b37528eb9..87086b4c17 100644
--- a/usr/src/uts/common/inet/ip_impl.h
+++ b/usr/src/uts/common/inet/ip_impl.h
@@ -21,6 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2019 Joyent, Inc.
*/
#ifndef _INET_IP_IMPL_H
@@ -159,9 +160,24 @@ extern "C" {
#define ILL_DIRECT_CAPABLE(ill) \
(((ill)->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0)
-/* This macro is used by the mac layer */
+/*
+ * Determine if a mblk needs to take the "slow path", aka OTH
+ * softring. There are multiple reasons why a mblk might take the slow
+ * path.
+ *
+ * o The mblk is not a data message.
+ *
+ * o There is more than one outstanding reference to the mblk.
+ *
+ * o The IP header is not aligned (we assume alignment in the checksum
+ * routine).
+ *
+ * o The mblk doesn't contain enough data to populate a simple IP header.
+ */
#define MBLK_RX_FANOUT_SLOWPATH(mp, ipha) \
- (DB_TYPE(mp) != M_DATA || DB_REF(mp) != 1 || !OK_32PTR(ipha) || \
+ (DB_TYPE(mp) != M_DATA || \
+ (DB_REF(mp) != 1) || \
+ !OK_32PTR(ipha) || \
(((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH) >= (mp)->b_wptr))
/*
diff --git a/usr/src/uts/common/io/bridge.c b/usr/src/uts/common/io/bridge.c
index bc54527515..389948e295 100644
--- a/usr/src/uts/common/io/bridge.c
+++ b/usr/src/uts/common/io/bridge.c
@@ -23,6 +23,7 @@
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
/*
@@ -41,6 +42,7 @@
#include <sys/modctl.h>
#include <sys/note.h>
#include <sys/param.h>
+#include <sys/pattr.h>
#include <sys/policy.h>
#include <sys/sdt.h>
#include <sys/stat.h>
@@ -1705,7 +1707,12 @@ reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid)
if (mp == NULL)
return (mp);
- /* No forwarded packet can have hardware checksum enabled */
+ /*
+ * A forwarded packet cannot have hardware offloads enabled
+ * because we don't know if the destination can handle them.
+ * By this point, any hardware offloads present should have
+ * been emulated.
+ */
DB_CKSUMFLAGS(mp) = 0;
/* Get the no-modification cases out of the way first */
@@ -1907,17 +1914,22 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
blp->bl_trillthreads++;
mutex_exit(&blp->bl_trilllock);
update_header(mp, hdr_info, B_FALSE);
- if (is_xmit)
- mp = mac_fix_cksum(mp);
- /* all trill data frames have Inner.VLAN */
+
+ /*
+ * All trill data frames have
+ * Inner.VLAN.
+ */
mp = reform_vlan_header(mp, vlanid, tci, 0);
+
if (mp == NULL) {
KIINCR(bki_drops);
- fwd_unref(bfp);
- return (NULL);
+ goto done;
}
+
trill_encap_fn(tdp, blp, hdr_info, mp,
bfp->bf_trill_nick);
+
+done:
mutex_enter(&blp->bl_trilllock);
if (--blp->bl_trillthreads == 0 &&
blp->bl_trilldata == NULL)
@@ -1959,17 +1971,16 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
mpsend = copymsg(mp);
}
- if (!from_trill && is_xmit)
- mpsend = mac_fix_cksum(mpsend);
-
mpsend = reform_vlan_header(mpsend, vlanid, tci,
blpsend->bl_pvid);
+
if (mpsend == NULL) {
KIINCR(bki_drops);
continue;
}
KIINCR(bki_forwards);
+
/*
* No need to bump up the link reference count, as
* the forwarding entry itself holds a reference to
@@ -1979,11 +1990,12 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
mac_rx_common(blpsend->bl_mh, NULL, mpsend);
} else {
KLPINCR(blpsend, bkl_xmit);
- MAC_RING_TX(blpsend->bl_mh, NULL, mpsend,
+ mpsend = mac_ring_tx(blpsend->bl_mh, NULL,
mpsend);
freemsg(mpsend);
}
}
+
/*
* Handle a special case: if we're transmitting to the original
* link, then check whether the localaddr flag is set. If it
@@ -2070,11 +2082,9 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
mpsend = copymsg(mp);
}
- if (!from_trill && is_xmit)
- mpsend = mac_fix_cksum(mpsend);
-
mpsend = reform_vlan_header(mpsend, vlanid, tci,
blpsend->bl_pvid);
+
if (mpsend == NULL) {
KIINCR(bki_drops);
continue;
@@ -2084,10 +2094,13 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp,
KIINCR(bki_unknown);
else
KIINCR(bki_mbcast);
+
KLPINCR(blpsend, bkl_xmit);
- if ((mpcopy = copymsg(mpsend)) != NULL)
+ if ((mpcopy = copymsg(mpsend)) != NULL) {
mac_rx_common(blpsend->bl_mh, NULL, mpcopy);
- MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, mpsend);
+ }
+
+ mpsend = mac_ring_tx(blpsend->bl_mh, NULL, mpsend);
freemsg(mpsend);
link_unref(blpsend);
}
@@ -2465,7 +2478,7 @@ bridge_xmit_cb(mac_handle_t mh, mac_ring_handle_t rh, mblk_t *mpnext)
(blp->bl_flags & BLF_SDUFAIL)))) {
KIINCR(bki_sent);
KLINCR(bkl_xmit);
- MAC_RING_TX(blp->bl_mh, rh, mpnext, mp);
+ mp = mac_ring_tx(blp->bl_mh, rh, mpnext);
return (mp);
}
@@ -2523,7 +2536,7 @@ bridge_xmit_cb(mac_handle_t mh, mac_ring_handle_t rh, mblk_t *mpnext)
B_FALSE, B_TRUE);
}
if (mp != NULL) {
- MAC_RING_TX(blp->bl_mh, rh, mp, mp);
+ mp = mac_ring_tx(blp->bl_mh, rh, mp);
if (mp == NULL) {
KIINCR(bki_sent);
KLINCR(bkl_xmit);
@@ -2589,7 +2602,7 @@ bridge_trill_decaps(bridge_link_t *blp, mblk_t *mp, uint16_t ingress_nick)
/* Deliver a copy locally as well */
if ((mpcopy = copymsg(mp)) != NULL)
mac_rx_common(blp->bl_mh, NULL, mpcopy);
- MAC_RING_TX(blp->bl_mh, NULL, mp, mp);
+ mp = mac_ring_tx(blp->bl_mh, NULL, mp);
}
if (mp == NULL) {
KIINCR(bki_sent);
@@ -2610,7 +2623,7 @@ bridge_trill_output(bridge_link_t *blp, mblk_t *mp)
bridge_inst_t *bip = blp->bl_inst; /* used by macros */
mac_trill_snoop(blp->bl_mh, mp);
- MAC_RING_TX(blp->bl_mh, NULL, mp, mp);
+ mp = mac_ring_tx(blp->bl_mh, NULL, mp);
if (mp == NULL) {
KIINCR(bki_sent);
KLINCR(bkl_xmit);
diff --git a/usr/src/uts/common/io/dls/dls_link.c b/usr/src/uts/common/io/dls/dls_link.c
index 6f9049b724..4099d0b801 100644
--- a/usr/src/uts/common/io/dls/dls_link.c
+++ b/usr/src/uts/common/io/dls/dls_link.c
@@ -21,7 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
/*
@@ -566,7 +566,13 @@ dls_rx_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
dls_head_t *dhp;
mod_hash_key_t key;
+ /*
+ * We expect to deal with only a single packet.
+ */
+ ASSERT3P(mp->b_next, ==, NULL);
+
DLS_PREPARE_PKT(dlp->dl_mh, mp, &mhi, err);
+
if (err != 0)
goto drop;
diff --git a/usr/src/uts/common/io/fcoe/fcoe_fc.c b/usr/src/uts/common/io/fcoe/fcoe_fc.c
index 42764e48d6..54402b027f 100644
--- a/usr/src/uts/common/io/fcoe/fcoe_fc.c
+++ b/usr/src/uts/common/io/fcoe/fcoe_fc.c
@@ -22,6 +22,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2019 Joyent, Inc.
*/
/*
@@ -39,6 +40,7 @@
#include <sys/fcntl.h>
#include <sys/unistd.h>
#include <sys/mac_client.h>
+#include <sys/strsubr.h>
/*
* FCoE header files
@@ -209,6 +211,7 @@ tx_frame:
ret_cookie = mac_tx(mac->fm_cli_handle, FRM2MBLK(frm), 0,
MAC_TX_NO_ENQUEUE, &ret_mblk);
if (ret_cookie != (mac_tx_cookie_t)NULL) {
+ frm->frm_netb = ret_mblk;
mutex_enter(&mac->fm_mutex);
(void) cv_reltimedwait(&mac->fm_tx_cv, &mac->fm_mutex,
drv_usectohz(100000), TR_CLOCK_TICK);
@@ -265,7 +268,7 @@ fcoe_alloc_netb(fcoe_port_t *eport, uint32_t fc_frame_size, uint8_t **ppfc)
static void
fcoe_free_netb(void *netb)
{
- freeb((mblk_t *)netb);
+ freemsgchain((mblk_t *)netb);
}
fcoe_frame_t *
diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c
index 76b4765de6..0a52043a15 100644
--- a/usr/src/uts/common/io/mac/mac.c
+++ b/usr/src/uts/common/io/mac/mac.c
@@ -1753,7 +1753,7 @@ mac_client_clear_flow_cb(mac_client_handle_t mch)
flow_entry_t *flent = mcip->mci_flent;
mutex_enter(&flent->fe_lock);
- flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
+ flent->fe_cb_fn = (flow_fn_t)mac_rx_def;
flent->fe_cb_arg1 = NULL;
flent->fe_cb_arg2 = NULL;
flent->fe_flags |= FE_MC_NO_DATAPATH;
@@ -1936,8 +1936,7 @@ mac_hwring_send_priv(mac_client_handle_t mch, mac_ring_handle_t rh, mblk_t *mp)
mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
mac_impl_t *mip = mcip->mci_mip;
- MAC_TX(mip, rh, mp, mcip);
- return (mp);
+ return (mac_provider_tx(mip, rh, mp, mcip));
}
/*
@@ -4712,9 +4711,9 @@ mac_group_remmac(mac_group_t *group, const uint8_t *addr)
}
/*
- * This is the entry point for packets transmitted through the bridging code.
- * If no bridge is in place, MAC_RING_TX transmits using tx ring. The 'rh'
- * pointer may be NULL to select the default ring.
+ * This is the entry point for packets transmitted through the bridge
+ * code. If no bridge is in place, mac_ring_tx() transmits via the tx
+ * ring. The 'rh' pointer may be NULL to select the default ring.
*/
mblk_t *
mac_bridge_tx(mac_impl_t *mip, mac_ring_handle_t rh, mblk_t *mp)
@@ -4731,8 +4730,34 @@ mac_bridge_tx(mac_impl_t *mip, mac_ring_handle_t rh, mblk_t *mp)
mac_bridge_ref_cb(mh, B_TRUE);
mutex_exit(&mip->mi_bridge_lock);
if (mh == NULL) {
- MAC_RING_TX(mip, rh, mp, mp);
+ mp = mac_ring_tx((mac_handle_t)mip, rh, mp);
} else {
+ /*
+ * The bridge may place this mblk on a provider's Tx
+ * path, a mac's Rx path, or both. Since we don't have
+ * enough information at this point, we can't be sure
+ * that the destination(s) are capable of handling the
+ * hardware offloads requested by the mblk. We emulate
+ * them here as it is the safest choice. In the
+ * future, if bridge performance becomes a priority,
+ * we can elide the emulation here and leave the
+ * choice up to bridge.
+ *
+ * We don't clear the DB_CKSUMFLAGS here because
+ * HCK_IPV4_HDRCKSUM (Tx) and HCK_IPV4_HDRCKSUM_OK
+ * (Rx) still have the same value. If the bridge
+ * receives a packet from a HCKSUM_IPHDRCKSUM NIC then
+ * the mac(s) it is forwarded on may calculate the
+ * checksum again, but incorrectly (because the
+ * checksum field is not zero). Until the
+ * HCK_IPV4_HDRCKSUM/HCK_IPV4_HDRCKSUM_OK issue is
+ * resovled, we leave the flag clearing in bridge
+ * itself.
+ */
+ if ((DB_CKSUMFLAGS(mp) & (HCK_TX_FLAGS | HW_LSO_FLAGS)) != 0) {
+ mac_hw_emul(&mp, NULL, NULL, MAC_ALL_EMULS);
+ }
+
mp = mac_bridge_tx_cb(mh, rh, mp);
mac_bridge_ref_cb(mh, B_FALSE);
}
@@ -8804,3 +8829,52 @@ mac_led_set(mac_handle_t mh, mac_led_mode_t desired)
return (ret);
}
+
+/*
+ * Send packets through the Tx ring ('mrh') or through the default
+ * handler if no ring is specified. Before passing the packet down to
+ * the MAC provider, emulate any hardware offloads which have been
+ * requested but are not supported by the provider.
+ */
+mblk_t *
+mac_ring_tx(mac_handle_t mh, mac_ring_handle_t mrh, mblk_t *mp)
+{
+ mac_impl_t *mip = (mac_impl_t *)mh;
+
+ if (mrh == NULL)
+ mrh = mip->mi_default_tx_ring;
+
+ if (mrh == NULL)
+ return (mip->mi_tx(mip->mi_driver, mp));
+ else
+ return (mac_hwring_tx(mrh, mp));
+}
+
+/*
+ * This is the final stop before reaching the underlying MAC provider.
+ * This is also where the bridging hook is inserted. Packets that are
+ * bridged will return through mac_bridge_tx(), with rh nulled out if
+ * the bridge chooses to send output on a different link due to
+ * forwarding.
+ */
+mblk_t *
+mac_provider_tx(mac_impl_t *mip, mac_ring_handle_t rh, mblk_t *mp,
+ mac_client_impl_t *mcip)
+{
+ /*
+ * If there is a bound Hybrid I/O share, send packets through
+ * the default tx ring. When there's a bound Hybrid I/O share,
+ * the tx rings of this client are mapped in the guest domain
+ * and not accessible from here.
+ */
+ if (mcip->mci_state_flags & MCIS_SHARE_BOUND)
+ rh = mip->mi_default_tx_ring;
+
+ if (mip->mi_promisc_list != NULL)
+ mac_promisc_dispatch(mip, mp, mcip, B_FALSE);
+
+ if (mip->mi_bridge_link == NULL)
+ return (mac_ring_tx((mac_handle_t)mip, rh, mp));
+ else
+ return (mac_bridge_tx(mip, rh, mp));
+}
diff --git a/usr/src/uts/common/io/mac/mac_bcast.c b/usr/src/uts/common/io/mac/mac_bcast.c
index 1ff33c3578..5302b89196 100644
--- a/usr/src/uts/common/io/mac/mac_bcast.c
+++ b/usr/src/uts/common/io/mac/mac_bcast.c
@@ -21,6 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/types.h>
@@ -146,7 +147,7 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback)
uint64_t gen;
uint_t i;
mblk_t *mp_chain1;
- flow_entry_t *flent;
+ flow_entry_t *flent;
int err;
rw_enter(&mip->mi_rw_lock, RW_READER);
@@ -182,13 +183,6 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback)
*/
if ((mp_chain1 = mac_copymsgchain_cksum(mp_chain)) == NULL)
break;
- /*
- * Fix the checksum for packets originating
- * from the local machine.
- */
- if ((src_mcip != NULL) &&
- (mp_chain1 = mac_fix_cksum(mp_chain1)) == NULL)
- break;
FLOW_TRY_REFHOLD(flent, err);
if (err != 0) {
@@ -246,7 +240,8 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback)
MCIP_STAT_UPDATE(src_mcip, brdcstxmt, 1);
MCIP_STAT_UPDATE(src_mcip, brdcstxmtbytes, msgdsize(mp_chain));
- MAC_TX(mip, mip->mi_default_tx_ring, mp_chain, src_mcip);
+ mp_chain = mac_provider_tx(mip, mip->mi_default_tx_ring,
+ mp_chain, src_mcip);
if (mp_chain != NULL)
freemsgchain(mp_chain);
} else {
diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c
index 7ff05f2ab6..605cb51bf7 100644
--- a/usr/src/uts/common/io/mac/mac_client.c
+++ b/usr/src/uts/common/io/mac/mac_client.c
@@ -115,6 +115,7 @@
#include <sys/stream.h>
#include <sys/strsun.h>
#include <sys/strsubr.h>
+#include <sys/pattr.h>
#include <sys/dlpi.h>
#include <sys/modhash.h>
#include <sys/mac_impl.h>
@@ -1357,7 +1358,7 @@ mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name,
mcip->mci_mip = mip;
mcip->mci_upper_mip = NULL;
- mcip->mci_rx_fn = mac_pkt_drop;
+ mcip->mci_rx_fn = mac_rx_def;
mcip->mci_rx_arg = NULL;
mcip->mci_rx_p_fn = NULL;
mcip->mci_rx_p_arg = NULL;
@@ -1629,7 +1630,7 @@ mac_rx_set(mac_client_handle_t mch, mac_rx_t rx_fn, void *arg)
void
mac_rx_clear(mac_client_handle_t mch)
{
- mac_rx_set(mch, mac_pkt_drop, NULL);
+ mac_rx_set(mch, mac_rx_def, NULL);
}
void
@@ -1641,7 +1642,7 @@ mac_rx_barrier(mac_client_handle_t mch)
i_mac_perim_enter(mip);
/* If a RX callback is set, quiesce and restart that datapath */
- if (mcip->mci_rx_fn != mac_pkt_drop) {
+ if (mcip->mci_rx_fn != mac_rx_def) {
mac_rx_client_quiesce(mch);
mac_rx_client_restart(mch);
}
@@ -2998,7 +2999,7 @@ mac_client_datapath_teardown(mac_client_handle_t mch, mac_unicast_impl_t *muip,
mac_misc_stat_delete(flent);
/* Initialize the receiver function to a safe routine */
- flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
+ flent->fe_cb_fn = (flow_fn_t)mac_rx_def;
flent->fe_cb_arg1 = NULL;
flent->fe_cb_arg2 = NULL;
@@ -3578,7 +3579,9 @@ mac_tx(mac_client_handle_t mch, mblk_t *mp_chain, uintptr_t hint,
srs_tx = &srs->srs_tx;
if (srs_tx->st_mode == SRS_TX_DEFAULT &&
(srs->srs_state & SRS_ENQUEUED) == 0 &&
- mip->mi_nactiveclients == 1 && mp_chain->b_next == NULL) {
+ mip->mi_nactiveclients == 1 &&
+ mp_chain->b_next == NULL &&
+ (DB_CKSUMFLAGS(mp_chain) & HW_LSO) == 0) {
uint64_t obytes;
/*
@@ -3613,7 +3616,9 @@ mac_tx(mac_client_handle_t mch, mblk_t *mp_chain, uintptr_t hint,
obytes = (mp_chain->b_cont == NULL ? MBLKL(mp_chain) :
msgdsize(mp_chain));
- MAC_TX(mip, srs_tx->st_arg2, mp_chain, mcip);
+ mp_chain = mac_provider_tx(mip, srs_tx->st_arg2, mp_chain,
+ mcip);
+
if (mp_chain == NULL) {
cookie = 0;
SRS_TX_STAT_UPDATE(srs, opackets, 1);
@@ -3625,7 +3630,74 @@ mac_tx(mac_client_handle_t mch, mblk_t *mp_chain, uintptr_t hint,
mutex_exit(&srs->srs_lock);
}
} else {
- cookie = srs_tx->st_func(srs, mp_chain, hint, flag, ret_mp);
+ mblk_t *mp = mp_chain;
+ mblk_t *new_head = NULL;
+ mblk_t *new_tail = NULL;
+
+ /*
+ * There are occasions where the packets arriving here
+ * may request hardware offloads that are not
+ * available from the underlying MAC provider. This
+ * currently only happens when a packet is sent across
+ * the MAC-loopback path of one MAC and then forwarded
+ * (via IP) to another MAC that lacks one or more of
+ * the hardware offloads provided by the first one.
+ * However, in the future, we may choose to pretend
+ * all MAC providers support all offloads, performing
+ * emulation on Tx as needed.
+ *
+ * We iterate each mblk in-turn, emulating hardware
+ * offloads as required. From this process, we create
+ * a new chain. The new chain may be the same as the
+ * original chain (no hardware emulation needed), a
+ * collection of new mblks (hardware emulation
+ * needed), or a mix. At this point, the chain is safe
+ * for consumption by the underlying MAC provider and
+ * is passed down to the SRS.
+ */
+ while (mp != NULL) {
+ mblk_t *next = mp->b_next;
+ mblk_t *tail = NULL;
+ const uint16_t needed =
+ (DB_CKSUMFLAGS(mp) ^ mip->mi_tx_cksum_flags) &
+ DB_CKSUMFLAGS(mp);
+
+ mp->b_next = NULL;
+
+ if ((needed & (HCK_TX_FLAGS | HW_LSO_FLAGS)) != 0) {
+ mac_emul_t emul = 0;
+
+ if (needed & HCK_IPV4_HDRCKSUM)
+ emul |= MAC_IPCKSUM_EMUL;
+ if (needed & (HCK_PARTIALCKSUM | HCK_FULLCKSUM))
+ emul |= MAC_HWCKSUM_EMUL;
+ if (needed & HW_LSO)
+ emul = MAC_LSO_EMUL;
+
+ mac_hw_emul(&mp, &tail, NULL, emul);
+
+ if (mp == NULL) {
+ mp = next;
+ continue;
+ }
+ }
+
+ if (new_head == NULL) {
+ new_head = mp;
+ } else {
+ new_tail->b_next = mp;
+ }
+
+ new_tail = (tail == NULL) ? mp : tail;
+ mp = next;
+ }
+
+ if (new_head == NULL) {
+ cookie = 0;
+ goto done;
+ }
+
+ cookie = srs_tx->st_func(srs, new_head, hint, flag, ret_mp);
}
done:
@@ -4026,14 +4098,15 @@ mac_client_get_effective_resources(mac_client_handle_t mch,
* The unicast packets of MAC_CLIENT_PROMISC_FILTER callbacks are dispatched
* after classification by mac_rx_deliver().
*/
-
static void
mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp,
- boolean_t loopback)
+ boolean_t loopback, boolean_t local)
{
- mblk_t *mp_copy, *mp_next;
+ mblk_t *mp_next;
if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag) {
+ mblk_t *mp_copy;
+
mp_copy = copymsg(mp);
if (mp_copy == NULL)
return;
@@ -4043,16 +4116,24 @@ mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp,
if (mp_copy == NULL)
return;
}
- mp_next = NULL;
- } else {
- mp_copy = mp;
- mp_next = mp->b_next;
+
+ /*
+ * There is code upstack that can't deal with message
+ * chains.
+ */
+ for (mblk_t *tmp = mp_copy; tmp != NULL; tmp = mp_next) {
+ mp_next = tmp->b_next;
+ tmp->b_next = NULL;
+ mpip->mpi_fn(mpip->mpi_arg, NULL, tmp, loopback);
+ }
+
+ return;
}
- mp_copy->b_next = NULL;
- mpip->mpi_fn(mpip->mpi_arg, NULL, mp_copy, loopback);
- if (mp_copy == mp)
- mp->b_next = mp_next;
+ mp_next = mp->b_next;
+ mp->b_next = NULL;
+ mpip->mpi_fn(mpip->mpi_arg, NULL, mp, loopback);
+ mp->b_next = mp_next;
}
/*
@@ -4094,7 +4175,7 @@ mac_is_mcast(mac_impl_t *mip, mblk_t *mp)
*/
void
mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain,
- mac_client_impl_t *sender)
+ mac_client_impl_t *sender, boolean_t local)
{
mac_promisc_impl_t *mpip;
mac_cb_t *mcb;
@@ -4134,8 +4215,10 @@ mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain,
if (is_sender ||
mpip->mpi_type == MAC_CLIENT_PROMISC_ALL ||
- is_mcast)
- mac_promisc_dispatch_one(mpip, mp, is_sender);
+ is_mcast) {
+ mac_promisc_dispatch_one(mpip, mp, is_sender,
+ local);
+ }
}
}
MAC_PROMISC_WALKER_DCR(mip);
@@ -4164,7 +4247,8 @@ mac_promisc_client_dispatch(mac_client_impl_t *mcip, mblk_t *mp_chain)
mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
if (mpip->mpi_type == MAC_CLIENT_PROMISC_FILTERED &&
!is_mcast) {
- mac_promisc_dispatch_one(mpip, mp, B_FALSE);
+ mac_promisc_dispatch_one(mpip, mp, B_FALSE,
+ B_FALSE);
}
}
}
@@ -4278,8 +4362,9 @@ mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
mac_impl_t *mip = (mac_impl_t *)mh;
/*
- * if mi_nactiveclients > 1, only MAC_CAPAB_LEGACY, MAC_CAPAB_HCKSUM,
- * MAC_CAPAB_NO_NATIVEVLAN and MAC_CAPAB_NO_ZCOPY can be advertised.
+ * Some capabilities are restricted when there are more than one active
+ * clients on the MAC resource. The ones noted below are safe,
+ * independent of that count.
*/
if (mip->mi_nactiveclients > 1) {
switch (cap) {
@@ -4287,6 +4372,7 @@ mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data)
return (B_TRUE);
case MAC_CAPAB_LEGACY:
case MAC_CAPAB_HCKSUM:
+ case MAC_CAPAB_LSO:
case MAC_CAPAB_NO_NATIVEVLAN:
break;
default:
diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c
index e3b660c3b3..9a5f94e7d2 100644
--- a/usr/src/uts/common/io/mac/mac_datapath_setup.c
+++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c
@@ -3476,7 +3476,7 @@ mac_srs_free(mac_soft_ring_set_t *mac_srs)
ASSERT((mac_srs->srs_state & (SRS_CONDEMNED | SRS_CONDEMNED_DONE |
SRS_PROC | SRS_PROC_FAST)) == (SRS_CONDEMNED | SRS_CONDEMNED_DONE));
- mac_pkt_drop(NULL, NULL, mac_srs->srs_first, B_FALSE);
+ mac_drop_chain(mac_srs->srs_first, "SRS free");
mac_srs_ring_free(mac_srs);
mac_srs_soft_rings_free(mac_srs);
mac_srs_fanout_list_free(mac_srs);
diff --git a/usr/src/uts/common/io/mac/mac_flow.c b/usr/src/uts/common/io/mac/mac_flow.c
index aa4985fe4c..62612122d6 100644
--- a/usr/src/uts/common/io/mac/mac_flow.c
+++ b/usr/src/uts/common/io/mac/mac_flow.c
@@ -22,6 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/strsun.h>
@@ -229,7 +230,7 @@ mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name,
cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL);
/* Initialize the receiver function to a safe routine */
- flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
+ flent->fe_cb_fn = (flow_fn_t)mac_rx_def;
flent->fe_index = -1;
}
(void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c
index fbeef1fd2f..ce986fd4bf 100644
--- a/usr/src/uts/common/io/mac/mac_provider.c
+++ b/usr/src/uts/common/io/mac/mac_provider.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
* Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
*/
@@ -116,6 +116,37 @@ mac_free(mac_register_t *mregp)
}
/*
+ * Convert a MAC's offload features into the equivalent DB_CKSUMFLAGS
+ * value.
+ */
+static uint16_t
+mac_features_to_flags(mac_handle_t mh)
+{
+ uint16_t flags = 0;
+ uint32_t cap_sum = 0;
+ mac_capab_lso_t cap_lso;
+
+ if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap_sum)) {
+ if (cap_sum & HCKSUM_IPHDRCKSUM)
+ flags |= HCK_IPV4_HDRCKSUM;
+
+ if (cap_sum & HCKSUM_INET_PARTIAL)
+ flags |= HCK_PARTIALCKSUM;
+ else if (cap_sum & (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6))
+ flags |= HCK_FULLCKSUM;
+ }
+
+ /*
+ * We don't need the information stored in 'cap_lso', but we
+ * need to pass a non-NULL pointer to appease the driver.
+ */
+ if (mac_capab_get(mh, MAC_CAPAB_LSO, &cap_lso))
+ flags |= HW_LSO;
+
+ return (flags);
+}
+
+/*
* mac_register() is how drivers register new MACs with the GLDv3
* framework. The mregp argument is allocated by drivers using the
* mac_alloc() function, and can be freed using mac_free() immediately upon
@@ -345,9 +376,13 @@ mac_register(mac_register_t *mregp, mac_handle_t *mhp)
mip, 0, &p0, TS_RUN, minclsyspri);
/*
- * Initialize the capabilities
+ * Cache the DB_CKSUMFLAGS that this MAC supports.
*/
+ mip->mi_tx_cksum_flags = mac_features_to_flags((mac_handle_t)mip);
+ /*
+ * Initialize the capabilities
+ */
bzero(&mip->mi_rx_rings_cap, sizeof (mac_capab_rings_t));
bzero(&mip->mi_tx_rings_cap, sizeof (mac_capab_rings_t));
@@ -689,7 +724,7 @@ mac_trill_snoop(mac_handle_t mh, mblk_t *mp)
mac_impl_t *mip = (mac_impl_t *)mh;
if (mip->mi_promisc_list != NULL)
- mac_promisc_dispatch(mip, mp, NULL);
+ mac_promisc_dispatch(mip, mp, NULL, B_FALSE);
}
/*
@@ -709,7 +744,7 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
* this MAC, pass them a copy if appropriate.
*/
if (mip->mi_promisc_list != NULL)
- mac_promisc_dispatch(mip, mp_chain, NULL);
+ mac_promisc_dispatch(mip, mp_chain, NULL, B_FALSE);
if (mr != NULL) {
/*
@@ -969,12 +1004,33 @@ mac_pdata_update(mac_handle_t mh, void *mac_pdata, size_t dsize)
}
/*
- * Invoked by driver as well as the framework to notify its capability change.
+ * The mac provider or mac frameowrk calls this function when it wants
+ * to notify upstream consumers that the capabilities have changed and
+ * that they should modify their own internal state accordingly.
+ *
+ * We currently have no regard for the fact that a provider could
+ * decide to drop capabilities which would invalidate pending traffic.
+ * For example, if one was to disable the Tx checksum offload while
+ * TCP/IP traffic was being sent by mac clients relying on that
+ * feature, then those packets would hit the write with missing or
+ * partial checksums. A proper solution involves not only providing
+ * notfication, but also performing client quiescing. That is, a capab
+ * change should be treated as an atomic transaction that forms a
+ * barrier between traffic relying on the current capabs and traffic
+ * relying on the new capabs. In practice, simnet is currently the
+ * only provider that could hit this, and it's an easily avoidable
+ * situation (and at worst it should only lead to some dropped
+ * packets). But if we ever want better on-the-fly capab change to
+ * actual hardware providers, then we should give this update
+ * mechanism a proper implementation.
*/
void
mac_capab_update(mac_handle_t mh)
{
- /* Send MAC_NOTE_CAPAB_CHG notification */
+ /*
+ * Send a MAC_NOTE_CAPAB_CHG notification to alert upstream
+ * clients to renegotiate capabilities.
+ */
i_mac_notify((mac_impl_t *)mh, MAC_NOTE_CAPAB_CHG);
}
@@ -1277,6 +1333,19 @@ i_mac_notify_thread(void *arg)
}
/*
+ * Depending on which capabs have changed, the Tx
+ * checksum flags may also need to be updated.
+ */
+ if ((bits & (1 << MAC_NOTE_CAPAB_CHG)) != 0) {
+ mac_perim_handle_t mph;
+ mac_handle_t mh = (mac_handle_t)mip;
+
+ mac_perim_enter_by_mh(mh, &mph);
+ mip->mi_tx_cksum_flags = mac_features_to_flags(mh);
+ mac_perim_exit(mph);
+ }
+
+ /*
* Do notification callbacks for each notification type.
*/
for (type = 0; type < MAC_NNOTE; type++) {
@@ -1542,15 +1611,22 @@ mac_hcksum_clone(const mblk_t *src, mblk_t *dst)
ASSERT3U(DB_TYPE(dst), ==, M_DATA);
/*
- * Do these assignments unconditionally, rather than only when flags is
- * non-zero. This protects a situation where zeroed hcksum data does
- * not make the jump onto an mblk_t with stale data in those fields.
+ * Do these assignments unconditionally, rather than only when
+ * flags is non-zero. This protects a situation where zeroed
+ * hcksum data does not make the jump onto an mblk_t with
+ * stale data in those fields. It's important to copy all
+ * possible flags (HCK_* as well as HW_*) and not just the
+ * checksum specific flags. Dropping flags during a clone
+ * could result in dropped packets. If the caller has good
+ * reason to drop those flags then it should do it manually,
+ * after the clone.
*/
- DB_CKSUMFLAGS(dst) = (DB_CKSUMFLAGS(src) & HCK_FLAGS);
+ DB_CKSUMFLAGS(dst) = DB_CKSUMFLAGS(src);
DB_CKSUMSTART(dst) = DB_CKSUMSTART(src);
DB_CKSUMSTUFF(dst) = DB_CKSUMSTUFF(src);
DB_CKSUMEND(dst) = DB_CKSUMEND(src);
DB_CKSUM16(dst) = DB_CKSUM16(src);
+ DB_LSOMSS(dst) = DB_LSOMSS(src);
}
void
diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c
index cbd5ce1e19..5b3e87dfd1 100644
--- a/usr/src/uts/common/io/mac/mac_sched.c
+++ b/usr/src/uts/common/io/mac/mac_sched.c
@@ -968,6 +968,7 @@
#include <sys/types.h>
#include <sys/callb.h>
+#include <sys/pattr.h>
#include <sys/sdt.h>
#include <sys/strsubr.h>
#include <sys/strsun.h>
@@ -1327,7 +1328,7 @@ int mac_srs_worker_wakeup_ticks = 0;
* b_prev may be set to the fanout hint \
* hence can't use freemsg directly \
*/ \
- mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \
+ mac_drop_chain(mp_chain, "SRS Tx max queue"); \
DTRACE_PROBE1(tx_queued_hiwat, \
mac_soft_ring_set_t *, srs); \
enqueue = 0; \
@@ -1346,11 +1347,11 @@ int mac_srs_worker_wakeup_ticks = 0;
if (!(srs->srs_type & SRST_TX)) \
mutex_exit(&srs->srs_bw->mac_bw_lock);
-#define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \
- mac_pkt_drop(NULL, NULL, mp, B_FALSE); \
+#define MAC_TX_SRS_DROP_MESSAGE(srs, chain, cookie, s) { \
+ mac_drop_chain((chain), (s)); \
/* increment freed stats */ \
- mac_srs->srs_tx.st_stat.mts_sdrops++; \
- cookie = (mac_tx_cookie_t)srs; \
+ (srs)->srs_tx.st_stat.mts_sdrops++; \
+ (cookie) = (mac_tx_cookie_t)(srs); \
}
#define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \
@@ -2321,7 +2322,7 @@ check_again:
if (smcip->mci_mip->mi_promisc_list != NULL) {
mutex_exit(lock);
mac_promisc_dispatch(smcip->mci_mip,
- head, NULL);
+ head, NULL, B_FALSE);
mutex_enter(lock);
}
}
@@ -2893,7 +2894,7 @@ again:
mac_srs->srs_bw->mac_bw_sz -= sz;
mac_srs->srs_bw->mac_bw_drop_bytes += sz;
mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
- mac_pkt_drop(NULL, NULL, head, B_FALSE);
+ mac_drop_chain(head, "Rx no bandwidth");
goto leave_poll;
} else {
mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
@@ -3275,9 +3276,10 @@ mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs,
}
/*
- * mac_rx_srs_process
- *
- * Receive side routine called from the interrupt path.
+ * MAC SRS receive side routine. If the data is coming from the
+ * network (i.e. from a NIC) then this is called in interrupt context.
+ * If the data is coming from a local sender (e.g. mac_tx_send() or
+ * bridge_forward()) then this is not called in interrupt context.
*
* loopback is set to force a context switch on the loopback
* path between MAC clients.
@@ -3337,7 +3339,7 @@ mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain,
mac_bw->mac_bw_drop_bytes += sz;
mutex_exit(&mac_bw->mac_bw_lock);
mutex_exit(&mac_srs->srs_lock);
- mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
+ mac_drop_chain(mp_chain, "Rx no bandwidth");
return;
} else {
if ((mac_bw->mac_bw_sz + sz) <=
@@ -3459,7 +3461,8 @@ mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW);
if (flag & MAC_DROP_ON_NO_DESC) {
- MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+ MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie,
+ "Tx no desc");
} else {
if (mac_srs->srs_first != NULL)
wakeup_worker = B_FALSE;
@@ -3522,7 +3525,8 @@ mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
if (flag & MAC_DROP_ON_NO_DESC) {
if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) {
- MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+ MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie,
+ "Tx SRS hiwat");
} else {
MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
mp_chain, tail, cnt, sz);
@@ -3895,7 +3899,8 @@ mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
cookie = (mac_tx_cookie_t)mac_srs;
*ret_mp = mp_chain;
} else {
- MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
+ MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie,
+ "Tx no bandwidth");
}
mutex_exit(&mac_srs->srs_lock);
return (cookie);
@@ -4342,7 +4347,7 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
msgdsize(mp));
CHECK_VID_AND_ADD_TAG(mp);
- MAC_TX(mip, ring, mp, src_mcip);
+ mp = mac_provider_tx(mip, ring, mp, src_mcip);
/*
* If the driver is out of descriptors and does a
@@ -4373,7 +4378,6 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
flow_entry_t *dst_flow_ent;
void *flow_cookie;
size_t pkt_size;
- mblk_t *mp1;
next = mp->b_next;
mp->b_next = NULL;
@@ -4388,44 +4392,12 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
dst_flow_ent = mac_tx_classify(mip, mp);
if (dst_flow_ent != NULL) {
- size_t hdrsize;
- int err = 0;
-
- if (mip->mi_info.mi_nativemedia == DL_ETHER) {
- struct ether_vlan_header *evhp =
- (struct ether_vlan_header *)mp->b_rptr;
-
- if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN)
- hdrsize = sizeof (*evhp);
- else
- hdrsize = sizeof (struct ether_header);
- } else {
- mac_header_info_t mhi;
-
- err = mac_header_info((mac_handle_t)mip,
- mp, &mhi);
- if (err == 0)
- hdrsize = mhi.mhi_hdrsize;
- }
-
/*
* Got a matching flow. It's either another
* MAC client, or a broadcast/multicast flow.
- * Make sure the packet size is within the
- * allowed size. If not drop the packet and
- * move to next packet.
*/
- if (err != 0 ||
- (pkt_size - hdrsize) > mip->mi_sdu_max) {
- oerrors++;
- DTRACE_PROBE2(loopback__drop, size_t, pkt_size,
- mblk_t *, mp);
- freemsg(mp);
- mp = next;
- FLOW_REFRELE(dst_flow_ent);
- continue;
- }
flow_cookie = mac_flow_get_client_cookie(dst_flow_ent);
+
if (flow_cookie != NULL) {
/*
* The vnic_bcast_send function expects
@@ -4443,6 +4415,7 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
* bypass is set.
*/
boolean_t do_switch;
+
mac_client_impl_t *dst_mcip =
dst_flow_ent->fe_mcip;
@@ -4458,19 +4431,23 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
* check is done inside the MAC_TX()
* macro.
*/
- if (mip->mi_promisc_list != NULL)
- mac_promisc_dispatch(mip, mp, src_mcip);
+ if (mip->mi_promisc_list != NULL) {
+ mac_promisc_dispatch(mip, mp, src_mcip,
+ B_TRUE);
+ }
do_switch = ((src_mcip->mci_state_flags &
dst_mcip->mci_state_flags &
MCIS_CLIENT_POLL_CAPABLE) != 0);
- if ((mp1 = mac_fix_cksum(mp)) != NULL) {
+ mac_hw_emul(&mp, NULL, NULL, MAC_ALL_EMULS);
+ if (mp != NULL) {
(dst_flow_ent->fe_cb_fn)(
dst_flow_ent->fe_cb_arg1,
dst_flow_ent->fe_cb_arg2,
- mp1, do_switch);
+ mp, do_switch);
}
+
}
FLOW_REFRELE(dst_flow_ent);
} else {
@@ -4478,7 +4455,7 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
* Unknown destination, send via the underlying
* NIC.
*/
- MAC_TX(mip, ring, mp, src_mcip);
+ mp = mac_provider_tx(mip, ring, mp, src_mcip);
if (mp != NULL) {
/*
* Adjust for the last packet that
@@ -4827,7 +4804,7 @@ mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
if (flag & MAC_DROP_ON_NO_DESC) {
- mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
+ mac_drop_chain(mp_chain, "Tx softring no desc");
/* increment freed stats */
ringp->s_ring_drops += cnt;
cookie = (mac_tx_cookie_t)ringp;
@@ -4871,8 +4848,8 @@ mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
* b_prev may be set to the fanout hint
* hence can't use freemsg directly
*/
- mac_pkt_drop(NULL, NULL,
- mp_chain, B_FALSE);
+ mac_drop_chain(mp_chain,
+ "Tx softring max queue");
DTRACE_PROBE1(tx_queued_hiwat,
mac_soft_ring_t *, ringp);
enqueue = B_FALSE;
diff --git a/usr/src/uts/common/io/mac/mac_soft_ring.c b/usr/src/uts/common/io/mac/mac_soft_ring.c
index f4d2a5ee81..c8a16e6fd3 100644
--- a/usr/src/uts/common/io/mac/mac_soft_ring.c
+++ b/usr/src/uts/common/io/mac/mac_soft_ring.c
@@ -242,7 +242,7 @@ mac_soft_ring_free(mac_soft_ring_t *softring)
ASSERT((softring->s_ring_state &
(S_RING_CONDEMNED | S_RING_CONDEMNED_DONE | S_RING_PROC)) ==
(S_RING_CONDEMNED | S_RING_CONDEMNED_DONE));
- mac_pkt_drop(NULL, NULL, softring->s_ring_first, B_FALSE);
+ mac_drop_chain(softring->s_ring_first, "softring free");
softring->s_ring_tx_arg2 = NULL;
mac_soft_ring_stat_delete(softring);
mac_callback_free(softring->s_ring_notify_cb_list);
diff --git a/usr/src/uts/common/io/mac/mac_util.c b/usr/src/uts/common/io/mac/mac_util.c
index 924d018ad0..03da3a3504 100644
--- a/usr/src/uts/common/io/mac/mac_util.c
+++ b/usr/src/uts/common/io/mac/mac_util.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
/*
@@ -48,6 +48,75 @@
#include <inet/sadb.h>
#include <inet/ipsecesp.h>
#include <inet/ipsecah.h>
+#include <inet/tcp.h>
+#include <inet/udp_impl.h>
+#include <inet/sctp_ip.h>
+
+/*
+ * The next two functions are used for dropping packets or chains of
+ * packets, respectively. We could use one function for both but
+ * separating the use cases allows us to specify intent and prevent
+ * dropping more data than intended.
+ *
+ * The purpose of these functions is to aid the debugging effort,
+ * especially in production. Rather than use freemsg()/freemsgchain(),
+ * it's preferable to use these functions when dropping a packet in
+ * the MAC layer. These functions should only be used during
+ * unexpected conditions. That is, any time a packet is dropped
+ * outside of the regular, successful datapath. Consolidating all
+ * drops on these functions allows the user to trace one location and
+ * determine why the packet was dropped based on the msg. It also
+ * allows the user to inspect the packet before it is freed. Finally,
+ * it allows the user to avoid tracing freemsg()/freemsgchain() thus
+ * keeping the hot path running as efficiently as possible.
+ *
+ * NOTE: At this time not all MAC drops are aggregated on these
+ * functions; but that is the plan. This comment should be erased once
+ * completed.
+ */
+
+/*PRINTFLIKE2*/
+void
+mac_drop_pkt(mblk_t *mp, const char *fmt, ...)
+{
+ va_list adx;
+ char msg[128];
+ char *msgp = msg;
+
+ ASSERT3P(mp->b_next, ==, NULL);
+
+ va_start(adx, fmt);
+ (void) vsnprintf(msgp, sizeof (msg), fmt, adx);
+ va_end(adx);
+
+ DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
+ freemsg(mp);
+}
+
+/*PRINTFLIKE2*/
+void
+mac_drop_chain(mblk_t *chain, const char *fmt, ...)
+{
+ va_list adx;
+ char msg[128];
+ char *msgp = msg;
+
+ va_start(adx, fmt);
+ (void) vsnprintf(msgp, sizeof (msg), fmt, adx);
+ va_end(adx);
+
+ /*
+ * We could use freemsgchain() for the actual freeing but
+ * since we are already walking the chain to fire the dtrace
+ * probe we might as well free the msg here too.
+ */
+ for (mblk_t *mp = chain, *next; mp != NULL; ) {
+ next = mp->b_next;
+ DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp);
+ freemsg(mp);
+ mp = next;
+ }
+}
/*
* Copy an mblk, preserving its hardware checksum flags.
@@ -89,222 +158,1272 @@ mac_copymsgchain_cksum(mblk_t *mp)
}
/*
- * Process the specified mblk chain for proper handling of hardware
- * checksum offload. This routine is invoked for loopback traffic
- * between MAC clients.
- * The function handles a NULL mblk chain passed as argument.
+ * Calculate the ULP checksum for IPv4. Return true if the calculation
+ * was successful, or false if an error occurred. If the later, place
+ * an error message into '*err'.
*/
-mblk_t *
-mac_fix_cksum(mblk_t *mp_chain)
+static boolean_t
+mac_sw_cksum_ipv4(mblk_t *mp, uint32_t ip_hdr_offset, ipha_t *ipha,
+ const char **err)
{
- mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1;
+ const uint8_t proto = ipha->ipha_protocol;
+ size_t len;
+ const uint32_t ip_hdr_sz = IPH_HDR_LENGTH(ipha);
+ /* ULP offset from start of L2. */
+ const uint32_t ulp_offset = ip_hdr_offset + ip_hdr_sz;
+ ipaddr_t src, dst;
+ uint32_t cksum;
+ uint16_t *up;
+
+ /*
+ * We need a pointer to the ULP checksum. We're assuming the
+ * ULP checksum pointer resides in the first mblk. Our native
+ * TCP stack should always put the headers in the first mblk,
+ * but currently we have no way to guarantee that other
+ * clients don't spread headers (or even header fields) across
+ * mblks.
+ */
+ switch (proto) {
+ case IPPROTO_TCP:
+ ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t)));
+ if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) {
+ *err = "mblk doesn't contain TCP header";
+ goto bail;
+ }
+
+ up = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_sz);
+ cksum = IP_TCP_CSUM_COMP;
+ break;
+
+ case IPPROTO_UDP:
+ ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t)));
+ if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) {
+ *err = "mblk doesn't contain UDP header";
+ goto bail;
+ }
+
+ up = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_sz);
+ cksum = IP_UDP_CSUM_COMP;
+ break;
+
+ case IPPROTO_SCTP: {
+ sctp_hdr_t *sctph;
+
+ ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t)));
+ if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) {
+ *err = "mblk doesn't contain SCTP header";
+ goto bail;
+ }
+
+ sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset);
+ sctph->sh_chksum = 0;
+ sctph->sh_chksum = sctp_cksum(mp, ulp_offset);
+ return (B_TRUE);
+ }
+
+ default:
+ *err = "unexpected protocol";
+ goto bail;
+
+ }
+
+ /* Pseudo-header checksum. */
+ src = ipha->ipha_src;
+ dst = ipha->ipha_dst;
+ len = ntohs(ipha->ipha_length) - ip_hdr_sz;
+
+ cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
+ cksum += htons(len);
+
+ /*
+ * We have already accounted for the pseudo checksum above.
+ * Make sure the ULP checksum field is zero before computing
+ * the rest.
+ */
+ *up = 0;
+ cksum = IP_CSUM(mp, ulp_offset, cksum);
+ *up = (uint16_t)(cksum ? cksum : ~cksum);
+
+ return (B_TRUE);
+
+bail:
+ return (B_FALSE);
+}
+
+/*
+ * Calculate the ULP checksum for IPv6. Return true if the calculation
+ * was successful, or false if an error occurred. If the later, place
+ * an error message into '*err'.
+ */
+static boolean_t
+mac_sw_cksum_ipv6(mblk_t *mp, uint32_t ip_hdr_offset, const char **err)
+{
+ ip6_t *ip6h = (ip6_t *)(mp->b_rptr + ip_hdr_offset);
+ const uint8_t proto = ip6h->ip6_nxt;
+ const uint16_t *iphs = (uint16_t *)ip6h;
+ /* ULP offset from start of L2. */
+ uint32_t ulp_offset;
+ size_t len;
+ uint32_t cksum;
+ uint16_t *up;
+ uint16_t ip_hdr_sz;
+
+ if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_sz, NULL)) {
+ *err = "malformed IPv6 header";
+ goto bail;
+ }
+
+ ulp_offset = ip_hdr_offset + ip_hdr_sz;
+
+ /*
+ * We need a pointer to the ULP checksum. We're assuming the
+ * ULP checksum pointer resides in the first mblk. Our native
+ * TCP stack should always put the headers in the first mblk,
+ * but currently we have no way to guarantee that other
+ * clients don't spread headers (or even header fields) across
+ * mblks.
+ */
+ switch (proto) {
+ case IPPROTO_TCP:
+ ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (tcph_t)));
+ if (MBLKL(mp) < (ulp_offset + sizeof (tcph_t))) {
+ *err = "mblk doesn't contain TCP header";
+ goto bail;
+ }
+
+ up = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_sz);
+ cksum = IP_TCP_CSUM_COMP;
+ break;
+
+ case IPPROTO_UDP:
+ ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (udpha_t)));
+ if (MBLKL(mp) < (ulp_offset + sizeof (udpha_t))) {
+ *err = "mblk doesn't contain UDP header";
+ goto bail;
+ }
+
+ up = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_sz);
+ cksum = IP_UDP_CSUM_COMP;
+ break;
+
+ case IPPROTO_SCTP: {
+ sctp_hdr_t *sctph;
+
+ ASSERT3U(MBLKL(mp), >=, (ulp_offset + sizeof (sctp_hdr_t)));
+ if (MBLKL(mp) < (ulp_offset + sizeof (sctp_hdr_t))) {
+ *err = "mblk doesn't contain SCTP header";
+ goto bail;
+ }
+
+ sctph = (sctp_hdr_t *)(mp->b_rptr + ulp_offset);
+ /*
+ * Zero out the checksum field to ensure proper
+ * checksum calculation.
+ */
+ sctph->sh_chksum = 0;
+ sctph->sh_chksum = sctp_cksum(mp, ulp_offset);
+ return (B_TRUE);
+ }
+
+ default:
+ *err = "unexpected protocol";
+ goto bail;
+ }
+
+ /*
+ * The payload length includes the payload and the IPv6
+ * extension headers; the idea is to subtract the extension
+ * header length to get the real payload length.
+ */
+ len = ntohs(ip6h->ip6_plen) - (ip_hdr_sz - IPV6_HDR_LEN);
+ cksum += len;
+
+ /*
+ * We accumulate the pseudo header checksum in cksum; then we
+ * call IP_CSUM to compute the checksum over the payload.
+ */
+ cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] + iphs[8] + iphs[9] +
+ iphs[10] + iphs[11] + iphs[12] + iphs[13] + iphs[14] + iphs[15] +
+ iphs[16] + iphs[17] + iphs[18] + iphs[19];
+ cksum = IP_CSUM(mp, ulp_offset, cksum);
+
+ /* For UDP/IPv6 a zero UDP checksum is not allowed. Change to 0xffff */
+ if (proto == IPPROTO_UDP && cksum == 0)
+ cksum = ~cksum;
+
+ *up = (uint16_t)cksum;
+
+ return (B_TRUE);
+
+bail:
+ return (B_FALSE);
+}
+
+/*
+ * Perform software checksum on a single message, if needed. The
+ * emulation performed is determined by an intersection of the mblk's
+ * flags and the emul flags requested. The emul flags are documented
+ * in mac.h.
+ */
+static mblk_t *
+mac_sw_cksum(mblk_t *mp, mac_emul_t emul)
+{
+ mblk_t *skipped_hdr = NULL;
uint32_t flags, start, stuff, end, value;
+ uint32_t ip_hdr_offset;
+ uint16_t etype;
+ size_t ip_hdr_sz;
+ struct ether_header *ehp;
+ const char *err = "";
- for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) {
- uint16_t len;
- uint32_t offset;
- struct ether_header *ehp;
- uint16_t sap;
+ /*
+ * This function should only be called from mac_hw_emul()
+ * which handles mblk chains and the shared ref case.
+ */
+ ASSERT3P(mp->b_next, ==, NULL);
- mac_hcksum_get(mp, &start, &stuff, &end, &value, &flags);
- if (flags == 0)
- continue;
+ mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL);
+
+ flags = DB_CKSUMFLAGS(mp);
+
+ /* Why call this if checksum emulation isn't needed? */
+ ASSERT3U(flags & (HCK_FLAGS), !=, 0);
+
+ /*
+ * Ethernet, and optionally VLAN header. mac_hw_emul() has
+ * already verified we have enough data to read the L2 header.
+ */
+ ehp = (struct ether_header *)mp->b_rptr;
+ if (ntohs(ehp->ether_type) == VLAN_TPID) {
+ struct ether_vlan_header *evhp;
+
+ evhp = (struct ether_vlan_header *)mp->b_rptr;
+ etype = ntohs(evhp->ether_type);
+ ip_hdr_offset = sizeof (struct ether_vlan_header);
+ } else {
+ etype = ntohs(ehp->ether_type);
+ ip_hdr_offset = sizeof (struct ether_header);
+ }
+
+ /*
+ * If this packet isn't IP, then leave it alone. We don't want
+ * to affect non-IP traffic like ARP. Assume the IP header
+ * doesn't include any options, for now. We will use the
+ * correct size later after we know there are enough bytes to
+ * at least fill out the basic header.
+ */
+ switch (etype) {
+ case ETHERTYPE_IP:
+ ip_hdr_sz = sizeof (ipha_t);
+ break;
+ case ETHERTYPE_IPV6:
+ ip_hdr_sz = sizeof (ip6_t);
+ break;
+ default:
+ return (mp);
+ }
+
+ ASSERT3U(MBLKL(mp), >=, ip_hdr_offset);
+
+ /*
+ * If the first mblk of this packet contains only the ethernet
+ * header, skip past it for now. Packets with their data
+ * contained in only a single mblk can then use the fastpaths
+ * tuned to that possibility.
+ */
+ if (MBLKL(mp) == ip_hdr_offset) {
+ ip_hdr_offset -= MBLKL(mp);
+ /* This is guaranteed by mac_hw_emul(). */
+ ASSERT3P(mp->b_cont, !=, NULL);
+ skipped_hdr = mp;
+ mp = mp->b_cont;
+ }
+
+ /*
+ * Both full and partial checksum rely on finding the IP
+ * header in the current mblk. Our native TCP stack honors
+ * this assumption but it's prudent to guard our future
+ * clients that might not honor this contract.
+ */
+ ASSERT3U(MBLKL(mp), >=, ip_hdr_offset + ip_hdr_sz);
+ if (MBLKL(mp) < (ip_hdr_offset + ip_hdr_sz)) {
+ err = "mblk doesn't contain IP header";
+ goto bail;
+ }
+
+ /*
+ * We are about to modify the header mblk; make sure we are
+ * modifying our own copy. The code that follows assumes that
+ * the IP/ULP headers exist in this mblk (and drops the
+ * message if they don't).
+ */
+ if (DB_REF(mp) > 1) {
+ mblk_t *tmp = copyb(mp);
+
+ if (tmp == NULL) {
+ err = "copyb failed";
+ goto bail;
+ }
+
+ if (skipped_hdr != NULL) {
+ ASSERT3P(skipped_hdr->b_cont, ==, mp);
+ skipped_hdr->b_cont = tmp;
+ }
+
+ tmp->b_cont = mp->b_cont;
+ freeb(mp);
+ mp = tmp;
+ }
+
+ if (etype == ETHERTYPE_IP) {
+ ipha_t *ipha = (ipha_t *)(mp->b_rptr + ip_hdr_offset);
+
+ if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
+ if (!mac_sw_cksum_ipv4(mp, ip_hdr_offset, ipha, &err))
+ goto bail;
+ }
+
+ /* We always update the ULP checksum flags. */
+ if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
+ flags &= ~HCK_FULLCKSUM;
+ flags |= HCK_FULLCKSUM_OK;
+ value = 0;
+ }
/*
- * Since the processing of checksum offload for loopback
- * traffic requires modification of the packet contents,
- * ensure sure that we are always modifying our own copy.
+ * While unlikely, it's possible to write code that
+ * might end up calling mac_sw_cksum() twice on the
+ * same mblk (performing both LSO and checksum
+ * emualtion in a single mblk chain loop -- the LSO
+ * emulation inserts a new chain into the existing
+ * chain and then the loop iterates back over the new
+ * segments and emulates the checksum a second time).
+ * Normally this wouldn't be a problem, because the
+ * HCK_*_OK flags are supposed to indicate that we
+ * don't need to do peform the work. But
+ * HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the
+ * same value; so we cannot use these flags to
+ * determine if the IP header checksum has already
+ * been calculated or not. For this reason, we zero
+ * out the the checksum first. In the future, we
+ * should fix the HCK_* flags.
*/
- if (DB_REF(mp) > 1) {
- mp1 = copymsg(mp);
- if (mp1 == NULL)
- continue;
- mp1->b_next = mp->b_next;
- mp->b_next = NULL;
- freemsg(mp);
- if (prev != NULL)
- prev->b_next = mp1;
- else
- new_chain = mp1;
- mp = mp1;
+ if ((flags & HCK_IPV4_HDRCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
+ ipha->ipha_hdr_checksum = 0;
+ ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
+ flags &= ~HCK_IPV4_HDRCKSUM;
+ flags |= HCK_IPV4_HDRCKSUM_OK;
+ }
+ } else if (etype == ETHERTYPE_IPV6) {
+ /* There is no IP header checksum for IPv6. */
+ if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
+ if (!mac_sw_cksum_ipv6(mp, ip_hdr_offset, &err))
+ goto bail;
+ flags &= ~HCK_FULLCKSUM;
+ flags |= HCK_FULLCKSUM_OK;
+ value = 0;
}
+ }
+
+ /*
+ * Partial checksum is the same for both IPv4 and IPv6.
+ */
+ if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMUL)) {
+ uint16_t *up, partial, cksum;
+ uchar_t *ipp; /* ptr to beginning of IP header */
+
+ ipp = mp->b_rptr + ip_hdr_offset;
+ up = (uint16_t *)((uchar_t *)ipp + stuff);
+ partial = *up;
+ *up = 0;
+
+ ASSERT3S(end, >, start);
+ cksum = ~IP_CSUM_PARTIAL(mp, ip_hdr_offset + start, partial);
+ *up = cksum != 0 ? cksum : ~cksum;
+ }
+
+ /* We always update the ULP checksum flags. */
+ if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMULS)) {
+ flags &= ~HCK_PARTIALCKSUM;
+ flags |= HCK_FULLCKSUM_OK;
+ value = 0;
+ }
+
+ mac_hcksum_set(mp, start, stuff, end, value, flags);
+
+ /* Don't forget to reattach the header. */
+ if (skipped_hdr != NULL) {
+ ASSERT3P(skipped_hdr->b_cont, ==, mp);
/*
- * Ethernet, and optionally VLAN header.
+ * Duplicate the HCKSUM data into the header mblk.
+ * This mimics mac_add_vlan_tag which ensures that
+ * both the first mblk _and_ the first data bearing
+ * mblk possess the HCKSUM information. Consumers like
+ * IP will end up discarding the ether_header mblk, so
+ * for now, it is important that the data be available
+ * in both places.
*/
- /* LINTED: improper alignment cast */
- ehp = (struct ether_header *)mp->b_rptr;
- if (ntohs(ehp->ether_type) == VLAN_TPID) {
- struct ether_vlan_header *evhp;
+ mac_hcksum_clone(mp, skipped_hdr);
+ mp = skipped_hdr;
+ }
- ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header));
- /* LINTED: improper alignment cast */
- evhp = (struct ether_vlan_header *)mp->b_rptr;
- sap = ntohs(evhp->ether_type);
- offset = sizeof (struct ether_vlan_header);
+ return (mp);
+
+bail:
+ if (skipped_hdr != NULL) {
+ ASSERT3P(skipped_hdr->b_cont, ==, mp);
+ mp = skipped_hdr;
+ }
+
+ mac_drop_pkt(mp, err);
+ return (NULL);
+}
+
+/*
+ * Build a single data segment from an LSO packet. The mblk chain
+ * returned, seg_head, represents the data segment and is always
+ * exactly seg_len bytes long. The lso_mp and offset input/output
+ * parameters track our position in the LSO packet. This function
+ * exists solely as a helper to mac_sw_lso().
+ *
+ * Case A
+ *
+ * The current lso_mp is larger than the requested seg_len. The
+ * beginning of seg_head may start at the beginning of lso_mp or
+ * offset into it. In either case, a single mblk is returned, and
+ * *offset is updated to reflect our new position in the current
+ * lso_mp.
+ *
+ * +----------------------------+
+ * | in *lso_mp / out *lso_mp |
+ * +----------------------------+
+ * ^ ^
+ * | |
+ * | |
+ * | |
+ * +------------------------+
+ * | seg_head |
+ * +------------------------+
+ * ^ ^
+ * | |
+ * in *offset = 0 out *offset = seg_len
+ *
+ * |------ seg_len ----|
+ *
+ *
+ * +------------------------------+
+ * | in *lso_mp / out *lso_mp |
+ * +------------------------------+
+ * ^ ^
+ * | |
+ * | |
+ * | |
+ * +------------------------+
+ * | seg_head |
+ * +------------------------+
+ * ^ ^
+ * | |
+ * in *offset = N out *offset = N + seg_len
+ *
+ * |------ seg_len ----|
+ *
+ *
+ *
+ * Case B
+ *
+ * The requested seg_len consumes exactly the rest of the lso_mp.
+ * I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr.
+ * The seg_head may start at the beginning of the lso_mp or at some
+ * offset into it. In either case we return a single mblk, reset
+ * *offset to zero, and walk to the next lso_mp.
+ *
+ * +------------------------+ +------------------------+
+ * | in *lso_mp |---------->| out *lso_mp |
+ * +------------------------+ +------------------------+
+ * ^ ^ ^
+ * | | |
+ * | | out *offset = 0
+ * | |
+ * +------------------------+
+ * | seg_head |
+ * +------------------------+
+ * ^
+ * |
+ * in *offset = 0
+ *
+ * |------ seg_len ----|
+ *
+ *
+ *
+ * +----------------------------+ +------------------------+
+ * | in *lso_mp |---------->| out *lso_mp |
+ * +----------------------------+ +------------------------+
+ * ^ ^ ^
+ * | | |
+ * | | out *offset = 0
+ * | |
+ * +------------------------+
+ * | seg_head |
+ * +------------------------+
+ * ^
+ * |
+ * in *offset = N
+ *
+ * |------ seg_len ----|
+ *
+ *
+ * Case C
+ *
+ * The requested seg_len is greater than the current lso_mp. In
+ * this case we must consume LSO mblks until we have enough data to
+ * satisfy either case (A) or (B) above. We will return multiple
+ * mblks linked via b_cont, offset will be set based on the cases
+ * above, and lso_mp will walk forward at least one mblk, but maybe
+ * more.
+ *
+ * N.B. This digram is not exhaustive. The seg_head may start on
+ * the beginning of an lso_mp. The seg_tail may end exactly on the
+ * boundary of an lso_mp. And there may be two (in this case the
+ * middle block wouldn't exist), three, or more mblks in the
+ * seg_head chain. This is meant as one example of what might
+ * happen. The main thing to remember is that the seg_tail mblk
+ * must be one of case (A) or (B) above.
+ *
+ * +------------------+ +----------------+ +------------------+
+ * | in *lso_mp |--->| *lso_mp |--->| out *lso_mp |
+ * +------------------+ +----------------+ +------------------+
+ * ^ ^ ^ ^ ^ ^
+ * | | | | | |
+ * | | | | | |
+ * | | | | | |
+ * | | | | | |
+ * +------------+ +----------------+ +------------+
+ * | seg_head |--->| |--->| seg_tail |
+ * +------------+ +----------------+ +------------+
+ * ^ ^
+ * | |
+ * in *offset = N out *offset = MBLKL(seg_tail)
+ *
+ * |------------------- seg_len -------------------|
+ *
+ */
+static mblk_t *
+build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len)
+{
+ mblk_t *seg_head, *seg_tail, *seg_mp;
+
+ ASSERT3P(*lso_mp, !=, NULL);
+ ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr);
+
+ seg_mp = dupb(*lso_mp);
+ if (seg_mp == NULL)
+ return (NULL);
+
+ seg_head = seg_mp;
+ seg_tail = seg_mp;
+
+ /* Continue where we left off from in the lso_mp. */
+ seg_mp->b_rptr += *offset;
+
+last_mblk:
+ /* Case (A) */
+ if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) {
+ *offset += seg_len;
+ seg_mp->b_wptr = seg_mp->b_rptr + seg_len;
+ return (seg_head);
+ }
+
+ /* Case (B) */
+ if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) {
+ *offset = 0;
+ *lso_mp = (*lso_mp)->b_cont;
+ return (seg_head);
+ }
+
+ /* Case (C) */
+ ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr);
+
+ /*
+ * The current LSO mblk doesn't have enough data to satisfy
+ * seg_len -- continue peeling off LSO mblks to build the new
+ * segment message. If allocation fails we free the previously
+ * allocated segment mblks and return NULL.
+ */
+ while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) {
+ ASSERT3U(MBLKL(seg_mp), <=, seg_len);
+ seg_len -= MBLKL(seg_mp);
+ *offset = 0;
+ *lso_mp = (*lso_mp)->b_cont;
+ seg_mp = dupb(*lso_mp);
+
+ if (seg_mp == NULL) {
+ freemsgchain(seg_head);
+ return (NULL);
+ }
+
+ seg_tail->b_cont = seg_mp;
+ seg_tail = seg_mp;
+ }
+
+ /*
+ * We've walked enough LSO mblks that we can now satisfy the
+ * remaining seg_len. At this point we need to jump back to
+ * determine if we have arrived at case (A) or (B).
+ */
+
+ /* Just to be paranoid that we didn't underflow. */
+ ASSERT3U(seg_len, <, IP_MAXPACKET);
+ ASSERT3U(seg_len, >, 0);
+ goto last_mblk;
+}
+
+/*
+ * Perform software segmentation of a single LSO message. Take an LSO
+ * message as input and return head/tail pointers as output. This
+ * function should not be invoked directly but instead through
+ * mac_hw_emul().
+ *
+ * The resulting chain is comprised of multiple (nsegs) MSS sized
+ * segments. Each segment will consist of two or more mblks joined by
+ * b_cont: a header and one or more data mblks. The header mblk is
+ * allocated anew for each message. The first segment's header is used
+ * as a template for the rest with adjustments made for things such as
+ * ID, sequence, length, TCP flags, etc. The data mblks reference into
+ * the existing LSO mblk (passed in as omp) by way of dupb(). Their
+ * b_rptr/b_wptr values are adjusted to reference only the fraction of
+ * the LSO message they are responsible for. At the successful
+ * completion of this function the original mblk (omp) is freed,
+ * leaving the newely created segment chain as the only remaining
+ * reference to the data.
+ */
+static void
+mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail,
+ uint_t *count)
+{
+ uint32_t ocsum_flags, ocsum_start, ocsum_stuff;
+ uint32_t mss;
+ uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen, odatalen;
+ uint32_t oleft;
+ uint_t nsegs, seg;
+ int len;
+
+ struct ether_vlan_header *oevh;
+ const ipha_t *oiph;
+ const tcph_t *otcph;
+ ipha_t *niph;
+ tcph_t *ntcph;
+ uint16_t ip_id;
+ uint32_t tcp_seq, tcp_sum, otcp_sum;
+
+ uint32_t offset;
+ mblk_t *odatamp;
+ mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp;
+ mblk_t *tmptail;
+
+ ASSERT3P(head, !=, NULL);
+ ASSERT3P(tail, !=, NULL);
+ ASSERT3P(count, !=, NULL);
+ ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0);
+
+ /* Assume we are dealing with a single LSO message. */
+ ASSERT3P(omp->b_next, ==, NULL);
+
+ /*
+ * XXX: This is a hack to deal with mac_add_vlan_tag().
+ *
+ * When VLANs are in play, mac_add_vlan_tag() creates a new
+ * mblk with just the ether_vlan_header and tacks it onto the
+ * front of 'omp'. This breaks the assumptions made below;
+ * namely that the TCP/IP headers are in the first mblk. In
+ * this case, since we already have to pay the cost of LSO
+ * emulation, we simply pull up everything. While this might
+ * seem irksome, keep in mind this will only apply in a couple
+ * of scenarios: a) an LSO-capable VLAN client sending to a
+ * non-LSO-capable client over the "MAC/bridge loopback"
+ * datapath or b) an LSO-capable VLAN client is sending to a
+ * client that, for whatever reason, doesn't have DLS-bypass
+ * enabled. Finally, we have to check for both a tagged and
+ * untagged sized mblk depending on if the mblk came via
+ * mac_promisc_dispatch() or mac_rx_deliver().
+ *
+ * In the future, two things should be done:
+ *
+ * 1. This function should make use of some yet to be
+ * implemented "mblk helpers". These helper functions would
+ * perform all the b_cont walking for us and guarantee safe
+ * access to the mblk data.
+ *
+ * 2. We should add some slop to the mblks so that
+ * mac_add_vlan_tag() can just edit the first mblk instead
+ * of allocating on the hot path.
+ */
+ if (MBLKL(omp) == sizeof (struct ether_vlan_header) ||
+ MBLKL(omp) == sizeof (struct ether_header)) {
+ mblk_t *tmp = msgpullup(omp, -1);
+
+ if (tmp == NULL) {
+ mac_drop_pkt(omp, "failed to pull up");
+ goto fail;
+ }
+
+ mac_hcksum_clone(omp, tmp);
+ freemsg(omp);
+ omp = tmp;
+ }
+
+ mss = DB_LSOMSS(omp);
+ ASSERT3U(msgsize(omp), <=, IP_MAXPACKET +
+ sizeof (struct ether_vlan_header));
+ opktlen = msgsize(omp);
+
+ /*
+ * First, get references to the IP and TCP headers and
+ * determine the total TCP length (header + data).
+ *
+ * Thanks to mac_hw_emul() we know that the first mblk must
+ * contain (at minimum) the full L2 header. However, this
+ * function assumes more than that. It assumes the L2/L3/L4
+ * headers are all contained in the first mblk of a message
+ * (i.e., no b_cont walking for headers). While this is a
+ * current reality (our native TCP stack and viona both
+ * enforce this) things may become more nuanced in the future
+ * (e.g. when introducing encap support or adding new
+ * clients). For now we guard against this case by dropping
+ * the packet.
+ */
+ oevh = (struct ether_vlan_header *)omp->b_rptr;
+ if (oevh->ether_tpid == htons(ETHERTYPE_VLAN))
+ oehlen = sizeof (struct ether_vlan_header);
+ else
+ oehlen = sizeof (struct ether_header);
+
+ ASSERT3U(MBLKL(omp), >=, (oehlen + sizeof (ipha_t) + sizeof (tcph_t)));
+ if (MBLKL(omp) < (oehlen + sizeof (ipha_t) + sizeof (tcph_t))) {
+ mac_drop_pkt(omp, "mblk doesn't contain TCP/IP headers");
+ goto fail;
+ }
+
+ oiph = (ipha_t *)(omp->b_rptr + oehlen);
+ oiphlen = IPH_HDR_LENGTH(oiph);
+ otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen);
+ otcphlen = TCP_HDR_LENGTH(otcph);
+
+ /*
+ * Currently we only support LSO for TCP/IPv4.
+ */
+ if (IPH_HDR_VERSION(oiph) != IPV4_VERSION) {
+ mac_drop_pkt(omp, "LSO unsupported IP version: %uhh",
+ IPH_HDR_VERSION(oiph));
+ goto fail;
+ }
+
+ if (oiph->ipha_protocol != IPPROTO_TCP) {
+ mac_drop_pkt(omp, "LSO unsupported protocol: %uhh",
+ oiph->ipha_protocol);
+ goto fail;
+ }
+
+ if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) {
+ mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set");
+ goto fail;
+ }
+
+ ohdrslen = oehlen + oiphlen + otcphlen;
+ if ((len = MBLKL(omp)) < ohdrslen) {
+ mac_drop_pkt(omp, "LSO packet too short: %d < %u", len,
+ ohdrslen);
+ goto fail;
+ }
+
+ /*
+ * Either we have data in the first mblk or it's just the
+ * header. In either case, we need to set rptr to the start of
+ * the TCP data.
+ */
+ if (len > ohdrslen) {
+ odatamp = omp;
+ offset = ohdrslen;
+ } else {
+ ASSERT3U(len, ==, ohdrslen);
+ odatamp = omp->b_cont;
+ offset = 0;
+ }
+
+ /* Make sure we still have enough data. */
+ ASSERT3U(msgsize(odatamp), >=, opktlen - ohdrslen);
+
+ /*
+ * If a MAC negotiated LSO then it must negotioate both
+ * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or
+ * HCKSUM_INET_PARTIAL; because both the IP and TCP headers
+ * change during LSO segmentation (only the 3 fields of the
+ * pseudo header checksum don't change: src, dst, proto). Thus
+ * we would expect these flags (HCK_IPV4_HDRCKSUM |
+ * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this
+ * function to emulate those checksums in software. However,
+ * that assumes a world where we only expose LSO if the
+ * underlying hardware exposes LSO. Moving forward the plan is
+ * to assume LSO in the upper layers and have MAC perform
+ * software LSO when the underlying provider doesn't support
+ * it. In such a world, if the provider doesn't support LSO
+ * but does support hardware checksum offload, then we could
+ * simply perform the segmentation and allow the hardware to
+ * calculate the checksums. To the hardware it's just another
+ * chain of non-LSO packets.
+ */
+ ASSERT3S(DB_TYPE(omp), ==, M_DATA);
+ ocsum_flags = DB_CKSUMFLAGS(omp);
+ ASSERT3U(ocsum_flags & HCK_IPV4_HDRCKSUM, !=, 0);
+ ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0);
+
+ /*
+ * If hardware only provides partial checksum then software
+ * must supply the pseudo-header checksum. In the case of LSO
+ * we leave the TCP length at zero to be filled in by
+ * hardware. This function must handle two scenarios.
+ *
+ * 1. Being called by a MAC client on the Rx path to segment
+ * an LSO packet and calculate the checksum.
+ *
+ * 2. Being called by a MAC provider to segment an LSO packet.
+ * In this case the LSO segmentation is performed in
+ * software (by this routine) but the MAC provider should
+ * still calculate the TCP/IP checksums in hardware.
+ *
+ * To elaborate on the second case: we cannot have the
+ * scenario where IP sends LSO packets but the underlying HW
+ * doesn't support checksum offload -- because in that case
+ * TCP/IP would calculate the checksum in software (for the
+ * LSO packet) but then MAC would segment the packet and have
+ * to redo all the checksum work. So IP should never do LSO
+ * if HW doesn't support both IP and TCP checksum.
+ */
+ if (ocsum_flags & HCK_PARTIALCKSUM) {
+ ocsum_start = (uint32_t)DB_CKSUMSTART(omp);
+ ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp);
+ }
+
+ odatalen = opktlen - ohdrslen;
+
+ /*
+ * Subtract one to account for the case where the data length
+ * is evenly divisble by the MSS. Add one to account for the
+ * fact that the division will always result in one less
+ * segment than needed.
+ */
+ nsegs = ((odatalen - 1) / mss) + 1;
+ if (nsegs < 2) {
+ mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs);
+ goto fail;
+ }
+
+ DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph,
+ __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, uint_t,
+ nsegs);
+
+ seg_chain = NULL;
+ tmptail = seg_chain;
+ oleft = odatalen;
+
+ for (uint_t i = 0; i < nsegs; i++) {
+ boolean_t last_seg = ((i + 1) == nsegs);
+ uint32_t seg_len;
+
+ /*
+ * If we fail to allocate, then drop the partially
+ * allocated chain as well as the LSO packet. Let the
+ * sender deal with the fallout.
+ */
+ if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) {
+ freemsgchain(seg_chain);
+ mac_drop_pkt(omp, "failed to alloc segment header");
+ goto fail;
+ }
+ ASSERT3P(nhdrmp->b_cont, ==, NULL);
+
+ if (seg_chain == NULL) {
+ seg_chain = nhdrmp;
} else {
- sap = ntohs(ehp->ether_type);
- offset = sizeof (struct ether_header);
+ ASSERT3P(tmptail, !=, NULL);
+ tmptail->b_next = nhdrmp;
}
- if (MBLKL(mp) <= offset) {
- offset -= MBLKL(mp);
- if (mp->b_cont == NULL) {
- /* corrupted packet, skip it */
- if (prev != NULL)
- prev->b_next = mp->b_next;
- else
- new_chain = mp->b_next;
- mp1 = mp->b_next;
- mp->b_next = NULL;
- freemsg(mp);
- mp = mp1;
- continue;
- }
- mp = mp->b_cont;
+ tmptail = nhdrmp;
+
+ /*
+ * Calculate this segment's lengh. It's either the MSS
+ * or whatever remains for the last segment.
+ */
+ seg_len = last_seg ? oleft : mss;
+ ASSERT3U(seg_len, <=, mss);
+ ndatamp = build_data_seg(&odatamp, &offset, seg_len);
+
+ if (ndatamp == NULL) {
+ freemsgchain(seg_chain);
+ mac_drop_pkt(omp, "LSO failed to segment data");
+ goto fail;
}
- if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) {
- ipha_t *ipha = NULL;
+ /* Attach data mblk to header mblk. */
+ nhdrmp->b_cont = ndatamp;
+ DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO;
+ ASSERT3U(seg_len, <=, oleft);
+ oleft -= seg_len;
+ }
+
+ /* We should have consumed entire LSO msg. */
+ ASSERT3S(oleft, ==, 0);
+ ASSERT3P(odatamp, ==, NULL);
- /*
- * In order to compute the full and header
- * checksums, we need to find and parse
- * the IP and/or ULP headers.
- */
+ /*
+ * All seg data mblks are referenced by the header mblks, null
+ * out this pointer to catch any bad derefs.
+ */
+ ndatamp = NULL;
+
+ /*
+ * Set headers and checksum for first segment.
+ */
+ nhdrmp = seg_chain;
+ bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen);
+ nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
+ niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
+ ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss);
+ niph->ipha_length = htons(oiphlen + otcphlen + mss);
+ niph->ipha_hdr_checksum = 0;
+ ip_id = ntohs(niph->ipha_ident);
+ ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
+ tcp_seq = BE32_TO_U32(ntcph->th_seq);
+ tcp_seq += mss;
+
+ /*
+ * The first segment shouldn't:
+ *
+ * o indicate end of data transmission (FIN),
+ * o indicate immediate handling of the data (PUSH).
+ */
+ ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
+ DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
+
+ /*
+ * If the underlying HW provides partial checksum, then make
+ * sure to correct the pseudo header checksum before calling
+ * mac_sw_cksum(). The native TCP stack doesn't include the
+ * length field in the pseudo header when LSO is in play -- so
+ * we need to calculate it here.
+ */
+ if (ocsum_flags & HCK_PARTIALCKSUM) {
+ DB_CKSUMSTART(nhdrmp) = ocsum_start;
+ DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
+ DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
+ tcp_sum = BE16_TO_U16(ntcph->th_sum);
+ otcp_sum = tcp_sum;
+ tcp_sum += mss + otcphlen;
+ tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
+ U16_TO_BE16(tcp_sum, ntcph->th_sum);
+ }
+
+ if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
+ (emul & MAC_HWCKSUM_EMULS)) {
+ next_nhdrmp = nhdrmp->b_next;
+ nhdrmp->b_next = NULL;
+ nhdrmp = mac_sw_cksum(nhdrmp, emul);
+ nhdrmp->b_next = next_nhdrmp;
+ next_nhdrmp = NULL;
+
+ /*
+ * We may have freed the nhdrmp argument during
+ * checksum emulation, make sure that seg_chain
+ * references a valid mblk.
+ */
+ seg_chain = nhdrmp;
+ }
+
+ ASSERT3P(nhdrmp, !=, NULL);
- sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap;
+ seg = 1;
+ DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
+ (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
+ (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss,
+ uint_t, seg);
+ seg++;
+ /* There better be at least 2 segs. */
+ ASSERT3P(nhdrmp->b_next, !=, NULL);
+ prev_nhdrmp = nhdrmp;
+ nhdrmp = nhdrmp->b_next;
+
+ /*
+ * Now adjust the headers of the middle segments. For each
+ * header we need to adjust the following.
+ *
+ * o IP ID
+ * o IP length
+ * o TCP sequence
+ * o TCP flags
+ * o cksum flags
+ * o cksum values (if MAC_HWCKSUM_EMUL is set)
+ */
+ for (; seg < nsegs; seg++) {
+ /*
+ * We use seg_chain as a reference to the first seg
+ * header mblk -- this first header is a template for
+ * the rest of the segments. This copy will include
+ * the now updated checksum values from the first
+ * header. We must reset these checksum values to
+ * their original to make sure we produce the correct
+ * value.
+ */
+ bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
+ nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
+ niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
+ niph->ipha_ident = htons(++ip_id);
+ ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss);
+ niph->ipha_length = htons(oiphlen + otcphlen + mss);
+ niph->ipha_hdr_checksum = 0;
+ ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
+ U32_TO_BE32(tcp_seq, ntcph->th_seq);
+ tcp_seq += mss;
+ /*
+ * Just like the first segment, the middle segments
+ * shouldn't have these flags set.
+ */
+ ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH);
+ DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
+
+ if (ocsum_flags & HCK_PARTIALCKSUM) {
/*
- * IP header.
+ * First and middle segs have same
+ * pseudo-header checksum.
*/
- if (sap != ETHERTYPE_IP)
- continue;
+ U16_TO_BE16(tcp_sum, ntcph->th_sum);
+ DB_CKSUMSTART(nhdrmp) = ocsum_start;
+ DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
+ DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
+ }
- ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t));
- /* LINTED: improper alignment cast */
- ipha = (ipha_t *)(mp->b_rptr + offset);
-
- if (flags & HCK_FULLCKSUM) {
- ipaddr_t src, dst;
- uint32_t cksum;
- uint16_t *up;
- uint8_t proto;
-
- /*
- * Pointer to checksum field in ULP header.
- */
- proto = ipha->ipha_protocol;
- ASSERT(ipha->ipha_version_and_hdr_length ==
- IP_SIMPLE_HDR_VERSION);
-
- switch (proto) {
- case IPPROTO_TCP:
- /* LINTED: improper alignment cast */
- up = IPH_TCPH_CHECKSUMP(ipha,
- IP_SIMPLE_HDR_LENGTH);
- break;
-
- case IPPROTO_UDP:
- /* LINTED: improper alignment cast */
- up = IPH_UDPH_CHECKSUMP(ipha,
- IP_SIMPLE_HDR_LENGTH);
- break;
-
- default:
- cmn_err(CE_WARN, "mac_fix_cksum: "
- "unexpected protocol: %d", proto);
- continue;
- }
-
- /*
- * Pseudo-header checksum.
- */
- src = ipha->ipha_src;
- dst = ipha->ipha_dst;
- len = ntohs(ipha->ipha_length) -
- IP_SIMPLE_HDR_LENGTH;
-
- cksum = (dst >> 16) + (dst & 0xFFFF) +
- (src >> 16) + (src & 0xFFFF);
- cksum += htons(len);
-
- /*
- * The checksum value stored in the packet needs
- * to be correct. Compute it here.
- */
- *up = 0;
- cksum += (((proto) == IPPROTO_UDP) ?
- IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP);
- cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH +
- offset, cksum);
- *(up) = (uint16_t)(cksum ? cksum : ~cksum);
-
- /*
- * Flag the packet so that it appears
- * that the checksum has already been
- * verified by the hardware.
- */
- flags &= ~HCK_FULLCKSUM;
- flags |= HCK_FULLCKSUM_OK;
- value = 0;
- }
+ if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
+ (emul & MAC_HWCKSUM_EMULS)) {
+ next_nhdrmp = nhdrmp->b_next;
+ nhdrmp->b_next = NULL;
+ nhdrmp = mac_sw_cksum(nhdrmp, emul);
+ nhdrmp->b_next = next_nhdrmp;
+ next_nhdrmp = NULL;
+ /* We may have freed the original nhdrmp. */
+ prev_nhdrmp->b_next = nhdrmp;
+ }
- if (flags & HCK_IPV4_HDRCKSUM) {
- ASSERT(ipha != NULL);
- ipha->ipha_hdr_checksum =
- (uint16_t)ip_csum_hdr(ipha);
- flags &= ~HCK_IPV4_HDRCKSUM;
- flags |= HCK_IPV4_HDRCKSUM_OK;
+ DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
+ (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
+ (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen),
+ uint_t, mss, uint_t, seg);
- }
+ ASSERT3P(nhdrmp->b_next, !=, NULL);
+ prev_nhdrmp = nhdrmp;
+ nhdrmp = nhdrmp->b_next;
+ }
+
+ /* Make sure we are on the last segment. */
+ ASSERT3U(seg, ==, nsegs);
+ ASSERT3P(nhdrmp->b_next, ==, NULL);
+
+ /*
+ * Now we set the last segment header. The difference being
+ * that FIN/PSH/RST flags are allowed.
+ */
+ bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen);
+ nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen;
+ niph = (ipha_t *)(nhdrmp->b_rptr + oehlen);
+ niph->ipha_ident = htons(++ip_id);
+ len = msgsize(nhdrmp->b_cont);
+ ASSERT3S(len, >, 0);
+ niph->ipha_length = htons(oiphlen + otcphlen + len);
+ niph->ipha_hdr_checksum = 0;
+ ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen);
+ U32_TO_BE32(tcp_seq, ntcph->th_seq);
+
+ DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO);
+ if (ocsum_flags & HCK_PARTIALCKSUM) {
+ DB_CKSUMSTART(nhdrmp) = ocsum_start;
+ DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length);
+ DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff;
+ tcp_sum = otcp_sum;
+ tcp_sum += len + otcphlen;
+ tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF);
+ U16_TO_BE16(tcp_sum, ntcph->th_sum);
+ }
+
+ if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) &&
+ (emul & MAC_HWCKSUM_EMULS)) {
+ /* This should be the last mblk. */
+ ASSERT3P(nhdrmp->b_next, ==, NULL);
+ nhdrmp = mac_sw_cksum(nhdrmp, emul);
+ prev_nhdrmp->b_next = nhdrmp;
+ }
+
+ DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *,
+ (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *,
+ (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, len,
+ uint_t, seg);
+
+ /*
+ * Free the reference to the original LSO message as it is
+ * being replaced by seg_cahin.
+ */
+ freemsg(omp);
+ *head = seg_chain;
+ *tail = nhdrmp;
+ *count = nsegs;
+ return;
+
+fail:
+ *head = NULL;
+ *tail = NULL;
+ *count = 0;
+}
+
+#define HCK_NEEDED (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM)
+
+/*
+ * Emulate various hardware offload features in software. Take a chain
+ * of packets as input and emulate the hardware features specified in
+ * 'emul'. The resulting chain's head pointer replaces the 'mp_chain'
+ * pointer given as input, and its tail pointer is written to
+ * '*otail'. The number of packets in the new chain is written to
+ * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus
+ * may be NULL. The 'mp_chain' argument may point to a NULL chain; in
+ * which case 'mp_chain' will simply stay a NULL chain.
+ *
+ * While unlikely, it is technically possible that this function could
+ * receive a non-NULL chain as input and return a NULL chain as output
+ * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be
+ * zero). This could happen if all the packets in the chain are
+ * dropped or if we fail to allocate new mblks. In this case, there is
+ * nothing for the caller to free. In any event, the caller shouldn't
+ * assume that '*mp_chain' is non-NULL on return.
+ *
+ * This function was written with three main use cases in mind.
+ *
+ * 1. To emulate hardware offloads when traveling mac-loopback (two
+ * clients on the same mac). This is wired up in mac_tx_send().
+ *
+ * 2. To provide hardware offloads to the client when the underlying
+ * provider cannot. This is currently wired up in mac_tx() but we
+ * still only negotiate offloads when the underlying provider
+ * supports them.
+ *
+ * 3. To emulate real hardware in simnet.
+ */
+void
+mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul)
+{
+ mblk_t *head = NULL, *tail = NULL;
+ uint_t count = 0;
+
+ ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0);
+ ASSERT3P(mp_chain, !=, NULL);
+
+ for (mblk_t *mp = *mp_chain; mp != NULL; ) {
+ mblk_t *tmp, *next, *tmphead, *tmptail;
+ struct ether_header *ehp;
+ uint32_t flags;
+ uint_t len = MBLKL(mp), l2len;
+
+ /* Perform LSO/cksum one message at a time. */
+ next = mp->b_next;
+ mp->b_next = NULL;
+
+ /*
+ * For our sanity the first mblk should contain at
+ * least the full L2 header.
+ */
+ if (len < sizeof (struct ether_header)) {
+ mac_drop_pkt(mp, "packet too short (A): %u", len);
+ mp = next;
+ continue;
}
- if (flags & HCK_PARTIALCKSUM) {
- uint16_t *up, partial, cksum;
- uchar_t *ipp; /* ptr to beginning of IP header */
-
- if (mp->b_cont != NULL) {
- mblk_t *mp1;
-
- mp1 = msgpullup(mp, offset + end);
- if (mp1 == NULL)
- continue;
- mp1->b_next = mp->b_next;
- mp->b_next = NULL;
- freemsg(mp);
- if (prev != NULL)
- prev->b_next = mp1;
- else
- new_chain = mp1;
- mp = mp1;
- }
+ ehp = (struct ether_header *)mp->b_rptr;
+ if (ntohs(ehp->ether_type) == VLAN_TPID)
+ l2len = sizeof (struct ether_vlan_header);
+ else
+ l2len = sizeof (struct ether_header);
- ipp = mp->b_rptr + offset;
- /* LINTED: cast may result in improper alignment */
- up = (uint16_t *)((uchar_t *)ipp + stuff);
- partial = *up;
- *up = 0;
+ /*
+ * If the first mblk is solely the L2 header, then
+ * there better be more data.
+ */
+ if (len < l2len || (len == l2len && mp->b_cont == NULL)) {
+ mac_drop_pkt(mp, "packet too short (C): %u", len);
+ mp = next;
+ continue;
+ }
+
+ DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul);
+
+ /*
+ * We use DB_CKSUMFLAGS (instead of mac_hcksum_get())
+ * because we don't want to mask-out the LSO flag.
+ */
+ flags = DB_CKSUMFLAGS(mp);
- cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start,
- end - start, partial);
- cksum = ~cksum;
- *up = cksum ? cksum : ~cksum;
+ if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) {
+ uint_t tmpcount = 0;
/*
- * Since we already computed the whole checksum,
- * indicate to the stack that it has already
- * been verified by the hardware.
+ * LSO fix-up handles checksum emulation
+ * inline (if requested). It also frees mp.
*/
- flags &= ~HCK_PARTIALCKSUM;
- flags |= HCK_FULLCKSUM_OK;
- value = 0;
+ mac_sw_lso(mp, emul, &tmphead, &tmptail,
+ &tmpcount);
+ if (tmphead == NULL) {
+ /* mac_sw_lso() freed the mp. */
+ mp = next;
+ continue;
+ }
+ count += tmpcount;
+ } else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) {
+ tmp = mac_sw_cksum(mp, emul);
+ if (tmp == NULL) {
+ /* mac_sw_cksum() freed the mp. */
+ mp = next;
+ continue;
+ }
+ tmphead = tmp;
+ tmptail = tmp;
+ count++;
+ } else {
+ /* There is nothing to emulate. */
+ tmp = mp;
+ tmphead = tmp;
+ tmptail = tmp;
+ count++;
+ }
+
+ /*
+ * The tmp mblk chain is either the start of the new
+ * chain or added to the tail of the new chain.
+ */
+ if (head == NULL) {
+ head = tmphead;
+ tail = tmptail;
+ } else {
+ /* Attach the new mblk to the end of the new chain. */
+ tail->b_next = tmphead;
+ tail = tmptail;
}
- mac_hcksum_set(mp, start, stuff, end, value, flags);
+ mp = next;
}
- return (new_chain);
+ *mp_chain = head;
+
+ if (otail != NULL)
+ *otail = tail;
+
+ if (ocount != NULL)
+ *ocount = count;
}
/*
@@ -449,17 +1568,10 @@ mac_strip_vlan_tag_chain(mblk_t *mp_chain)
*/
/* ARGSUSED */
void
-mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp,
+mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp_chain,
boolean_t loopback)
{
- mblk_t *mp1 = mp;
-
- while (mp1 != NULL) {
- mp1->b_prev = NULL;
- mp1->b_queue = NULL;
- mp1 = mp1->b_next;
- }
- freemsgchain(mp);
+ freemsgchain(mp_chain);
}
/*
diff --git a/usr/src/uts/common/io/simnet/simnet.c b/usr/src/uts/common/io/simnet/simnet.c
index 727fbbad8e..b215f6e94b 100644
--- a/usr/src/uts/common/io/simnet/simnet.c
+++ b/usr/src/uts/common/io/simnet/simnet.c
@@ -21,6 +21,8 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2019 Joyent, Inc.
*/
/*
@@ -51,6 +53,7 @@
#include <sys/atomic.h>
#include <sys/mac_wifi.h>
#include <sys/mac_impl.h>
+#include <sys/pattr.h>
#include <inet/wifi_ioctl.h>
#include <sys/thread.h>
#include <sys/synch.h>
@@ -107,14 +110,15 @@ static int simnet_m_stat(void *, uint_t, uint64_t *);
static void simnet_m_ioctl(void *, queue_t *, mblk_t *);
static mblk_t *simnet_m_tx(void *, mblk_t *);
static int simnet_m_setprop(void *, const char *, mac_prop_id_t,
- uint_t, const void *);
+ const uint_t, const void *);
static int simnet_m_getprop(void *, const char *, mac_prop_id_t,
uint_t, void *);
static void simnet_m_propinfo(void *, const char *, mac_prop_id_t,
mac_prop_info_handle_t);
+static boolean_t simnet_m_getcapab(void *, mac_capab_t, void *);
static mac_callbacks_t simnet_m_callbacks = {
- (MC_IOCTL | MC_SETPROP | MC_GETPROP | MC_PROPINFO),
+ (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO),
simnet_m_stat,
simnet_m_start,
simnet_m_stop,
@@ -124,7 +128,7 @@ static mac_callbacks_t simnet_m_callbacks = {
simnet_m_tx,
NULL,
simnet_m_ioctl,
- NULL,
+ simnet_m_getcapab,
NULL,
NULL,
simnet_m_setprop,
@@ -671,6 +675,12 @@ simnet_thread_unref(simnet_dev_t *sdev)
mutex_exit(&sdev->sd_instlock);
}
+/*
+ * TODO: Add properties to set Rx checksum flag behavior.
+ *
+ * o HCK_PARTIALCKSUM.
+ * o HCK_FULLCKSUM_OK.
+ */
static void
simnet_rx(void *arg)
{
@@ -683,7 +693,7 @@ simnet_rx(void *arg)
/* Check for valid packet header */
if (mac_header_info(sdev->sd_mh, mp, &hdr_info) != 0) {
- freemsg(mp);
+ mac_drop_pkt(mp, "invalid L2 header");
sdev->sd_stats.recv_errors++;
goto rx_done;
}
@@ -712,6 +722,16 @@ simnet_rx(void *arg)
}
}
+ /*
+ * We don't actually calculate and verify the IP header
+ * checksum because the nature of simnet makes it redundant to
+ * do so. The point is to test the presence of the flags. The
+ * Tx side will have already populated the checksum field.
+ */
+ if ((sdev->sd_rx_cksum & HCKSUM_IPHDRCKSUM) != 0) {
+ mac_hcksum_set(mp, 0, 0, 0, 0, HCK_IPV4_HDRCKSUM_OK);
+ }
+
sdev->sd_stats.recv_count++;
sdev->sd_stats.rbytes += msgdsize(mp);
mac_rx(sdev->sd_mh, NULL, mp);
@@ -719,19 +739,22 @@ rx_done:
simnet_thread_unref(sdev);
}
+#define SIMNET_ULP_CKSUM (HCKSUM_INET_FULL_V4 | HCKSUM_INET_PARTIAL)
+
static mblk_t *
simnet_m_tx(void *arg, mblk_t *mp_chain)
{
simnet_dev_t *sdev = arg;
simnet_dev_t *sdev_rx;
mblk_t *mpnext = mp_chain;
- mblk_t *mp;
+ mblk_t *mp, *nmp;
+ mac_emul_t emul = 0;
rw_enter(&simnet_dev_lock, RW_READER);
if ((sdev_rx = sdev->sd_peer_dev) == NULL) {
/* Discard packets when no peer exists */
rw_exit(&simnet_dev_lock);
- freemsgchain(mp_chain);
+ mac_drop_chain(mp_chain, "no peer");
return (NULL);
}
@@ -748,20 +771,20 @@ simnet_m_tx(void *arg, mblk_t *mp_chain)
*/
if (!simnet_thread_ref(sdev_rx)) {
rw_exit(&simnet_dev_lock);
- freemsgchain(mp_chain);
+ mac_drop_chain(mp_chain, "simnet peer dev not ready");
return (NULL);
}
rw_exit(&simnet_dev_lock);
if (!simnet_thread_ref(sdev)) {
simnet_thread_unref(sdev_rx);
- freemsgchain(mp_chain);
+ mac_drop_chain(mp_chain, "simnet dev not ready");
return (NULL);
}
while ((mp = mpnext) != NULL) {
- int len;
- int size;
+ size_t len;
+ size_t size;
mblk_t *mp_new;
mblk_t *mp_tmp;
@@ -775,7 +798,7 @@ simnet_m_tx(void *arg, mblk_t *mp_chain)
mp_new = allocb(size, BPRI_HI);
if (mp_new == NULL) {
sdev->sd_stats.xmit_errors++;
- freemsg(mp);
+ mac_drop_pkt(mp, "allocb failed");
continue;
}
bzero(mp_new->b_wptr, size);
@@ -789,25 +812,44 @@ simnet_m_tx(void *arg, mblk_t *mp_chain)
}
/* Pullup packet into a single mblk */
- if (!pullupmsg(mp, -1)) {
- sdev->sd_stats.xmit_errors++;
- freemsg(mp);
- continue;
- }
-
- /* Fix mblk checksum as the pkt dest is local */
- if ((mp = mac_fix_cksum(mp)) == NULL) {
+ if ((nmp = msgpullup(mp, -1)) == NULL) {
sdev->sd_stats.xmit_errors++;
+ mac_drop_pkt(mp, "msgpullup failed");
continue;
+ } else {
+ mac_hcksum_clone(mp, nmp);
+ freemsg(mp);
+ mp = nmp;
}
/* Hold reference for taskq receive processing per-pkt */
if (!simnet_thread_ref(sdev_rx)) {
- freemsg(mp);
- freemsgchain(mpnext);
+ mac_drop_pkt(mp, "failed to get thread ref");
+ mac_drop_chain(mpnext, "failed to get thread ref");
break;
}
+ if ((sdev->sd_tx_cksum & HCKSUM_IPHDRCKSUM) != 0)
+ emul |= MAC_IPCKSUM_EMUL;
+ if ((sdev->sd_tx_cksum & SIMNET_ULP_CKSUM) != 0)
+ emul |= MAC_HWCKSUM_EMUL;
+ if (sdev->sd_lso)
+ emul |= MAC_LSO_EMUL;
+
+ if (emul != 0)
+ mac_hw_emul(&mp, NULL, NULL, emul);
+
+ if (mp == NULL) {
+ sdev->sd_stats.xmit_errors++;
+ continue;
+ }
+
+ /*
+ * Remember, we are emulating a real NIC here; the
+ * checksum flags can't make the trip across the link.
+ */
+ DB_CKSUMFLAGS(mp) = 0;
+
/* Use taskq for pkt receive to avoid kernel stack explosion */
mp->b_next = (mblk_t *)sdev_rx;
if (ddi_taskq_dispatch(simnet_rxq, simnet_rx, mp,
@@ -886,6 +928,43 @@ simnet_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
miocack(q, mp, msgdsize(mp1), rc);
}
+static boolean_t
+simnet_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
+{
+ simnet_dev_t *sdev = arg;
+ const uint_t tcp_cksums = HCKSUM_INET_FULL_V4 | HCKSUM_INET_PARTIAL;
+
+ switch (cap) {
+ case MAC_CAPAB_HCKSUM: {
+ uint32_t *tx_cksum_flags = cap_data;
+ *tx_cksum_flags = sdev->sd_tx_cksum;
+ break;
+ }
+ case MAC_CAPAB_LSO: {
+ mac_capab_lso_t *cap_lso = cap_data;
+
+ if (sdev->sd_lso &&
+ (sdev->sd_tx_cksum & HCKSUM_IPHDRCKSUM) != 0 &&
+ (sdev->sd_tx_cksum & tcp_cksums) != 0) {
+ /*
+ * The LSO configuration is hardwried for now,
+ * but there's no reason we couldn't also make
+ * this configurable in the future.
+ */
+ cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
+ cap_lso->lso_basic_tcp_ipv4.lso_max = SD_LSO_MAXLEN;
+ break;
+ } else {
+ return (B_FALSE);
+ }
+ }
+ default:
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
static int
simnet_m_stat(void *arg, uint_t stat, uint64_t *val)
{
@@ -1142,20 +1221,20 @@ set_wl_esslist_priv_prop(simnet_wifidev_t *wdev, uint_t pr_valsize,
}
static int
-simnet_set_priv_prop(simnet_dev_t *sdev, const char *pr_name,
- uint_t pr_valsize, const void *pr_val)
+simnet_set_priv_prop_wifi(simnet_dev_t *sdev, const char *name,
+ const uint_t len, const void *val)
{
simnet_wifidev_t *wdev = sdev->sd_wifidev;
long result;
- if (strcmp(pr_name, "_wl_esslist") == 0) {
- if (pr_val == NULL)
+ if (strcmp(name, "_wl_esslist") == 0) {
+ if (val == NULL)
return (EINVAL);
- return (set_wl_esslist_priv_prop(wdev, pr_valsize, pr_val));
- } else if (strcmp(pr_name, "_wl_connected") == 0) {
- if (pr_val == NULL)
+ return (set_wl_esslist_priv_prop(wdev, len, val));
+ } else if (strcmp(name, "_wl_connected") == 0) {
+ if (val == NULL)
return (EINVAL);
- (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
+ (void) ddi_strtol(val, (char **)NULL, 0, &result);
wdev->swd_linkstatus = ((result == 1) ?
WL_CONNECTED:WL_NOTCONNECTED);
return (0);
@@ -1164,37 +1243,89 @@ simnet_set_priv_prop(simnet_dev_t *sdev, const char *pr_name,
return (EINVAL);
}
+/* ARGSUSED */
static int
-simnet_m_setprop(void *arg, const char *pr_name, mac_prop_id_t wldp_pr_num,
- uint_t wldp_length, const void *wldp_buf)
+simnet_set_priv_prop_ether(simnet_dev_t *sdev, const char *name,
+ const uint_t len, const void *val)
{
- simnet_dev_t *sdev = arg;
- simnet_wifidev_t *wdev = sdev->sd_wifidev;
- int err = 0;
- uint32_t mtu;
+ if (strcmp(name, SD_PROP_RX_IP_CKSUM) == 0) {
+ if (val == NULL)
+ return (EINVAL);
- switch (wldp_pr_num) {
- case MAC_PROP_MTU:
- (void) memcpy(&mtu, wldp_buf, sizeof (mtu));
- if (mtu > ETHERMIN && mtu < SIMNET_MAX_MTU)
- return (mac_maxsdu_update(sdev->sd_mh, mtu));
- else
+ if (strcmp(val, "off") == 0) {
+ sdev->sd_rx_cksum &= ~HCKSUM_IPHDRCKSUM;
+ } else if (strcmp(val, "on") == 0) {
+ sdev->sd_rx_cksum |= HCKSUM_IPHDRCKSUM;
+ } else {
return (EINVAL);
- default:
- break;
+ }
+
+ return (0);
+ } else if (strcmp(name, SD_PROP_TX_ULP_CKSUM) == 0) {
+ if (val == NULL)
+ return (EINVAL);
+
+ /*
+ * Remember, full and partial checksum are mutually
+ * exclusive.
+ */
+ if (strcmp(val, "none") == 0) {
+ sdev->sd_tx_cksum &= ~HCKSUM_INET_FULL_V4;
+ } else if (strcmp(val, "fullv4") == 0) {
+ sdev->sd_tx_cksum &= ~HCKSUM_INET_PARTIAL;
+ sdev->sd_tx_cksum |= HCKSUM_INET_FULL_V4;
+ } else if (strcmp(val, "partial") == 0) {
+ sdev->sd_tx_cksum &= HCKSUM_INET_FULL_V4;
+ sdev->sd_tx_cksum |= HCKSUM_INET_PARTIAL;
+ } else {
+ return (EINVAL);
+ }
+
+ return (0);
+ } else if (strcmp(name, SD_PROP_TX_IP_CKSUM) == 0) {
+ if (val == NULL)
+ return (EINVAL);
+
+ if (strcmp(val, "off") == 0) {
+ sdev->sd_tx_cksum &= ~HCKSUM_IPHDRCKSUM;
+ } else if (strcmp(val, "on") == 0) {
+ sdev->sd_tx_cksum |= HCKSUM_IPHDRCKSUM;
+ } else {
+ return (EINVAL);
+ }
+
+ return (0);
+ } else if (strcmp(name, SD_PROP_LSO) == 0) {
+ if (val == NULL)
+ return (EINVAL);
+
+ if (strcmp(val, "off") == 0) {
+ sdev->sd_lso = B_FALSE;
+ } else if (strcmp(val, "on") == 0) {
+ sdev->sd_lso = B_TRUE;
+ } else {
+ return (EINVAL);
+ }
+
+ return (0);
}
- if (sdev->sd_type == DL_ETHER)
- return (ENOTSUP);
+ return (ENOTSUP);
+}
+
+static int
+simnet_setprop_wifi(simnet_dev_t *sdev, const char *name,
+ const mac_prop_id_t num, const uint_t len, const void *val)
+{
+ int err = 0;
+ simnet_wifidev_t *wdev = sdev->sd_wifidev;
- /* mac_prop_id */
- switch (wldp_pr_num) {
+ switch (num) {
case MAC_PROP_WL_ESSID: {
int i;
wl_ess_conf_t *wls;
- (void) memcpy(&wdev->swd_essid, wldp_buf,
- sizeof (wl_essid_t));
+ (void) memcpy(&wdev->swd_essid, val, sizeof (wl_essid_t));
wdev->swd_linkstatus = WL_CONNECTED;
/* Lookup the signal strength of the connected ESSID */
@@ -1209,8 +1340,7 @@ simnet_m_setprop(void *arg, const char *pr_name, mac_prop_id_t wldp_pr_num,
break;
}
case MAC_PROP_WL_BSSID: {
- (void) memcpy(&wdev->swd_bssid, wldp_buf,
- sizeof (wl_bssid_t));
+ (void) memcpy(&wdev->swd_bssid, val, sizeof (wl_bssid_t));
break;
}
case MAC_PROP_WL_PHY_CONFIG:
@@ -1221,10 +1351,10 @@ simnet_m_setprop(void *arg, const char *pr_name, mac_prop_id_t wldp_pr_num,
case MAC_PROP_WL_DESIRED_RATES:
break;
case MAC_PROP_PRIVATE:
- err = simnet_set_priv_prop(sdev, pr_name,
- wldp_length, wldp_buf);
+ err = simnet_set_priv_prop_wifi(sdev, name, len, val);
break;
default:
+ err = EINVAL;
break;
}
@@ -1232,66 +1362,159 @@ simnet_m_setprop(void *arg, const char *pr_name, mac_prop_id_t wldp_pr_num,
}
static int
-simnet_get_priv_prop(simnet_dev_t *sdev, const char *pr_name,
- uint_t pr_valsize, void *pr_val)
+simnet_setprop_ether(simnet_dev_t *sdev, const char *name,
+ const mac_prop_id_t num, const uint_t len, const void *val)
{
- simnet_wifidev_t *wdev = sdev->sd_wifidev;
int err = 0;
- int value;
- if (strcmp(pr_name, "_wl_esslist") == 0) {
+ switch (num) {
+ case MAC_PROP_PRIVATE:
+ err = simnet_set_priv_prop_ether(sdev, name, len, val);
+ break;
+ default:
+ err = EINVAL;
+ break;
+ }
+
+ return (err);
+}
+
+static int
+simnet_m_setprop(void *arg, const char *name, mac_prop_id_t num,
+ const uint_t len, const void *val)
+{
+ simnet_dev_t *sdev = arg;
+ int err = 0;
+ uint32_t mtu;
+
+ switch (num) {
+ case MAC_PROP_MTU:
+ (void) memcpy(&mtu, val, sizeof (mtu));
+ if (mtu > ETHERMIN && mtu < SIMNET_MAX_MTU)
+ return (mac_maxsdu_update(sdev->sd_mh, mtu));
+ else
+ return (EINVAL);
+ default:
+ break;
+ }
+
+ switch (sdev->sd_type) {
+ case DL_ETHER:
+ err = simnet_setprop_ether(sdev, name, num, len, val);
+ break;
+ case DL_WIFI:
+ err = simnet_setprop_wifi(sdev, name, num, len, val);
+ break;
+ default:
+ err = EINVAL;
+ break;
+ }
+
+ /*
+ * We may have modified the configuration of hardware
+ * offloads. Make sure to renegotiate capabilities with the
+ * upstream clients.
+ */
+ mac_capab_update(sdev->sd_mh);
+ return (err);
+}
+
+static int
+simnet_get_priv_prop_wifi(const simnet_dev_t *sdev, const char *name,
+ const uint_t len, void *val)
+{
+ simnet_wifidev_t *wdev = sdev->sd_wifidev;
+ int ret, value;
+
+ if (strcmp(name, "_wl_esslist") == 0) {
/* Returns num of _wl_ess_conf_t that have been set */
value = wdev->swd_esslist_num;
- } else if (strcmp(pr_name, "_wl_connected") == 0) {
+ } else if (strcmp(name, "_wl_connected") == 0) {
value = ((wdev->swd_linkstatus == WL_CONNECTED) ? 1:0);
} else {
- err = ENOTSUP;
+ return (ENOTSUP);
}
- if (err == 0)
- (void) snprintf(pr_val, pr_valsize, "%d", value);
- return (err);
+ ret = snprintf(val, len, "%d", value);
+
+ if (ret < 0 || ret >= len)
+ return (EOVERFLOW);
+
+ return (0);
}
static int
-simnet_m_getprop(void *arg, const char *pr_name, mac_prop_id_t wldp_pr_num,
- uint_t wldp_length, void *wldp_buf)
+simnet_get_priv_prop_ether(const simnet_dev_t *sdev, const char *name,
+ const uint_t len, void *val)
{
- simnet_dev_t *sdev = arg;
- simnet_wifidev_t *wdev = sdev->sd_wifidev;
- int err = 0;
- int i;
+ int ret;
+ char *value;
- if (sdev->sd_type == DL_ETHER)
+ if (strcmp(name, SD_PROP_RX_IP_CKSUM) == 0) {
+ if ((sdev->sd_rx_cksum & HCKSUM_IPHDRCKSUM) != 0) {
+ value = "on";
+ } else {
+ value = "off";
+ }
+ } else if (strcmp(name, SD_PROP_TX_ULP_CKSUM) == 0) {
+ if ((sdev->sd_tx_cksum & HCKSUM_INET_FULL_V4) != 0) {
+ value = "fullv4";
+ } else if ((sdev->sd_tx_cksum & HCKSUM_INET_PARTIAL) != 0) {
+ value = "partial";
+ } else {
+ value = "none";
+ }
+ } else if (strcmp(name, SD_PROP_TX_IP_CKSUM) == 0) {
+ if ((sdev->sd_tx_cksum & HCKSUM_IPHDRCKSUM) != 0) {
+ value = "on";
+ } else {
+ value = "off";
+ }
+ } else if (strcmp(name, SD_PROP_LSO) == 0) {
+ value = sdev->sd_lso ? "on" : "off";
+ } else {
return (ENOTSUP);
+ }
- /* mac_prop_id */
- switch (wldp_pr_num) {
+ ret = snprintf(val, len, "%s", value);
+
+ if (ret < 0 || ret >= len) {
+ return (EOVERFLOW);
+ }
+
+ return (0);
+}
+
+static int
+simnet_getprop_wifi(const simnet_dev_t *sdev, const char *name,
+ const mac_prop_id_t num, const uint_t len, void *val)
+{
+ const simnet_wifidev_t *wdev = sdev->sd_wifidev;
+ int err = 0;
+
+ switch (num) {
case MAC_PROP_WL_ESSID:
- (void) memcpy(wldp_buf, &wdev->swd_essid,
- sizeof (wl_essid_t));
+ (void) memcpy(val, &wdev->swd_essid, sizeof (wl_essid_t));
break;
case MAC_PROP_WL_BSSID:
- (void) memcpy(wldp_buf, &wdev->swd_bssid,
- sizeof (wl_bssid_t));
+ (void) memcpy(val, &wdev->swd_bssid, sizeof (wl_bssid_t));
break;
case MAC_PROP_WL_PHY_CONFIG:
case MAC_PROP_WL_AUTH_MODE:
case MAC_PROP_WL_ENCRYPTION:
break;
case MAC_PROP_WL_LINKSTATUS:
- (void) memcpy(wldp_buf, &wdev->swd_linkstatus,
+ (void) memcpy(val, &wdev->swd_linkstatus,
sizeof (wdev->swd_linkstatus));
break;
case MAC_PROP_WL_ESS_LIST: {
wl_ess_conf_t *w_ess_conf;
- ((wl_ess_list_t *)wldp_buf)->wl_ess_list_num =
- wdev->swd_esslist_num;
+ ((wl_ess_list_t *)val)->wl_ess_list_num = wdev->swd_esslist_num;
/* LINTED E_BAD_PTR_CAST_ALIGN */
- w_ess_conf = (wl_ess_conf_t *)((char *)wldp_buf +
+ w_ess_conf = (wl_ess_conf_t *)((char *)val +
offsetof(wl_ess_list_t, wl_ess_list_ess));
- for (i = 0; i < wdev->swd_esslist_num; i++) {
+ for (uint_t i = 0; i < wdev->swd_esslist_num; i++) {
(void) memcpy(w_ess_conf, wdev->swd_esslist[i],
sizeof (wl_ess_conf_t));
w_ess_conf++;
@@ -1299,18 +1522,35 @@ simnet_m_getprop(void *arg, const char *pr_name, mac_prop_id_t wldp_pr_num,
break;
}
case MAC_PROP_WL_RSSI:
- *(wl_rssi_t *)wldp_buf = wdev->swd_rssi;
+ *(wl_rssi_t *)val = wdev->swd_rssi;
break;
case MAC_PROP_WL_RADIO:
- *(wl_radio_t *)wldp_buf = B_TRUE;
+ *(wl_radio_t *)val = B_TRUE;
break;
case MAC_PROP_WL_POWER_MODE:
break;
case MAC_PROP_WL_DESIRED_RATES:
break;
case MAC_PROP_PRIVATE:
- err = simnet_get_priv_prop(sdev, pr_name, wldp_length,
- wldp_buf);
+ err = simnet_get_priv_prop_wifi(sdev, name, len, val);
+ break;
+ default:
+ err = ENOTSUP;
+ break;
+ }
+
+ return (err);
+}
+
+static int
+simnet_getprop_ether(const simnet_dev_t *sdev, const char *name,
+ const mac_prop_id_t num, const uint_t len, void *val)
+{
+ int err = 0;
+
+ switch (num) {
+ case MAC_PROP_PRIVATE:
+ err = simnet_get_priv_prop_ether(sdev, name, len, val);
break;
default:
err = ENOTSUP;
@@ -1320,14 +1560,36 @@ simnet_m_getprop(void *arg, const char *pr_name, mac_prop_id_t wldp_pr_num,
return (err);
}
+static int
+simnet_m_getprop(void *arg, const char *name, const mac_prop_id_t num,
+ const uint_t len, void *val)
+{
+ const simnet_dev_t *sdev = arg;
+ int err = 0;
+
+ switch (sdev->sd_type) {
+ case DL_ETHER:
+ err = simnet_getprop_ether(sdev, name, num, len, val);
+ break;
+ case DL_WIFI:
+ err = simnet_getprop_wifi(sdev, name, num, len, val);
+ break;
+ default:
+ err = EINVAL;
+ break;
+ }
+
+ return (err);
+}
+
static void
-simnet_priv_propinfo(const char *pr_name, mac_prop_info_handle_t prh)
+simnet_priv_propinfo_wifi(const char *name, mac_prop_info_handle_t prh)
{
char valstr[MAXNAMELEN];
bzero(valstr, sizeof (valstr));
- if (strcmp(pr_name, "_wl_esslist") == 0) {
+ if (strcmp(name, "_wl_esslist") == 0) {
(void) snprintf(valstr, sizeof (valstr), "%d", 0);
}
@@ -1336,15 +1598,10 @@ simnet_priv_propinfo(const char *pr_name, mac_prop_info_handle_t prh)
}
static void
-simnet_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t wldp_pr_num,
+simnet_propinfo_wifi(const char *name, const mac_prop_id_t num,
mac_prop_info_handle_t prh)
{
- simnet_dev_t *sdev = arg;
-
- if (sdev->sd_type == DL_ETHER)
- return;
-
- switch (wldp_pr_num) {
+ switch (num) {
case MAC_PROP_WL_BSSTYPE:
case MAC_PROP_WL_ESS_LIST:
case MAC_PROP_WL_SUPPORTED_RATES:
@@ -1352,7 +1609,55 @@ simnet_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t wldp_pr_num,
mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
break;
case MAC_PROP_PRIVATE:
- simnet_priv_propinfo(pr_name, prh);
+ simnet_priv_propinfo_wifi(name, prh);
+ break;
+ }
+}
+
+static void
+simnet_priv_propinfo_ether(const char *name, mac_prop_info_handle_t prh)
+{
+ if (strcmp(name, SD_PROP_RX_IP_CKSUM) == 0 ||
+ strcmp(name, SD_PROP_TX_ULP_CKSUM) == 0 ||
+ strcmp(name, SD_PROP_TX_IP_CKSUM) == 0 ||
+ strcmp(name, SD_PROP_LSO) == 0) {
+ mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW);
+ }
+
+ if (strcmp(name, SD_PROP_TX_ULP_CKSUM) == 0) {
+ mac_prop_info_set_default_str(prh, "none");
+ }
+
+ if (strcmp(name, SD_PROP_RX_IP_CKSUM) == 0 ||
+ strcmp(name, SD_PROP_TX_IP_CKSUM) == 0 ||
+ strcmp(name, SD_PROP_LSO) == 0) {
+ mac_prop_info_set_default_str(prh, "off");
+ }
+}
+
+static void
+simnet_propinfo_ether(const char *name, const mac_prop_id_t num,
+ mac_prop_info_handle_t prh)
+{
+ switch (num) {
+ case MAC_PROP_PRIVATE:
+ simnet_priv_propinfo_ether(name, prh);
+ break;
+ }
+}
+
+static void
+simnet_m_propinfo(void *arg, const char *name, const mac_prop_id_t num,
+ const mac_prop_info_handle_t prh)
+{
+ simnet_dev_t *sdev = arg;
+
+ switch (sdev->sd_type) {
+ case DL_ETHER:
+ simnet_propinfo_ether(name, num, prh);
+ break;
+ case DL_WIFI:
+ simnet_propinfo_wifi(name, num, prh);
break;
}
}
diff --git a/usr/src/uts/common/io/simnet/simnet_impl.h b/usr/src/uts/common/io/simnet/simnet_impl.h
index 74dcba5113..5d6f16f113 100644
--- a/usr/src/uts/common/io/simnet/simnet_impl.h
+++ b/usr/src/uts/common/io/simnet/simnet_impl.h
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2019 Joyent, Inc.
*/
#ifndef _SYS_SIMNET_IMPL_H
@@ -84,13 +85,25 @@ typedef struct simnet_dev {
uint_t sd_mac_len;
uchar_t sd_mac_addr[MAXMACADDRLEN];
simnet_stats_t sd_stats;
+
+ /* Capabilities */
+ uint_t sd_rx_cksum;
+ uint_t sd_tx_cksum;
+ boolean_t sd_lso;
} simnet_dev_t;
+/* Simnet dladm private properties. */
+#define SD_PROP_RX_IP_CKSUM "_rx_ipv4_cksum"
+#define SD_PROP_TX_ULP_CKSUM "_tx_ulp_cksum"
+#define SD_PROP_TX_IP_CKSUM "_tx_ipv4_cksum"
+#define SD_PROP_LSO "_lso"
+
/* Simnet device flags */
#define SDF_SHUTDOWN 0x00000001 /* Device shutdown, no new ops */
#define SDF_STARTED 0x00000002 /* Device started, allow ops */
#define SIMNET_MAX_MTU 9000 /* Max MTU supported by simnet driver */
+#define SD_LSO_MAXLEN 65535 /* Max LSO supported by simnet driver */
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/io/stream.c b/usr/src/uts/common/io/stream.c
index ec76c6e2b9..288f77ae47 100644
--- a/usr/src/uts/common/io/stream.c
+++ b/usr/src/uts/common/io/stream.c
@@ -839,7 +839,7 @@ frnop_func(void *arg)
*/
static mblk_t *
gesballoc(unsigned char *base, size_t size, uint32_t db_rtfu, frtn_t *frp,
- void (*lastfree)(mblk_t *, dblk_t *), int kmflags)
+ void (*lastfree)(mblk_t *, dblk_t *), int kmflags)
{
dblk_t *dbp;
mblk_t *mp;
@@ -1451,6 +1451,16 @@ copyb(mblk_t *bp)
ndp = nbp->b_datap;
/*
+ * Copy the various checksum information that came in
+ * originally.
+ */
+ ndp->db_cksumstart = dp->db_cksumstart;
+ ndp->db_cksumend = dp->db_cksumend;
+ ndp->db_cksumstuff = dp->db_cksumstuff;
+ bcopy(dp->db_struioun.data, ndp->db_struioun.data,
+ sizeof (dp->db_struioun.data));
+
+ /*
* Well, here is a potential issue. If we are trying to
* trace a flow, and we copy the message, we might lose
* information about where this message might have been.
diff --git a/usr/src/uts/common/io/vnic/vnic_dev.c b/usr/src/uts/common/io/vnic/vnic_dev.c
index bbbd9b46bd..d75db5f258 100644
--- a/usr/src/uts/common/io/vnic/vnic_dev.c
+++ b/usr/src/uts/common/io/vnic/vnic_dev.c
@@ -457,6 +457,20 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid,
} else {
vnic->vn_hcksum_txflags = 0;
}
+
+ /*
+ * Check for LSO capabilities. LSO implementations
+ * depend on hardware checksumming, so the same
+ * requirement is enforced here.
+ */
+ if (vnic->vn_hcksum_txflags != 0) {
+ if (!mac_capab_get(vnic->vn_lower_mh, MAC_CAPAB_LSO,
+ &vnic->vn_cap_lso)) {
+ vnic->vn_cap_lso.lso_flags = 0;
+ }
+ } else {
+ vnic->vn_cap_lso.lso_flags = 0;
+ }
}
/* register with the MAC module */
@@ -827,6 +841,15 @@ vnic_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
HCKSUM_INET_PARTIAL);
break;
}
+ case MAC_CAPAB_LSO: {
+ mac_capab_lso_t *cap_lso = cap_data;
+
+ if (vnic->vn_cap_lso.lso_flags == 0) {
+ return (B_FALSE);
+ }
+ *cap_lso = vnic->vn_cap_lso;
+ break;
+ }
case MAC_CAPAB_VNIC: {
mac_capab_vnic_t *vnic_capab = cap_data;
diff --git a/usr/src/uts/common/os/ip_cksum.c b/usr/src/uts/common/os/ip_cksum.c
index 1fa1c9425b..0a237e86ec 100644
--- a/usr/src/uts/common/os/ip_cksum.c
+++ b/usr/src/uts/common/os/ip_cksum.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2019 Joyent, Inc.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -34,6 +35,7 @@
#include <sys/vtrace.h>
#include <inet/sctp_crc32.h>
#include <inet/ip.h>
+#include <inet/ip6.h>
#include <sys/multidata.h>
#include <sys/multidata_impl.h>
@@ -556,3 +558,109 @@ ip_csum_hdr(ipha_t *ipha)
sum = 0;
return ((uint16_t)sum);
}
+
+/*
+ * This function takes an mblk and IPv6 header as input and returns
+ * three pieces of information.
+ *
+ * 'hdr_length_ptr': The IPv6 header length including extension headers.
+ *
+ * 'nethdrpp': A pointer to the "next hedader" value, aka the
+ * transport header. This argument may be set to NULL if
+ * only the length is desired.
+ *
+ * return: Whether or not the header was malformed.
+ *
+ * This function assumes the IPv6 header along with all extensions are
+ * contained solely in this mblk: i.e., there is no b_cont walking.
+ */
+boolean_t
+ip_hdr_length_nexthdr_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length_ptr,
+ uint8_t **nexthdrpp)
+{
+ uint16_t length;
+ uint_t ehdrlen;
+ uint8_t *nexthdrp;
+ uint8_t *whereptr;
+ uint8_t *endptr;
+ ip6_dest_t *desthdr;
+ ip6_rthdr_t *rthdr;
+ ip6_frag_t *fraghdr;
+
+ ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
+ length = IPV6_HDR_LEN;
+ whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
+ endptr = mp->b_wptr;
+
+ nexthdrp = &ip6h->ip6_nxt;
+ while (whereptr < endptr) {
+ /* Is there enough left for len + nexthdr? */
+ if (whereptr + MIN_EHDR_LEN > endptr)
+ break;
+
+ switch (*nexthdrp) {
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_DSTOPTS:
+ /* Assumes the headers are identical for hbh and dst */
+ desthdr = (ip6_dest_t *)whereptr;
+ ehdrlen = 8 * (desthdr->ip6d_len + 1);
+ if ((uchar_t *)desthdr + ehdrlen > endptr)
+ return (B_FALSE);
+ nexthdrp = &desthdr->ip6d_nxt;
+ break;
+ case IPPROTO_ROUTING:
+ rthdr = (ip6_rthdr_t *)whereptr;
+ ehdrlen = 8 * (rthdr->ip6r_len + 1);
+ if ((uchar_t *)rthdr + ehdrlen > endptr)
+ return (B_FALSE);
+ nexthdrp = &rthdr->ip6r_nxt;
+ break;
+ case IPPROTO_FRAGMENT:
+ fraghdr = (ip6_frag_t *)whereptr;
+ ehdrlen = sizeof (ip6_frag_t);
+ if ((uchar_t *)&fraghdr[1] > endptr)
+ return (B_FALSE);
+ nexthdrp = &fraghdr->ip6f_nxt;
+ break;
+ case IPPROTO_NONE:
+ /* No next header means we're finished */
+ default:
+ *hdr_length_ptr = length;
+
+ if (nexthdrpp != NULL)
+ *nexthdrpp = nexthdrp;
+
+ return (B_TRUE);
+ }
+ length += ehdrlen;
+ whereptr += ehdrlen;
+ *hdr_length_ptr = length;
+
+ if (nexthdrpp != NULL)
+ *nexthdrpp = nexthdrp;
+ }
+ switch (*nexthdrp) {
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_DSTOPTS:
+ case IPPROTO_ROUTING:
+ case IPPROTO_FRAGMENT:
+ /*
+ * If any know extension headers are still to be processed,
+ * the packet's malformed (or at least all the IP header(s) are
+ * not in the same mblk - and that should never happen.
+ */
+ return (B_FALSE);
+
+ default:
+ /*
+ * If we get here, we know that all of the IP headers were in
+ * the same mblk, even if the ULP header is in the next mblk.
+ */
+ *hdr_length_ptr = length;
+
+ if (nexthdrpp != NULL)
+ *nexthdrpp = nexthdrp;
+
+ return (B_TRUE);
+ }
+}
diff --git a/usr/src/uts/common/sys/mac.h b/usr/src/uts/common/sys/mac.h
index 0907d6deff..2ce448fc3d 100644
--- a/usr/src/uts/common/sys/mac.h
+++ b/usr/src/uts/common/sys/mac.h
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2017, Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
* Copyright (c) 2015 Garrett D'Amore <garrett@damore.org>
*/
@@ -614,6 +614,38 @@ typedef struct mactype_register_s {
} mactype_register_t;
/*
+ * Flags to describe the hardware emulation desired from a client when
+ * calling mac_hw_emul().
+ *
+ * MAC_HWCKSUM_EMUL
+ *
+ * If an mblk is marked with HCK_* flags, then calculate those
+ * checksums and update the checksum flags.
+ *
+ * MAC_IPCKSUM_EMUL
+ *
+ * Like MAC_HWCKSUM_EMUL, except only calculate the IPv4 header
+ * checksum. We still update both the IPv4 and ULP checksum
+ * flags.
+ *
+ * MAC_LSO_EMUL
+ *
+ * If an mblk is marked with HW_LSO, then segment the LSO mblk
+ * into a new chain of mblks which reference the original data
+ * block. This flag DOES NOT imply MAC_HWCKSUM_EMUL. If the
+ * caller needs both then it must set both.
+ */
+typedef enum mac_emul {
+ MAC_HWCKSUM_EMUL = (1 << 0),
+ MAC_IPCKSUM_EMUL = (1 << 1),
+ MAC_LSO_EMUL = (1 << 2)
+} mac_emul_t;
+
+#define MAC_HWCKSUM_EMULS (MAC_HWCKSUM_EMUL | MAC_IPCKSUM_EMUL)
+#define MAC_ALL_EMULS (MAC_HWCKSUM_EMUL | MAC_IPCKSUM_EMUL | \
+ MAC_LSO_EMUL)
+
+/*
* Driver interface functions.
*/
extern int mac_open_by_linkid(datalink_id_t,
diff --git a/usr/src/uts/common/sys/mac_client.h b/usr/src/uts/common/sys/mac_client.h
index 88ab5f4756..1d1915a816 100644
--- a/usr/src/uts/common/sys/mac_client.h
+++ b/usr/src/uts/common/sys/mac_client.h
@@ -200,6 +200,8 @@ extern int mac_set_mtu(mac_handle_t, uint_t, uint_t *);
extern void mac_client_set_rings(mac_client_handle_t, int, int);
+extern void mac_hw_emul(mblk_t **, mblk_t **, uint_t *, mac_emul_t);
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/sys/mac_client_impl.h b/usr/src/uts/common/sys/mac_client_impl.h
index d5c66684d0..0e3a6306e0 100644
--- a/usr/src/uts/common/sys/mac_client_impl.h
+++ b/usr/src/uts/common/sys/mac_client_impl.h
@@ -410,8 +410,8 @@ extern int mac_tx_percpu_cnt;
extern void mac_promisc_client_dispatch(mac_client_impl_t *, mblk_t *);
extern void mac_client_init(void);
extern void mac_client_fini(void);
-extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *,
- mac_client_impl_t *);
+extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *, mac_client_impl_t *,
+ boolean_t);
extern int mac_validate_props(mac_impl_t *, mac_resource_props_t *);
diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h
index 4625417828..da645ad382 100644
--- a/usr/src/uts/common/sys/mac_impl.h
+++ b/usr/src/uts/common/sys/mac_impl.h
@@ -35,6 +35,7 @@
#include <net/if.h>
#include <sys/mac_flow_impl.h>
#include <netinet/ip6.h>
+#include <sys/pattr.h>
#ifdef __cplusplus
extern "C" {
@@ -289,54 +290,6 @@ struct mac_group_s {
#define GROUP_INTR_ENABLE_FUNC(g) (g)->mrg_info.mgi_intr.mi_enable
#define GROUP_INTR_DISABLE_FUNC(g) (g)->mrg_info.mgi_intr.mi_disable
-#define MAC_RING_TX(mhp, rh, mp, rest) { \
- mac_ring_handle_t mrh = rh; \
- mac_impl_t *mimpl = (mac_impl_t *)mhp; \
- /* \
- * Send packets through a selected tx ring, or through the \
- * default handler if there is no selected ring. \
- */ \
- if (mrh == NULL) \
- mrh = mimpl->mi_default_tx_ring; \
- if (mrh == NULL) { \
- rest = mimpl->mi_tx(mimpl->mi_driver, mp); \
- } else { \
- rest = mac_hwring_tx(mrh, mp); \
- } \
-}
-
-/*
- * This is the final stop before reaching the underlying driver
- * or aggregation, so this is where the bridging hook is implemented.
- * Packets that are bridged will return through mac_bridge_tx(), with
- * rh nulled out if the bridge chooses to send output on a different
- * link due to forwarding.
- */
-#define MAC_TX(mip, rh, mp, src_mcip) { \
- mac_ring_handle_t rhandle = (rh); \
- /* \
- * If there is a bound Hybrid I/O share, send packets through \
- * the default tx ring. (When there's a bound Hybrid I/O share, \
- * the tx rings of this client are mapped in the guest domain \
- * and not accessible from here.) \
- */ \
- _NOTE(CONSTANTCONDITION) \
- if ((src_mcip)->mci_state_flags & MCIS_SHARE_BOUND) \
- rhandle = (mip)->mi_default_tx_ring; \
- if (mip->mi_promisc_list != NULL) \
- mac_promisc_dispatch(mip, mp, src_mcip); \
- /* \
- * Grab the proper transmit pointer and handle. Special \
- * optimization: we can test mi_bridge_link itself atomically, \
- * and if that indicates no bridge send packets through tx ring.\
- */ \
- if (mip->mi_bridge_link == NULL) { \
- MAC_RING_TX(mip, rhandle, mp, mp); \
- } else { \
- mp = mac_bridge_tx(mip, rhandle, mp); \
- } \
-}
-
/* mci_tx_flag */
#define MCI_TX_QUIESCE 0x1
@@ -485,6 +438,9 @@ struct mac_impl_s {
mac_led_mode_t mi_led_modes;
mac_capab_led_t mi_led;
+ /* Cache of the Tx DB_CKSUMFLAGS that this MAC supports. */
+ uint16_t mi_tx_cksum_flags; /* SL */
+
/*
* MAC address and VLAN lists. SL protected.
*/
@@ -721,16 +677,30 @@ typedef struct mac_client_impl_s mac_client_impl_t;
extern void mac_init(void);
extern int mac_fini(void);
+/*
+ * MAC packet/chain drop functions to aggregate all dropped-packet
+ * debugging to a single surface.
+ */
+/*PRINTFLIKE2*/
+extern void mac_drop_pkt(mblk_t *, const char *, ...)
+ __KPRINTFLIKE(2);
+
+/*PRINTFLIKE2*/
+extern void mac_drop_chain(mblk_t *, const char *, ...)
+ __KPRINTFLIKE(2);
+
extern void mac_ndd_ioctl(mac_impl_t *, queue_t *, mblk_t *);
extern boolean_t mac_ip_hdr_length_v6(ip6_t *, uint8_t *, uint16_t *,
uint8_t *, ip6_frag_t **);
extern mblk_t *mac_copymsgchain_cksum(mblk_t *);
-extern mblk_t *mac_fix_cksum(mblk_t *);
extern void mac_packet_print(mac_handle_t, mblk_t *);
extern void mac_rx_deliver(void *, mac_resource_handle_t, mblk_t *,
mac_header_info_t *);
extern void mac_tx_notify(mac_impl_t *);
+extern mblk_t *mac_ring_tx(mac_handle_t, mac_ring_handle_t, mblk_t *);
+extern mblk_t *mac_provider_tx(mac_impl_t *, mac_ring_handle_t, mblk_t *,
+ mac_client_impl_t *);
extern void mac_callback_add(mac_cb_info_t *, mac_cb_t **, mac_cb_t *);
extern boolean_t mac_callback_remove(mac_cb_info_t *, mac_cb_t **, mac_cb_t *);
@@ -832,7 +802,7 @@ extern void mac_flow_set_name(flow_entry_t *, const char *);
extern mblk_t *mac_add_vlan_tag(mblk_t *, uint_t, uint16_t);
extern mblk_t *mac_add_vlan_tag_chain(mblk_t *, uint_t, uint16_t);
extern mblk_t *mac_strip_vlan_tag_chain(mblk_t *);
-extern void mac_pkt_drop(void *, mac_resource_handle_t, mblk_t *, boolean_t);
+extern void mac_rx_def(void *, mac_resource_handle_t, mblk_t *, boolean_t);
extern mblk_t *mac_rx_flow(mac_handle_t, mac_resource_handle_t, mblk_t *);
extern void i_mac_share_alloc(mac_client_impl_t *);
diff --git a/usr/src/uts/common/sys/pattr.h b/usr/src/uts/common/sys/pattr.h
index 1269aeca10..a1fb21ad21 100644
--- a/usr/src/uts/common/sys/pattr.h
+++ b/usr/src/uts/common/sys/pattr.h
@@ -21,6 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2019 Joyent, Inc.
*/
#ifndef _SYS_PATTR_H
@@ -97,6 +98,8 @@ typedef struct pattr_hcksum_s {
#define HCK_FLAGS (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | \
HCK_FULLCKSUM | HCK_FULLCKSUM_OK)
+#define HCK_TX_FLAGS (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | \
+ HCK_FULLCKSUM)
/*
* Extended hardware offloading flags that also use hcksum_flags
*/
diff --git a/usr/src/uts/common/sys/vnic_impl.h b/usr/src/uts/common/sys/vnic_impl.h
index 1a91158da6..4c8d49c621 100644
--- a/usr/src/uts/common/sys/vnic_impl.h
+++ b/usr/src/uts/common/sys/vnic_impl.h
@@ -21,7 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _SYS_VNIC_IMPL_H
@@ -64,6 +64,7 @@ typedef struct vnic_s {
mac_notify_handle_t vn_mnh;
uint32_t vn_hcksum_txflags;
+ mac_capab_lso_t vn_cap_lso;
uint32_t vn_mtu;
link_state_t vn_ls;
} vnic_t;
diff --git a/usr/src/uts/common/xen/io/xnb.c b/usr/src/uts/common/xen/io/xnb.c
index 4bf424c44e..23e1d971cb 100644
--- a/usr/src/uts/common/xen/io/xnb.c
+++ b/usr/src/uts/common/xen/io/xnb.c
@@ -22,6 +22,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
#ifdef DEBUG
@@ -251,8 +252,8 @@ xnb_software_csum(xnb_t *xnbp, mblk_t *mp)
* because it doesn't cover all of the interesting cases :-(
*/
mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM);
-
- return (mac_fix_cksum(mp));
+ mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL);
+ return (mp);
}
mblk_t *