diff options
author | Eric Cheng <none@none> | 2008-12-04 18:16:10 -0800 |
---|---|---|
committer | Eric Cheng <none@none> | 2008-12-04 18:16:10 -0800 |
commit | da14cebe459d3275048785f25bd869cb09b5307f (patch) | |
tree | a394d2c61ec4d7591782a4a5db4e3a157c3ca89a /usr/src | |
parent | 03361682bf38acf5bcc36ee83a0d6277731eee68 (diff) | |
download | illumos-joyent-da14cebe459d3275048785f25bd869cb09b5307f.tar.gz |
PSARC/2006/357 Crossbow - Network Virtualization and Resource Management
6498311 Crossbow - Network Virtualization and Resource Management
6402493 DLPI provider loopback behavior should be improved
6453165 move mac capabs definitions outside mac.h
6338667 Need ability to use NAT for non-global zones
6692884 several threads hung due to deadlock scenario between aggr and mac
6768302 dls: soft_ring_bind/unbind race can panic in thread_affinity_set with cpu_id == -1
6635849 race between lacp_xmit_sm() and aggr_m_stop() ends in panic
6742712 potential message double free in the aggr driver
6754299 a potential race between aggr_m_tx() and aggr_port_delete()
6485324 mi_data_lock recursively held when enabling promiscuous mode on an aggregation
6442559 Forwarding perf bottleneck due to mac_rx() calls
6505462 assertion failure after removing a port from a snooped aggregation
6716664 need to add src/dst IP address to soft ring fanout
--HG--
rename : usr/src/uts/common/io/dls/dls_soft_ring.c => usr/src/uts/common/io/mac/mac_soft_ring.c
rename : usr/src/uts/common/inet/ip/ip_cksum.c => usr/src/uts/common/os/ip_cksum.c
rename : usr/src/uts/common/inet/sctp_crc32.c => usr/src/uts/common/os/sctp_crc32.c
rename : usr/src/uts/common/sys/dls_soft_ring.h => usr/src/uts/common/sys/mac_soft_ring.h
Diffstat (limited to 'usr/src')
326 files changed, 55600 insertions, 23414 deletions
diff --git a/usr/src/cmd/Makefile b/usr/src/cmd/Makefile index 927f5ca801..5d7c5f0f8c 100644 --- a/usr/src/cmd/Makefile +++ b/usr/src/cmd/Makefile @@ -161,6 +161,7 @@ COMMON_SUBDIRS= \ file \ filebench \ find \ + flowadm \ fm \ fmli \ fmt \ @@ -582,6 +583,7 @@ MSGSUBDIRS= \ file \ filesync \ find \ + flowadm \ fm \ fold \ fs.d \ diff --git a/usr/src/cmd/Makefile.cmd b/usr/src/cmd/Makefile.cmd index 44364753b2..8abf748eab 100644 --- a/usr/src/cmd/Makefile.cmd +++ b/usr/src/cmd/Makefile.cmd @@ -66,6 +66,7 @@ ROOTETCTSOL= $(ROOTETCSECURITY)/tsol ROOTETCSECLIB= $(ROOTETCSECURITY)/lib ROOTETCZONES= $(ROOTETC)/zones +ROOTETCINET= $(ROOT)/etc/inet ROOTCCSBIN= $(ROOT)/usr/ccs/bin ROOTCCSBIN64= $(ROOTCCSBIN)/$(MACH64) ROOTCCSBINLINKDIR= $(ROOT)/../../bin @@ -316,6 +317,9 @@ $(ROOTUSRSBIN64)/%: % $(ROOTETC)/%: % $(INS.file) +$(ROOTETCINET)/%: % + $(INS.file) + $(ROOTETCDEFAULT)/%: %.dfl $(INS.rename) diff --git a/usr/src/cmd/acctadm/Makefile b/usr/src/cmd/acctadm/Makefile index 554135fa78..09343cbca7 100644 --- a/usr/src/cmd/acctadm/Makefile +++ b/usr/src/cmd/acctadm/Makefile @@ -19,8 +19,6 @@ # CDDL HEADER END # # -#ident "%Z%%M% %I% %E% SMI" -# # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -37,7 +35,7 @@ include ../Makefile.cmd ROOTMANIFESTDIR = $(ROOTSVCSYSTEM) CFLAGS += $(CCVERBOSE) -LDLIBS += -lexacct -lscf -lsecdb +LDLIBS += -lexacct -lscf -lsecdb -ldladm POFILE = acctadm.po XGETFLAGS = -a -x acctadm.xcl FILEMODE = 0555 diff --git a/usr/src/cmd/acctadm/acctadm.xcl b/usr/src/cmd/acctadm/acctadm.xcl index 4926a94690..e8d2b4572d 100644 --- a/usr/src/cmd/acctadm/acctadm.xcl +++ b/usr/src/cmd/acctadm/acctadm.xcl @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -19,6 +18,11 @@ # # CDDL HEADER END # +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + msgid "ruxf:e:d:" msgid "/etc/acctadm.conf" msgid "" @@ -26,6 +30,7 @@ msgid "process" msgid "proc" msgid "task" msgid "flow" +msgid "net" msgid "no" msgid "none" msgid "yes" @@ -41,6 +46,10 @@ msgid "ACCTADM_FLOW_ENABLE" msgid "ACCTADM_FLOW_FILE" msgid "ACCTADM_FLOW_TRACKED" msgid "ACCTADM_FLOW_UNTRACKED" +msgid "ACCTADM_NET_ENABLE" +msgid "ACCTADM_NET_FILE" +msgid "ACCTADM_NET_TRACKED" +msgid "ACCTADM_NET_UNTRACKED" msgid "r+" msgid "r" msgid " %[^=]=%s \n%n" diff --git a/usr/src/cmd/acctadm/aconf.c b/usr/src/cmd/acctadm/aconf.c index 70c5f7618d..8453a4fa8f 100644 --- a/usr/src/cmd/acctadm/aconf.c +++ b/usr/src/cmd/acctadm/aconf.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/acctctl.h> #include <unistd.h> @@ -32,6 +30,7 @@ #include <stdlib.h> #include <errno.h> #include <limits.h> +#include <libdllink.h> #include <libscf.h> #include <pwd.h> #include <auth_attr.h> @@ -47,6 +46,7 @@ #define FMRI_FLOW_ACCT "svc:/system/extended-accounting:flow" #define FMRI_PROC_ACCT "svc:/system/extended-accounting:process" #define FMRI_TASK_ACCT "svc:/system/extended-accounting:task" +#define FMRI_NET_ACCT "svc:/system/extended-accounting:net" #define NELEM(x) (sizeof (x)) / (sizeof (x[0])) @@ -134,13 +134,14 @@ aconf_setup(const char *fmri) } /* - * Flow accounting is not available in non-global zones and + * Net/Flow accounting is not available in non-global zones and * the service instance should therefore never be 'enabled' in * non-global zones. This is enforced by acctadm(1M), but there is * nothing that prevents someone from calling svcadm enable directly, * so we handle that case here by disabling the instance. */ - if (type == AC_FLOW && getzoneid() != GLOBAL_ZONEID) { + if ((type == AC_FLOW || type == AC_NET) && + getzoneid() != GLOBAL_ZONEID) { (void) smf_disable_instance(fmri, 0); warn(gettext("%s accounting cannot be configured in " "non-global zones\n"), ac_type_name(type)); @@ -210,6 +211,19 @@ aconf_setup(const char *fmri) ret = SMF_EXIT_ERR_FATAL; } (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, PRIV_SYS_ACCT, NULL); + + if (state == AC_ON && type == AC_NET) { + /* + * Start logging. + */ + (void) priv_set(PRIV_ON, PRIV_EFFECTIVE, PRIV_SYS_DL_CONFIG, + NULL); + (void) dladm_start_usagelog(strncmp(tracked, "basic", + strlen("basic")) == 0 ? DLADM_LOGTYPE_LINK : + DLADM_LOGTYPE_FLOW, 20); + (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, PRIV_SYS_DL_CONFIG, + NULL); + } out: aconf_scf_fini(); return (ret); @@ -219,7 +233,7 @@ void aconf_print(FILE *fp, int types) { acctconf_t ac; - int print_order[] = { AC_TASK, AC_PROC, AC_FLOW }; + int print_order[] = { AC_TASK, AC_PROC, AC_FLOW, AC_NET }; int i; for (i = 0; i < NELEM(print_order); i++) { @@ -279,6 +293,21 @@ aconf_print_type(acctconf_t *acp, FILE *fp, int type) gettext(" Untracked flow resources: %s\n"), acp->untracked); break; + case AC_NET: + (void) fprintf(fp, + gettext(" Net accounting: %s\n"), + acp->state == AC_ON ? + gettext("active") : gettext("inactive")); + (void) fprintf(fp, + gettext(" Net accounting file: %s\n"), + acp->file); + (void) fprintf(fp, + gettext(" Tracked net resources: %s\n"), + acp->tracked); + (void) fprintf(fp, + gettext(" Untracked net resources: %s\n"), + acp->untracked); + break; } } @@ -369,6 +398,8 @@ aconf_type2fmri(int type) return (FMRI_TASK_ACCT); case AC_FLOW: return (FMRI_FLOW_ACCT); + case AC_NET: + return (FMRI_NET_ACCT); default: die(gettext("invalid type %d\n"), type); } @@ -385,6 +416,8 @@ aconf_fmri2type(const char *fmri) return (AC_TASK); else if (strcmp(fmri, FMRI_FLOW_ACCT) == 0) return (AC_FLOW); + else if (strcmp(fmri, FMRI_NET_ACCT) == 0) + return (AC_NET); else return (-1); } diff --git a/usr/src/cmd/acctadm/extended-accounting.xml b/usr/src/cmd/acctadm/extended-accounting.xml index 2c68130080..07cb9af9c1 100644 --- a/usr/src/cmd/acctadm/extended-accounting.xml +++ b/usr/src/cmd/acctadm/extended-accounting.xml @@ -23,8 +23,6 @@ CDDL HEADER END - ident "%Z%%M% %I% %E% SMI" - NOTE: This service manifest is not editable; its contents will be overwritten by package or patch operations, including operating system upgrade. Make customizations in a different @@ -175,6 +173,43 @@ </documentation> </template> </instance> + + <instance name='net' enabled='false'> + + <property_group name='general' type='framework'> + <propval name='action_authorization' type='astring' + value='solaris.smf.manage.extended-accounting.net' /> + <propval name='value_authorization' type='astring' + value='solaris.smf.manage.extended-accounting.net' /> + </property_group> + + <property_group name='config' type='application'> + <propval name='value_authorization' type='astring' + value='solaris.smf.value.extended-accounting.net' /> + <propval name='enabled' type='boolean' + value='false' /> + <propval name='file' type='astring' + value='none' /> + <propval name='tracked' type='astring' + value='none' /> + <propval name='untracked' type='astring' + value='extended' /> + </property_group> + + <template> + <common_name> + <loctext xml:lang='C'> + configure net extended accounting + </loctext> + </common_name> + + <documentation> + <manpage + title='acctadm' section='1M' + manpath='/usr/share/man' /> + </documentation> + </template> + </instance> <stability value='Unstable' /> </service> diff --git a/usr/src/cmd/acctadm/main.c b/usr/src/cmd/acctadm/main.c index f83c1ec73c..484caf8988 100644 --- a/usr/src/cmd/acctadm/main.c +++ b/usr/src/cmd/acctadm/main.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/acctctl.h> #include <assert.h> #include <stdio.h> @@ -33,6 +31,7 @@ #include <string.h> #include <errno.h> #include <libintl.h> +#include <libdllink.h> #include <locale.h> #include <priv.h> #include <libscf.h> @@ -44,12 +43,12 @@ static const char USAGE[] = "\ Usage:\n\ - acctadm [ {process | task | flow} ]\n\ + acctadm [ {process | task | flow | net} ]\n\ acctadm -s\n\ - acctadm -r [ {process | task | flow} ]\n\ - acctadm -x|-E|-D {process | task | flow}\n\ - acctadm -f filename {process | task | flow}\n\ - acctadm -e resources -d resources {process | task | flow}\n"; + acctadm -r [ {process | task | flow | net} ]\n\ + acctadm -x|-E|-D {process | task | flow | net}\n\ + acctadm -f filename {process | task | flow | net}\n\ + acctadm -e resources -d resources {process | task | flow | net}\n"; static const char OPTS[] = "rsxf:e:d:ED"; @@ -77,6 +76,7 @@ setup_privs() (void) priv_addset(privset, PRIV_SYS_ACCT); (void) priv_addset(privset, PRIV_FILE_DAC_WRITE); + (void) priv_addset(privset, PRIV_SYS_DL_CONFIG); (void) priv_delset(privset, PRIV_FILE_LINK_ANY); (void) priv_delset(privset, PRIV_PROC_EXEC); (void) priv_delset(privset, PRIV_PROC_FORK); @@ -98,10 +98,11 @@ setup_privs() die(gettext("cannot setup privileges")); /* - * Turn off the sys_acct and file_dac_write privileges until needed. + * Turn off the sys_acct, file_dac_write and dl_config privileges + * until needed. */ (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, PRIV_FILE_DAC_WRITE, - PRIV_SYS_ACCT, NULL); + PRIV_SYS_ACCT, PRIV_SYS_DL_CONFIG, NULL); } int @@ -183,7 +184,7 @@ main(int argc, char *argv[]) if (!(disabled || enabled || Dflg || Eflg || file || sflg || xflg)) (void) priv_set(PRIV_OFF, PRIV_PERMITTED, - PRIV_SYS_ACCT, NULL); + PRIV_SYS_ACCT, PRIV_SYS_DL_CONFIG, NULL); if (optind < argc) { if (typestr != NULL) { @@ -203,20 +204,34 @@ main(int argc, char *argv[]) type |= AC_TASK; else if (strcmp(typestr, "flow") == 0) type |= AC_FLOW; + else if (strcmp(typestr, "net") == 0) + type |= AC_NET; else { warn(gettext("unknown accounting type -- %s\n"), typestr); usage(); } } else - type = AC_PROC | AC_TASK | AC_FLOW; + type = AC_PROC | AC_TASK | AC_FLOW | AC_NET; /* + * Drop the DL config privilege if we are not working with + * net. + */ + if ((type & AC_NET) == 0) { + (void) priv_set(PRIV_OFF, PRIV_PERMITTED, + PRIV_SYS_DL_CONFIG, NULL); + } + /* * check for invalid options */ if (optcnt > 1) usage(); + /* + * XXX For AC_NET, enabled/disabled should only be "basic" or + * "extended" - need to check it here. + */ if ((enabled || disabled) && (rflg || Dflg || sflg || xflg || Eflg)) usage(); @@ -253,9 +268,10 @@ main(int argc, char *argv[]) return (E_ERROR); } - assert(type == AC_PROC || type == AC_TASK || type == AC_FLOW); + assert(type == AC_PROC || type == AC_TASK || type == AC_FLOW || + type == AC_NET); - if (type == AC_FLOW && getzoneid() != GLOBAL_ZONEID) + if ((type == AC_FLOW || type == AC_NET) && getzoneid() != GLOBAL_ZONEID) die(gettext("%s accounting cannot be configured in " "non-global zones\n"), ac_type_name(type)); @@ -277,6 +293,18 @@ main(int argc, char *argv[]) /* * Turn off the specified accounting and close its file */ + + /* + * Stop net logging before turning it off so that the last + * set of logs can be written. + */ + if (type & AC_NET) { + (void) priv_set(PRIV_ON, PRIV_EFFECTIVE, + PRIV_SYS_DL_CONFIG, NULL); + (void) dladm_stop_usagelog(DLADM_LOGTYPE_FLOW); + (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, + PRIV_SYS_DL_CONFIG, NULL); + } state = AC_OFF; (void) priv_set(PRIV_ON, PRIV_EFFECTIVE, PRIV_SYS_ACCT, NULL); @@ -311,8 +339,22 @@ main(int argc, char *argv[]) free(buf); die(gettext("cannot obtain list of resources\n")); } - if (disabled) + if (disabled) { + /* + * Stop net logging before turning it off so that the + * last set of logs can be written. + */ + if (type & AC_NET) { + (void) priv_set(PRIV_ON, PRIV_EFFECTIVE, + PRIV_SYS_DL_CONFIG, NULL); + (void) dladm_stop_usagelog(strncmp(disabled, + "basic", strlen("basic")) == 0 ? + DLADM_LOGTYPE_LINK : DLADM_LOGTYPE_FLOW); + (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, + PRIV_SYS_DL_CONFIG, NULL); + } str2buf(buf, disabled, AC_OFF, type); + } if (enabled) str2buf(buf, enabled, AC_ON, type); @@ -332,6 +374,24 @@ main(int argc, char *argv[]) if (aconf_set_string(AC_PROP_UNTRACKED, untracked) == -1) die(gettext("cannot update %s property\n"), AC_PROP_UNTRACKED); + /* + * We will enable net logging after turning it on so that + * it can immediately start writing log. + */ + if (type & AC_NET && enabled != NULL) { + /* + * Default logging interval for AC_NET is 20. + * XXX need to find the right place to + * configure it. + */ + (void) priv_set(PRIV_ON, PRIV_EFFECTIVE, + PRIV_SYS_DL_CONFIG, NULL); + (void) dladm_start_usagelog(strncmp(enabled, "basic", + strlen("basic")) == 0 ? DLADM_LOGTYPE_LINK : + DLADM_LOGTYPE_FLOW, 20); + (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, + PRIV_SYS_DL_CONFIG, NULL); + } free(tracked); free(untracked); free(buf); @@ -365,6 +425,18 @@ main(int argc, char *argv[]) /* * Disable accounting */ + + /* + * Stop net logging before turning it off so that the last + * set of logs can be written. + */ + if (type & AC_NET) { + (void) priv_set(PRIV_ON, PRIV_EFFECTIVE, + PRIV_SYS_DL_CONFIG, NULL); + (void) dladm_stop_usagelog(DLADM_LOGTYPE_FLOW); + (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, + PRIV_SYS_DL_CONFIG, NULL); + } state = AC_OFF; (void) priv_set(PRIV_ON, PRIV_EFFECTIVE, PRIV_SYS_ACCT, NULL); @@ -395,6 +467,17 @@ main(int argc, char *argv[]) die(gettext("cannot update %s property\n"), AC_PROP_STATE); modified++; + if (type & AC_NET) { + /* + * Default logging interval for AC_NET is 20, + * XXX need to find the right place to configure it. + */ + (void) priv_set(PRIV_ON, PRIV_EFFECTIVE, + PRIV_SYS_DL_CONFIG, NULL); + (void) dladm_start_usagelog(DLADM_LOGTYPE_FLOW, 20); + (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, + PRIV_SYS_DL_CONFIG, NULL); + } } (void) priv_set(PRIV_OFF, PRIV_PERMITTED, PRIV_SYS_ACCT, NULL); diff --git a/usr/src/cmd/acctadm/res.c b/usr/src/cmd/acctadm/res.c index 844e3641c1..7f9484f12b 100644 --- a/usr/src/cmd/acctadm/res.c +++ b/usr/src/cmd/acctadm/res.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <stdlib.h> #include <stdio.h> #include <libintl.h> @@ -89,6 +87,33 @@ static ac_resname_t ac_names[] = { { AC_FLOW, AC_FLOW_ANAME, "action" }, /* + * Net accounting resources + */ + + { AC_NET, AC_NET_NAME, "name" }, + { AC_NET, AC_NET_EHOST, "ehost" }, + { AC_NET, AC_NET_EDEST, "edest" }, + { AC_NET, AC_NET_VLAN_TPID, "vlan_pid" }, + { AC_NET, AC_NET_VLAN_TCI, "vlan_tci" }, + { AC_NET, AC_NET_SAP, "sap" }, + { AC_NET, AC_NET_PRIORITY, "priority" }, + { AC_NET, AC_NET_BWLIMIT, "bwlimit" }, + { AC_NET, AC_NET_DEVNAME, "devname" }, + { AC_NET, AC_NET_SADDR, "src_ip" }, + { AC_NET, AC_NET_DADDR, "dst_ip" }, + { AC_NET, AC_NET_SPORT, "src_port" }, + { AC_NET, AC_NET_DPORT, "dst_port" }, + { AC_NET, AC_NET_PROTOCOL, "protocol" }, + { AC_NET, AC_NET_DSFIELD, "dsfield" }, + { AC_NET, AC_NET_CURTIME, "curtime" }, + { AC_NET, AC_NET_IBYTES, "ibytes" }, + { AC_NET, AC_NET_OBYTES, "obytes" }, + { AC_NET, AC_NET_IPKTS, "ipkts" }, + { AC_NET, AC_NET_OPKTS, "opkts" }, + { AC_NET, AC_NET_IERRPKTS, "ierrpkts" }, + { AC_NET, AC_NET_OERRPKTS, "oerrpkts" }, + + /* * These are included for compatibility with old acctadm that * didn't have resource groups for individual accounting types. * It was possible to have resource "pid" enabled for task @@ -134,6 +159,19 @@ static ac_group_t ac_groups[] = { { AC_FLOW_SADDR, AC_FLOW_DADDR, AC_FLOW_SPORT, AC_FLOW_DPORT, AC_FLOW_PROTOCOL, AC_FLOW_NBYTES, AC_FLOW_NPKTS, AC_FLOW_ANAME, AC_NONE } }, + { AC_NET, "extended", + { AC_NET_NAME, AC_NET_EHOST, AC_NET_EDEST, AC_NET_VLAN_TPID, + AC_NET_VLAN_TCI, AC_NET_SAP, AC_NET_PRIORITY, + AC_NET_BWLIMIT, AC_NET_DEVNAME, AC_NET_SADDR, AC_NET_DADDR, + AC_NET_SPORT, AC_NET_DPORT, AC_NET_PROTOCOL, AC_NET_DSFIELD, + AC_NET_CURTIME, AC_NET_IBYTES, AC_NET_OBYTES, AC_NET_IPKTS, + AC_NET_OPKTS, AC_NET_IERRPKTS, AC_NET_OERRPKTS, AC_NONE } }, + { AC_NET, "basic", + { AC_NET_NAME, AC_NET_DEVNAME, AC_NET_EHOST, AC_NET_EDEST, + AC_NET_VLAN_TPID, AC_NET_VLAN_TCI, AC_NET_SAP, + AC_NET_PRIORITY, AC_NET_BWLIMIT, AC_NET_CURTIME, AC_NET_IBYTES, + AC_NET_OBYTES, AC_NET_IPKTS, AC_NET_OPKTS, AC_NET_IERRPKTS, + AC_NET_OERRPKTS, AC_NONE } }, { AC_NONE, NULL, { AC_NONE } } }; @@ -202,9 +240,10 @@ printgroups(int type) { int header = 0; - if ((type & AC_PROC) && (type & AC_TASK) && (type & AC_FLOW)) + if ((type & AC_PROC) && (type & AC_TASK) && (type & AC_FLOW) && + (type & AC_NET)) { header = 1; - + } if (type & AC_PROC) { if (header == 1) (void) printf("process:\n"); @@ -220,6 +259,11 @@ printgroups(int type) (void) printf("flow:\n"); printgroup(AC_FLOW); } + if (type & AC_NET) { + if (header == 1) + (void) printf("net:\n"); + printgroup(AC_NET); + } } /* diff --git a/usr/src/cmd/acctadm/utils.c b/usr/src/cmd/acctadm/utils.c index 26482d5ccd..bbee653eeb 100644 --- a/usr/src/cmd/acctadm/utils.c +++ b/usr/src/cmd/acctadm/utils.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <assert.h> #include <sys/types.h> #include <sys/acctctl.h> @@ -107,6 +105,8 @@ ac_type_name(int type) return (gettext("flow")); case AC_TASK: return (gettext("task")); + case AC_NET: + return (gettext("net")); default: die(gettext("invalid type %d\n"), type); } @@ -217,8 +217,9 @@ verify_exacct_file(const char *file, int type) } else { /* * A non-header object exists. Insist that it be - * either a process, task, or flow accounting record, - * the same type as is desired. + * either a process, task, flow or net accounting + * record, the same type as is desired. + * xxx-venu:check 101 merge for EXD_GROUP_NET_* */ uint_t c = eo.eo_catalog & EXD_DATA_MASK; @@ -226,7 +227,12 @@ verify_exacct_file(const char *file, int type) (eo.eo_catalog & EXC_CATALOG_MASK) != EXC_NONE || (!(c == EXD_GROUP_PROC && type == AC_PROC || c == EXD_GROUP_TASK && type == AC_TASK || - c == EXD_GROUP_FLOW && type == AC_FLOW))) { + c == EXD_GROUP_FLOW && type == AC_FLOW || + (c == EXD_GROUP_NET_LINK_DESC || + c == EXD_GROUP_NET_FLOW_DESC || + c == EXD_GROUP_NET_LINK_STATS || + c == EXD_GROUP_NET_FLOW_STATS) && + type == AC_NET))) { (void) ea_close(&ef); return (B_FALSE); } diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile index 4924d2fe4e..69e91758ea 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile +++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/Makefile @@ -22,7 +22,6 @@ # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" # PROG = ifconfig @@ -39,7 +38,7 @@ COMMONSRCS= $(CMDINETCOMMONDIR)/$(COMMONOBJS:%.o=%.c) SRCS= $(LOCALSRCS) $(COMMONSRCS) CPPFLAGS += -I$(CMDINETCOMMONDIR) -I$(SRC)/common/net/dhcp -LDLIBS += -ldhcpagent -linetcfg -ldlpi +LDLIBS += -ldhcpagent -linetcfg -ldlpi -ldladm LINTFLAGS += -m ROOTUSRSBINLINKS = $(PROG:%=$(ROOTUSRSBIN)/%) diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c index b33fc6c1b6..79e2991164 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c @@ -13,6 +13,7 @@ #include "ifconfig.h" #include <compat.h> #include <libdlpi.h> +#include <libdllink.h> #include <inet/ip.h> #include <inet/ipsec_impl.h> @@ -4499,7 +4500,11 @@ static boolean_t ni_entry(const char *linkname, void *arg) { dlpi_handle_t dh; + datalink_class_t class; + (void) dladm_name2info(linkname, NULL, NULL, &class, NULL); + if (class == DATALINK_CLASS_ETHERSTUB) + return (_B_FALSE); if (dlpi_open(linkname, &dh, 0) != DLPI_SUCCESS) return (_B_FALSE); diff --git a/usr/src/cmd/dladm/Makefile b/usr/src/cmd/dladm/Makefile index 94e6842ff3..6757c63d89 100644 --- a/usr/src/cmd/dladm/Makefile +++ b/usr/src/cmd/dladm/Makefile @@ -22,7 +22,6 @@ # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" # PROG= dladm @@ -35,6 +34,7 @@ ROOTCFGFILES= $(CFGFILES:%=$(ROOTCFGDIR)/%) include ../Makefile.cmd XGETFLAGS += -a -x $(PROG).xcl +LDLIBS += -L$(ROOT)/lib -lsocket LDLIBS += -ldladm -ldlpi -lkstat -lsecdb -lbsm -linetutil -ldevinfo $(ROOTCFGFILES) := OWNER= dladm diff --git a/usr/src/cmd/dladm/dladm.c b/usr/src/cmd/dladm/dladm.c index 466adfe6c0..9422a31da3 100644 --- a/usr/src/cmd/dladm/dladm.c +++ b/usr/src/cmd/dladm/dladm.c @@ -46,7 +46,9 @@ #include <libintl.h> #include <libdevinfo.h> #include <libdlpi.h> +#include <libdladm.h> #include <libdllink.h> +#include <libdlstat.h> #include <libdlaggr.h> #include <libdlwlan.h> #include <libdlvlan.h> @@ -54,11 +56,18 @@ #include <libinetutil.h> #include <bsm/adt.h> #include <bsm/adt_event.h> +#include <libdlvnic.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/processor.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <net/if_types.h> #include <stddef.h> -#define AGGR_DRV "aggr" #define STR_UNDEF_VAL "--" #define MAXPORT 256 +#define MAXVNIC 256 #define BUFLEN(lim, ptr) (((lim) > (ptr)) ? ((lim) - (ptr)) : 0) #define MAXLINELEN 1024 #define SMF_UPGRADE_FILE "/var/svc/profile/upgrade" @@ -131,9 +140,7 @@ * with a callback function that will be called for each field to be printed. * The callback function will be passed a pointer to the print_field_t * for the field, and the pf_index may then be used to identify the - * system call required to find the value to be printed. An example of - * this implementation may be found in the do_show_dev() and print_dev() - * invocation. + * system call required to find the value to be printed. */ typedef struct print_field_s { @@ -192,15 +199,6 @@ static char *dladm_print_field(print_field_t *, void *); #define MAX_FIELD_LEN 32 -typedef struct pktsum_s { - uint64_t ipackets; - uint64_t opackets; - uint64_t rbytes; - uint64_t obytes; - uint32_t ierrors; - uint32_t oerrors; -} pktsum_t; - typedef struct show_state { boolean_t ls_firstonly; boolean_t ls_donefirst; @@ -210,6 +208,8 @@ typedef struct show_state { print_state_t ls_print; boolean_t ls_parseable; boolean_t ls_printheader; + boolean_t ls_mac; + boolean_t ls_hwgrp; } show_state_t; typedef struct show_grp_state { @@ -226,9 +226,37 @@ typedef struct show_grp_state { print_state_t gs_print; } show_grp_state_t; +typedef struct show_vnic_state { + datalink_id_t vs_vnic_id; + datalink_id_t vs_link_id; + char vs_vnic[MAXLINKNAMELEN]; + char vs_link[MAXLINKNAMELEN]; + boolean_t vs_parseable; + boolean_t vs_printheader; + boolean_t vs_found; + boolean_t vs_firstonly; + boolean_t vs_donefirst; + boolean_t vs_stats; + boolean_t vs_printstats; + pktsum_t vs_totalstats; + pktsum_t vs_prevstats[MAXVNIC]; + boolean_t vs_etherstub; + dladm_status_t vs_status; + uint32_t vs_flags; + print_state_t vs_print; +} show_vnic_state_t; + +typedef struct show_usage_state_s { + boolean_t us_plot; + boolean_t us_parseable; + boolean_t us_printheader; + boolean_t us_first; + print_state_t us_print; +} show_usage_state_t; + typedef void cmdfunc_t(int, char **, const char *); -static cmdfunc_t do_show_link, do_show_dev, do_show_wifi, do_show_phys; +static cmdfunc_t do_show_link, do_show_wifi, do_show_phys; static cmdfunc_t do_create_aggr, do_delete_aggr, do_add_aggr, do_remove_aggr; static cmdfunc_t do_modify_aggr, do_show_aggr, do_up_aggr; static cmdfunc_t do_scan_wifi, do_connect_wifi, do_disconnect_wifi; @@ -239,21 +267,25 @@ static cmdfunc_t do_create_vlan, do_delete_vlan, do_up_vlan, do_show_vlan; static cmdfunc_t do_rename_link, do_delete_phys, do_init_phys; static cmdfunc_t do_show_linkmap; static cmdfunc_t do_show_ether; +static cmdfunc_t do_create_vnic, do_delete_vnic, do_show_vnic; +static cmdfunc_t do_up_vnic; +static cmdfunc_t do_create_etherstub, do_delete_etherstub, do_show_etherstub; +static cmdfunc_t do_show_usage; + +static void do_up_vnic_common(int, char **, const char *, boolean_t); static void altroot_cmd(char *, int, char **); static int show_linkprop_onelink(datalink_id_t, void *); static void link_stats(datalink_id_t, uint_t, char *, show_state_t *); static void aggr_stats(datalink_id_t, show_grp_state_t *, uint_t); -static void dev_stats(const char *dev, uint32_t, char *, show_state_t *); +static void vnic_stats(show_vnic_state_t *, uint32_t); static int get_one_kstat(const char *, const char *, uint8_t, void *, boolean_t); static void get_mac_stats(const char *, pktsum_t *); static void get_link_stats(const char *, pktsum_t *); static uint64_t get_ifspeed(const char *, boolean_t); -static void stats_total(pktsum_t *, pktsum_t *, pktsum_t *); -static void stats_diff(pktsum_t *, pktsum_t *, pktsum_t *); static const char *get_linkstate(const char *, boolean_t, char *); static const char *get_linkduplex(const char *, boolean_t, char *); @@ -286,8 +318,6 @@ static cmd_t cmds[] = { "\tshow-link\t[-pP] [-o <field>,..] [-s [-i <interval>]] [<link>]"}, { "rename-link", do_rename_link, "\trename-link\t[-R <root-dir>] <oldlink> <newlink>\n" }, - { "show-dev", do_show_dev, - "\tshow-dev\t[-p] [-o <field>,..] [-s [-i <interval>]] [<dev>]\n" }, { "create-aggr", do_create_aggr, "\tcreate-aggr\t[-t] [-R <root-dir>] [-P <policy>] [-L <mode>]\n" "\t\t\t[-T <time>] [-u <address>] [-l <link>] ... <link>" }, @@ -343,9 +373,30 @@ static cmd_t cmds[] = { { "delete-phys", do_delete_phys, "\tdelete-phys\t<link>" }, { "show-phys", do_show_phys, - "\tshow-phys\t[-pP] [-o <field>,..] [<link>]" }, + "\tshow-phys\t[-pP] [-o <field>,..] [-H] [<link>]" }, { "init-phys", do_init_phys, NULL }, - { "show-linkmap", do_show_linkmap, NULL } + { "show-linkmap", do_show_linkmap, NULL }, + { "create-vnic", do_create_vnic, + "\tcreate-vnic [-t] [-R <root-dir>] -l <link> [-m <value> |" + " auto |\n" + "\t {factory [-n <slot-identifier>]} |\n" + "\t {random [-r <prefix>]}] [-v vlan-tag [-f]]\n" + "\t -p <prop>=<value>[,...] [-H]" + " <vnic-link>\n" }, + { "delete-vnic", do_delete_vnic, + "\tdelete-vnic [-t] [-R <root-dir>] <vnic-link>\n" }, + { "show-vnic", do_show_vnic, + "\tshow-vnic [-pP] [-l <link>] [-s [-i <interval>]]" }, + { "up-vnic", do_up_vnic, NULL }, + { "create-etherstub", do_create_etherstub, + "\tcreate-etherstub [-t] [-R <root-dir>] <link>\n" }, + { "delete-etherstub", do_delete_etherstub, + "\tdelete-etherstub [-t] [-R <root-dir>] <link>\n" }, + { "show-etherstub", do_show_etherstub, + "\tshow-etherstub [-t] [-R <root-dir>] [<link>]\n" }, + { "show-usage", do_show_usage, + "\tshow-usage [-d|-p -F <format>] [-f <filename>]\n" + "\t [-s <time>] [-e <time>] <link>\n" } }; static const struct option lopts[] = { @@ -360,11 +411,15 @@ static const struct option lopts[] = { {"root-dir", required_argument, 0, 'R'}, {"link", required_argument, 0, 'l'}, {"forcible", no_argument, 0, 'f'}, + {"bw-limit", required_argument, 0, 'b'}, + {"mac-address", required_argument, 0, 'm'}, + {"slot", required_argument, 0, 'n'}, { 0, 0, 0, 0 } }; static const struct option show_lopts[] = { {"statistics", no_argument, 0, 's'}, + {"continuous", no_argument, 0, 'S'}, {"interval", required_argument, 0, 'i'}, {"parseable", no_argument, 0, 'p'}, {"extended", no_argument, 0, 'x'}, @@ -409,6 +464,24 @@ static const struct option showeth_lopts[] = { { 0, 0, 0, 0 } }; +static const struct option vnic_lopts[] = { + {"temporary", no_argument, 0, 't' }, + {"root-dir", required_argument, 0, 'R' }, + {"dev", required_argument, 0, 'd' }, + {"mac-address", required_argument, 0, 'm' }, + {"cpus", required_argument, 0, 'c' }, + {"bw-limit", required_argument, 0, 'b' }, + {"slot", required_argument, 0, 'n' }, + {"mac-prefix", required_argument, 0, 'r' }, + { 0, 0, 0, 0 } +}; + +static const struct option etherstub_lopts[] = { + {"temporary", no_argument, 0, 't' }, + {"root-dir", required_argument, 0, 'R' }, + { 0, 0, 0, 0 } +}; + /* * structures for 'dladm show-ether' */ @@ -451,26 +524,7 @@ typedef struct print_ether_state { } print_ether_state_t; /* - * structures for 'dladm show-dev'. - */ -typedef enum { - DEV_LINK, - DEV_STATE, - DEV_SPEED, - DEV_DUPLEX -} dev_field_index_t; - -static print_field_t dev_fields[] = { -/* name, header, field width, index, cmdtype */ -{ "link", "LINK", 15, DEV_LINK, CMD_TYPE_ANY}, -{ "state", "STATE", 6, DEV_STATE, CMD_TYPE_ANY}, -{ "speed", "SPEED", 8, DEV_SPEED, CMD_TYPE_ANY}, -{ "duplex", "DUPLEX", 8, DEV_DUPLEX, CMD_TYPE_ANY}} -; -#define DEV_MAX_FIELDS (sizeof (dev_fields) / sizeof (print_field_t)) - -/* - * structures for 'dladm show-dev -s' (print statistics) + * structures for 'dladm show-link -s' (print statistics) */ typedef enum { DEVS_LINK, @@ -493,12 +547,6 @@ static print_field_t devs_fields[] = { { "oerrors", "OERRORS", 8, DEVS_OERRORS, CMD_TYPE_ANY}} ; #define DEVS_MAX_FIELDS (sizeof (devs_fields) / sizeof (print_field_t)) -typedef struct dev_args_s { - char *devs_link; - pktsum_t *devs_psum; -} dev_args_t; -static char *print_dev_stats(print_field_t *, void *); -static char *print_dev(print_field_t *, void *); /* * buffer used by print functions for show-{link,phys,vlan} commands. @@ -635,10 +683,10 @@ static print_field_t aggr_s_fields[] = { CMD_TYPE_ANY}} ; #define AGGR_S_MAX_FIELDS \ - (sizeof (aggr_l_fields) / sizeof (print_field_t)) + (sizeof (aggr_s_fields) / sizeof (print_field_t)) /* - * structures for 'dladm show-dev -L'. + * structures for 'dladm show-aggr -L'. */ typedef enum { AGGR_L_LINK, @@ -697,6 +745,50 @@ static print_field_t phys_fields[] = { #define PHYS_MAX_FIELDS (sizeof (phys_fields) / sizeof (print_field_t)) /* + * structures for 'dladm show-phys -m' + */ + +typedef enum { + PHYS_M_LINK, + PHYS_M_SLOT, + PHYS_M_ADDRESS, + PHYS_M_INUSE, + PHYS_M_CLIENT +} phys_m_field_index_t; + +static print_field_t phys_m_fields[] = { +/* name, header, field width, offset, cmdtype */ +{ "link", "LINK", 12, PHYS_M_LINK, CMD_TYPE_ANY}, +{ "slot", "SLOT", 8, PHYS_M_SLOT, CMD_TYPE_ANY}, +{ "address", "ADDRESS", 18, PHYS_M_ADDRESS, CMD_TYPE_ANY}, +{ "inuse", "INUSE", 4, PHYS_M_INUSE, CMD_TYPE_ANY}, +{ "client", "CLIENT", 12, PHYS_M_CLIENT, CMD_TYPE_ANY}} +; +#define PHYS_M_MAX_FIELDS (sizeof (phys_m_fields) / sizeof (print_field_t)) + +/* + * structures for 'dladm show-phys -H' + */ + +typedef enum { + PHYS_H_LINK, + PHYS_H_GROUP, + PHYS_H_GRPTYPE, + PHYS_H_RINGS, + PHYS_H_CLIENTS +} phys_h_field_index_t; + +static print_field_t phys_h_fields[] = { +/* name, header, field width, offset, cmdtype */ +{ "link", "LINK", 12, PHYS_H_LINK, CMD_TYPE_ANY}, +{ "group", "GROUP", 8, PHYS_H_GROUP, CMD_TYPE_ANY}, +{ "grouptype", "TYPE", 6, PHYS_H_GRPTYPE, CMD_TYPE_ANY}, +{ "rings", "NUM-RINGS", 16, PHYS_H_RINGS, CMD_TYPE_ANY}, +{ "clients", "CLIENTS", 20, PHYS_H_CLIENTS, CMD_TYPE_ANY}} +; +#define PHYS_H_MAX_FIELDS (sizeof (phys_h_fields) / sizeof (print_field_t)) + +/* * structures for 'dladm show-vlan' */ static print_field_t vlan_fields[] = { @@ -712,6 +804,7 @@ static print_field_t vlan_fields[] = { ; #define VLAN_MAX_FIELDS (sizeof (vlan_fields) / sizeof (print_field_t)) + /* * structures for 'dladm show-wifi' */ @@ -764,34 +857,28 @@ static print_field_t linkprop_fields[] = { #define LINKPROP_MAX_FIELDS \ (sizeof (linkprop_fields) / sizeof (print_field_t)) -#define MAX_PROPS 32 #define MAX_PROP_LINE 512 -typedef struct prop_info { - char *pi_name; - char *pi_val[DLADM_MAX_PROP_VALCNT]; - uint_t pi_count; -} prop_info_t; - -typedef struct prop_list { - prop_info_t pl_info[MAX_PROPS]; - uint_t pl_count; - char *pl_buf; -} prop_list_t; - typedef struct show_linkprop_state { - char ls_link[MAXLINKNAMELEN]; - char *ls_line; - char **ls_propvals; - prop_list_t *ls_proplist; - boolean_t ls_parseable; - boolean_t ls_persist; - boolean_t ls_header; - dladm_status_t ls_status; - dladm_status_t ls_retstatus; - print_state_t ls_print; + char ls_link[MAXLINKNAMELEN]; + char *ls_line; + char **ls_propvals; + dladm_arg_list_t *ls_proplist; + boolean_t ls_parseable; + boolean_t ls_persist; + boolean_t ls_header; + dladm_status_t ls_status; + dladm_status_t ls_retstatus; + print_state_t ls_print; } show_linkprop_state_t; +typedef struct set_linkprop_state { + const char *ls_name; + boolean_t ls_reset; + boolean_t ls_temp; + dladm_status_t ls_status; +} set_linkprop_state_t; + typedef struct linkprop_args_s { show_linkprop_state_t *ls_state; char *ls_propname; @@ -817,9 +904,108 @@ static print_field_t secobj_fields[] = { ; #define DEV_SOBJ_FIELDS (sizeof (secobj_fields) / sizeof (print_field_t)) +/* + * structures for 'dladm show-vnic' + */ +typedef struct vnic_fields_buf_s +{ + char vnic_link[DLPI_LINKNAME_MAX]; + char vnic_over[DLPI_LINKNAME_MAX]; + char vnic_speed[6]; + char vnic_macaddr[19]; + char vnic_macaddrtype[19]; + char vnic_vid[6]; +} vnic_fields_buf_t; + +static print_field_t vnic_fields[] = { +/* name, header, field width, offset, cmdtype */ +{ "link", "LINK", 12, + offsetof(vnic_fields_buf_t, vnic_link), CMD_TYPE_ANY}, +{ "over", "OVER", 12, + offsetof(vnic_fields_buf_t, vnic_over), CMD_TYPE_ANY}, +{ "speed", "SPEED", 6, + offsetof(vnic_fields_buf_t, vnic_speed), CMD_TYPE_ANY}, +{ "macaddr", "MACADDRESS", 20, + offsetof(vnic_fields_buf_t, vnic_macaddr), CMD_TYPE_ANY}, +{ "macaddrtype", "MACADDRTYPE", 19, + offsetof(vnic_fields_buf_t, vnic_macaddrtype), CMD_TYPE_ANY}, +{ "vid", "VID", 6, + offsetof(vnic_fields_buf_t, vnic_vid), CMD_TYPE_ANY}} +; +#define VNIC_MAX_FIELDS (sizeof (vnic_fields) / sizeof (print_field_t)) + +/* + * structures for 'dladm show-usage' + */ + +typedef struct usage_fields_buf_s { + char usage_link[12]; + char usage_duration[10]; + char usage_ipackets[9]; + char usage_rbytes[10]; + char usage_opackets[9]; + char usage_obytes[10]; + char usage_bandwidth[14]; +} usage_fields_buf_t; + +static print_field_t usage_fields[] = { +/* name, header, field width, offset, cmdtype */ +{ "link", "LINK", 12, + offsetof(usage_fields_buf_t, usage_link), CMD_TYPE_ANY}, +{ "duration", "DURATION", 10, + offsetof(usage_fields_buf_t, usage_duration), CMD_TYPE_ANY}, +{ "ipackets", "IPACKETS", 9, + offsetof(usage_fields_buf_t, usage_ipackets), CMD_TYPE_ANY}, +{ "rbytes", "RBYTES", 10, + offsetof(usage_fields_buf_t, usage_rbytes), CMD_TYPE_ANY}, +{ "opackets", "OPACKETS", 9, + offsetof(usage_fields_buf_t, usage_opackets), CMD_TYPE_ANY}, +{ "obytes", "OBYTES", 10, + offsetof(usage_fields_buf_t, usage_obytes), CMD_TYPE_ANY}, +{ "bandwidth", "BANDWIDTH", 14, + offsetof(usage_fields_buf_t, usage_bandwidth), CMD_TYPE_ANY}} +; + +#define USAGE_MAX_FIELDS (sizeof (usage_fields) / sizeof (print_field_t)) + +/* + * structures for 'dladm show-usage link' + */ + +typedef struct usage_l_fields_buf_s { + char usage_l_link[12]; + char usage_l_stime[13]; + char usage_l_etime[13]; + char usage_l_rbytes[8]; + char usage_l_obytes[8]; + char usage_l_bandwidth[14]; +} usage_l_fields_buf_t; + +static print_field_t usage_l_fields[] = { +/* name, header, field width, offset, cmdtype */ +{ "link", "LINK", 12, + offsetof(usage_l_fields_buf_t, usage_l_link), CMD_TYPE_ANY}, +{ "start", "START", 13, + offsetof(usage_l_fields_buf_t, usage_l_stime), CMD_TYPE_ANY}, +{ "end", "END", 13, + offsetof(usage_l_fields_buf_t, usage_l_etime), CMD_TYPE_ANY}, +{ "rbytes", "RBYTES", 8, + offsetof(usage_l_fields_buf_t, usage_l_rbytes), CMD_TYPE_ANY}, +{ "obytes", "OBYTES", 8, + offsetof(usage_l_fields_buf_t, usage_l_obytes), CMD_TYPE_ANY}, +{ "bandwidth", "BANDWIDTH", 14, + offsetof(usage_l_fields_buf_t, usage_l_bandwidth), CMD_TYPE_ANY}} +; + +#define USAGE_L_MAX_FIELDS \ + (sizeof (usage_l_fields) /sizeof (print_field_t)) + static char *progname; static sig_atomic_t signalled; +#define DLADM_ETHERSTUB_NAME "etherstub" +#define DLADM_IS_ETHERSTUB(id) (id == DATALINK_INVALID_LINKID) + static void usage(void) { @@ -867,6 +1053,254 @@ main(int argc, char *argv[]) return (0); } +/*ARGSUSED*/ +static int +show_usage_date(dladm_usage_t *usage, void *arg) +{ + + time_t stime; + char timebuf[20]; + + stime = usage->du_stime; + (void) strftime(timebuf, sizeof (timebuf), "%m/%d/%Y", + localtime(&stime)); + (void) printf("%s\n", timebuf); + + return (DLADM_STATUS_OK); +} + +static int +show_usage_time(dladm_usage_t *usage, void *arg) +{ + show_usage_state_t *state = (show_usage_state_t *)arg; + char buf[DLADM_STRSIZE]; + usage_l_fields_buf_t ubuf; + time_t time; + double bw; + + if (state->us_plot) { + if (!state->us_printheader) { + if (state->us_first) { + (void) printf("# Time"); + state->us_first = B_FALSE; + } + (void) printf(" %s", usage->du_name); + if (usage->du_last) { + (void) printf("\n"); + state->us_first = B_TRUE; + state->us_printheader = B_TRUE; + } + } else { + if (state->us_first) { + time = usage->du_etime; + (void) strftime(buf, sizeof (buf), "%T", + localtime(&time)); + state->us_first = B_FALSE; + (void) printf("%s", buf); + } + bw = (double)usage->du_bandwidth/1000; + (void) printf(" %.2f", bw); + if (usage->du_last) { + (void) printf("\n"); + state->us_first = B_TRUE; + } + } + return (DLADM_STATUS_OK); + } + + bzero(&ubuf, sizeof (ubuf)); + + (void) snprintf(ubuf.usage_l_link, sizeof (ubuf.usage_l_link), "%s", + usage->du_name); + time = usage->du_stime; + (void) strftime(buf, sizeof (buf), "%T", localtime(&time)); + (void) snprintf(ubuf.usage_l_stime, sizeof (ubuf.usage_l_stime), "%s", + buf); + time = usage->du_etime; + (void) strftime(buf, sizeof (buf), "%T", localtime(&time)); + (void) snprintf(ubuf.usage_l_etime, sizeof (ubuf.usage_l_etime), "%s", + buf); + (void) snprintf(ubuf.usage_l_rbytes, sizeof (ubuf.usage_l_rbytes), + "%llu", usage->du_rbytes); + (void) snprintf(ubuf.usage_l_obytes, sizeof (ubuf.usage_l_obytes), + "%llu", usage->du_obytes); + (void) snprintf(ubuf.usage_l_bandwidth, sizeof (ubuf.usage_l_bandwidth), + "%s Mbps", dladm_bw2str(usage->du_bandwidth, buf)); + + if (!state->us_parseable && !state->us_printheader) { + print_header(&state->us_print); + state->us_printheader = B_TRUE; + } + + dladm_print_output(&state->us_print, state->us_parseable, + dladm_print_field, (void *)&ubuf); + + return (DLADM_STATUS_OK); +} + +static int +show_usage_res(dladm_usage_t *usage, void *arg) +{ + show_usage_state_t *state = (show_usage_state_t *)arg; + char buf[DLADM_STRSIZE]; + usage_fields_buf_t ubuf; + + bzero(&ubuf, sizeof (ubuf)); + + (void) snprintf(ubuf.usage_link, sizeof (ubuf.usage_link), "%s", + usage->du_name); + (void) snprintf(ubuf.usage_duration, sizeof (ubuf.usage_duration), + "%llu", usage->du_duration); + (void) snprintf(ubuf.usage_ipackets, sizeof (ubuf.usage_ipackets), + "%llu", usage->du_ipackets); + (void) snprintf(ubuf.usage_rbytes, sizeof (ubuf.usage_rbytes), + "%llu", usage->du_rbytes); + (void) snprintf(ubuf.usage_opackets, sizeof (ubuf.usage_opackets), + "%llu", usage->du_opackets); + (void) snprintf(ubuf.usage_obytes, sizeof (ubuf.usage_obytes), + "%llu", usage->du_obytes); + (void) snprintf(ubuf.usage_bandwidth, sizeof (ubuf.usage_bandwidth), + "%s Mbps", dladm_bw2str(usage->du_bandwidth, buf)); + + if (!state->us_parseable && !state->us_printheader) { + print_header(&state->us_print); + state->us_printheader = B_TRUE; + } + + dladm_print_output(&state->us_print, state->us_parseable, + dladm_print_field, (void *)&ubuf); + + return (DLADM_STATUS_OK); +} + +static boolean_t +valid_formatspec(char *formatspec_str) +{ + if (strcmp(formatspec_str, "gnuplot") == 0) + return (B_TRUE); + return (B_FALSE); + +} + +/*ARGSUSED*/ +static void +do_show_usage(int argc, char *argv[], const char *use) +{ + char *file = NULL; + int opt; + dladm_status_t status; + boolean_t d_arg = B_FALSE; + boolean_t p_arg = B_FALSE; + char *stime = NULL; + char *etime = NULL; + char *resource = NULL; + show_usage_state_t state; + boolean_t o_arg = B_FALSE; + boolean_t F_arg = B_FALSE; + char *fields_str = NULL; + char *formatspec_str = NULL; + print_field_t **fields; + uint_t nfields; + char *all_fields = + "link,duration,ipackets,rbytes,opackets,obytes,bandwidth"; + char *all_l_fields = + "link,start,end,rbytes,obytes,bandwidth"; + + bzero(&state, sizeof (show_usage_state_t)); + state.us_parseable = B_FALSE; + state.us_printheader = B_FALSE; + state.us_plot = B_FALSE; + state.us_first = B_TRUE; + + while ((opt = getopt(argc, argv, "dps:e:o:f:F:")) != -1) { + switch (opt) { + case 'd': + d_arg = B_TRUE; + break; + case 'p': + state.us_plot = p_arg = B_TRUE; + break; + case 'f': + file = optarg; + break; + case 's': + stime = optarg; + break; + case 'e': + etime = optarg; + break; + case 'o': + o_arg = B_TRUE; + fields_str = optarg; + break; + case 'F': + F_arg = B_TRUE; + formatspec_str = optarg; + break; + default: + die_opterr(optopt, opt, use); + break; + } + } + + if (file == NULL) + die("show-usage requires a file"); + + if (optind == (argc-1)) { + resource = argv[optind]; + } + + if (resource == NULL && stime == NULL && etime == NULL) { + if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0)) + fields_str = all_fields; + fields = parse_output_fields(fields_str, usage_fields, + USAGE_MAX_FIELDS, CMD_TYPE_ANY, &nfields); + } else { + if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0)) + fields_str = all_l_fields; + fields = parse_output_fields(fields_str, usage_l_fields, + USAGE_L_MAX_FIELDS, CMD_TYPE_ANY, &nfields); + } + + if (fields == NULL) { + die("invalid fields(s) specified"); + return; + } + state.us_print.ps_fields = fields; + state.us_print.ps_nfields = nfields; + + if (p_arg && d_arg) + die("plot and date options are incompatible"); + + if (p_arg && !F_arg) + die("specify format speicifier: -F <format>"); + + if (F_arg && valid_formatspec(formatspec_str) == B_FALSE) + die("Format specifier %s not supported", formatspec_str); + + if (d_arg) { + /* Print log dates */ + status = dladm_usage_dates(show_usage_date, + DLADM_LOGTYPE_LINK, file, resource, &state); + } else if (resource == NULL && stime == NULL && etime == NULL && + !p_arg) { + /* Print summary */ + status = dladm_usage_summary(show_usage_res, + DLADM_LOGTYPE_LINK, file, &state); + } else if (resource != NULL) { + /* Print log entries for named resource */ + status = dladm_walk_usage_res(show_usage_time, + DLADM_LOGTYPE_LINK, file, resource, stime, etime, &state); + } else { + /* Print time and information for each link */ + status = dladm_walk_usage_time(show_usage_time, + DLADM_LOGTYPE_LINK, file, stime, etime, &state); + } + + if (status != DLADM_STATUS_OK) + die_dlerr(status, "show-usage"); +} + static void do_create_aggr(int argc, char *argv[], const char *use) { @@ -889,9 +1323,13 @@ do_create_aggr(int argc, char *argv[], const char *use) char *devs[MAXPORT]; char *links[MAXPORT]; dladm_status_t status; + dladm_status_t pstatus; + dladm_arg_list_t *proplist = NULL; + int i; + datalink_id_t linkid; ndev = nlink = opterr = 0; - while ((option = getopt_long(argc, argv, ":d:l:L:P:R:tfu:T:", + while ((option = getopt_long(argc, argv, ":d:l:L:P:R:tfu:T:p:", lopts, NULL)) != -1) { switch (option) { case 'd': @@ -955,6 +1393,11 @@ do_create_aggr(int argc, char *argv[], const char *use) case 'R': altroot = optarg; break; + case 'p': + if (dladm_parse_link_props(optarg, &proplist, B_FALSE) + != DLADM_STATUS_OK) + die("invalid aggregation property"); + break; default: die_opterr(optopt, option, use); break; @@ -1000,7 +1443,30 @@ do_create_aggr(int argc, char *argv[], const char *use) status = dladm_aggr_create(name, key, ndev + nlink, port, policy, mac_addr_fixed, (const uchar_t *)mac_addr, lacp_mode, lacp_timer, flags); + if (status != DLADM_STATUS_OK) + goto done; + + if (proplist == NULL) + return; + + status = dladm_name2info(name, &linkid, NULL, NULL, NULL); + if (status != DLADM_STATUS_OK) + goto done; + + for (i = 0; i < proplist->al_count; i++) { + dladm_arg_info_t *aip = &proplist->al_info[i]; + + pstatus = dladm_set_linkprop(linkid, aip->ai_name, + aip->ai_val, aip->ai_count, flags); + + if (pstatus != DLADM_STATUS_OK) { + die_dlerr(pstatus, + "aggr creation succeeded but " + "could not set property '%s'", aip->ai_name); + } + } done: + dladm_free_props(proplist); if (status != DLADM_STATUS_OK) { if (status == DLADM_STATUS_NONOTIF) { die_dlerr(status, "not all links have link up/down " @@ -1379,19 +1845,21 @@ done: static void do_create_vlan(int argc, char *argv[], const char *use) { - char *link = NULL; - char drv[DLPI_LINKNAME_MAX]; - uint_t ppa; - datalink_id_t linkid; - int vid = 0; - char option; - uint32_t flags = (DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST); - char *altroot = NULL; - char vlan[MAXLINKNAMELEN]; - dladm_status_t status; + char *link = NULL; + char drv[DLPI_LINKNAME_MAX]; + uint_t ppa; + datalink_id_t linkid; + datalink_id_t dev_linkid; + int vid = 0; + char option; + uint32_t flags = (DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST); + char *altroot = NULL; + char vlan[MAXLINKNAMELEN]; + dladm_arg_list_t *proplist = NULL; + dladm_status_t status; opterr = 0; - while ((option = getopt_long(argc, argv, ":tfl:v:", + while ((option = getopt_long(argc, argv, ":tfR:l:v:p:", lopts, NULL)) != -1) { switch (option) { case 'v': @@ -1408,15 +1876,21 @@ do_create_vlan(int argc, char *argv[], const char *use) link = optarg; break; - case 'f': - flags |= DLADM_OPT_FORCE; - break; case 't': flags &= ~DLADM_OPT_PERSIST; break; case 'R': altroot = optarg; break; + case 'p': + if (dladm_parse_link_props(optarg, &proplist, B_FALSE) + != DLADM_STATUS_OK) { + die("invalid vlan property"); + } + break; + case 'f': + flags |= DLADM_OPT_FORCE; + break; default: die_opterr(optopt, option, use); break; @@ -1444,19 +1918,14 @@ do_create_vlan(int argc, char *argv[], const char *use) if (altroot != NULL) altroot_cmd(altroot, argc, argv); - if (dladm_name2info(link, &linkid, NULL, NULL, NULL) != + if (dladm_name2info(link, &dev_linkid, NULL, NULL, NULL) != DLADM_STATUS_OK) { die("invalid link name '%s'", link); } - if ((status = dladm_vlan_create(vlan, linkid, vid, flags)) != - DLADM_STATUS_OK) { - if (status == DLADM_STATUS_NOTSUP) { - die_dlerr(status, "VLAN over '%s' may require lowered " - "MTU; must use -f (see dladm(1M))\n", link); - } else { - die_dlerr(status, "create operation failed"); - } + if ((status = dladm_vlan_create(vlan, dev_linkid, vid, proplist, flags, + &linkid)) != DLADM_STATUS_OK) { + die_dlerr(status, "create operation over %s failed", link); } } @@ -1505,31 +1974,7 @@ done: static void do_up_vlan(int argc, char *argv[], const char *use) { - datalink_id_t linkid = DATALINK_ALL_LINKID; - dladm_status_t status; - - /* - * get the name of the VLAN (optional last argument) - */ - if (argc > 2) - usage(); - - if (argc == 2) { - status = dladm_name2info(argv[1], &linkid, NULL, NULL, NULL); - if (status != DLADM_STATUS_OK) - goto done; - } - - status = dladm_vlan_up(linkid); -done: - if (status != DLADM_STATUS_OK) { - if (argc == 2) { - die_dlerr(status, - "could not bring up VLAN '%s'", argv[1]); - } else { - die_dlerr(status, "could not bring VLANs up"); - } - } + do_up_vnic_common(argc, argv, use, B_TRUE); } static void @@ -1724,7 +2169,7 @@ print_link_topology(show_state_t *state, datalink_id_t linkid, } free(ginfo.lg_ports); } else if (class == DATALINK_CLASS_VNIC) { - dladm_vnic_attr_sys_t vinfo; + dladm_vnic_attr_t vinfo; if ((status = dladm_vnic_info(linkid, &vinfo, flags)) != DLADM_STATUS_OK || (status = dladm_datalink_id2info( @@ -1816,7 +2261,6 @@ done: return (status); } - static int show_link(datalink_id_t linkid, void *arg) { @@ -1854,7 +2298,6 @@ show_link_stats(datalink_id_t linkid, void *arg) show_state_t *state = (show_state_t *)arg; pktsum_t stats, diff_stats; dladm_phys_attr_t dpa; - dev_args_t largs; if (state->ls_firstonly) { if (state->ls_donefirst) @@ -1881,12 +2324,15 @@ show_link_stats(datalink_id_t linkid, void *arg) } else { get_link_stats(link, &stats); } - stats_diff(&diff_stats, &stats, &state->ls_prevstats); + dladm_stats_diff(&diff_stats, &stats, &state->ls_prevstats); - largs.devs_link = link; - largs.devs_psum = &diff_stats; - dladm_print_output(&state->ls_print, state->ls_parseable, - print_dev_stats, &largs); + (void) printf("%-12s", link); + (void) printf("%-10llu", diff_stats.ipackets); + (void) printf("%-12llu", diff_stats.rbytes); + (void) printf("%-8llu", diff_stats.ierrors); + (void) printf("%-10llu", diff_stats.opackets); + (void) printf("%-12llu", diff_stats.obytes); + (void) printf("%-8llu\n", diff_stats.oerrors); state->ls_prevstats = stats; return (DLADM_WALK_CONTINUE); @@ -2192,7 +2638,7 @@ print_aggr_stats_callback(print_field_t *pf, void *arg) goto err; } - stats_diff(&diff_stats, &port_stat, l->laggr_prevstats); + dladm_stats_diff(&diff_stats, &port_stat, l->laggr_prevstats); } switch (pf->pf_index) { @@ -2296,7 +2742,8 @@ print_aggr_stats(show_grp_state_t *state, const char *link, } get_mac_stats(dpa.dp_dev, &port_stat); - stats_total(&pktsumtot, &port_stat, &state->gs_prevstats[i]); + dladm_stats_total(&pktsumtot, &port_stat, + &state->gs_prevstats[i]); } if (!state->gs_parseable && !state->gs_printheader) { @@ -2381,127 +2828,17 @@ done: return (DLADM_WALK_CONTINUE); } -static char * -print_dev(print_field_t *pf, void *arg) -{ - const char *dev = arg; - static char buf[DLADM_STRSIZE]; - - switch (pf->pf_index) { - case DEV_LINK: - (void) snprintf(buf, sizeof (buf), "%s", dev); - break; - case DEV_STATE: - (void) get_linkstate(dev, B_FALSE, buf); - break; - case DEV_SPEED: - (void) snprintf(buf, sizeof (buf), "%uMb", - (unsigned int)(get_ifspeed(dev, B_FALSE) / 1000000ull)); - break; - case DEV_DUPLEX: - (void) get_linkduplex(dev, B_FALSE, buf); - break; - default: - die("invalid index '%d'", pf->pf_index); - break; - } - return (buf); -} - -static int -show_dev(const char *dev, void *arg) -{ - show_state_t *state = arg; - - if (!state->ls_parseable && !state->ls_printheader) { - print_header(&state->ls_print); - state->ls_printheader = B_TRUE; - } - - dladm_print_output(&state->ls_print, state->ls_parseable, - print_dev, (void *)dev); - - return (DLADM_WALK_CONTINUE); -} - -static char * -print_dev_stats(print_field_t *pf, void *arg) -{ - dev_args_t *dargs = arg; - pktsum_t *diff_stats = dargs->devs_psum; - static char buf[DLADM_STRSIZE]; - - switch (pf->pf_index) { - case DEVS_LINK: - (void) snprintf(buf, sizeof (buf), "%s", dargs->devs_link); - break; - case DEVS_IPKTS: - (void) snprintf(buf, sizeof (buf), "%llu", - diff_stats->ipackets); - break; - case DEVS_RBYTES: - (void) snprintf(buf, sizeof (buf), "%llu", - diff_stats->rbytes); - break; - case DEVS_IERRORS: - (void) snprintf(buf, sizeof (buf), "%u", - diff_stats->ierrors); - break; - case DEVS_OPKTS: - (void) snprintf(buf, sizeof (buf), "%llu", - diff_stats->opackets); - break; - case DEVS_OBYTES: - (void) snprintf(buf, sizeof (buf), "%llu", - diff_stats->obytes); - break; - case DEVS_OERRORS: - (void) snprintf(buf, sizeof (buf), "%u", - diff_stats->oerrors); - break; - default: - die("invalid input"); - break; - } - return (buf); -} - -static int -show_dev_stats(const char *dev, void *arg) -{ - show_state_t *state = arg; - pktsum_t stats, diff_stats; - dev_args_t dargs; - - if (state->ls_firstonly) { - if (state->ls_donefirst) - return (DLADM_WALK_CONTINUE); - state->ls_donefirst = B_TRUE; - } else { - bzero(&state->ls_prevstats, sizeof (state->ls_prevstats)); - } - - get_mac_stats(dev, &stats); - stats_diff(&diff_stats, &stats, &state->ls_prevstats); - - dargs.devs_link = (char *)dev; - dargs.devs_psum = &diff_stats; - dladm_print_output(&state->ls_print, state->ls_parseable, - print_dev_stats, &dargs); - - state->ls_prevstats = stats; - return (DLADM_WALK_CONTINUE); -} - static void do_show_link(int argc, char *argv[], const char *use) { int option; boolean_t s_arg = B_FALSE; + boolean_t S_arg = B_FALSE; boolean_t i_arg = B_FALSE; uint32_t flags = DLADM_OPT_ACTIVE; boolean_t p_arg = B_FALSE; datalink_id_t linkid = DATALINK_ALL_LINKID; + char linkname[MAXLINKNAMELEN]; int interval = 0; show_state_t state; dladm_status_t status; @@ -2517,7 +2854,7 @@ do_show_link(int argc, char *argv[], const char *use) bzero(&state, sizeof (state)); opterr = 0; - while ((option = getopt_long(argc, argv, ":pPsi:o:", + while ((option = getopt_long(argc, argv, ":pPsSi:o:", show_lopts, NULL)) != -1) { switch (option) { case 'p': @@ -2538,6 +2875,12 @@ do_show_link(int argc, char *argv[], const char *use) flags = DLADM_OPT_PERSIST; break; + case 'S': + if (S_arg) + die_optdup(option); + + S_arg = B_TRUE; + break; case 'o': o_arg = B_TRUE; fields_str = optarg; @@ -2556,19 +2899,32 @@ do_show_link(int argc, char *argv[], const char *use) } } - if (i_arg && !s_arg) - die("the option -i can be used only with -s"); + if (i_arg && !(s_arg || S_arg)) + die("the option -i can be used only with -s or -S"); + + if (s_arg && S_arg) + die("the -s option cannot be used with -S"); if (s_arg && flags != DLADM_OPT_ACTIVE) die("the option -P cannot be used with -s"); + if (S_arg && (p_arg || flags != DLADM_OPT_ACTIVE)) + die("the option -%c cannot be used with -S", p_arg ? 'p' : 'P'); + /* get link name (optional last argument) */ if (optind == (argc-1)) { uint32_t f; - if ((status = dladm_name2info(argv[optind], &linkid, &f, + if (strlcpy(linkname, argv[optind], MAXLINKNAMELEN) + >= MAXLINKNAMELEN) { + (void) fprintf(stderr, + gettext("%s: link name too long\n"), + progname); + exit(1); + } + if ((status = dladm_name2info(linkname, &linkid, &f, NULL, NULL)) != DLADM_STATUS_OK) { - die_dlerr(status, "link %s is not valid", argv[optind]); + die_dlerr(status, "link %s is not valid", linkname); } if (!(f & flags)) { @@ -2583,6 +2939,11 @@ do_show_link(int argc, char *argv[], const char *use) if (p_arg && !o_arg) die("-p requires -o"); + if (S_arg) { + dladm_continuous(linkid, NULL, interval, LINK_REPORT); + return; + } + if (p_arg && strcasecmp(fields_str, "all") == 0) die("\"-o all\" is invalid with -p"); @@ -2604,7 +2965,6 @@ do_show_link(int argc, char *argv[], const char *use) return; } - fields = parse_output_fields(fields_str, link_fields, DEV_LINK_FIELDS, CMD_TYPE_ANY, &nfields); @@ -2641,17 +3001,17 @@ do_show_aggr(int argc, char *argv[], const char *use) int interval = 0; int key; dladm_status_t status; - boolean_t o_arg = B_FALSE; - char *fields_str = NULL; - print_field_t **fields; - uint_t nfields; - char *all_fields = + boolean_t o_arg = B_FALSE; + char *fields_str = NULL; + print_field_t **fields; + uint_t nfields; + char *all_fields = "link,policy,addrpolicy,lacpactivity,lacptimer,flags"; - char *all_lacp_fields = + char *all_lacp_fields = "link,port,aggregatable,sync,coll,dist,defaulted,expired"; - char *all_stats_fields = + char *all_stats_fields = "link,port,ipackets,rbytes,opackets,obytes,ipktdist,opktdist"; - char *all_extended_fields = + char *all_extended_fields = "link,port,speed,duplex,state,address,portstate"; print_field_t *pf; int pfmax; @@ -2806,138 +3166,222 @@ do_show_aggr(int argc, char *argv[], const char *use) } } -static void -do_show_dev(int argc, char *argv[], const char *use) +static dladm_status_t +print_phys_default(show_state_t *state, datalink_id_t linkid, + const char *link, uint32_t flags, uint32_t media) { - int option; - char *dev = NULL; - boolean_t s_arg = B_FALSE; - boolean_t i_arg = B_FALSE; - boolean_t o_arg = B_FALSE; - boolean_t p_arg = B_FALSE; - datalink_id_t linkid; - int interval = 0; - show_state_t state; - char *fields_str = NULL; - print_field_t **fields; - uint_t nfields; - char *all_fields = "link,state,speed,duplex"; - static char *allstat_fields = - "link,ipackets,rbytes,ierrors,opackets,obytes,oerrors"; + dladm_phys_attr_t dpa; + dladm_status_t status; + link_fields_buf_t pattr; - bzero(&state, sizeof (state)); - fields_str = all_fields; + status = dladm_phys_info(linkid, &dpa, state->ls_flags); + if (status != DLADM_STATUS_OK) + goto done; - opterr = 0; - while ((option = getopt_long(argc, argv, ":psi:o:", - show_lopts, NULL)) != -1) { - switch (option) { - case 'p': - if (p_arg) - die_optdup(option); + (void) snprintf(pattr.link_phys_device, + sizeof (pattr.link_phys_device), "%s", dpa.dp_dev); + (void) dladm_media2str(media, pattr.link_phys_media); + if (state->ls_flags == DLADM_OPT_ACTIVE) { + boolean_t islink; - p_arg = B_TRUE; - break; - case 's': - if (s_arg) - die_optdup(option); + if (!dpa.dp_novanity) { + (void) strlcpy(pattr.link_name, link, + sizeof (pattr.link_name)); + islink = B_TRUE; + } else { + /* + * This is a physical link that does not have + * vanity naming support. + */ + (void) strlcpy(pattr.link_name, dpa.dp_dev, + sizeof (pattr.link_name)); + islink = B_FALSE; + } - s_arg = B_TRUE; - break; - case 'o': - o_arg = B_TRUE; - fields_str = optarg; - break; - case 'i': - if (i_arg) - die_optdup(option); + (void) get_linkstate(pattr.link_name, islink, + pattr.link_phys_state); + (void) snprintf(pattr.link_phys_speed, + sizeof (pattr.link_phys_speed), "%u", + (uint_t)((get_ifspeed(pattr.link_name, + islink)) / 1000000ull)); + (void) get_linkduplex(pattr.link_name, islink, + pattr.link_phys_duplex); + } else { + (void) snprintf(pattr.link_name, sizeof (pattr.link_name), + "%s", link); + (void) snprintf(pattr.link_flags, sizeof (pattr.link_flags), + "%c----", flags & DLADM_OPT_ACTIVE ? '-' : 'r'); + } - i_arg = B_TRUE; - if (!str2int(optarg, &interval) || interval == 0) - die("invalid interval value '%s'", optarg); - break; - default: - die_opterr(optopt, option, use); - break; - } + if (!state->ls_parseable && !state->ls_printheader) { + print_header(&state->ls_print); + state->ls_printheader = B_TRUE; } - if (p_arg && !o_arg) - die("-p requires -o"); + dladm_print_output(&state->ls_print, state->ls_parseable, + dladm_print_field, (void *)&pattr); - if (p_arg && strcasecmp(fields_str, "all") == 0) - die("\"-o all\" is invalid with -p"); +done: + return (status); +} - if (i_arg && !s_arg) - die("the option -i can be used only with -s"); +typedef struct { + show_state_t *ms_state; + char *ms_link; + dladm_macaddr_attr_t *ms_mac_attr; +} print_phys_mac_state_t; - if (o_arg && strcasecmp(fields_str, "all") == 0) { - if (!s_arg) - fields_str = all_fields; +/* callback of dladm_print_output() */ +static char * +print_phys_one_mac_callback(print_field_t *pf, void *arg) +{ + print_phys_mac_state_t *mac_state = arg; + dladm_macaddr_attr_t *attr = mac_state->ms_mac_attr; + static char buf[DLADM_STRSIZE]; + boolean_t is_primary = (attr->ma_slot == 0); + boolean_t is_parseable = mac_state->ms_state->ls_parseable; + + switch (pf->pf_index) { + case PHYS_M_LINK: + (void) snprintf(buf, sizeof (buf), "%s", + (is_primary || is_parseable) ? mac_state->ms_link : " "); + break; + case PHYS_M_SLOT: + if (is_primary) + (void) snprintf(buf, sizeof (buf), gettext("primary")); else - fields_str = allstat_fields; + (void) snprintf(buf, sizeof (buf), "%d", attr->ma_slot); + break; + case PHYS_M_ADDRESS: + (void) dladm_aggr_macaddr2str(attr->ma_addr, buf); + break; + case PHYS_M_INUSE: + (void) snprintf(buf, sizeof (buf), "%s", + attr->ma_flags & DLADM_MACADDR_USED ? gettext("yes") : + gettext("no")); + break; + case PHYS_M_CLIENT: + /* + * CR 6678526: resolve link id to actual link name if + * it is valid. + */ + (void) snprintf(buf, sizeof (buf), "%s", attr->ma_client_name); + break; } - if (!o_arg && s_arg) - fields_str = allstat_fields; - - if (s_arg && p_arg) - die("the option -s cannot be used with -p"); - - /* get dev name (optional last argument) */ - if (optind == (argc-1)) { - uint32_t flags; + return (buf); +} - dev = argv[optind]; +typedef struct { + show_state_t *hs_state; + char *hs_link; + dladm_hwgrp_attr_t *hs_grp_attr; +} print_phys_hwgrp_state_t; - if (dladm_dev2linkid(dev, &linkid) != DLADM_STATUS_OK) - die("invalid device %s", dev); +static char * +print_phys_one_hwgrp_callback(print_field_t *pf, void *arg) +{ + print_phys_hwgrp_state_t *hg_state = arg; + dladm_hwgrp_attr_t *attr = hg_state->hs_grp_attr; + static char buf[DLADM_STRSIZE]; - if ((dladm_datalink_id2info(linkid, &flags, NULL, NULL, - NULL, 0) != DLADM_STATUS_OK) || - !(flags & DLADM_OPT_ACTIVE)) { - die("device %s has been removed", dev); + switch (pf->pf_index) { + case PHYS_H_LINK: + (void) snprintf(buf, sizeof (buf), "%s", attr->hg_link_name); + break; + case PHYS_H_GROUP: + (void) snprintf(buf, sizeof (buf), "%d", attr->hg_grp_num); + break; + case PHYS_H_GRPTYPE: + (void) snprintf(buf, sizeof (buf), "%s", + attr->hg_grp_type == DLADM_HWGRP_TYPE_RX ? "RX" : "TX"); + break; + case PHYS_H_RINGS: + (void) snprintf(buf, sizeof (buf), "%d", attr->hg_n_rings); + break; + case PHYS_H_CLIENTS: + if (attr->hg_client_names[0] == '\0') { + (void) snprintf(buf, sizeof (buf), "--"); + } else { + (void) snprintf(buf, sizeof (buf), "%s ", + attr->hg_client_names); } - } else if (optind != argc) { - usage(); + break; } - state.ls_parseable = p_arg; - state.ls_donefirst = B_FALSE; + return (buf); +} - if (s_arg) { - dev_stats(dev, interval, fields_str, &state); - return; +/* callback of dladm_walk_macaddr, invoked for each MAC address slot */ +static boolean_t +print_phys_mac_callback(void *arg, dladm_macaddr_attr_t *attr) +{ + print_phys_mac_state_t *mac_state = arg; + show_state_t *state = mac_state->ms_state; + + if (!state->ls_parseable && !state->ls_printheader) { + print_header(&state->ls_print); + state->ls_printheader = B_TRUE; } - fields = parse_output_fields(fields_str, dev_fields, DEV_MAX_FIELDS, - CMD_TYPE_ANY, &nfields); + mac_state->ms_mac_attr = attr; + dladm_print_output(&state->ls_print, state->ls_parseable, + print_phys_one_mac_callback, mac_state); - if (fields == NULL) { - die("invalid field(s) specified"); - return; - } + return (B_TRUE); +} - state.ls_print.ps_fields = fields; - state.ls_print.ps_nfields = nfields; +/* invoked by show-phys -m for each physical data-link */ +static dladm_status_t +print_phys_mac(show_state_t *state, datalink_id_t linkid, char *link) +{ + print_phys_mac_state_t mac_state; - if (dev == NULL) { - (void) dladm_mac_walk(show_dev, &state); - } else { - (void) show_dev(dev, &state); + mac_state.ms_state = state; + mac_state.ms_link = link; + + return (dladm_walk_macaddr(linkid, &mac_state, + print_phys_mac_callback)); +} + +/* callback of dladm_walk_hwgrp, invoked for each MAC hwgrp */ +static boolean_t +print_phys_hwgrp_callback(void *arg, dladm_hwgrp_attr_t *attr) +{ + print_phys_hwgrp_state_t *hwgrp_state = arg; + show_state_t *state = hwgrp_state->hs_state; + + if (!state->ls_parseable && !state->ls_printheader) { + print_header(&state->ls_print); + state->ls_printheader = B_TRUE; } + hwgrp_state->hs_grp_attr = attr; + dladm_print_output(&state->ls_print, state->ls_parseable, + print_phys_one_hwgrp_callback, hwgrp_state); + + return (B_TRUE); } +/* invoked by show-phys -H for each physical data-link */ +static dladm_status_t +print_phys_hwgrp(show_state_t *state, datalink_id_t linkid, char *link) +{ + print_phys_hwgrp_state_t hwgrp_state; + + hwgrp_state.hs_state = state; + hwgrp_state.hs_link = link; + return (dladm_walk_hwgrp(linkid, &hwgrp_state, + print_phys_hwgrp_callback)); +} static dladm_status_t -print_phys(show_state_t *state, datalink_id_t linkid, link_fields_buf_t *pattr) +print_phys(show_state_t *state, datalink_id_t linkid) { char link[MAXLINKNAMELEN]; - dladm_phys_attr_t dpa; uint32_t flags; + dladm_status_t status; datalink_class_t class; uint32_t media; - dladm_status_t status; if ((status = dladm_datalink_id2info(linkid, &flags, &class, &media, link, MAXLINKNAMELEN)) != DLADM_STATUS_OK) { @@ -2954,44 +3398,12 @@ print_phys(show_state_t *state, datalink_id_t linkid, link_fields_buf_t *pattr) goto done; } - status = dladm_phys_info(linkid, &dpa, state->ls_flags); - if (status != DLADM_STATUS_OK) - goto done; - - (void) snprintf(pattr->link_phys_device, - sizeof (pattr->link_phys_device), "%s", dpa.dp_dev); - (void) dladm_media2str(media, pattr->link_phys_media); - if (state->ls_flags == DLADM_OPT_ACTIVE) { - boolean_t islink; - - if (!dpa.dp_novanity) { - (void) strlcpy(pattr->link_name, link, - sizeof (pattr->link_name)); - islink = B_TRUE; - } else { - /* - * This is a physical link that does not have - * vanity naming support. - */ - (void) strlcpy(pattr->link_name, dpa.dp_dev, - sizeof (pattr->link_name)); - islink = B_FALSE; - } - - (void) get_linkstate(pattr->link_name, islink, - pattr->link_phys_state); - (void) snprintf(pattr->link_phys_speed, - sizeof (pattr->link_phys_speed), "%u", - (uint_t)((get_ifspeed(pattr->link_name, - islink)) / 1000000ull)); - (void) get_linkduplex(pattr->link_name, islink, - pattr->link_phys_duplex); - } else { - (void) snprintf(pattr->link_name, sizeof (pattr->link_name), - "%s", link); - (void) snprintf(pattr->link_flags, sizeof (pattr->link_flags), - "%c----", flags & DLADM_OPT_ACTIVE ? '-' : 'r'); - } + if (state->ls_mac) + status = print_phys_mac(state, linkid, link); + else if (state->ls_hwgrp) + status = print_phys_hwgrp(state, linkid, link); + else + status = print_phys_default(state, linkid, link, flags, media); done: return (status); @@ -3000,29 +3412,12 @@ done: static int show_phys(datalink_id_t linkid, void *arg) { - show_state_t *state = arg; - dladm_status_t status; - link_fields_buf_t pattr; - - bzero(&pattr, sizeof (link_fields_buf_t)); - status = print_phys(state, linkid, &pattr); - if (status != DLADM_STATUS_OK) - goto done; - - if (!state->ls_parseable && !state->ls_printheader) { - print_header(&state->ls_print); - state->ls_printheader = B_TRUE; - } - - dladm_print_output(&state->ls_print, state->ls_parseable, - dladm_print_field, (void *)&pattr); + show_state_t *state = arg; -done: - state->ls_status = status; + state->ls_status = print_phys(state, linkid); return (DLADM_WALK_CONTINUE); } - /* * Print the active topology information. */ @@ -3052,8 +3447,8 @@ print_vlan(show_state_t *state, datalink_id_t linkid, link_fields_buf_t *l) (void) snprintf(l->link_vlan_vid, sizeof (l->link_vlan_vid), "%d", vinfo.dv_vid); - (void) snprintf(l->link_flags, sizeof (l->link_flags), "%c%c---", - vinfo.dv_force ? 'f' : '-', vinfo.dv_implicit ? 'i' : '-'); + (void) snprintf(l->link_flags, sizeof (l->link_flags), "%c----", + vinfo.dv_force ? 'f' : '-'); done: return (status); @@ -3091,6 +3486,8 @@ do_show_phys(int argc, char *argv[], const char *use) uint32_t flags = DLADM_OPT_ACTIVE; boolean_t p_arg = B_FALSE; boolean_t o_arg = B_FALSE; + boolean_t m_arg = B_FALSE; + boolean_t H_arg = B_FALSE; datalink_id_t linkid = DATALINK_ALL_LINKID; show_state_t state; dladm_status_t status; @@ -3100,10 +3497,15 @@ do_show_phys(int argc, char *argv[], const char *use) char *all_active_fields = "link,media,state,speed,duplex,device"; char *all_inactive_fields = "link,device,media,flags"; + char *all_mac_fields = "link,slot,address,inuse,client"; + char *all_hwgrp_fields = + "link,group,grouptype,rings,clients"; + print_field_t *pf; + int pfmax; bzero(&state, sizeof (state)); opterr = 0; - while ((option = getopt_long(argc, argv, ":pPo:", + while ((option = getopt_long(argc, argv, ":pPo:mH", show_lopts, NULL)) != -1) { switch (option) { case 'p': @@ -3122,6 +3524,12 @@ do_show_phys(int argc, char *argv[], const char *use) o_arg = B_TRUE; fields_str = optarg; break; + case 'm': + m_arg = B_TRUE; + break; + case 'H': + H_arg = B_TRUE; + break; default: die_opterr(optopt, option, use); break; @@ -3131,6 +3539,9 @@ do_show_phys(int argc, char *argv[], const char *use) if (p_arg && !o_arg) die("-p requires -o"); + if (m_arg && H_arg) + die("-m cannot combine with -H"); + if (p_arg && strcasecmp(fields_str, "all") == 0) die("\"-o all\" is invalid with -p"); @@ -3147,16 +3558,42 @@ do_show_phys(int argc, char *argv[], const char *use) state.ls_parseable = p_arg; state.ls_flags = flags; state.ls_donefirst = B_FALSE; + state.ls_mac = m_arg; + state.ls_hwgrp = H_arg; + + if (m_arg && !(flags & DLADM_OPT_ACTIVE)) { + /* + * We can only display the factory MAC addresses of + * active data-links. + */ + die("-m not compatible with -P"); + } if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0)) { - if (state.ls_flags & DLADM_OPT_ACTIVE) + if (state.ls_mac) + fields_str = all_mac_fields; + else if (state.ls_hwgrp) + fields_str = all_hwgrp_fields; + else if (state.ls_flags & DLADM_OPT_ACTIVE) { fields_str = all_active_fields; - else + } else { fields_str = all_inactive_fields; + } + } + + if (state.ls_mac) { + pf = phys_m_fields; + pfmax = PHYS_M_MAX_FIELDS; + } else if (state.ls_hwgrp) { + pf = phys_h_fields; + pfmax = PHYS_H_MAX_FIELDS; + } else { + pf = phys_fields; + pfmax = PHYS_MAX_FIELDS; } - fields = parse_output_fields(fields_str, phys_fields, - PHYS_MAX_FIELDS, CMD_TYPE_ANY, &nfields); + fields = parse_output_fields(fields_str, pf, + pfmax, CMD_TYPE_ANY, &nfields); if (fields == NULL) { die("invalid field(s) specified"); @@ -3267,6 +3704,661 @@ do_show_vlan(int argc, char *argv[], const char *use) } static void +do_create_vnic(int argc, char *argv[], const char *use) +{ + datalink_id_t linkid, dev_linkid; + char devname[MAXLINKNAMELEN]; + char name[MAXLINKNAMELEN]; + boolean_t l_arg = B_FALSE; + uint32_t flags = DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST; + char *altroot = NULL; + char option; + char *endp = NULL; + dladm_status_t status; + vnic_mac_addr_type_t mac_addr_type = VNIC_MAC_ADDR_TYPE_AUTO; + uchar_t *mac_addr; + int mac_slot = -1, maclen = 0, mac_prefix_len = 0; + dladm_arg_list_t *proplist = NULL; + uint16_t vid = 0; + + opterr = 0; + while ((option = getopt_long(argc, argv, ":tfR:l:m:n:p:r:v:H", + vnic_lopts, NULL)) != -1) { + switch (option) { + case 't': + flags &= ~DLADM_OPT_PERSIST; + break; + case 'R': + altroot = optarg; + break; + case 'l': + if (strlcpy(devname, optarg, MAXLINKNAMELEN) >= + MAXLINKNAMELEN) + die("link name too long"); + l_arg = B_TRUE; + break; + case 'm': + if (strcmp(optarg, "fixed") == 0) { + /* + * A fixed MAC address must be specified + * by its value, not by the keyword 'fixed'. + */ + die("'fixed' is not a valid MAC address"); + } + if (dladm_vnic_str2macaddrtype(optarg, + &mac_addr_type) != DLADM_STATUS_OK) { + mac_addr_type = VNIC_MAC_ADDR_TYPE_FIXED; + /* MAC address specified by value */ + mac_addr = _link_aton(optarg, &maclen); + if (mac_addr == NULL) { + if (maclen == -1) + die("invalid MAC address"); + else + die("out of memory"); + exit(1); + } + } + break; + case 'n': + errno = 0; + mac_slot = (int)strtol(optarg, &endp, 10); + if (errno != 0 || *endp != '\0') + die("invalid slot number"); + break; + case 'p': + if (dladm_parse_link_props(optarg, &proplist, B_FALSE) + != DLADM_STATUS_OK) + die("invalid vnic property"); + break; + case 'r': + mac_addr = _link_aton(optarg, &mac_prefix_len); + if (mac_addr == NULL) { + if (mac_prefix_len == -1) + die("invalid MAC address"); + else + die("out of memory"); + exit(1); + } + break; + case 'v': + vid = (int)strtol(optarg, &endp, 10); + if (errno != 0 || *endp != '\0' || vid == 0) + /* VID of 0 is invalid */ + die("invalid VLAN id"); + break; + case 'f': + flags |= DLADM_OPT_FORCE; + break; + case 'H': + flags |= DLADM_OPT_HWRINGS; + break; + default: + die_opterr(optopt, option, use); + } + } + + /* + * 'f' - force, flag can be specified only with 'v' - vlan. + */ + if ((flags & DLADM_OPT_FORCE) != 0 && vid == 0) + die("-f option can only be used with -v"); + + if (mac_prefix_len != 0 && mac_addr_type != VNIC_MAC_ADDR_TYPE_RANDOM && + mac_addr_type != VNIC_MAC_ADDR_TYPE_FIXED) + usage(); + + /* check required options */ + if (!l_arg) + usage(); + + if (mac_slot != -1 && mac_addr_type != VNIC_MAC_ADDR_TYPE_FACTORY) + usage(); + + /* the VNIC id is the required operand */ + if (optind != (argc - 1)) + usage(); + + if (strlcpy(name, argv[optind], MAXLINKNAMELEN) >= MAXLINKNAMELEN) + die("link name too long '%s'", argv[optind]); + + if (!dladm_valid_linkname(name)) + die("invalid link name '%s'", argv[optind]); + + if (altroot != NULL) + altroot_cmd(altroot, argc, argv); + + if (dladm_name2info(devname, &dev_linkid, NULL, NULL, NULL) != + DLADM_STATUS_OK) + die("invalid link name '%s'", devname); + + status = dladm_vnic_create(name, dev_linkid, mac_addr_type, mac_addr, + maclen, &mac_slot, mac_prefix_len, vid, &linkid, proplist, flags); + if (status != DLADM_STATUS_OK) + die_dlerr(status, "vnic creation over %s failed", devname); + + dladm_free_props(proplist); +} + +static void +do_etherstub_check(const char *name, datalink_id_t linkid, boolean_t etherstub, + uint32_t flags) +{ + boolean_t is_etherstub; + dladm_vnic_attr_t attr; + + if (dladm_vnic_info(linkid, &attr, flags) != DLADM_STATUS_OK) { + /* + * Let the delete continue anyway. + */ + return; + } + is_etherstub = (attr.va_link_id == DATALINK_INVALID_LINKID); + if (is_etherstub != etherstub) { + die("'%s' is not %s", name, + (is_etherstub ? "a vnic" : "an etherstub")); + } +} + +static void +do_delete_vnic_common(int argc, char *argv[], const char *use, + boolean_t etherstub) +{ + char option; + uint32_t flags = DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST; + datalink_id_t linkid; + char *altroot = NULL; + dladm_status_t status; + + opterr = 0; + while ((option = getopt_long(argc, argv, ":R:t", lopts, + NULL)) != -1) { + switch (option) { + case 't': + flags &= ~DLADM_OPT_PERSIST; + break; + case 'R': + altroot = optarg; + break; + default: + die_opterr(optopt, option, use); + } + } + + /* get vnic name (required last argument) */ + if (optind != (argc - 1)) + usage(); + + if (altroot != NULL) + altroot_cmd(altroot, argc, argv); + + status = dladm_name2info(argv[optind], &linkid, NULL, NULL, NULL); + if (status != DLADM_STATUS_OK) + die("invalid link name '%s'", argv[optind]); + + if ((flags & DLADM_OPT_ACTIVE) != 0) { + do_etherstub_check(argv[optind], linkid, etherstub, + DLADM_OPT_ACTIVE); + } + if ((flags & DLADM_OPT_PERSIST) != 0) { + do_etherstub_check(argv[optind], linkid, etherstub, + DLADM_OPT_PERSIST); + } + + status = dladm_vnic_delete(linkid, flags); + if (status != DLADM_STATUS_OK) + die_dlerr(status, "vnic deletion failed"); +} + +static void +do_delete_vnic(int argc, char *argv[], const char *use) +{ + do_delete_vnic_common(argc, argv, use, B_FALSE); +} + +/* ARGSUSED */ +static void +do_up_vnic_common(int argc, char *argv[], const char *use, boolean_t vlan) +{ + datalink_id_t linkid = DATALINK_ALL_LINKID; + dladm_status_t status; + char *type; + + type = vlan ? "vlan" : "vnic"; + + /* + * get the id or the name of the vnic/vlan (optional last argument) + */ + if (argc == 2) { + status = dladm_name2info(argv[1], &linkid, NULL, NULL, NULL); + if (status != DLADM_STATUS_OK) + goto done; + + } else if (argc > 2) { + usage(); + } + + if (vlan) + status = dladm_vlan_up(linkid); + else + status = dladm_vnic_up(linkid, 0); + +done: + if (status != DLADM_STATUS_OK) { + if (argc == 2) { + die_dlerr(status, + "could not bring up %s '%s'", type, argv[1]); + } else { + die_dlerr(status, "could not bring %ss up", type); + } + } +} + +static void +do_up_vnic(int argc, char *argv[], const char *use) +{ + do_up_vnic_common(argc, argv, use, B_FALSE); +} + +static void +dump_vnics_head(const char *dev) +{ + if (strlen(dev)) + (void) printf("%s", dev); + + (void) printf("\tipackets rbytes opackets obytes "); + + if (strlen(dev)) + (void) printf("%%ipkts %%opkts\n"); + else + (void) printf("\n"); +} + +static void +dump_vnic_stat(const char *name, datalink_id_t vnic_id, + show_vnic_state_t *state, pktsum_t *vnic_stats, pktsum_t *tot_stats) +{ + pktsum_t diff_stats; + pktsum_t *old_stats = &state->vs_prevstats[vnic_id]; + + dladm_stats_diff(&diff_stats, vnic_stats, old_stats); + + (void) printf("%s", name); + + (void) printf("\t%-10llu", diff_stats.ipackets); + (void) printf("%-12llu", diff_stats.rbytes); + (void) printf("%-10llu", diff_stats.opackets); + (void) printf("%-12llu", diff_stats.obytes); + + if (tot_stats) { + if (tot_stats->ipackets == 0) { + (void) printf("\t-"); + } else { + (void) printf("\t%-6.1f", (double)diff_stats.ipackets/ + (double)tot_stats->ipackets * 100); + } + if (tot_stats->opackets == 0) { + (void) printf("\t-"); + } else { + (void) printf("\t%-6.1f", (double)diff_stats.opackets/ + (double)tot_stats->opackets * 100); + } + } + (void) printf("\n"); + + *old_stats = *vnic_stats; +} + +/* + * Called from the walker dladm_vnic_walk_sys() for each vnic to display + * vnic information or statistics. + */ +static dladm_status_t +print_vnic(show_vnic_state_t *state, datalink_id_t linkid) +{ + dladm_vnic_attr_t attr, *vnic = &attr; + dladm_status_t status; + boolean_t is_etherstub; + char devname[MAXLINKNAMELEN]; + char vnic_name[MAXLINKNAMELEN]; + char mstr[MAXMACADDRLEN * 3]; + vnic_fields_buf_t vbuf; + + if ((status = dladm_vnic_info(linkid, vnic, state->vs_flags)) != + DLADM_STATUS_OK) + return (status); + + is_etherstub = (vnic->va_link_id == DATALINK_INVALID_LINKID); + if (state->vs_etherstub != is_etherstub) { + /* + * Want all etherstub but it's not one, or want + * non-etherstub and it's one. + */ + return (DLADM_STATUS_OK); + } + + if (state->vs_link_id != DATALINK_ALL_LINKID) { + if (state->vs_link_id != vnic->va_link_id) + return (DLADM_STATUS_OK); + } + + if (dladm_datalink_id2info(linkid, NULL, NULL, + NULL, vnic_name, sizeof (vnic_name)) != DLADM_STATUS_OK) + return (DLADM_STATUS_BADARG); + + bzero(devname, sizeof (devname)); + if (!is_etherstub && + dladm_datalink_id2info(vnic->va_link_id, NULL, NULL, + NULL, devname, sizeof (devname)) != DLADM_STATUS_OK) + return (DLADM_STATUS_BADARG); + + state->vs_found = B_TRUE; + if (state->vs_stats) { + /* print vnic statistics */ + pktsum_t vnic_stats; + + if (state->vs_firstonly) { + if (state->vs_donefirst) + return (0); + state->vs_donefirst = B_TRUE; + } + + if (!state->vs_printstats) { + /* + * get vnic statistics and add to the sum for the + * named device. + */ + get_link_stats(vnic_name, &vnic_stats); + dladm_stats_total(&state->vs_totalstats, &vnic_stats, + &state->vs_prevstats[vnic->va_vnic_id]); + } else { + /* get and print vnic statistics */ + get_link_stats(vnic_name, &vnic_stats); + dump_vnic_stat(vnic_name, linkid, state, &vnic_stats, + &state->vs_totalstats); + } + return (DLADM_STATUS_OK); + } else { + (void) snprintf(vbuf.vnic_link, sizeof (vbuf.vnic_link), + "%s", vnic_name); + + if (!is_etherstub) { + + (void) snprintf(vbuf.vnic_over, sizeof (vbuf.vnic_over), + "%s", devname); + (void) snprintf(vbuf.vnic_speed, + sizeof (vbuf.vnic_speed), "%u", + (uint_t)((get_ifspeed(vnic_name, B_TRUE)) + / 1000000ull)); + + switch (vnic->va_mac_addr_type) { + case VNIC_MAC_ADDR_TYPE_FIXED: + case VNIC_MAC_ADDR_TYPE_PRIMARY: + (void) snprintf(vbuf.vnic_macaddrtype, + sizeof (vbuf.vnic_macaddrtype), + gettext("fixed")); + break; + case VNIC_MAC_ADDR_TYPE_RANDOM: + (void) snprintf(vbuf.vnic_macaddrtype, + sizeof (vbuf.vnic_macaddrtype), + gettext("random")); + break; + case VNIC_MAC_ADDR_TYPE_FACTORY: + (void) snprintf(vbuf.vnic_macaddrtype, + sizeof (vbuf.vnic_macaddrtype), + gettext("factory, slot %d"), + vnic->va_mac_slot); + break; + } + + if (strlen(vbuf.vnic_macaddrtype) > 0) { + (void) snprintf(vbuf.vnic_macaddr, + sizeof (vbuf.vnic_macaddr), "%s", + dladm_aggr_macaddr2str(vnic->va_mac_addr, + mstr)); + } + + (void) snprintf(vbuf.vnic_vid, sizeof (vbuf.vnic_vid), + "%d", vnic->va_vid); + } + + if (!state->vs_parseable && !state->vs_printheader) { + print_header(&state->vs_print); + state->vs_printheader = B_TRUE; + } + + dladm_print_output(&state->vs_print, state->vs_parseable, + dladm_print_field, (void *)&vbuf); + + return (DLADM_STATUS_OK); + } +} + +static int +show_vnic(datalink_id_t linkid, void *arg) +{ + show_vnic_state_t *state = arg; + + state->vs_status = print_vnic(state, linkid); + return (DLADM_WALK_CONTINUE); +} + +static void +do_show_vnic_common(int argc, char *argv[], const char *use, + boolean_t etherstub) +{ + int option; + boolean_t s_arg = B_FALSE; + boolean_t i_arg = B_FALSE; + boolean_t l_arg = B_FALSE; + char *endp = NULL; + uint32_t interval = 0, flags = DLADM_OPT_ACTIVE; + datalink_id_t linkid = DATALINK_ALL_LINKID; + datalink_id_t dev_linkid = DATALINK_ALL_LINKID; + show_vnic_state_t state; + dladm_status_t status; + boolean_t o_arg = B_FALSE; + char *fields_str = NULL; + print_field_t **fields; + print_field_t *pf; + int pfmax; + uint_t nfields; + char *all_fields = + "link,over,speed,macaddr,macaddrtype,vid"; + char *all_e_fields = + "link"; + + bzero(&state, sizeof (state)); + opterr = 0; + while ((option = getopt_long(argc, argv, ":pPl:si:o:", lopts, + NULL)) != -1) { + switch (option) { + case 'p': + state.vs_parseable = B_TRUE; + break; + case 'P': + flags = DLADM_OPT_PERSIST; + break; + case 'l': + if (etherstub) + die("option not supported for this command"); + + if (strlcpy(state.vs_link, optarg, MAXLINKNAMELEN) >= + MAXLINKNAMELEN) + die("link name too long"); + + l_arg = B_TRUE; + break; + case 's': + if (s_arg) { + die("the option -s cannot be specified " + "more than once"); + } + s_arg = B_TRUE; + break; + case 'i': + if (i_arg) { + die("the option -i cannot be specified " + "more than once"); + } + i_arg = B_TRUE; + interval = (int)strtol(optarg, &endp, 10); + if (errno != 0 || interval == 0 || *endp != '\0') + die("invalid interval value '%s'", optarg); + break; + case 'o': + o_arg = B_TRUE; + fields_str = optarg; + break; + default: + die_opterr(optopt, option, use); + } + } + + if (i_arg && !s_arg) + die("the option -i can be used only with -s"); + + /* get vnic ID (optional last argument) */ + if (optind == (argc - 1)) { + status = dladm_name2info(argv[optind], &linkid, NULL, + NULL, NULL); + if (status != DLADM_STATUS_OK) { + die_dlerr(status, "invalid vnic name '%s'", + argv[optind]); + } + (void) strlcpy(state.vs_vnic, argv[optind], MAXLINKNAMELEN); + } else if (optind != argc) { + usage(); + } + + if (l_arg) { + status = dladm_name2info(state.vs_link, &dev_linkid, NULL, + NULL, NULL); + if (status != DLADM_STATUS_OK) { + die_dlerr(status, "invalid link name '%s'", + state.vs_link); + } + } + + state.vs_vnic_id = linkid; + state.vs_link_id = dev_linkid; + state.vs_etherstub = etherstub; + state.vs_found = B_FALSE; + state.vs_flags = flags; + + if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0)) { + if (etherstub) + fields_str = all_e_fields; + else + fields_str = all_fields; + } + + pf = vnic_fields; + pfmax = VNIC_MAX_FIELDS; + + fields = parse_output_fields(fields_str, pf, pfmax, CMD_TYPE_ANY, + &nfields); + + if (fields == NULL) { + die("invalid field(s) specified"); + return; + } + + state.vs_print.ps_fields = fields; + state.vs_print.ps_nfields = nfields; + + if (s_arg) { + /* Display vnic statistics */ + vnic_stats(&state, interval); + return; + } + + /* Display vnic information */ + state.vs_donefirst = B_FALSE; + + if (linkid == DATALINK_ALL_LINKID) { + (void) dladm_walk_datalink_id(show_vnic, &state, + DATALINK_CLASS_VNIC | DATALINK_CLASS_ETHERSTUB, + DATALINK_ANY_MEDIATYPE, DLADM_OPT_ACTIVE); + } else { + (void) show_vnic(linkid, &state); + if (state.vs_status != DLADM_STATUS_OK) { + die_dlerr(state.vs_status, "failed to show vnic '%s'", + state.vs_vnic); + } + } +} + +static void +do_show_vnic(int argc, char *argv[], const char *use) +{ + do_show_vnic_common(argc, argv, use, B_FALSE); +} + +static void +do_create_etherstub(int argc, char *argv[], const char *use) +{ + uint32_t flags; + char *altroot = NULL; + char option; + dladm_status_t status; + char name[MAXLINKNAMELEN]; + uchar_t mac_addr[ETHERADDRL]; + + name[0] = '\0'; + bzero(mac_addr, sizeof (mac_addr)); + flags = DLADM_OPT_ANCHOR | DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST; + + opterr = 0; + while ((option = getopt_long(argc, argv, "tR:", + etherstub_lopts, NULL)) != -1) { + switch (option) { + case 't': + flags &= ~DLADM_OPT_PERSIST; + break; + case 'R': + altroot = optarg; + break; + default: + die_opterr(optopt, option, use); + } + } + + /* the etherstub id is the required operand */ + if (optind != (argc - 1)) + usage(); + + if (strlcpy(name, argv[optind], MAXLINKNAMELEN) >= MAXLINKNAMELEN) + die("link name too long '%s'", argv[optind]); + + if (!dladm_valid_linkname(name)) + die("invalid link name '%s'", argv[optind]); + + if (altroot != NULL) + altroot_cmd(altroot, argc, argv); + + status = dladm_vnic_create(name, DATALINK_INVALID_LINKID, + VNIC_MAC_ADDR_TYPE_AUTO, mac_addr, ETHERADDRL, NULL, 0, 0, NULL, + NULL, flags); + if (status != DLADM_STATUS_OK) + die_dlerr(status, "etherstub creation failed"); + + +} + +static void +do_delete_etherstub(int argc, char *argv[], const char *use) +{ + do_delete_vnic_common(argc, argv, use, B_TRUE); +} + +/* ARGSUSED */ +static void +do_show_etherstub(int argc, char *argv[], const char *use) +{ + do_show_vnic_common(argc, argv, use, B_TRUE); +} + +static void link_stats(datalink_id_t linkid, uint_t interval, char *fields_str, show_state_t *state) { @@ -3333,147 +4425,134 @@ aggr_stats(datalink_id_t linkid, show_grp_state_t *state, uint_t interval) } } +/* ARGSUSED */ static void -dev_stats(const char *dev, uint32_t interval, char *fields_str, - show_state_t *state) +vnic_stats(show_vnic_state_t *sp, uint32_t interval) { - print_field_t **fields; - uint_t nfields; - - fields = parse_output_fields(fields_str, devs_fields, DEVS_MAX_FIELDS, - CMD_TYPE_ANY, &nfields); + show_vnic_state_t state; + boolean_t specific_link, specific_dev; - if (fields == NULL) { - die("invalid field(s) specified"); - return; - } - - state->ls_print.ps_fields = fields; - state->ls_print.ps_nfields = nfields; + /* Display vnic statistics */ + dump_vnics_head(sp->vs_link); + bzero(&state, sizeof (state)); + state.vs_stats = B_TRUE; + state.vs_vnic_id = sp->vs_vnic_id; + state.vs_link_id = sp->vs_link_id; /* - * If an interval is specified, continuously show the stats - * only for the first MAC port. + * If an interval is specified, and a vnic ID is not specified, + * continuously show the stats only for the first vnic. */ - state->ls_firstonly = (interval != 0); + specific_link = (sp->vs_vnic_id != DATALINK_ALL_LINKID); + specific_dev = (sp->vs_link_id != DATALINK_ALL_LINKID); for (;;) { + /* Get stats for each vnic */ + state.vs_found = B_FALSE; + state.vs_donefirst = B_FALSE; + state.vs_printstats = B_FALSE; + state.vs_flags = DLADM_OPT_ACTIVE; + + if (!specific_link) { + (void) dladm_walk_datalink_id(show_vnic, &state, + DATALINK_CLASS_VNIC, DATALINK_ANY_MEDIATYPE, + DLADM_OPT_ACTIVE); + } else { + (void) show_vnic(sp->vs_vnic_id, &state); + if (state.vs_status != DLADM_STATUS_OK) { + die_dlerr(state.vs_status, + "failed to show vnic '%s'", sp->vs_vnic); + } + } - if (!state->ls_parseable) - print_header(&state->ls_print); - state->ls_donefirst = B_FALSE; + if (specific_link && !state.vs_found) + die("non-existent vnic '%s'", sp->vs_vnic); + if (specific_dev && !state.vs_found) + die("device %s has no vnics", sp->vs_link); + + /* Show totals */ + if ((specific_link | specific_dev) && !interval) { + (void) printf("Total"); + (void) printf("\t%-10llu", + state.vs_totalstats.ipackets); + (void) printf("%-12llu", + state.vs_totalstats.rbytes); + (void) printf("%-10llu", + state.vs_totalstats.opackets); + (void) printf("%-12llu\n", + state.vs_totalstats.obytes); + } - if (dev == NULL) - (void) dladm_mac_walk(show_dev_stats, state); - else - (void) show_dev_stats(dev, state); + /* Show stats for each vnic */ + state.vs_donefirst = B_FALSE; + state.vs_printstats = B_TRUE; + + if (!specific_link) { + (void) dladm_walk_datalink_id(show_vnic, &state, + DATALINK_CLASS_VNIC, DATALINK_ANY_MEDIATYPE, + DLADM_OPT_ACTIVE); + } else { + (void) show_vnic(sp->vs_vnic_id, &state); + if (state.vs_status != DLADM_STATUS_OK) { + die_dlerr(state.vs_status, + "failed to show vnic '%s'", sp->vs_vnic); + } + } if (interval == 0) break; (void) sleep(interval); } - - if (dev != NULL && state->ls_status != DLADM_STATUS_OK) - die_dlerr(state->ls_status, "cannot show device '%s'", dev); } -/* accumulate stats (s1 += (s2 - s3)) */ static void -stats_total(pktsum_t *s1, pktsum_t *s2, pktsum_t *s3) -{ - s1->ipackets += (s2->ipackets - s3->ipackets); - s1->opackets += (s2->opackets - s3->opackets); - s1->rbytes += (s2->rbytes - s3->rbytes); - s1->obytes += (s2->obytes - s3->obytes); - s1->ierrors += (s2->ierrors - s3->ierrors); - s1->oerrors += (s2->oerrors - s3->oerrors); -} - -/* compute stats differences (s1 = s2 - s3) */ -static void -stats_diff(pktsum_t *s1, pktsum_t *s2, pktsum_t *s3) -{ - s1->ipackets = s2->ipackets - s3->ipackets; - s1->opackets = s2->opackets - s3->opackets; - s1->rbytes = s2->rbytes - s3->rbytes; - s1->obytes = s2->obytes - s3->obytes; - s1->ierrors = s2->ierrors - s3->ierrors; - s1->oerrors = s2->oerrors - s3->oerrors; -} - -static void -get_stats(char *module, int instance, const char *name, pktsum_t *stats) +get_mac_stats(const char *dev, pktsum_t *stats) { kstat_ctl_t *kcp; kstat_t *ksp; + char module[DLPI_LINKNAME_MAX]; + uint_t instance; - if ((kcp = kstat_open()) == NULL) { - warn("kstat open operation failed"); + + bzero(stats, sizeof (*stats)); + + if (dlpi_parselink(dev, module, &instance) != DLPI_SUCCESS) return; - } - if ((ksp = kstat_lookup(kcp, module, instance, (char *)name)) == NULL) { - /* - * The kstat query could fail if the underlying MAC - * driver was already detached. - */ - (void) kstat_close(kcp); + if ((kcp = kstat_open()) == NULL) { + warn("kstat open operation failed"); return; } - if (kstat_read(kcp, ksp, NULL) == -1) - goto bail; - - if (dladm_kstat_value(ksp, "ipackets64", KSTAT_DATA_UINT64, - &stats->ipackets) < 0) - goto bail; - - if (dladm_kstat_value(ksp, "opackets64", KSTAT_DATA_UINT64, - &stats->opackets) < 0) - goto bail; - - if (dladm_kstat_value(ksp, "rbytes64", KSTAT_DATA_UINT64, - &stats->rbytes) < 0) - goto bail; - - if (dladm_kstat_value(ksp, "obytes64", KSTAT_DATA_UINT64, - &stats->obytes) < 0) - goto bail; - - if (dladm_kstat_value(ksp, "ierrors", KSTAT_DATA_UINT32, - &stats->ierrors) < 0) - goto bail; - - if (dladm_kstat_value(ksp, "oerrors", KSTAT_DATA_UINT32, - &stats->oerrors) < 0) - goto bail; + ksp = dladm_kstat_lookup(kcp, module, instance, "mac", NULL); + if (ksp != NULL) + dladm_get_stats(kcp, ksp, stats); -bail: (void) kstat_close(kcp); - return; } static void -get_mac_stats(const char *dev, pktsum_t *stats) +get_link_stats(const char *link, pktsum_t *stats) { - char module[DLPI_LINKNAME_MAX]; - uint_t instance; + kstat_ctl_t *kcp; + kstat_t *ksp; bzero(stats, sizeof (*stats)); - if (dlpi_parselink(dev, module, &instance) != DLPI_SUCCESS) + + if ((kcp = kstat_open()) == NULL) { + warn("kstat_open operation failed"); return; + } - get_stats(module, instance, "mac", stats); -} + ksp = dladm_kstat_lookup(kcp, "link", 0, link, NULL); -static void -get_link_stats(const char *link, pktsum_t *stats) -{ - bzero(stats, sizeof (*stats)); - get_stats("link", 0, link, stats); + if (ksp != NULL) + dladm_get_stats(kcp, ksp, stats); + + (void) kstat_close(kcp); } static int @@ -3547,7 +4626,7 @@ get_linkstate(const char *name, boolean_t islink, char *buf) if (get_one_kstat(name, "link_state", KSTAT_DATA_UINT32, &linkstate, islink) != 0) { - (void) strlcpy(buf, "unknown", DLADM_STRSIZE); + (void) strlcpy(buf, "?", DLADM_STRSIZE); return (buf); } return (dladm_linkstate2str(linkstate, buf)); @@ -4271,92 +5350,6 @@ do_disconnect_wifi(int argc, char **argv, const char *use) die_dlerr(status, "cannot disconnect"); } - -static void -free_props(prop_list_t *list) -{ - if (list != NULL) { - free(list->pl_buf); - free(list); - } -} - -static int -parse_props(char *str, prop_list_t **listp, boolean_t novalues) -{ - prop_list_t *list; - prop_info_t *pip; - char *buf, *curr; - int len, i; - - list = malloc(sizeof (prop_list_t)); - if (list == NULL) - return (-1); - - list->pl_count = 0; - list->pl_buf = buf = strdup(str); - if (buf == NULL) - goto fail; - - /* - * buf is a string of form [<propname>=<value>][,<propname>=<value>]+ - * where each <value> string itself could be a comma-separated array. - * The loop below will count the number of propname assignments - * in pl_count; for each property, there is a pip entry with - * pi_name == <propname>, pi_count == # of elements in <value> array. - * pi_val[] contains the actual values. - * - * This could really be a combination of calls to - * strtok (token delimiter is ",") and strchr (chr '=') - * with appropriate null/string-bound-checks. - */ - - curr = buf; - len = strlen(buf); - pip = NULL; - for (i = 0; i < len; i++) { - char c = buf[i]; - boolean_t match = (c == '=' || c == ','); - - if (!match && i != len - 1) - continue; - - if (match) { - buf[i] = '\0'; - if (*curr == '\0') - goto fail; - } - - if (pip != NULL && c != '=') { - if (pip->pi_count > DLADM_MAX_PROP_VALCNT) - goto fail; - - if (novalues) - goto fail; - - pip->pi_val[pip->pi_count] = curr; - pip->pi_count++; - } else { - if (list->pl_count > MAX_PROPS) - goto fail; - - pip = &list->pl_info[list->pl_count]; - pip->pi_name = curr; - pip->pi_count = 0; - list->pl_count++; - if (c == ',') - pip = NULL; - } - curr = buf + i + 1; - } - *listp = list; - return (0); - -fail: - free_props(list); - return (-1); -} - static void print_linkprop(datalink_id_t linkid, show_linkprop_state_t *statep, const char *propname, dladm_prop_type_t type, @@ -4365,7 +5358,7 @@ print_linkprop(datalink_id_t linkid, show_linkprop_state_t *statep, int i; char *ptr, *lim; char buf[DLADM_STRSIZE]; - char *unknown = "?", *notsup = ""; + char *unknown = "--", *notsup = ""; char **propvals = statep->ls_propvals; uint_t valcnt = DLADM_MAX_PROP_VALCNT; dladm_status_t status; @@ -4545,7 +5538,7 @@ static void do_show_linkprop(int argc, char **argv, const char *use) { int option; - prop_list_t *proplist = NULL; + dladm_arg_list_t *proplist = NULL; datalink_id_t linkid = DATALINK_ALL_LINKID; show_linkprop_state_t state; uint32_t flags = DLADM_OPT_ACTIVE; @@ -4570,7 +5563,8 @@ do_show_linkprop(int argc, char **argv, const char *use) prop_longopts, NULL)) != -1) { switch (option) { case 'p': - if (parse_props(optarg, &proplist, B_TRUE) < 0) + if (dladm_parse_link_props(optarg, &proplist, B_TRUE) + != DLADM_STATUS_OK) die("invalid link properties specified"); break; case 'c': @@ -4628,7 +5622,7 @@ do_show_linkprop(int argc, char **argv, const char *use) } else { (void) show_linkprop_onelink(linkid, &state); } - free_props(proplist); + dladm_free_props(proplist); if (state.ls_retstatus != DLADM_STATUS_OK) exit(EXIT_FAILURE); @@ -4640,7 +5634,7 @@ show_linkprop_onelink(datalink_id_t linkid, void *arg) int i; char *buf; uint32_t flags; - prop_list_t *proplist = NULL; + dladm_arg_list_t *proplist = NULL; show_linkprop_state_t *statep = arg; dlpi_handle_t dh = NULL; @@ -4689,9 +5683,9 @@ show_linkprop_onelink(datalink_id_t linkid, void *arg) (sizeof (char *) + DLADM_PROP_VAL_MAX) * DLADM_MAX_PROP_VALCNT; if (proplist != NULL) { - for (i = 0; i < proplist->pl_count; i++) { + for (i = 0; i < proplist->al_count; i++) { (void) show_linkprop(linkid, - proplist->pl_info[i].pi_name, statep); + proplist->al_info[i].ai_name, statep); } } else { (void) dladm_walk_linkprop(linkid, statep, show_linkprop); @@ -4712,30 +5706,58 @@ set_linkprop_persist(datalink_id_t linkid, const char *prop_name, DLADM_OPT_PERSIST); if (status != DLADM_STATUS_OK) { - warn_dlerr(status, "cannot persistently %s link property", - reset ? "reset" : "set"); + warn_dlerr(status, "cannot persistently %s link property '%s'", + reset ? "reset" : "set", prop_name); } return (status); } +static int +reset_one_linkprop(datalink_id_t linkid, const char *propname, void *arg) +{ + set_linkprop_state_t *statep = arg; + dladm_status_t status; + + status = dladm_set_linkprop(linkid, propname, NULL, 0, + DLADM_OPT_ACTIVE); + if (status != DLADM_STATUS_OK) { + warn_dlerr(status, "cannot reset link property '%s' on '%s'", + propname, statep->ls_name); + } + if (!statep->ls_temp) { + dladm_status_t s; + + s = set_linkprop_persist(linkid, propname, NULL, 0, + statep->ls_reset); + if (s != DLADM_STATUS_OK) + status = s; + } + if (status != DLADM_STATUS_OK) + statep->ls_status = status; + + return (DLADM_WALK_CONTINUE); +} + static void set_linkprop(int argc, char **argv, boolean_t reset, const char *use) { - int i, option; - char errmsg[DLADM_STRSIZE]; - char *altroot = NULL; - datalink_id_t linkid; - prop_list_t *proplist = NULL; - boolean_t temp = B_FALSE; - dladm_status_t status = DLADM_STATUS_OK; + int i, option; + char errmsg[DLADM_STRSIZE]; + char *altroot = NULL; + datalink_id_t linkid; + boolean_t temp = B_FALSE; + dladm_status_t status = DLADM_STATUS_OK; + dladm_arg_list_t *proplist = NULL; opterr = 0; while ((option = getopt_long(argc, argv, ":p:R:t", prop_longopts, NULL)) != -1) { switch (option) { case 'p': - if (parse_props(optarg, &proplist, reset) < 0) + if (dladm_parse_link_props(optarg, &proplist, reset) != + DLADM_STATUS_OK) { die("invalid link properties specified"); + } break; case 't': temp = B_TRUE; @@ -4757,7 +5779,7 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use) die("link property must be specified"); if (altroot != NULL) { - free_props(proplist); + dladm_free_props(proplist); altroot_cmd(altroot, argc, argv); } @@ -4766,24 +5788,21 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use) die_dlerr(status, "link %s is not valid", argv[optind]); if (proplist == NULL) { - status = dladm_set_linkprop(linkid, NULL, NULL, 0, - DLADM_OPT_ACTIVE); - if (status != DLADM_STATUS_OK) { - warn_dlerr(status, "cannot reset link property " - "on '%s'", argv[optind]); - } - if (!temp) { - dladm_status_t s; + set_linkprop_state_t state; - s = set_linkprop_persist(linkid, NULL, NULL, 0, reset); - if (s != DLADM_STATUS_OK) - status = s; - } + state.ls_name = argv[optind]; + state.ls_reset = reset; + state.ls_temp = temp; + state.ls_status = DLADM_STATUS_OK; + + (void) dladm_walk_linkprop(linkid, &state, reset_one_linkprop); + + status = state.ls_status; goto done; } - for (i = 0; i < proplist->pl_count; i++) { - prop_info_t *pip = &proplist->pl_info[i]; + for (i = 0; i < proplist->al_count; i++) { + dladm_arg_info_t *aip = &proplist->al_info[i]; char **val; uint_t count; dladm_status_t s; @@ -4792,21 +5811,21 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use) val = NULL; count = 0; } else { - val = pip->pi_val; - count = pip->pi_count; + val = aip->ai_val; + count = aip->ai_count; if (count == 0) { warn("no value specified for '%s'", - pip->pi_name); + aip->ai_name); status = DLADM_STATUS_BADARG; continue; } } - s = dladm_set_linkprop(linkid, pip->pi_name, val, count, + s = dladm_set_linkprop(linkid, aip->ai_name, val, count, DLADM_OPT_ACTIVE); if (s == DLADM_STATUS_OK) { if (!temp) { s = set_linkprop_persist(linkid, - pip->pi_name, val, count, reset); + aip->ai_name, val, count, reset); if (s != DLADM_STATUS_OK) status = s; } @@ -4815,7 +5834,7 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use) status = s; switch (s) { case DLADM_STATUS_NOTFOUND: - warn("invalid link property '%s'", pip->pi_name); + warn("invalid link property '%s'", aip->ai_name); break; case DLADM_STATUS_BADVAL: { int j; @@ -4837,12 +5856,12 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use) j * DLADM_PROP_VAL_MAX; } s = dladm_get_linkprop(linkid, - DLADM_PROP_VAL_MODIFIABLE, pip->pi_name, propvals, + DLADM_PROP_VAL_MODIFIABLE, aip->ai_name, propvals, &valcnt); if (s != DLADM_STATUS_OK) { warn_dlerr(status, "cannot set link property " - "'%s' on '%s'", pip->pi_name, argv[optind]); + "'%s' on '%s'", aip->ai_name, argv[optind]); free(propvals); break; } @@ -4859,7 +5878,7 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use) if (ptr > errmsg) { *(ptr - 1) = '\0'; warn("link property '%s' must be one of: %s", - pip->pi_name, errmsg); + aip->ai_name, errmsg); } else warn("invalid link property '%s'", *val); free(propvals); @@ -4868,16 +5887,16 @@ set_linkprop(int argc, char **argv, boolean_t reset, const char *use) default: if (reset) { warn_dlerr(status, "cannot reset link property " - "'%s' on '%s'", pip->pi_name, argv[optind]); + "'%s' on '%s'", aip->ai_name, argv[optind]); } else { warn_dlerr(status, "cannot set link property " - "'%s' on '%s'", pip->pi_name, argv[optind]); + "'%s' on '%s'", aip->ai_name, argv[optind]); } break; } } done: - free_props(proplist); + dladm_free_props(proplist); if (status != DLADM_STATUS_OK) exit(1); } @@ -5414,7 +6433,7 @@ i_dladm_init_linkprop(datalink_id_t linkid, void *arg) } /*ARGSUSED*/ -static void +void do_init_linkprop(int argc, char **argv, const char *use) { int option; @@ -5890,6 +6909,7 @@ show_ether_xprop(datalink_id_t linkid, void *arg) (void) snprintf(ebuf.eth_ptype, sizeof (ebuf.eth_ptype), "%s", "peeradv"); (void) snprintf(ebuf.eth_state, sizeof (ebuf.eth_state), ""); + (void) dladm_get_single_mac_stat(linkid, "lp_cap_autoneg", KSTAT_DATA_UINT32, &autoneg); (void) snprintf(ebuf.eth_autoneg, sizeof (ebuf.eth_autoneg), diff --git a/usr/src/cmd/dladm/dladm.xcl b/usr/src/cmd/dladm/dladm.xcl index b849b22f79..09192c7f4d 100644 --- a/usr/src/cmd/dladm/dladm.xcl +++ b/usr/src/cmd/dladm/dladm.xcl @@ -21,244 +21,343 @@ # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" # -msgid " %-9s\t%s" -msgid " %s" -msgid " Total" -msgid " address-type=%s\n" -msgid " address=%s" -msgid " device=%s address=%s" -msgid " duplex=%s" -msgid " duplex=%s\n" -msgid " lacp-mode=%s" -msgid " lacp-timer=%s\n" -msgid " link=%s" -msgid " policy=%s" -msgid " port=%s" -msgid " speed=%u" + msgid "" -msgid "%%ipkts %%opkts\n" -msgid "%-*s " +msgid "\t%-10llu" +msgid "\t%-6.1f" +msgid "\t-" +msgid "\tipackets rbytes opackets obytes " +msgid "\n" +msgid " " +msgid " " +msgid " %-18s" +msgid " MACADDRESS" +msgid " %-18s" +msgid " MACADDRTYPE" +msgid " dev=%s" +msgid " mac_addr=%s" +msgid " speed=%u" +msgid " vid=%d\n" +msgid "%%ipkts %%opkts\n" msgid "%-*s" msgid "%-10llu" msgid "%-12llu" msgid "%-12llu\n" -msgid "%-14s " -msgid "%-15s " -msgid "%-15s %-14s %-14s %-30s \n" -msgid "%-20s %-20s " -msgid "%-30s " -msgid "%-30s" -msgid "%-8u" -msgid "%-8u\n" -msgid "%s type=%s mtu=%d device=%s\n" -msgid "%s type=%s mtu=%d key=%u\n" -msgid "%s type=legacy mtu=%d device=%s\n" +msgid "%-12s" +msgid "%-12s%-10s%-12s%-8s%-10s%-12s%-8s\n" +msgid "%-12s%-12s" +msgid "%-12s%-12s%10s%-20s%-19s%-6s\n" +msgid "%-12s%8d %-12s%-20s %6d\n" +msgid "%-12s%8s %-12s%-20s %6s\n" +msgid "%-6.1f" +msgid "%-6d\n" +msgid "%-8llu" +msgid "%-8llu\n" +msgid "%5u Mbps" +msgid "%c----" +msgid "%d" +msgid "%d%c-%c" +msgid "%llu" msgid "%s" +msgid "%s\n" +msgid "%s " msgid "%s," msgid "%s: " -msgid "%s=\"%s\" " msgid "%s=\"%s\"" -msgid "%s\n" +msgid "%sfdx" +msgid "%shdx" +msgid "%u" +msgid "%uMb" msgid "," -msgid "--" -msgid "--," -msgid "/dev/%s" +msgid "-R" +msgid "-f" +msgid "-fh" +msgid "-h" +msgid "/" +msgid "/%s" +msgid "/%s/%s" +msgid "/sbin/dladm " msgid "0x" -msgid "0x%-30s" -msgid ": %s (%s)\n" +msgid "100M" +msgid "10M" +msgid "1G" msgid ": %s\n" -msgid ":Lpsi:" +msgid ":L:l:P:R:tu:T:" +msgid ":LpPxsi:o:" +msgid ":R:" msgid ":R:t" msgid ":a" -msgid ":d:R:t" -msgid ":d:l:P:R:tu:T:" +msgid ":d:l:L:P:R:tfu:T:" +msgid ":d:l:R:t" +msgid ":d:l:R:tf" msgid ":e:i:a:m:b:s:k:T:c" msgid ":f:c:R:t" -msgid ":l:P:R:tu:T:" msgid ":o:p" msgid ":p:R:t" -msgid ":p:cP" -msgid ":pPd" -msgid ":psi:" +msgid ":p:cPo:" +msgid ":pPo:" +msgid ":pPo:m" +msgid ":pPsSi:o:" +msgid ":psi:o:" +msgid ":tl:v:p:" msgid "?" -msgid "100M" +msgid "ADDRESS" +msgid "ADDRPOLICY" msgid "ADT_dladm_create_secobj" msgid "ADT_dladm_delete_secobj" +msgid "AGGREGATABLE" msgid "AUTH" msgid "AUTO" msgid "BSSID/IBSSID" msgid "BSSTYPE" msgid "CLASS" +msgid "CLIENT" +msgid "COLL" msgid "DEFAULT" +msgid "DEFAULTED" +msgid "DEVICE" +msgid "DIST" msgid "DUPLEX" msgid "ESSID" +msgid "EXPIRED" +msgid "FLAGS" +msgid "IERRORS" +msgid "INUSE" +msgid "IPACKETS" +msgid "IPKTDIST" +msgid "LACPACTIVITY" +msgid "LACPTIMER" msgid "LINK" +msgid "LINK\n" +msgid "LINKID" +msgid "MEDIA" msgid "MODE" +msgid "MTU" msgid "Mb" +msgid "NAME" msgid "OBJECT" -msgid "OBJECT=\"%s\" CLASS=\"%s\" " +msgid "OBYTES" +msgid "OERRORS" +msgid "OPACKETS" +msgid "OPKTDIST" +msgid "OVER" msgid "PAUSE" +msgid "POLICY" +msgid "PORT" +msgid "PORTSTATE" msgid "POSSIBLE" msgid "PROPERTY" -msgid "PROPERTY=\"%s\" " +msgid "PTYPE" +msgid "RBYTES" msgid "REM_FAULT" msgid "SEC" +msgid "SLOT" msgid "SPEED" msgid "SPEED-DUPLEX" +msgid "STATE" msgid "STATUS" msgid "STRENGTH" +msgid "SYNC" +msgid "Total" msgid "VALUE" -msgid "VALUE=\"0x%s\"" -msgid "\n" -msgid "\t %5uMb" -msgid "\t%-10llu" -msgid "\t%-6.1f" -msgid "\t%s" -msgid "\t%s\n" -msgid "\t-" -msgid "\t\t%-10llu" -msgid "\t\tipackets rbytes ierrors " -msgid "\tipackets rbytes opackets obytes " -msgid "active" +msgid "VID" +msgid "a+" +msgid "add-aggr" +msgid "address" +msgid "addrpolicy" +msgid "adt_alloc_event (%s): %s" +msgid "adt_start_session: %s" +msgid "adv" msgid "adv_cap_10" msgid "adv_cap_100" msgid "adv_cap_1000" msgid "adv_cap_asmpause" msgid "adv_cap_autoneg" msgid "adv_cap_pause" -msgid "add-aggr" -msgid "adt_alloc_event (%s): %s" -msgid "adt_start_session: %s" -msgid "aggr key=%d" -msgid "aggr" +msgid "adv_rem_fault" +msgid "aggr%d" +msgid "aggregatable" msgid "all" msgid "all-links" -msgid "attached" msgid "auth" msgid "auto" +msgid "bi" msgid "bssid" msgid "bsstype" -msgid "cap_pause" +msgid "bw-limit" msgid "cap_10" +msgid "cap_100" msgid "cap_1000" +msgid "cap_asmpause" msgid "cap_autoneg" +msgid "cap_pause" +msgid "cap_rem_fault" msgid "capable" +msgid "class" +msgid "client" +msgid "coll" msgid "connect-wifi" +msgid "continuous" +msgid "cpus" msgid "create-aggr" +msgid "create-etherstub" msgid "create-ibss" msgid "create-secobj" msgid "create-vlan" +msgid "create-vnic" msgid "current" +msgid "default" +msgid "defaulted" msgid "delete-aggr" +msgid "delete-etherstub" msgid "delete-phys" msgid "delete-secobj" msgid "delete-vlan" -msgid "dev key=%d" +msgid "delete-vnic" msgid "dev" +msgid "device" msgid "disconnect-wifi" -msgid "down" -msgid "down-aggr" +msgid "dist" +msgid "down-vnic" msgid "duplex" msgid "essid" +msgid "expired" +msgid "extended" msgid "fault" msgid "file" +msgid "fixed" +msgid "fixed (%s)" +msgid "flags" +msgid "forcible" msgid "forever" -msgid "full" -msgid "half" msgid "ibssid" msgid "ierrors" msgid "ifspeed" msgid "init-linkprop" +msgid "init-phys" msgid "init-secobj" msgid "interval" -msgid "invalid input" -msgid "ipackets64" +msgid "inuse" +msgid "ipackets" +msgid "ipktdist" msgid "key" msgid "lacp" msgid "lacp-mode" msgid "lacp-timer" +msgid "lacpactivity" +msgid "lacptimer" msgid "link" msgid "link,class,mtu,state,over" +msgid "link,class,over" msgid "link,device,media,flags" msgid "link,essid,bssid,sec,strength,mode,speed" -msgid "link,essid,bssid,sec,strength,mode,speed,auth,bsstype" +msgid "link,essid,bssid,sec,strength,mode,speed,bsstype" msgid "link,ipackets,rbytes,ierrors,opackets,obytes,oerrors" msgid "link,media,state,speed,duplex,device" -msgid "link,property,value,default,possible" msgid "link,policy,addrpolicy,lacpactivity,lacptimer,flags" -msigd "link,port,aggregatable,sync,coll,dist,defaulted,expired" +msgid "link,port,aggregatable,sync,coll,dist,defaulted,expired" msgid "link,port,ipackets,rbytes,opackets,obytes,ipktdist,opktdist" msgid "link,port,speed,duplex,state,address,portstate" +msgid "link,property,value,default,possible" +msgid "link,ptype,state,auto,speed-duplex,pause" +msgid "link,ptype,state,auto,speed-duplex,pause,rem_fault" +msgid "link,slot,address,inuse,client" msgid "link,state,speed,duplex" -msgid "link,vid,over,flags" msgid "link,status,essid,sec,strength,mode,speed" msgid "link,status,essid,sec,strength,mode,speed,auth,bssid,bsstype" +msgid "link,vid,over,flags" +msgid "link=%s" msgid "link_asmpause" msgid "link_autoneg" msgid "link_duplex" msgid "link_pause" msgid "link_state" -msgid "long" msgid "lp_cap_10" msgid "lp_cap_100" msgid "lp_cap_1000" -msgid "lp_cap_autoneg" msgid "lp_cap_asmpause" +msgid "lp_cap_autoneg" msgid "lp_cap_pause" msgid "lp_rem_fault" msgid "mac" +msgid "mac-address" +msgid "mac-prefix" +msgid "media" msgid "mode" msgid "modify-aggr" -msgid "net_rawaccess" +msgid "mtu" msgid "no" -msgid "obytes64" +msgid "none" +msgid "o:px" +msgid "object" +msgid "object,class" +msgid "object,class,value" +msgid "obytes" msgid "oerrors" -msgid "opackets obytes oerrors\n" -msgid "opackets64" +msgid "opackets" +msgid "opktdist" msgid "output" +msgid "over" msgid "parseable" -msgid "passive" msgid "pause" +msgid "pd:si:" msgid "peeradv" msgid "persistent" msgid "policy" +msgid "port" +msgid "portstate" +msgid "possible" +msgid "primary" msgid "prop" +msgid "property" +msgid "ptype" msgid "r" -msgid "rbytes64" +msgid "random" +msgid "rbytes" msgid "rem_fault" msgid "remove-aggr" msgid "rename-link" +msgid "reset" msgid "reset-linkprop" msgid "root-dir" msgid "scan-wifi" msgid "sec" +msgid "set" msgid "set-linkprop" -msgid "short" msgid "show-aggr" msgid "show-dev" +msgid "show-ether" +msgid "show-etherstub" msgid "show-link" +msgid "show-linkmap" msgid "show-linkprop" msgid "show-phys" msgid "show-secobj" -msgid "show-wifi" +msgid "show-usage" msgid "show-vlan" -msgid "show-ether" -msgid "solaris.network.link.security" +msgid "show-vnic" +msgid "show-wifi" +msgid "slot" msgid "speed" msgid "speed-duplex" -msgid "standby" +msgid "state" msgid "statistics" msgid "status" msgid "strength" -msgid "sys_net_config" +msgid "sync" +msgid "tR:" +msgid "tR:d:m:n:p:r:v:" +msgid "tdps:e:f:" msgid "temporary" +msgid "timeout" +msgid "tx" msgid "unicast" msgid "unknown" -msgid "up" msgid "up-aggr" msgid "up-vlan" +msgid "up-vnic" +msgid "value" +msgid "vid" msgid "vlan-id" -msgid "wep" msgid "yes" diff --git a/usr/src/cmd/dladm/vnic.conf b/usr/src/cmd/dladm/vnic.conf new file mode 100644 index 0000000000..d156a65ec1 --- /dev/null +++ b/usr/src/cmd/dladm/vnic.conf @@ -0,0 +1,29 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# +# DO NOT EDIT OR PARSE THIS FILE! +# +# Use the dladm(1m) command to change the contents of this file. + diff --git a/usr/src/cmd/flowadm/Makefile b/usr/src/cmd/flowadm/Makefile new file mode 100644 index 0000000000..b6af8b2b79 --- /dev/null +++ b/usr/src/cmd/flowadm/Makefile @@ -0,0 +1,76 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +PROG=flowadm + +ROOTFS_PROG= $(PROG) + +POFILE= $(PROG).po +CONFIGFILES= flowadm.conf flowprop.conf + +include ../Makefile.cmd + +XGETFLAGS += -a -x $(PROG).xcl +LDLIBS += -L$(ROOT)/lib +LDLIBS += -ldladm -lkstat + +ROOTCFGDIR= $(ROOTETC)/dladm +ROOTCFGFILES= $(CONFIGFILES:%=$(ROOTCFGDIR)/%) + +$(ROOTCFGFILES):= FILEMODE= 644 +$(ROOTCFGFILES):= OWNER= dladm +$(ROOTCFGFILES):= GROUP= sys + +.KEEP_STATE: + +all: $(ROOTFS_PROG) + +# +# Message catalog +# +_msg: $(POFILE) + +$(POFILE): $(PROG).c + $(RM) $@ + $(COMPILE.cpp) $(PROG).c > $(POFILE).i + $(XGETTEXT) $(XGETFLAGS) $(POFILE).i + sed "/^domain/d" messages.po > $@ + $(RM) messages.po $(POFILE).i + +install: all $(ROOTSBINPROG) $(ROOTCFGDIR) $(ROOTCFGFILES) + $(RM) $(ROOTUSRSBINPROG) + -$(SYMLINK) ../../sbin/$(PROG) $(ROOTUSRSBINPROG) + +clean: + +lint: lint_PROG + +$(ROOTCFGDIR): + $(INS.dir) + +$(ROOTCFGDIR)/%: $(ROOTCFGDIR) % + $(INS.file) + +include ../Makefile.targ diff --git a/usr/src/cmd/flowadm/flowadm.c b/usr/src/cmd/flowadm/flowadm.c new file mode 100644 index 0000000000..f4c3859172 --- /dev/null +++ b/usr/src/cmd/flowadm/flowadm.c @@ -0,0 +1,1963 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <stdio.h> +#include <locale.h> +#include <stdarg.h> +#include <stdlib.h> +#include <fcntl.h> +#include <string.h> +#include <stropts.h> +#include <errno.h> +#include <kstat.h> +#include <strings.h> +#include <getopt.h> +#include <unistd.h> +#include <priv.h> +#include <netdb.h> +#include <libintl.h> +#include <libdlflow.h> +#include <libdllink.h> +#include <libdlstat.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <sys/ethernet.h> +#include <inet/ip.h> +#include <inet/ip6.h> +#include <stddef.h> + +#define CMD_TYPE_ANY 0xffffffff +#define STR_UNDEF_VAL "--" + + +/* + * data structures and routines for printing output. + */ + +typedef struct print_field_s { + const char *pf_name; + const char *pf_header; + uint_t pf_width; + union { + uint_t _pf_index; + size_t _pf_offset; + }_pf_un; +#define pf_index _pf_un._pf_index +#define pf_offset _pf_un._pf_offset; + uint_t pf_cmdtype; +} print_field_t; + +typedef struct print_state_s { + print_field_t **ps_fields; + uint_t ps_nfields; + boolean_t ps_lastfield; + uint_t ps_overflow; +} print_state_t; + +typedef struct show_usage_state_s { + boolean_t us_plot; + boolean_t us_parseable; + boolean_t us_printheader; + boolean_t us_first; + print_state_t us_print; +} show_usage_state_t; + +typedef char *(*print_callback_t)(print_field_t *, void *); +static print_field_t **parse_output_fields(char *, print_field_t *, int, + uint_t, uint_t *); + +static void print_header(print_state_t *); +static void print_field(print_state_t *, print_field_t *, const char *, + boolean_t); + +static void flowadm_print_output(print_state_t *, boolean_t, + print_callback_t, void *); + +/* + * helper function that, when invoked as flowadm(print_field(pf, buf) + * prints string which is offset by pf->pf_offset within buf. + */ +static char *flowadm_print_field(print_field_t *, void *); + +#define MAX_FIELD_LEN 32 + +typedef void cmdfunc_t(int, char **); + +static cmdfunc_t do_add_flow, do_remove_flow, do_init_flow, do_show_flow; +static cmdfunc_t do_show_flowprop, do_set_flowprop, do_reset_flowprop; +static cmdfunc_t do_show_usage; + +static int show_flow(dladm_flow_attr_t *, void *); +static int show_flows_onelink(datalink_id_t, void *); + +static void flow_stats(const char *, datalink_id_t, uint_t); +static void get_flow_stats(const char *, pktsum_t *); +static int show_flow_stats(dladm_flow_attr_t *, void *); +static int show_link_flow_stats(datalink_id_t, void *); + +static int remove_flow(dladm_flow_attr_t *, void *); + +static int show_flowprop(dladm_flow_attr_t *, void *); +static void show_flowprop_one_flow(void *, const char *); +static int show_flowprop_onelink(datalink_id_t, void *); + +static void die(const char *, ...); +static void die_optdup(int); +static void die_opterr(int, int); +static void die_dlerr(dladm_status_t, const char *, ...); +static void warn(const char *, ...); +static void warn_dlerr(dladm_status_t, const char *, ...); + +typedef struct cmd { + char *c_name; + void (*c_fn)(int, char **); +} cmd_t; + +static cmd_t cmds[] = { + { "add-flow", do_add_flow }, + { "remove-flow", do_remove_flow }, + { "show-flowprop", do_show_flowprop }, + { "set-flowprop", do_set_flowprop }, + { "reset-flowprop", do_reset_flowprop }, + { "show-flow", do_show_flow }, + { "init-flow", do_init_flow }, + { "show-usage", do_show_usage } +}; + +static const struct option longopts[] = { + {"link", required_argument, 0, 'l'}, + {"parseable", no_argument, 0, 'p'}, + {"statistics", no_argument, 0, 's'}, + {"interval", required_argument, 0, 'i'}, + {"temporary", no_argument, 0, 't'}, + {"root-dir", required_argument, 0, 'R'}, + { 0, 0, 0, 0 } +}; + +static const struct option prop_longopts[] = { + {"link", required_argument, 0, 'l'}, + {"temporary", no_argument, 0, 't'}, + {"root-dir", required_argument, 0, 'R'}, + {"prop", required_argument, 0, 'p'}, + {"attr", required_argument, 0, 'a'}, + { 0, 0, 0, 0 } +}; + +/* + * structures for 'flowadm show-flow' + */ + +typedef struct show_flow_state { + boolean_t fs_firstonly; + boolean_t fs_donefirst; + pktsum_t fs_prevstats; + uint32_t fs_flags; + dladm_status_t fs_status; + print_state_t fs_print; + const char *fs_flow; + const char *fs_link; + boolean_t fs_parseable; + boolean_t fs_printheader; + boolean_t fs_persist; + boolean_t fs_stats; + uint64_t fs_mask; +} show_flow_state_t; + +/* + * structures for 'flowadm remove-flow' + */ + +typedef struct remove_flow_state { + boolean_t fs_tempop; + const char *fs_altroot; + dladm_status_t fs_status; +} remove_flow_state_t; + +typedef struct flow_args_s { + const char *fa_link; + int fa_attrno; /* -1 indicates flow itself */ + uint64_t fa_mask; + dladm_flow_attr_t *fa_finfop; + dladm_status_t *fa_status; + boolean_t fa_parseable; +} flow_args_t; + +#define PROTO_MAXSTR_LEN 7 +#define PORT_MAXSTR_LEN 6 +#define DSFIELD_MAXSTR_LEN 10 + +typedef struct flow_fields_buf_s +{ + char flow_name[MAXNAMELEN]; + char flow_link[MAXLINKNAMELEN]; + char flow_ipaddr[INET6_ADDRSTRLEN+4]; + char flow_proto[PROTO_MAXSTR_LEN]; + char flow_port[PORT_MAXSTR_LEN]; + char flow_dsfield[DSFIELD_MAXSTR_LEN]; +} flow_fields_buf_t; + +static print_field_t flow_fields[] = { +/* name, header, field width, index, cmdtype */ +{ "flow", "FLOW", 11, + offsetof(flow_fields_buf_t, flow_name), CMD_TYPE_ANY}, +{ "link", "LINK", 11, + offsetof(flow_fields_buf_t, flow_link), CMD_TYPE_ANY}, +{ "ipaddr", "IP ADDR", 30, + offsetof(flow_fields_buf_t, flow_ipaddr), CMD_TYPE_ANY}, +{ "transport", "PROTO", 6, + offsetof(flow_fields_buf_t, flow_proto), CMD_TYPE_ANY}, +{ "port", "PORT", 7, + offsetof(flow_fields_buf_t, flow_port), CMD_TYPE_ANY}, +{ "dsfield", "DSFLD", 9, + offsetof(flow_fields_buf_t, flow_dsfield), CMD_TYPE_ANY}} +; + +#define FLOW_MAX_FIELDS (sizeof (flow_fields) / sizeof (print_field_t)) + +/* + * structures for 'flowadm show-flowprop' + */ +typedef enum { + FLOWPROP_FLOW, + FLOWPROP_PROPERTY, + FLOWPROP_VALUE, + FLOWPROP_DEFAULT, + FLOWPROP_POSSIBLE +} flowprop_field_index_t; + +static print_field_t flowprop_fields[] = { +/* name, header, fieldwidth, index, cmdtype */ +{ "flow", "FLOW", 12, FLOWPROP_FLOW, CMD_TYPE_ANY}, +{ "property", "PROPERTY", 15, FLOWPROP_PROPERTY, CMD_TYPE_ANY}, +{ "value", "VALUE", 14, FLOWPROP_VALUE, CMD_TYPE_ANY}, +{ "default", "DEFAULT", 14, FLOWPROP_DEFAULT, CMD_TYPE_ANY}, +{ "possible", "POSSIBLE", 20, FLOWPROP_POSSIBLE, CMD_TYPE_ANY}} +; +#define FLOWPROP_MAX_FIELDS \ + (sizeof (flowprop_fields) / sizeof (print_field_t)) + +#define MAX_PROP_LINE 512 + +typedef struct show_flowprop_state { + const char *fs_flow; + datalink_id_t fs_linkid; + char *fs_line; + char **fs_propvals; + dladm_arg_list_t *fs_proplist; + boolean_t fs_parseable; + boolean_t fs_persist; + boolean_t fs_header; + dladm_status_t fs_status; + dladm_status_t fs_retstatus; + print_state_t fs_print; +} show_flowprop_state_t; + +typedef struct set_flowprop_state { + const char *fs_name; + boolean_t fs_reset; + boolean_t fs_temp; + dladm_status_t fs_status; +} set_flowprop_state_t; + +typedef struct flowprop_args_s { + show_flowprop_state_t *fs_state; + char *fs_propname; + char *fs_flowname; +} flowprop_args_t; + +/* + * structures for 'flow show-usage' + */ + +typedef struct usage_fields_buf_s { + char usage_flow[12]; + char usage_duration[10]; + char usage_ipackets[9]; + char usage_rbytes[10]; + char usage_opackets[9]; + char usage_obytes[10]; + char usage_bandwidth[14]; +} usage_fields_buf_t; + +static print_field_t usage_fields[] = { +/* name, header, field width, offset, cmdtype */ +{ "flow", "FLOW", 12, + offsetof(usage_fields_buf_t, usage_flow), CMD_TYPE_ANY}, +{ "duration", "DURATION", 10, + offsetof(usage_fields_buf_t, usage_duration), CMD_TYPE_ANY}, +{ "ipackets", "IPACKETS", 9, + offsetof(usage_fields_buf_t, usage_ipackets), CMD_TYPE_ANY}, +{ "rbytes", "RBYTES", 10, + offsetof(usage_fields_buf_t, usage_rbytes), CMD_TYPE_ANY}, +{ "opackets", "OPACKETS", 9, + offsetof(usage_fields_buf_t, usage_opackets), CMD_TYPE_ANY}, +{ "obytes", "OBYTES", 10, + offsetof(usage_fields_buf_t, usage_obytes), CMD_TYPE_ANY}, +{ "bandwidth", "BANDWIDTH", 14, + offsetof(usage_fields_buf_t, usage_bandwidth), CMD_TYPE_ANY}} +; + +#define USAGE_MAX_FIELDS (sizeof (usage_fields) / sizeof (print_field_t)) + +/* + * structures for 'dladm show-usage link' + */ + +typedef struct usage_l_fields_buf_s { + char usage_l_flow[12]; + char usage_l_stime[13]; + char usage_l_etime[13]; + char usage_l_rbytes[8]; + char usage_l_obytes[8]; + char usage_l_bandwidth[14]; +} usage_l_fields_buf_t; + +static print_field_t usage_l_fields[] = { +/* name, header, field width, offset, cmdtype */ +{ "flow", "FLOW", 12, + offsetof(usage_l_fields_buf_t, usage_l_flow), CMD_TYPE_ANY}, +{ "start", "START", 13, + offsetof(usage_l_fields_buf_t, usage_l_stime), CMD_TYPE_ANY}, +{ "end", "END", 13, + offsetof(usage_l_fields_buf_t, usage_l_etime), CMD_TYPE_ANY}, +{ "rbytes", "RBYTES", 8, + offsetof(usage_l_fields_buf_t, usage_l_rbytes), CMD_TYPE_ANY}, +{ "obytes", "OBYTES", 8, + offsetof(usage_l_fields_buf_t, usage_l_obytes), CMD_TYPE_ANY}, +{ "bandwidth", "BANDWIDTH", 14, + offsetof(usage_l_fields_buf_t, usage_l_bandwidth), CMD_TYPE_ANY}} +; + +#define USAGE_L_MAX_FIELDS \ + (sizeof (usage_l_fields) /sizeof (print_field_t)) + +#define PRI_HI 100 +#define PRI_LO 10 +#define PRI_NORM 50 + +#define FLOWADM_CONF "/etc/dladm/flowadm.conf" +#define BLANK_LINE(s) ((s[0] == '\0') || (s[0] == '#') || (s[0] == '\n')) + +static char *progname; + +boolean_t t_arg = B_FALSE; /* changes are persistent */ +char *altroot = NULL; + +static const char *attr_table[] = + {"local_ip", "remote_ip", "transport", "local_port", "dsfield"}; + +#define NATTR (sizeof (attr_table)/sizeof (char *)) + +static void +usage(void) +{ + (void) fprintf(stderr, gettext("usage: flowadm <subcommand>" + " <args>...\n" + "\tadd-flow [-t] [-R <root-dir>] -l <link>\n" + "\t\t-a attr=value[,...] [-p prop=value,...]\n" + "\t\tflow-name\n" + "\tremove-flow [-t] [-R <root-dir>] {-l <link> | flow-name}\n" + "\tset-flowprop [-t] [-R <root-dir>] \n" + "\t\t-p prop=value[,...] flowname\n" + "\treset-flowprop [-t] [-R <root-dir>] \n" + "\t\t[-p prop,...] flowname\n" + "\tshow-flowprop [-cP] [-l <link>] [-p prop,...] [flow-name]\n" + "\tshow-flow [-p] [-s [-i <interval>]] [-l <link>] [flow-name]\n" + "\tshow-usage [-d|-p -F <format>] [-s <DD/MM/YYYY,HH:MM:SS>]\n" + "\t\t[-e <DD/MM/YYYY,HH:MM:SS>]] -f <logfile> [<name>]\n")); + exit(1); +} + +int +main(int argc, char *argv[]) +{ + int i, arglen, cmdlen; + cmd_t *cmdp; + + (void) setlocale(LC_ALL, ""); +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + (void) textdomain(TEXT_DOMAIN); + + progname = argv[0]; + + if (argc < 2) + usage(); + + for (i = 0; i < sizeof (cmds) / sizeof (cmds[0]); i++) { + cmdp = &cmds[i]; + arglen = strlen(argv[1]); + cmdlen = strlen(cmdp->c_name); + if ((arglen == cmdlen) && (strncmp(argv[1], cmdp->c_name, + cmdlen) == 0)) { + cmdp->c_fn(argc - 1, &argv[1]); + exit(0); + } + } + + (void) fprintf(stderr, gettext("%s: unknown subcommand '%s'\n"), + progname, argv[1]); + usage(); + + return (0); +} + +static const char * +match_attr(char *attr) +{ + int i; + + for (i = 0; i < NATTR; i++) { + if (strlen(attr) == strlen(attr_table[i]) && + strncmp(attr, attr_table[i], strlen(attr_table[i])) == 0) { + return (attr); + } + } + return (NULL); +} + +/* ARGSUSED */ +static void +do_init_flow(int argc, char *argv[]) +{ + dladm_status_t status; + + status = dladm_flow_init(); + if (status != DLADM_STATUS_OK) + die_dlerr(status, "flows initialization failed"); +} + +/* ARGSUSED */ +static int +show_usage_date(dladm_usage_t *usage, void *arg) +{ + + time_t stime; + char timebuf[20]; + + stime = usage->du_stime; + (void) strftime(timebuf, sizeof (timebuf), "%m/%d/%Y", + localtime(&stime)); + (void) printf("%s\n", timebuf); + + return (DLADM_STATUS_OK); +} + +static int +show_usage_time(dladm_usage_t *usage, void *arg) +{ + show_usage_state_t *state = (show_usage_state_t *)arg; + char buf[DLADM_STRSIZE]; + usage_l_fields_buf_t ubuf; + time_t time; + double bw; + + if (state->us_plot) { + if (!state->us_printheader) { + if (state->us_first) { + (void) printf("# Time"); + state->us_first = B_FALSE; + } + (void) printf(" %s", usage->du_name); + if (usage->du_last) { + (void) printf("\n"); + state->us_first = B_TRUE; + state->us_printheader = B_TRUE; + } + } else { + if (state->us_first) { + time = usage->du_etime; + (void) strftime(buf, sizeof (buf), "%T", + localtime(&time)); + state->us_first = B_FALSE; + (void) printf("%s", buf); + } + bw = (double)usage->du_bandwidth/1000; + (void) printf(" %.2f", bw); + if (usage->du_last) { + (void) printf("\n"); + state->us_first = B_TRUE; + } + } + return (DLADM_STATUS_OK); + } + + bzero(&ubuf, sizeof (ubuf)); + + (void) snprintf(ubuf.usage_l_flow, sizeof (ubuf.usage_l_flow), "%s", + usage->du_name); + time = usage->du_stime; + (void) strftime(buf, sizeof (buf), "%T", localtime(&time)); + (void) snprintf(ubuf.usage_l_stime, sizeof (ubuf.usage_l_stime), "%s", + buf); + time = usage->du_etime; + (void) strftime(buf, sizeof (buf), "%T", localtime(&time)); + (void) snprintf(ubuf.usage_l_etime, sizeof (ubuf.usage_l_etime), "%s", + buf); + (void) snprintf(ubuf.usage_l_rbytes, sizeof (ubuf.usage_l_rbytes), + "%llu", usage->du_rbytes); + (void) snprintf(ubuf.usage_l_obytes, sizeof (ubuf.usage_l_obytes), + "%llu", usage->du_obytes); + (void) snprintf(ubuf.usage_l_bandwidth, sizeof (ubuf.usage_l_bandwidth), + "%s Mbps", dladm_bw2str(usage->du_bandwidth, buf)); + + if (!state->us_parseable && !state->us_printheader) { + print_header(&state->us_print); + state->us_printheader = B_TRUE; + } + + flowadm_print_output(&state->us_print, state->us_parseable, + flowadm_print_field, (void *)&ubuf); + + return (DLADM_STATUS_OK); +} + +static int +show_usage_res(dladm_usage_t *usage, void *arg) +{ + show_usage_state_t *state = (show_usage_state_t *)arg; + char buf[DLADM_STRSIZE]; + usage_fields_buf_t ubuf; + + bzero(&ubuf, sizeof (ubuf)); + + (void) snprintf(ubuf.usage_flow, sizeof (ubuf.usage_flow), "%s", + usage->du_name); + (void) snprintf(ubuf.usage_duration, sizeof (ubuf.usage_duration), + "%llu", usage->du_duration); + (void) snprintf(ubuf.usage_ipackets, sizeof (ubuf.usage_ipackets), + "%llu", usage->du_ipackets); + (void) snprintf(ubuf.usage_rbytes, sizeof (ubuf.usage_rbytes), + "%llu", usage->du_rbytes); + (void) snprintf(ubuf.usage_opackets, sizeof (ubuf.usage_opackets), + "%llu", usage->du_opackets); + (void) snprintf(ubuf.usage_obytes, sizeof (ubuf.usage_obytes), + "%llu", usage->du_obytes); + (void) snprintf(ubuf.usage_bandwidth, sizeof (ubuf.usage_bandwidth), + "%s Mbps", dladm_bw2str(usage->du_bandwidth, buf)); + + if (!state->us_parseable && !state->us_printheader) { + print_header(&state->us_print); + state->us_printheader = B_TRUE; + } + + flowadm_print_output(&state->us_print, state->us_parseable, + flowadm_print_field, (void *)&ubuf); + + return (DLADM_STATUS_OK); +} + +static boolean_t +valid_formatspec(char *formatspec_str) +{ + if (strcmp(formatspec_str, "gnuplot") == 0) + return (B_TRUE); + return (B_FALSE); +} + +/* ARGSUSED */ +static void +do_show_usage(int argc, char *argv[]) +{ + char *file = NULL; + int opt; + dladm_status_t status; + boolean_t d_arg = B_FALSE; + boolean_t p_arg = B_FALSE; + char *stime = NULL; + char *etime = NULL; + char *resource = NULL; + show_usage_state_t state; + boolean_t o_arg = B_FALSE; + boolean_t F_arg = B_FALSE; + char *fields_str = NULL; + char *formatspec_str = NULL; + print_field_t **fields; + uint_t nfields; + char *all_fields = + "flow,duration,ipackets,rbytes,opackets,obytes,bandwidth"; + char *all_l_fields = + "flow,start,end,rbytes,obytes,bandwidth"; + + bzero(&state, sizeof (show_usage_state_t)); + state.us_parseable = B_FALSE; + state.us_printheader = B_FALSE; + state.us_plot = B_FALSE; + state.us_first = B_TRUE; + + while ((opt = getopt(argc, argv, "dps:e:o:f:F:")) != -1) { + switch (opt) { + case 'd': + d_arg = B_TRUE; + break; + case 'p': + state.us_plot = p_arg = B_TRUE; + break; + case 'f': + file = optarg; + break; + case 's': + stime = optarg; + break; + case 'e': + etime = optarg; + break; + case 'o': + o_arg = B_TRUE; + fields_str = optarg; + break; + case 'F': + F_arg = B_TRUE; + formatspec_str = optarg; + break; + default: + die_opterr(optopt, opt); + } + } + + if (file == NULL) + die("show-usage requires a file"); + + if (optind == (argc-1)) { + resource = argv[optind]; + } + + if (resource == NULL && stime == NULL && etime == NULL) { + if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0)) + fields_str = all_fields; + fields = parse_output_fields(fields_str, usage_fields, + USAGE_MAX_FIELDS, CMD_TYPE_ANY, &nfields); + } else { + if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0)) + fields_str = all_l_fields; + fields = parse_output_fields(fields_str, usage_l_fields, + USAGE_L_MAX_FIELDS, CMD_TYPE_ANY, &nfields); + } + + if (fields == NULL) { + die("invalid fields(s) specified"); + return; + } + state.us_print.ps_fields = fields; + state.us_print.ps_nfields = nfields; + + if (p_arg && d_arg) + die("plot and date options are incompatible"); + + if (p_arg && !F_arg) + die("specify format speicifier: -F <format>"); + + if (F_arg && valid_formatspec(formatspec_str) == B_FALSE) + die("Format specifier %s not supported", formatspec_str); + + if (d_arg) { + /* Print log dates */ + status = dladm_usage_dates(show_usage_date, + DLADM_LOGTYPE_FLOW, file, resource, &state); + } else if (resource == NULL && stime == NULL && etime == NULL && + !p_arg) { + /* Print summary */ + status = dladm_usage_summary(show_usage_res, + DLADM_LOGTYPE_FLOW, file, &state); + } else if (resource != NULL) { + /* Print log entries for named resource */ + status = dladm_walk_usage_res(show_usage_time, + DLADM_LOGTYPE_FLOW, file, resource, stime, etime, &state); + } else { + /* Print time and information for each link */ + status = dladm_walk_usage_time(show_usage_time, + DLADM_LOGTYPE_FLOW, file, stime, etime, &state); + } + + if (status != DLADM_STATUS_OK) + die_dlerr(status, "show-usage"); +} + +static void +do_add_flow(int argc, char *argv[]) +{ + char devname[MAXNAMELEN]; + char *name = NULL; + uint_t index; + datalink_id_t linkid; + + char option; + boolean_t l_arg = B_FALSE; + dladm_arg_list_t *proplist = NULL; + dladm_arg_list_t *attrlist = NULL; + dladm_status_t status; + + while ((option = getopt_long(argc, argv, "tR:l:a:p:", + prop_longopts, NULL)) != -1) { + switch (option) { + case 't': + t_arg = B_TRUE; + break; + case 'R': + altroot = optarg; + break; + case 'l': + if (strlcpy(devname, optarg, + MAXNAMELEN) >= MAXNAMELEN) { + die("link name too long"); + } + if (dladm_name2info(devname, &linkid, NULL, + NULL, NULL) != DLADM_STATUS_OK) + die("invalid link '%s'", devname); + l_arg = B_TRUE; + break; + case 'a': + if (dladm_parse_flow_attrs(optarg, &attrlist, B_FALSE) + != DLADM_STATUS_OK) + die("invalid flow attribute specified"); + break; + case 'p': + if (dladm_parse_flow_props(optarg, &proplist, B_FALSE) + != DLADM_STATUS_OK) + die("invalid flow property specified"); + break; + default: + die_opterr(optopt, option); + } + } + if (!l_arg) { + die("link is required"); + } + + opterr = 0; + index = optind; + + if ((index != (argc - 1)) || match_attr(argv[index]) != NULL) { + die("flow name is required"); + } else { + /* get flow name; required last argument */ + if (strlen(argv[index]) >= MAXFLOWNAME) + die("flow name too long"); + name = argv[index]; + } + + status = dladm_flow_add(linkid, attrlist, proplist, name, + t_arg, altroot); + if (status != DLADM_STATUS_OK) + die_dlerr(status, "add flow failed"); + + dladm_free_attrs(attrlist); + dladm_free_props(proplist); +} + +static void +do_remove_flow(int argc, char *argv[]) +{ + char option; + char *flowname = NULL; + char linkname[MAXNAMELEN]; + datalink_id_t linkid = DATALINK_ALL_LINKID; + boolean_t l_arg = B_FALSE; + remove_flow_state_t state; + dladm_status_t status; + + bzero(&state, sizeof (state)); + + opterr = 0; + while ((option = getopt_long(argc, argv, ":tR:l:", + longopts, NULL)) != -1) { + switch (option) { + case 't': + t_arg = B_TRUE; + break; + case 'R': + altroot = optarg; + break; + case 'l': + if (strlcpy(linkname, optarg, + MAXLINKNAMELEN) >= MAXLINKNAMELEN) { + die("link name too long"); + } + if (dladm_name2info(linkname, &linkid, NULL, + NULL, NULL) != DLADM_STATUS_OK) { + die("invalid link '%s'", linkname); + } + l_arg = B_TRUE; + break; + default: + die_opterr(optopt, option); + break; + } + } + + /* when link not specified get flow name */ + if (!l_arg) { + if (optind != (argc-1)) { + usage(); + } else { + if (strlen(argv[optind]) >= MAXFLOWNAME) + die("flow name too long"); + flowname = argv[optind]; + } + status = dladm_flow_remove(flowname, t_arg, altroot); + } else { + /* if link is specified then flow name should not be there */ + if (optind == argc-1) + usage(); + /* walk the link to find flows and remove them */ + state.fs_tempop = t_arg; + state.fs_altroot = altroot; + state.fs_status = DLADM_STATUS_OK; + status = dladm_walk_flow(remove_flow, linkid, &state, B_FALSE); + /* + * check if dladm_walk_flow terminated early and see if the + * walker function as any status for us + */ + if (status == DLADM_STATUS_OK) + status = state.fs_status; + } + + if (status != DLADM_STATUS_OK) + die_dlerr(status, "remove flow failed"); +} + +/* + * Walker function for removing a flow through dladm_walk_flow(); + */ +static int +remove_flow(dladm_flow_attr_t *attr, void *arg) +{ + remove_flow_state_t *state = (remove_flow_state_t *)arg; + + state->fs_status = dladm_flow_remove(attr->fa_flowname, + state->fs_tempop, state->fs_altroot); + + if (state->fs_status == DLADM_STATUS_OK) + return (DLADM_WALK_CONTINUE); + else + return (DLADM_WALK_TERMINATE); +} + +static char * +flowadm_print_field(print_field_t *pf, void *arg) +{ + char *value; + + value = (char *)arg + pf->pf_offset; + return (value); +} + +/*ARGSUSED*/ +static dladm_status_t +print_flow(show_flow_state_t *state, dladm_flow_attr_t *attr, + flow_fields_buf_t *fbuf) +{ + char link[MAXLINKNAMELEN]; + dladm_status_t status; + + if ((status = dladm_datalink_id2info(attr->fa_linkid, NULL, NULL, + NULL, link, sizeof (link))) != DLADM_STATUS_OK) { + return (status); + } + + (void) snprintf(fbuf->flow_name, sizeof (fbuf->flow_name), + "%s", attr->fa_flowname); + (void) snprintf(fbuf->flow_link, sizeof (fbuf->flow_link), + "%s", link); + + (void) dladm_flow_attr_ip2str(attr, fbuf->flow_ipaddr, + sizeof (fbuf->flow_ipaddr)); + (void) dladm_flow_attr_proto2str(attr, fbuf->flow_proto, + sizeof (fbuf->flow_proto)); + (void) dladm_flow_attr_port2str(attr, fbuf->flow_port, + sizeof (fbuf->flow_port)); + (void) dladm_flow_attr_dsfield2str(attr, fbuf->flow_dsfield, + sizeof (fbuf->flow_dsfield)); + + return (DLADM_STATUS_OK); +} + +/* + * Walker function for showing flow attributes through dladm_walk_flow(). + */ +static int +show_flow(dladm_flow_attr_t *attr, void *arg) +{ + show_flow_state_t *statep = arg; + dladm_status_t status; + flow_fields_buf_t fbuf; + + /* + * first get all the flow attributes into fbuf; + */ + bzero(&fbuf, sizeof (fbuf)); + status = print_flow(statep, attr, &fbuf); + + if (status != DLADM_STATUS_OK) + goto done; + + if (!statep->fs_parseable && !statep->fs_printheader) { + print_header(&statep->fs_print); + statep->fs_printheader = B_TRUE; + } + + flowadm_print_output(&statep->fs_print, statep->fs_parseable, + flowadm_print_field, (void *)&fbuf); + +done: + statep->fs_status = status; + return (DLADM_WALK_CONTINUE); +} + +static void +show_one_flow(void *arg, const char *name) +{ + dladm_flow_attr_t attr; + dladm_status_t status; + + if (dladm_flow_info(name, &attr) != DLADM_STATUS_OK) + die("invalid flow: '%s'", name); + else + show_flow(&attr, arg); +} + +/* + * Wrapper of dladm_walk_flow(show_flow,...) to make it usable to + * dladm_walk_datalink_id(). Used for showing flow attributes for + * all flows on all links. + */ +static int +show_flows_onelink(datalink_id_t linkid, void *arg) +{ + show_flow_state_t *state = arg; + + (void) dladm_walk_flow(show_flow, linkid, arg, state->fs_persist); + + return (DLADM_WALK_CONTINUE); +} + +static void +get_flow_stats(const char *flowname, pktsum_t *stats) +{ + kstat_ctl_t *kcp; + kstat_t *ksp; + + bzero(stats, sizeof (*stats)); + + if ((kcp = kstat_open()) == NULL) { + warn("kstat open operation failed"); + return; + } + + ksp = dladm_kstat_lookup(kcp, NULL, -1, flowname, "flow"); + + if (ksp != NULL) + dladm_get_stats(kcp, ksp, stats); + + (void) kstat_close(kcp); +} + +/* ARGSUSED */ +static int +show_flow_stats(dladm_flow_attr_t *attr, void *arg) +{ + show_flow_state_t *state = (show_flow_state_t *)arg; + const char *name = attr->fa_flowname; + pktsum_t stats, diff_stats; + + if (state->fs_firstonly) { + if (state->fs_donefirst) + return (DLADM_WALK_TERMINATE); + state->fs_donefirst = B_TRUE; + } else { + bzero(&state->fs_prevstats, sizeof (state->fs_prevstats)); + } + + get_flow_stats(name, &stats); + dladm_stats_diff(&diff_stats, &stats, &state->fs_prevstats); + + (void) printf("%-12s", name); + (void) printf("%-10llu", diff_stats.ipackets); + (void) printf("%-12llu", diff_stats.rbytes); + (void) printf("%-8llu", diff_stats.ierrors); + (void) printf("%-10llu", diff_stats.opackets); + (void) printf("%-12llu", diff_stats.obytes); + (void) printf("%-8llu\n", diff_stats.oerrors); + + state->fs_prevstats = stats; + + return (DLADM_WALK_CONTINUE); +} + +/* + * Wrapper of dladm_walk_flow(show_flow,...) to make it usable for + * dladm_walk_datalink_id(). Used for showing flow stats for + * all flows on all links. + */ +static int +show_link_flow_stats(datalink_id_t linkid, void * arg) +{ + if (dladm_walk_flow(show_flow_stats, linkid, arg, B_FALSE) + == DLADM_STATUS_OK) + return (DLADM_WALK_CONTINUE); + else + return (DLADM_WALK_TERMINATE); +} + +/* ARGSUSED */ +static void +flow_stats(const char *flow, datalink_id_t linkid, uint_t interval) +{ + show_flow_state_t state; + dladm_flow_attr_t attr; + + if (flow != NULL && dladm_flow_info(flow, &attr) != DLADM_STATUS_OK) + die("invalid flow %s", flow); + + bzero(&state, sizeof (state)); + + /* + * If an interval is specified, continuously show the stats + * for only the first flow. + */ + state.fs_firstonly = (interval != 0); + + for (;;) { + if (!state.fs_donefirst) + (void) printf("%-12s%-10s%-12s%-8s%-10s%-12s%-8s\n", + "FLOW", "IPACKETS", "RBYTES", "IERRORS", + "OPACKETS", "OBYTES", "OERRORS"); + + state.fs_donefirst = B_FALSE; + + /* Show stats for named flow */ + if (flow != NULL) { + state.fs_flow = flow; + (void) show_flow_stats(&attr, &state); + + /* Show all stats on a link */ + } else if (linkid != DATALINK_INVALID_LINKID) { + (void) dladm_walk_flow(show_flow_stats, linkid, &state, + B_FALSE); + + /* Show all stats by datalink */ + } else { + (void) dladm_walk_datalink_id(show_link_flow_stats, + &state, DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE, + DLADM_OPT_ACTIVE); + } + + if (interval == 0) + break; + + (void) sleep(interval); + } +} + +static void +do_show_flow(int argc, char *argv[]) +{ + char flowname[MAXFLOWNAME]; + char linkname[MAXNAMELEN]; + datalink_id_t linkid = DATALINK_ALL_LINKID; + int option; + boolean_t s_arg = B_FALSE; + boolean_t S_arg = B_FALSE; + boolean_t i_arg = B_FALSE; + boolean_t l_arg = B_FALSE; + boolean_t o_arg = B_FALSE; + uint32_t interval = 0; + char *endp = NULL; + show_flow_state_t state; + char *fields_str = NULL; + print_field_t **fields; + uint_t nfields; + char *all_fields = + "flow,link,ipaddr,transport,port,dsfield"; + dladm_status_t status; + + bzero(&state, sizeof (state)); + + opterr = 0; + while ((option = getopt_long(argc, argv, ":pPsSi:l:o:", + longopts, NULL)) != -1) { + switch (option) { + case 'p': + state.fs_parseable = B_TRUE; + break; + case 'P': + state.fs_persist = B_TRUE; + break; + case 's': + if (s_arg) + die_optdup(option); + + s_arg = B_TRUE; + break; + case 'S': + if (S_arg) + die_optdup(option); + + S_arg = B_TRUE; + break; + case 'o': + if (o_arg) + die_optdup(option); + + o_arg = B_TRUE; + fields_str = optarg; + break; + case 'i': + if (i_arg) + die_optdup(option); + + i_arg = B_TRUE; + + errno = 0; + interval = (int)strtol(optarg, &endp, 10); + if (errno != 0 || interval == 0 || *endp != '\0') + die("invalid interval value" " '%d'\n", + interval); + break; + case 'l': + if (strlcpy(linkname, optarg, MAXLINKNAMELEN) + >= MAXLINKNAMELEN) + die("link name too long\n"); + if (dladm_name2info(linkname, &linkid, NULL, + NULL, NULL) != DLADM_STATUS_OK) + die("invalid link '%s'", linkname); + l_arg = B_TRUE; + break; + default: + die_opterr(optopt, option); + break; + } + } + if (i_arg && !(s_arg || S_arg)) + die("the -i option can be used only with -s or -S"); + + if (s_arg && S_arg) + die("the -s option cannot be used with -S"); + + /* get flow name (optional last argument */ + if (optind == (argc-1)) { + if (strlcpy(flowname, argv[optind], MAXFLOWNAME) + >= MAXFLOWNAME) + die("flow name too long"); + state.fs_flow = flowname; + } + + if (s_arg) { + flow_stats(state.fs_flow, linkid, interval); + return; + } + + if (S_arg) { + dladm_continuous(linkid, state.fs_flow, interval, FLOW_REPORT); + return; + } + + if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0)) + fields_str = all_fields; + + fields = parse_output_fields(fields_str, flow_fields, FLOW_MAX_FIELDS, + CMD_TYPE_ANY, &nfields); + + if (fields == NULL) { + die("invalid fields(s) specified"); + return; + } + + state.fs_print.ps_fields = fields; + state.fs_print.ps_nfields = nfields; + + /* Show attributes of one flow */ + if (state.fs_flow != NULL) { + show_one_flow(&state, state.fs_flow); + + /* Show attributes of flows on one link */ + } else if (l_arg) { + (void) show_flows_onelink(linkid, &state); + + /* Show attributes of all flows on all links */ + } else { + (void) dladm_walk_datalink_id(show_flows_onelink, &state, + DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE, + DLADM_OPT_ACTIVE); + } +} + +static dladm_status_t +set_flowprop_persist(const char *flow, const char *prop_name, char **prop_val, + uint_t val_cnt, boolean_t reset) +{ + dladm_status_t status; + char *errprop; + + status = dladm_set_flowprop(flow, prop_name, prop_val, val_cnt, + DLADM_OPT_PERSIST, &errprop); + + if (status != DLADM_STATUS_OK) { + warn_dlerr(status, "cannot persistently %s flow " + "property '%s' on '%s'", reset? "reset": "set", + errprop, flow); + } + return (status); +} + +static void +set_flowprop(int argc, char **argv, boolean_t reset) +{ + int i, option; + char errmsg[DLADM_STRSIZE]; + const char *flow = NULL; + dladm_arg_list_t *proplist = NULL; + boolean_t temp = B_FALSE; + dladm_status_t status = DLADM_STATUS_OK; + + opterr = 0; + while ((option = getopt_long(argc, argv, ":p:R:t", + prop_longopts, NULL)) != -1) { + switch (option) { + case 'p': + if (dladm_parse_flow_props(optarg, &proplist, reset) + != DLADM_STATUS_OK) + die("invalid flow property specified"); + break; + case 't': + temp = B_TRUE; + break; + case 'R': + status = dladm_set_rootdir(optarg); + if (status != DLADM_STATUS_OK) { + die_dlerr(status, "invalid directory " + "specified"); + } + break; + default: + die_opterr(optopt, option); + break; + } + } + + if (optind == (argc - 1)) { + if (strlen(argv[optind]) >= MAXFLOWNAME) + die("flow name too long"); + flow = argv[optind]; + } else if (optind != argc) { + usage(); + } + if (flow == NULL) + die("flow name must be specified"); + + if (proplist == NULL) { + char *errprop; + + if (!reset) + die("flow property must be specified"); + + status = dladm_set_flowprop(flow, NULL, NULL, 0, + DLADM_OPT_ACTIVE, &errprop); + if (status != DLADM_STATUS_OK) { + warn_dlerr(status, "cannot reset flow property '%s' " + "on '%s'", errprop, flow); + } + if (!temp) { + dladm_status_t s; + + s = set_flowprop_persist(flow, NULL, NULL, 0, reset); + if (s != DLADM_STATUS_OK) + status = s; + } + goto done; + } + + for (i = 0; i < proplist->al_count; i++) { + dladm_arg_info_t *aip = &proplist->al_info[i]; + char **val; + uint_t count; + dladm_status_t s; + + if (reset) { + val = NULL; + count = 0; + } else { + val = aip->ai_val; + count = aip->ai_count; + if (count == 0) { + warn("no value specified for '%s'", + aip->ai_name); + status = DLADM_STATUS_BADARG; + continue; + } + } + s = dladm_set_flowprop(flow, aip->ai_name, val, count, + DLADM_OPT_ACTIVE, NULL); + if (s == DLADM_STATUS_OK) { + if (!temp) { + s = set_flowprop_persist(flow, + aip->ai_name, val, count, reset); + if (s != DLADM_STATUS_OK) + status = s; + } + continue; + } + status = s; + switch (s) { + case DLADM_STATUS_NOTFOUND: + warn("invalid flow property '%s'", aip->ai_name); + break; + case DLADM_STATUS_BADVAL: { + int j; + char *ptr, *lim; + char **propvals = NULL; + uint_t valcnt = DLADM_MAX_PROP_VALCNT; + + ptr = malloc((sizeof (char *) + + DLADM_PROP_VAL_MAX) * DLADM_MAX_PROP_VALCNT + + MAX_PROP_LINE); + + if (ptr == NULL) + die("insufficient memory"); + propvals = (char **)(void *)ptr; + + for (j = 0; j < DLADM_MAX_PROP_VALCNT; j++) { + propvals[j] = ptr + sizeof (char *) * + DLADM_MAX_PROP_VALCNT + + j * DLADM_PROP_VAL_MAX; + } + s = dladm_get_flowprop(flow, DLADM_PROP_VAL_MODIFIABLE, + aip->ai_name, propvals, &valcnt); + + ptr = errmsg; + lim = ptr + DLADM_STRSIZE; + *ptr = '\0'; + for (j = 0; j < valcnt && s == DLADM_STATUS_OK; j++) { + ptr += snprintf(ptr, lim - ptr, "%s,", + propvals[j]); + if (ptr >= lim) + break; + } + if (ptr > errmsg) { + *(ptr - 1) = '\0'; + warn("flow property '%s' must be one of: %s", + aip->ai_name, errmsg); + } else + warn("%s is an invalid value for " + "flow property %s", *val, aip->ai_name); + free(propvals); + break; + } + default: + if (reset) { + warn_dlerr(status, "cannot reset flow property " + "'%s' on '%s'", aip->ai_name, flow); + } else { + warn_dlerr(status, "cannot set flow property " + "'%s' on '%s'", aip->ai_name, flow); + } + break; + } + } +done: + dladm_free_props(proplist); + if (status != DLADM_STATUS_OK) + exit(1); +} + +static void +do_set_flowprop(int argc, char **argv) +{ + set_flowprop(argc, argv, B_FALSE); +} + +static void +do_reset_flowprop(int argc, char **argv) +{ + set_flowprop(argc, argv, B_TRUE); +} + +static void +warn(const char *format, ...) +{ + va_list alist; + + format = gettext(format); + (void) fprintf(stderr, "%s: warning: ", progname); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); + + (void) putchar('\n'); +} + +/* PRINTFLIKE2 */ +static void +warn_dlerr(dladm_status_t err, const char *format, ...) +{ + va_list alist; + char errmsg[DLADM_STRSIZE]; + + format = gettext(format); + (void) fprintf(stderr, gettext("%s: warning: "), progname); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); + (void) fprintf(stderr, ": %s\n", dladm_status2str(err, errmsg)); +} + +/* PRINTFLIKE1 */ +static void +die(const char *format, ...) +{ + va_list alist; + + format = gettext(format); + (void) fprintf(stderr, "%s: ", progname); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); + + (void) putchar('\n'); + exit(EXIT_FAILURE); +} + +static void +die_optdup(int opt) +{ + die("the option -%c cannot be specified more than once", opt); +} + +static void +die_opterr(int opt, int opterr) +{ + switch (opterr) { + case ':': + die("option '-%c' requires a value", opt); + break; + case '?': + default: + die("unrecognized option '-%c'", opt); + break; + } +} + +/* PRINTFLIKE2 */ +static void +die_dlerr(dladm_status_t err, const char *format, ...) +{ + va_list alist; + char errmsg[DLADM_STRSIZE]; + + format = gettext(format); + (void) fprintf(stderr, "%s: ", progname); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); + (void) fprintf(stderr, ": %s\n", dladm_status2str(err, errmsg)); + + exit(EXIT_FAILURE); +} + +static void +print_flowprop(const char *flowname, show_flowprop_state_t *statep, + const char *propname, dladm_prop_type_t type, + const char *format, char **pptr) +{ + int i; + char *ptr, *lim; + char buf[DLADM_STRSIZE]; + char *unknown = "--", *notsup = ""; + char **propvals = statep->fs_propvals; + uint_t valcnt = DLADM_MAX_PROP_VALCNT; + dladm_status_t status; + + status = dladm_get_flowprop(flowname, type, propname, propvals, + &valcnt); + if (status != DLADM_STATUS_OK) { + if (status == DLADM_STATUS_TEMPONLY) { + if (type == DLADM_PROP_VAL_MODIFIABLE && + statep->fs_persist) { + valcnt = 1; + propvals = &unknown; + } else { + statep->fs_status = status; + statep->fs_retstatus = status; + return; + } + } else if (status == DLADM_STATUS_NOTSUP || + statep->fs_persist) { + valcnt = 1; + if (type == DLADM_PROP_VAL_CURRENT) + propvals = &unknown; + else + propvals = ¬sup; + } else { + if ((statep->fs_proplist != NULL) && + statep->fs_status == DLADM_STATUS_OK) { + warn("invalid flow property '%s'", propname); + } + statep->fs_status = status; + statep->fs_retstatus = status; + return; + } + } + + statep->fs_status = DLADM_STATUS_OK; + + ptr = buf; + lim = buf + DLADM_STRSIZE; + for (i = 0; i < valcnt; i++) { + if (propvals[i][0] == '\0' && !statep->fs_parseable) + ptr += snprintf(ptr, lim - ptr, STR_UNDEF_VAL","); + else + ptr += snprintf(ptr, lim - ptr, "%s,", propvals[i]); + if (ptr >= lim) + break; + } + if (valcnt > 0) + buf[strlen(buf) - 1] = '\0'; + + lim = statep->fs_line + MAX_PROP_LINE; + if (statep->fs_parseable) { + *pptr += snprintf(*pptr, lim - *pptr, + "%s", buf); + } else { + *pptr += snprintf(*pptr, lim - *pptr, format, buf); + } +} + +static char * +flowprop_callback(print_field_t *pf, void *fs_arg) +{ + flowprop_args_t *arg = fs_arg; + char *propname = arg->fs_propname; + show_flowprop_state_t *statep = arg->fs_state; + char *ptr = statep->fs_line; + char *lim = ptr + MAX_PROP_LINE; + char *flowname = arg->fs_flowname; + + switch (pf->pf_index) { + case FLOWPROP_FLOW: + (void) snprintf(ptr, lim - ptr, "%s", statep->fs_flow); + break; + case FLOWPROP_PROPERTY: + (void) snprintf(ptr, lim - ptr, "%s", propname); + break; + case FLOWPROP_VALUE: + print_flowprop(flowname, statep, propname, + statep->fs_persist ? DLADM_PROP_VAL_PERSISTENT : + DLADM_PROP_VAL_CURRENT, "%s", &ptr); + /* + * If we failed to query the flow property, for example, query + * the persistent value of a non-persistable flow property, + * simply skip the output. + */ + if (statep->fs_status != DLADM_STATUS_OK) + goto skip; + ptr = statep->fs_line; + break; + case FLOWPROP_DEFAULT: + print_flowprop(flowname, statep, propname, + DLADM_PROP_VAL_DEFAULT, "%s", &ptr); + if (statep->fs_status != DLADM_STATUS_OK) + goto skip; + ptr = statep->fs_line; + break; + case FLOWPROP_POSSIBLE: + print_flowprop(flowname, statep, propname, + DLADM_PROP_VAL_MODIFIABLE, "%s ", &ptr); + if (statep->fs_status != DLADM_STATUS_OK) + goto skip; + ptr = statep->fs_line; + break; + default: + die("invalid input"); + break; + } + return (ptr); +skip: + if (statep->fs_status != DLADM_STATUS_OK) + return (NULL); + else + return (""); +} + +static int +show_one_flowprop(void *arg, const char *propname) +{ + show_flowprop_state_t *statep = arg; + flowprop_args_t fs_arg; + + bzero(&fs_arg, sizeof (fs_arg)); + fs_arg.fs_state = statep; + fs_arg.fs_propname = (char *)propname; + fs_arg.fs_flowname = (char *)statep->fs_flow; + + if (statep->fs_header) { + statep->fs_header = B_FALSE; + if (!statep ->fs_parseable) + print_header(&statep->fs_print); + } + flowadm_print_output(&statep->fs_print, statep->fs_parseable, + flowprop_callback, (void *)&fs_arg); + + return (DLADM_WALK_CONTINUE); +} + +/* Walker function called by dladm_walk_flow to display flow properties */ +static int +show_flowprop(dladm_flow_attr_t *attr, void *arg) +{ + show_flowprop_one_flow(arg, attr->fa_flowname); + return (DLADM_WALK_CONTINUE); +} + +/* + * Wrapper of dladm_walk_flow(show_walk_fn,...) to make it + * usable to dladm_walk_datalink_id() + */ +static int +show_flowprop_onelink(datalink_id_t linkid, void *arg) +{ + char name[MAXLINKNAMELEN]; + + if (dladm_datalink_id2info(linkid, NULL, NULL, NULL, + name, sizeof (name)) != DLADM_STATUS_OK) + return (DLADM_WALK_TERMINATE); + + (void) dladm_walk_flow(show_flowprop, linkid, arg, B_FALSE); + + return (DLADM_WALK_CONTINUE); +} + +static void +do_show_flowprop(int argc, char **argv) +{ + int option; + dladm_arg_list_t *proplist = NULL; + show_flowprop_state_t state; + char *fields_str = NULL; + print_field_t **fields; + uint_t nfields; + char *all_fields = + "flow,property,value,default,possible"; + + fields_str = all_fields; + opterr = 0; + state.fs_propvals = NULL; + state.fs_line = NULL; + state.fs_parseable = B_FALSE; + state.fs_persist = B_FALSE; + state.fs_header = B_TRUE; + state.fs_retstatus = DLADM_STATUS_OK; + state.fs_linkid = DATALINK_INVALID_LINKID; + state.fs_flow = NULL; + + while ((option = getopt_long(argc, argv, ":p:cPl:o:", + prop_longopts, NULL)) != -1) { + switch (option) { + case 'p': + if (dladm_parse_flow_props(optarg, &proplist, B_TRUE) + != DLADM_STATUS_OK) + die("invalid flow properties specified"); + break; + case 'c': + state.fs_parseable = B_TRUE; + break; + case 'P': + state.fs_persist = B_TRUE; + break; + case 'l': + if (dladm_name2info(optarg, &state.fs_linkid, + NULL, NULL, NULL) != DLADM_STATUS_OK) + die("invalid link '%s'", optarg); + break; + case 'o': + if (strcasecmp(optarg, "all") == 0) + fields_str = all_fields; + else + fields_str = optarg; + break; + default: + die_opterr(optopt, option); + break; + } + } + + if (optind == (argc - 1)) { + if (strlen(argv[optind]) >= MAXFLOWNAME) + die("flow name too long"); + state.fs_flow = argv[optind]; + } else if (optind != argc) { + usage(); + } + bzero(&state.fs_print, sizeof (print_state_t)); + state.fs_proplist = proplist; + state.fs_status = DLADM_STATUS_OK; + + fields = parse_output_fields(fields_str, flowprop_fields, + FLOWPROP_MAX_FIELDS, CMD_TYPE_ANY, &nfields); + + if (fields == NULL) { + die("invalid field(s) specified"); + return; + } + + state.fs_print.ps_fields = fields; + state.fs_print.ps_nfields = nfields; + + /* Show properties for one flow */ + if (state.fs_flow != NULL) { + show_flowprop_one_flow(&state, state.fs_flow); + + /* Show properties for all flows on one link */ + } else if (state.fs_linkid != DATALINK_INVALID_LINKID) { + (void) show_flowprop_onelink(state.fs_linkid, &state); + + /* Show properties for all flows on all links */ + } else { + (void) dladm_walk_datalink_id(show_flowprop_onelink, &state, + DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE, + DLADM_OPT_ACTIVE); + } + + dladm_free_props(proplist); +} + +static void +show_flowprop_one_flow(void *arg, const char *flow) +{ + int i; + char *buf; + dladm_status_t status; + dladm_arg_list_t *proplist = NULL; + show_flowprop_state_t *statep = arg; + dladm_flow_attr_t attr; + const char *savep; + + /* + * Do not print flow props for invalid flows. + */ + if ((status = dladm_flow_info(flow, &attr)) != DLADM_STATUS_OK) { + die("invalid flow: '%s'", flow); + } + + savep = statep->fs_flow; + statep->fs_flow = flow; + + proplist = statep->fs_proplist; + + buf = malloc((sizeof (char *) + DLADM_PROP_VAL_MAX) + * DLADM_MAX_PROP_VALCNT + MAX_PROP_LINE); + if (buf == NULL) + die("insufficient memory"); + + statep->fs_propvals = (char **)(void *)buf; + for (i = 0; i < DLADM_MAX_PROP_VALCNT; i++) { + statep->fs_propvals[i] = buf + + sizeof (char *) * DLADM_MAX_PROP_VALCNT + + i * DLADM_PROP_VAL_MAX; + } + statep->fs_line = buf + + (sizeof (char *) + DLADM_PROP_VAL_MAX) * DLADM_MAX_PROP_VALCNT; + + /* show only specified flow properties */ + if (proplist != NULL) { + for (i = 0; i < proplist->al_count; i++) { + if (show_one_flowprop(statep, + proplist->al_info[i].ai_name) != DLADM_STATUS_OK) + break; + } + + /* show all flow properties */ + } else { + status = dladm_walk_flowprop(show_one_flowprop, flow, statep); + if (status != DLADM_STATUS_OK) + die_dlerr(status, "show-flowprop"); + } + free(buf); + statep->fs_flow = savep; +} + +typedef struct { + char *s_buf; + char **s_fields; /* array of pointer to the fields in s_buf */ + uint_t s_nfields; /* the number of fields in s_buf */ +} split_t; + +/* + * Free the split_t structure pointed to by `sp'. + */ +static void +splitfree(split_t *sp) +{ + free(sp->s_buf); + free(sp->s_fields); + free(sp); +} + +/* + * Split `str' into at most `maxfields' fields, each field at most `maxlen' in + * length. Return a pointer to a split_t containing the split fields, or NULL + * on failure. + */ +static split_t * +split(const char *str, uint_t maxfields, uint_t maxlen) +{ + char *field, *token, *lasts = NULL; + split_t *sp; + + if (*str == '\0' || maxfields == 0 || maxlen == 0) + return (NULL); + + sp = calloc(sizeof (split_t), 1); + if (sp == NULL) + return (NULL); + + sp->s_buf = strdup(str); + sp->s_fields = malloc(sizeof (char *) * maxfields); + if (sp->s_buf == NULL || sp->s_fields == NULL) + goto fail; + + token = sp->s_buf; + while ((field = strtok_r(token, ",", &lasts)) != NULL) { + if (sp->s_nfields == maxfields || strlen(field) > maxlen) + goto fail; + token = NULL; + sp->s_fields[sp->s_nfields++] = field; + } + return (sp); +fail: + splitfree(sp); + return (NULL); +} + +static print_field_t ** +parse_output_fields(char *str, print_field_t *template, int max_fields, + uint_t cmdtype, uint_t *countp) +{ + split_t *sp; + boolean_t good_match = B_FALSE; + uint_t i, j; + print_field_t **pf = NULL; + + sp = split(str, max_fields, MAX_FIELD_LEN); + + if (sp == NULL) + return (NULL); + + pf = malloc(sp->s_nfields * sizeof (print_field_t *)); + if (pf == NULL) + goto fail; + + for (i = 0; i < sp->s_nfields; i++) { + for (j = 0; j < max_fields; j++) { + if (strcasecmp(sp->s_fields[i], + template[j].pf_name) == 0) { + good_match = template[j]. pf_cmdtype & cmdtype; + break; + } + } + if (!good_match) + goto fail; + + good_match = B_FALSE; + pf[i] = &template[j]; + } + *countp = i; + splitfree(sp); + return (pf); +fail: + free(pf); + splitfree(sp); + return (NULL); +} + +static void +flowadm_print_output(print_state_t *statep, boolean_t parseable, + print_callback_t fn, void *arg) +{ + int i; + char *value; + print_field_t **pf; + + pf = statep->ps_fields; + for (i = 0; i < statep->ps_nfields; i++) { + statep->ps_lastfield = (i + 1 == statep->ps_nfields); + value = (*fn)(pf[i], arg); + if (value != NULL) + print_field(statep, pf[i], value, parseable); + } + (void) putchar('\n'); +} + +static void +print_header(print_state_t *ps) +{ + int i; + print_field_t **pf; + + pf = ps->ps_fields; + for (i = 0; i < ps->ps_nfields; i++) { + ps->ps_lastfield = (i + 1 == ps->ps_nfields); + print_field(ps, pf[i], pf[i]->pf_header, B_FALSE); + } + (void) putchar('\n'); +} + +static void +print_field(print_state_t *statep, print_field_t *pfp, const char *value, + boolean_t parseable) +{ + uint_t width = pfp->pf_width; + uint_t valwidth = strlen(value); + uint_t compress; + + if (parseable) { + (void) printf("%s=\"%s\"", pfp->pf_header, value); + } else { + if (value[0] == '\0') + value = STR_UNDEF_VAL; + if (statep->ps_lastfield) { + (void) printf("%s", value); + return; + } + + if (valwidth > width) { + statep->ps_overflow += valwidth - width; + } else if (valwidth < width && statep->ps_overflow > 0) { + compress = min(statep->ps_overflow, width - valwidth); + statep->ps_overflow -= compress; + width -= compress; + } + (void) printf("%-*s", width, value); + } + + if (!statep->ps_lastfield) + (void) putchar(' '); +} diff --git a/usr/src/cmd/flowadm/flowadm.conf b/usr/src/cmd/flowadm/flowadm.conf new file mode 100644 index 0000000000..3977ddf645 --- /dev/null +++ b/usr/src/cmd/flowadm/flowadm.conf @@ -0,0 +1,28 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# DO NOT EDIT OR PARSE THIS FILE! +# +# Use the flowadm(1m) command to change the contents of this file. + diff --git a/usr/src/cmd/flowadm/flowadm.xcl b/usr/src/cmd/flowadm/flowadm.xcl new file mode 100644 index 0000000000..856a788ed6 --- /dev/null +++ b/usr/src/cmd/flowadm/flowadm.xcl @@ -0,0 +1,113 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# + +msgid "--" +msgid "--," +msgid "" +msgid " " +msgid "%-*s" +msgid "%-10llu" +msgid "%-12llu" +msgid "%-12s" +msgid "%-12s%-10s%-12s%-8s%-10s%-12s%-8s\n" +msgid "%-8llu" +msgid "%-8llu\n" +msgid "%d" +msgid "%s" +msgid "%s " +msgid "%s," +msgid "%s/%d " +msgid "%s: " +msgid "%s=\"%s\"" +msgid "," +msgid "/" +msgid "0x%x" +msgid ": %s\n" +msgid ":d:R:t" +msgid ":p:R:t" +msgid ":p:cPl:o:" +msgid "?" +msgid "ATTR" +msgid "DEFAULT" +msgid "FLOW" +msgid "ICMPV6" +msgid "ICMPv6" +msgid "IERRORS" +msgid "IPACKETS" +msgid "LINK" +msgid "NAME" +msgid "OBYTES" +msgid "OERRORS" +msgid "OPACKETS" +msgid "POSSIBLE" +msgid "PROPERTY" +msgid "RBYTES" +msgid "SCTP" +msgid "TCP" +msgid "UDP" +msgid "VALUE" +msgid "add-flow" +msgid "all" +msgid "attr" +msgid "default" +msgid "dsfield" +msgid "dsfield_mask" +msgid "flow" +msgid "flow,property,value,default,possible" +msgid "icmp" +msgid "icmpv6" +msgid "init-flow" +msgid "interval" +msgid "link" +msgid "local_ip" +msgid "local_port" +msgid "name" +msgid "name,link,attr,value" +msgid "net_rawaccess" +msgid "parseable" +msgid "possible" +msgid "prop" +msgid "property" +msgid "psSi:l:o:" +msgid "remote_ip" +msgid "remove-flow" +msgid "reset" +msgid "reset-flowprop" +msgid "root-dir" +msgid "sctp" +msgid "set" +msgid "set-flowprop" +msgid "show-flow" +msgid "show-flowprop" +msgid "show-usage" +msgid "statistics" +msgid "sys_net_config" +msgid "tR:l:a:p:" +msgid "tcp" +msgid "tdps:e:f:" +msgid "temporary" +msgid "transport" +msgid "udp" +msgid "value" diff --git a/usr/src/cmd/flowadm/flowprop.conf b/usr/src/cmd/flowadm/flowprop.conf new file mode 100644 index 0000000000..ad6f802040 --- /dev/null +++ b/usr/src/cmd/flowadm/flowprop.conf @@ -0,0 +1,29 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# +# DO NOT EDIT OR PARSE THIS FILE! +# +# Use the flowadm(1m) command to change the contents of this file. + diff --git a/usr/src/cmd/mdb/Makefile.common b/usr/src/cmd/mdb/Makefile.common index 5677289bc9..ed27426b8d 100644 --- a/usr/src/cmd/mdb/Makefile.common +++ b/usr/src/cmd/mdb/Makefile.common @@ -24,7 +24,8 @@ # # MDB modules used for debugging user processes that every ISA's build # subdirectory will need to build. -# +# + COMMON_MODULES_PROC = \ dof \ libavl \ @@ -70,6 +71,7 @@ COMMON_MODULES_KVM = \ krtld \ lofs \ logindmux \ + mac \ md \ nca \ nsctl \ diff --git a/usr/src/cmd/mdb/common/modules/mac/mac.c b/usr/src/cmd/mdb/common/modules/mac/mac.c new file mode 100644 index 0000000000..0f1effb4b2 --- /dev/null +++ b/usr/src/cmd/mdb/common/modules/mac/mac.c @@ -0,0 +1,685 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/mdb_modapi.h> +#include <sys/types.h> +#include <inet/ip.h> +#include <inet/ip6.h> + +#include <sys/mac.h> +#include <sys/mac_provider.h> +#include <sys/mac_client.h> +#include <sys/mac_client_impl.h> +#include <sys/mac_flow_impl.h> +#include <sys/mac_soft_ring.h> + +#define STRSIZE 64 +#define MAC_RX_SRS_SIZE (MAX_RINGS_PER_GROUP * sizeof (uintptr_t)) + +#define LAYERED_WALKER_FOR_FLOW "flow_entry_cache" +#define LAYERED_WALKER_FOR_SRS "mac_srs_cache" +#define LAYERED_WALKER_FOR_RING "mac_ring_cache" + +/* arguments passed to mac_flow dee-command */ +#define MAC_FLOW_NONE 0x01 +#define MAC_FLOW_ATTR 0x02 +#define MAC_FLOW_PROP 0x04 +#define MAC_FLOW_RX 0x08 +#define MAC_FLOW_TX 0x10 +#define MAC_FLOW_USER 0x20 +#define MAC_FLOW_STATS 0x40 +#define MAC_FLOW_MISC 0x80 + +/* arguments passed to mac_srs dee-command */ +#define MAC_SRS_RX 0x01 +#define MAC_SRS_TX 0x02 + +static char * +mac_flow_proto2str(uint8_t protocol) +{ + switch (protocol) { + case IPPROTO_TCP: + return ("tcp"); + case IPPROTO_UDP: + return ("udp"); + case IPPROTO_SCTP: + return ("sctp"); + case IPPROTO_ICMP: + return ("icmp"); + case IPPROTO_ICMPV6: + return ("icmpv6"); + default: + return ("--"); + } +} + +static char * +mac_flow_priority2str(mac_priority_level_t prio) +{ + switch (prio) { + case MPL_LOW: + return ("low"); + case MPL_MEDIUM: + return ("medium"); + case MPL_HIGH: + return ("high"); + case MPL_RESET: + return ("reset"); + default: + return ("--"); + } +} + +/* + * Convert bandwidth in bps to a string in mpbs. + */ +static char * +mac_flow_bw2str(uint64_t bw, char *buf, ssize_t len) +{ + int kbps, mbps; + + kbps = (bw % 1000000)/1000; + mbps = bw/1000000; + if ((mbps == 0) && (kbps != 0)) + mdb_snprintf(buf, len, "0.%03u", kbps); + else + mdb_snprintf(buf, len, "%5u", mbps); + return (buf); +} + +static void +mac_flow_print_header(uint_t args) +{ + switch (args) { + case MAC_FLOW_NONE: + mdb_printf("%<u>%?s %-32s %-6s %?s %?s %-20s%</u>\n", + "ADDR", "FLOW NAME", "LINKID", "MCIP", "MIP", + "MIP NAME"); + break; + case MAC_FLOW_ATTR: + mdb_printf("%<u>%?s %-32s %-7s %6s " + "%-9s %s%</u>\n", + "ADDR", "FLOW NAME", "PROTO", "PORT", + "DSFLD:MSK", "IPADDR"); + break; + case MAC_FLOW_PROP: + mdb_printf("%<u>%?s %-32s %8s %9s%</u>\n", + "ADDR", "FLOW NAME", "MAXBW(M)", "PRIORITY"); + break; + case MAC_FLOW_MISC: + mdb_printf("%<u>%?s %-32s %10s %10s " + "%32s %s%</u>\n", + "ADDR", "FLOW NAME", "TYPE", "FLAGS", + "MATCH_FN", "ZONE"); + break; + case MAC_FLOW_RX: + mdb_printf("%<u>%?s %-24s %-30s %?s " + "%?s %7s %s%</u>\n", + "ADDR", "FLOW NAME", "CB_FUNC", "CB_ARG1", + "CB_ARG2", "SRS_CNT", "RX_SRS"); + break; + case MAC_FLOW_TX: + mdb_printf("%<u>%?s %-32s %?s %</u>\n", + "ADDR", "FLOW NAME", "TX_SRS"); + break; + case MAC_FLOW_STATS: + mdb_printf("%<u>%?s %-32s %?s %?s%</u>\n", + "ADDR", "FLOW NAME", "RBYTES", "OBYTES"); + break; + } +} + +/* + * Display selected fields of the flow_entry_t structure + */ +static int +mac_flow_dcmd_output(uintptr_t addr, uint_t flags, uint_t args) +{ + static const mdb_bitmask_t flow_type_bits[] = { + {"P", FLOW_PRIMARY_MAC, FLOW_PRIMARY_MAC}, + {"V", FLOW_VNIC_MAC, FLOW_VNIC_MAC}, + {"M", FLOW_MCAST, FLOW_MCAST}, + {"O", FLOW_OTHER, FLOW_OTHER}, + {"U", FLOW_USER, FLOW_USER}, + {"V", FLOW_VNIC, FLOW_VNIC}, + {"NS", FLOW_NO_STATS, FLOW_NO_STATS}, + { NULL, 0, 0 } + }; +#define FLOW_MAX_TYPE (sizeof (flow_type_bits) / sizeof (mdb_bitmask_t)) + + static const mdb_bitmask_t flow_flag_bits[] = { + {"Q", FE_QUIESCE, FE_QUIESCE}, + {"W", FE_WAITER, FE_WAITER}, + {"T", FE_FLOW_TAB, FE_FLOW_TAB}, + {"G", FE_G_FLOW_HASH, FE_G_FLOW_HASH}, + {"I", FE_INCIPIENT, FE_INCIPIENT}, + {"C", FE_CONDEMNED, FE_CONDEMNED}, + {"NU", FE_UF_NO_DATAPATH, FE_UF_NO_DATAPATH}, + {"NC", FE_MC_NO_DATAPATH, FE_MC_NO_DATAPATH}, + { NULL, 0, 0 } + }; +#define FLOW_MAX_FLAGS (sizeof (flow_flag_bits) / sizeof (mdb_bitmask_t)) + flow_entry_t fe; + mac_client_impl_t mcip; + mac_impl_t mip; + + if (mdb_vread(&fe, sizeof (fe), addr) == -1) { + mdb_warn("failed to read struct flow_entry_s at %p", addr); + return (DCMD_ERR); + } + if (args & MAC_FLOW_USER) { + args &= ~MAC_FLOW_USER; + if (fe.fe_type & FLOW_MCAST) { + if (DCMD_HDRSPEC(flags)) + mac_flow_print_header(args); + return (DCMD_OK); + } + } + if (DCMD_HDRSPEC(flags)) + mac_flow_print_header(args); + bzero(&mcip, sizeof (mcip)); + bzero(&mip, sizeof (mip)); + if (fe.fe_mcip != NULL && mdb_vread(&mcip, sizeof (mcip), + (uintptr_t)fe.fe_mcip) == sizeof (mcip)) { + (void) mdb_vread(&mip, sizeof (mip), (uintptr_t)mcip.mci_mip); + } + switch (args) { + case MAC_FLOW_NONE: { + mdb_printf("%?p %-32s %6d %?p " + "%?p %-20s\n", + addr, fe.fe_flow_name, fe.fe_link_id, fe.fe_mcip, + mcip.mci_mip, mip.mi_name); + break; + } + case MAC_FLOW_ATTR: { + struct in_addr in4; + uintptr_t desc_addr; + flow_desc_t fdesc; + + desc_addr = addr + OFFSETOF(flow_entry_t, fe_flow_desc); + if (mdb_vread(&fdesc, sizeof (fdesc), desc_addr) == -1) { + mdb_warn("failed to read struct flow_description at %p", + desc_addr); + return (DCMD_ERR); + } + mdb_printf("%?p %-32s " + "%-7s %6d" + "%4d:%-4d ", + addr, fe.fe_flow_name, + mac_flow_proto2str(fdesc.fd_protocol), fdesc.fd_local_port, + fdesc.fd_dsfield, fdesc.fd_dsfield_mask); + if (fdesc.fd_ipversion == IPV4_VERSION) { + IN6_V4MAPPED_TO_INADDR(&fdesc.fd_local_addr, &in4); + mdb_printf("%I", in4.s_addr); + } else if (fdesc.fd_ipversion == IPV6_VERSION) { + mdb_printf("%N", &fdesc.fd_local_addr); + } else { + mdb_printf("%s", "--"); + } + mdb_printf("\n"); + break; + } + case MAC_FLOW_PROP: { + uintptr_t prop_addr; + char bwstr[STRSIZE]; + mac_resource_props_t fprop; + + prop_addr = addr + OFFSETOF(flow_entry_t, fe_resource_props); + if (mdb_vread(&fprop, sizeof (fprop), prop_addr) == -1) { + mdb_warn("failed to read struct mac_resoource_props " + "at %p", prop_addr); + return (DCMD_ERR); + } + mdb_printf("%?p %-32s " + "%8s %9s\n", + addr, fe.fe_flow_name, + mac_flow_bw2str(fprop.mrp_maxbw, bwstr, STRSIZE), + mac_flow_priority2str(fprop.mrp_priority)); + break; + } + case MAC_FLOW_MISC: { + char flow_flags[2 * FLOW_MAX_FLAGS]; + char flow_type[2 * FLOW_MAX_TYPE]; + GElf_Sym sym; + char func_name[MDB_SYM_NAMLEN] = ""; + uintptr_t func, match_addr; + + match_addr = addr + OFFSETOF(flow_entry_t, fe_match); + (void) mdb_vread(&func, sizeof (func), match_addr); + (void) mdb_lookup_by_addr(func, MDB_SYM_EXACT, func_name, + MDB_SYM_NAMLEN, &sym); + mdb_snprintf(flow_flags, 2 * FLOW_MAX_FLAGS, "%hb", + fe.fe_flags, flow_flag_bits); + mdb_snprintf(flow_type, 2 * FLOW_MAX_TYPE, "%hb", + fe.fe_type, flow_type_bits); + mdb_printf("%?p %-32s %10s %10s " + "%32s %-d\n", + addr, fe.fe_flow_name, flow_type, flow_flags, + func_name, fe.fe_zoneid); + break; + } + case MAC_FLOW_RX: { + uintptr_t rx_srs[MAX_RINGS_PER_GROUP] = {0}; + char cb_fn[MDB_SYM_NAMLEN] = ""; + uintptr_t cb_fnaddr, fnaddr, rxaddr; + int i; + GElf_Sym sym; + + rxaddr = addr + OFFSETOF(flow_entry_t, fe_rx_srs); + (void) mdb_vread(rx_srs, MAC_RX_SRS_SIZE, rxaddr); + fnaddr = addr + OFFSETOF(flow_entry_t, fe_cb_fn); + (void) mdb_vread(&cb_fnaddr, sizeof (cb_fnaddr), fnaddr); + (void) mdb_lookup_by_addr(cb_fnaddr, MDB_SYM_EXACT, cb_fn, + MDB_SYM_NAMLEN, &sym); + mdb_printf("%?p %-24s %-30s %?p " + "%?p %7d ", + addr, fe.fe_flow_name, cb_fn, fe.fe_cb_arg1, + fe.fe_cb_arg2, fe.fe_rx_srs_cnt); + for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { + if (rx_srs[i] == 0) + continue; + mdb_printf("%p ", rx_srs[i]); + } + mdb_printf("\n"); + break; + } + case MAC_FLOW_TX: { + uintptr_t tx_srs = 0, txaddr; + + txaddr = addr + OFFSETOF(flow_entry_t, fe_tx_srs); + (void) mdb_vread(&tx_srs, sizeof (uintptr_t), txaddr); + mdb_printf("%?p %-32s %?p\n", + addr, fe.fe_flow_name, fe.fe_tx_srs); + break; + } + case MAC_FLOW_STATS: { + mdb_printf("%?p %-32s %16llu %16llu\n", + addr, fe.fe_flow_name, fe.fe_flowstats.fs_rbytes, + fe.fe_flowstats.fs_obytes); + break; + } + } + return (DCMD_OK); +} + +/* + * Parse the arguments passed to the dcmd and print all or one flow_entry_t + * structures + */ +static int +mac_flow_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +{ + uint_t args = 0; + + if (!(flags & DCMD_ADDRSPEC)) { + if (mdb_walk_dcmd("mac_flow", "mac_flow", argc, argv) == -1) { + mdb_warn("failed to walk 'mac_flow'"); + return (DCMD_ERR); + } + return (DCMD_OK); + } + if ((mdb_getopts(argc, argv, + 'a', MDB_OPT_SETBITS, MAC_FLOW_ATTR, &args, + 'p', MDB_OPT_SETBITS, MAC_FLOW_PROP, &args, + 'm', MDB_OPT_SETBITS, MAC_FLOW_MISC, &args, + 'r', MDB_OPT_SETBITS, MAC_FLOW_RX, &args, + 't', MDB_OPT_SETBITS, MAC_FLOW_TX, &args, + 's', MDB_OPT_SETBITS, MAC_FLOW_STATS, &args, + 'u', MDB_OPT_SETBITS, MAC_FLOW_USER, &args) != argc)) { + return (DCMD_USAGE); + } + if (argc > 2 || (argc == 2 && !(args & MAC_FLOW_USER))) + return (DCMD_USAGE); + /* + * If no arguments was specified or just "-u" was specified then + * we default to printing basic information of flows. + */ + if (args == 0 || args == MAC_FLOW_USER) + args |= MAC_FLOW_NONE; + + return (mac_flow_dcmd_output(addr, flags, args)); +} + +static void +mac_flow_help(void) +{ + mdb_printf("If an address is specified, then flow_entry structure at " + "that address is printed. Otherwise all the flows in the system " + "are printed.\n"); + mdb_printf("Options:\n" + "\t-u\tdisplay user defined link & vnic flows.\n" + "\t-a\tdisplay flow attributes\n" + "\t-p\tdisplay flow properties\n" + "\t-r\tdisplay rx side information\n" + "\t-t\tdisplay tx side information\n" + "\t-s\tdisplay flow statistics\n" + "\t-m\tdisplay miscellaneous flow information\n\n"); + mdb_printf("%<u>Interpreting Flow type and Flow flags output.%</u>\n"); + mdb_printf("Flow Types:\n"); + mdb_printf("\t P --> FLOW_PRIMARY_MAC\n"); + mdb_printf("\t V --> FLOW_VNIC_MAC\n"); + mdb_printf("\t M --> FLOW_MCAST\n"); + mdb_printf("\t O --> FLOW_OTHER\n"); + mdb_printf("\t U --> FLOW_USER\n"); + mdb_printf("\t NS --> FLOW_NO_STATS\n\n"); + mdb_printf("Flow Flags:\n"); + mdb_printf("\t Q --> FE_QUIESCE\n"); + mdb_printf("\t W --> FE_WAITER\n"); + mdb_printf("\t T --> FE_FLOW_TAB\n"); + mdb_printf("\t G --> FE_G_FLOW_HASH\n"); + mdb_printf("\t I --> FE_INCIPIENT\n"); + mdb_printf("\t C --> FE_CONDEMNED\n"); + mdb_printf("\t NU --> FE_UF_NO_DATAPATH\n"); + mdb_printf("\t NC --> FE_MC_NO_DATAPATH\n"); +} + +/* + * called once by the debugger when the mac_flow walk begins. + */ +static int +mac_flow_walk_init(mdb_walk_state_t *wsp) +{ + if (mdb_layered_walk(LAYERED_WALKER_FOR_FLOW, wsp) == -1) { + mdb_warn("failed to walk 'mac_flow'"); + return (WALK_ERR); + } + return (WALK_NEXT); +} + +/* + * Common walker step funciton for flow_entry_t, mac_soft_ring_set_t and + * mac_ring_t. + * + * Steps through each flow_entry_t and calls the callback function. If the + * user executed ::walk mac_flow, it just prints the address or if the user + * executed ::mac_flow it displays selected fields of flow_entry_t structure + * by calling "mac_flow_dcmd" + */ +static int +mac_common_walk_step(mdb_walk_state_t *wsp) +{ + int status; + + if (wsp->walk_addr == NULL) + return (WALK_DONE); + + status = wsp->walk_callback(wsp->walk_addr, wsp->walk_data, + wsp->walk_cbdata); + + return (status); +} + +static char * +mac_srs_txmode2str(mac_tx_srs_mode_t mode) +{ + switch (mode) { + case SRS_TX_DEFAULT: + return ("default"); + case SRS_TX_SERIALIZE: + return ("serialize"); + case SRS_TX_FANOUT: + return ("fanout"); + case SRS_TX_BW: + return ("bw"); + case SRS_TX_BW_FANOUT: + return ("bw fanout"); + } + return ("--"); +} + +static void +mac_srs_help(void) +{ + mdb_printf("If an address is specified, then mac_soft_ring_set " + "structure at that address is printed. Otherwise all the " + "SRS in the system are printed.\n"); + mdb_printf("Options:\n" + "\t-r\tdisplay recieve side SRS structures\n" + "\t-t\tdisplay transmit side SRS structures\n"); +} + +static int +mac_srs_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +{ + uint_t args = 0; + mac_soft_ring_set_t srs; + + if (!(flags & DCMD_ADDRSPEC)) { + if (mdb_walk_dcmd("mac_srs", "mac_srs", argc, argv) == -1) { + mdb_warn("failed to walk 'mac_srs'"); + return (DCMD_ERR); + } + return (DCMD_OK); + } + if ((mdb_getopts(argc, argv, + 'r', MDB_OPT_SETBITS, MAC_SRS_RX, &args, + 't', MDB_OPT_SETBITS, MAC_SRS_TX, &args) != argc)) { + return (DCMD_USAGE); + } + if (argc > 1) + return (DCMD_USAGE); + + if (mdb_vread(&srs, sizeof (srs), addr) == -1) { + mdb_warn("failed to read struct mac_soft_ring_set_s at %p", + addr); + return (DCMD_ERR); + } + + switch (args) { + case MAC_SRS_RX: { + GElf_Sym sym; + char func_name[MDB_SYM_NAMLEN] = ""; + char l_proc_name[MDB_SYM_NAMLEN] = ""; + uintptr_t func, lproc, funcaddr, lprocaddr, rxaddr; + + if (DCMD_HDRSPEC(flags)) { + mdb_printf("%<u>%?s %8s %-8s " + "%8s %-20s %-s%</u>\n", + "ADDR", "MBLK_CNT", "Q_BYTES", + "POLL_CNT", "SR_FUNC", "SR_LOWER_FUNC"); + } + if (srs.srs_type & SRST_TX) + return (DCMD_OK); + rxaddr = addr + OFFSETOF(mac_soft_ring_set_t, srs_rx); + funcaddr = rxaddr + OFFSETOF(mac_srs_rx_t, sr_func); + lprocaddr = rxaddr + OFFSETOF(mac_srs_rx_t, sr_lower_proc); + (void) mdb_vread(&func, sizeof (func), funcaddr); + (void) mdb_vread(&lproc, sizeof (lproc), lprocaddr); + (void) mdb_lookup_by_addr(func, MDB_SYM_EXACT, func_name, + MDB_SYM_NAMLEN, &sym); + (void) mdb_lookup_by_addr(lproc, MDB_SYM_EXACT, l_proc_name, + MDB_SYM_NAMLEN, &sym); + mdb_printf("%?p %-8d %-8d " + "%-8d %-20s %-s\n", + addr, srs.srs_count, srs.srs_size, + srs.srs_rx.sr_poll_count, func_name, l_proc_name); + break; + } + case MAC_SRS_TX: { + if (DCMD_HDRSPEC(flags)) { + mdb_printf("%<u>%?s %-10s %-5s %-7s %-7s " + "%-7s %-7s %-7s%</u>\n", + "ADDR", "TX_MODE", "WOKEN", "DROP", "BLOCK", + "UNBLOCK", "MBLK", "SR_CNT"); + } + if (!(srs.srs_type & SRST_TX)) + return (DCMD_OK); + + mdb_printf("%?p %-10s " + "%-5d %-7d " + "%-7d %-7d " + "%-7d %-7d\n", + addr, mac_srs_txmode2str(srs.srs_tx.st_mode), + srs.srs_tx.st_woken_up, srs.srs_tx.st_drop_count, + srs.srs_tx.st_blocked_cnt, srs.srs_tx.st_unblocked_cnt, + srs.srs_count, srs.srs_oth_ring_count); + break; + } + default: { + if (DCMD_HDRSPEC(flags)) { + mdb_printf("%<u>%?s %?s %?s %?s %-3s " + "%-8s %-8s %-7s %</u>\n", + "ADDR", "MCIP", "FLENT", "RING", "DIR", + "TYPE", "STATE", "SR_CNT"); + } + mdb_printf("%?p %?p %?p %?p " + "%-3s " + "%08x %08x %-7d \n", + addr, srs.srs_mcip, srs.srs_flent, srs.srs_ring, + (srs.srs_type & SRST_TX ? "TX" : "RX"), + srs.srs_type, srs.srs_state, srs.srs_soft_ring_count); + break; + } + } + return (DCMD_OK); +} + +static int +mac_srs_walk_init(mdb_walk_state_t *wsp) +{ + if (mdb_layered_walk(LAYERED_WALKER_FOR_SRS, wsp) == -1) { + mdb_warn("failed to walk 'mac_srs'"); + return (WALK_ERR); + } + return (WALK_NEXT); +} + +static char * +mac_ring_state2str(mac_ring_state_t state) +{ + switch (state) { + case MR_FREE: + return ("free"); + case MR_NEWLY_ADDED: + return ("new"); + case MR_INUSE: + return ("inuse"); + } + return ("--"); +} + +static char * +mac_ring_classify2str(mac_classify_type_t classify) +{ + switch (classify) { + case MAC_NO_CLASSIFIER: + return ("no"); + case MAC_SW_CLASSIFIER: + return ("sw"); + case MAC_HW_CLASSIFIER: + return ("hw"); + } + return ("--"); +} + +static int +mac_ring_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +{ + mac_ring_t ring; + mac_group_t group; + flow_entry_t flent; + mac_soft_ring_set_t srs; + + if (!(flags & DCMD_ADDRSPEC)) { + if (mdb_walk_dcmd("mac_ring", "mac_ring", argc, argv) == -1) { + mdb_warn("failed to walk 'mac_ring'"); + return (DCMD_ERR); + } + return (DCMD_OK); + } + if (mdb_vread(&ring, sizeof (ring), addr) == -1) { + mdb_warn("failed to read struct mac_ring_s at %p", addr); + return (DCMD_ERR); + } + bzero(&flent, sizeof (flent)); + if (mdb_vread(&srs, sizeof (srs), (uintptr_t)ring.mr_srs) != -1) { + (void) mdb_vread(&flent, sizeof (flent), + (uintptr_t)srs.srs_flent); + } + (void) mdb_vread(&group, sizeof (group), (uintptr_t)ring.mr_gh); + if (DCMD_HDRSPEC(flags)) { + mdb_printf("%<u>%?s %4s %5s %4s %?s " + "%5s %?s %?s %s %</u>\n", + "ADDR", "TYPE", "STATE", "FLAG", "GROUP", + "CLASS", "MIP", "SRS", "FLOW NAME"); + } + mdb_printf("%?p %-4s " + "%5s %04x " + "%?p %-5s " + "%?p %?p %s\n", + addr, ((ring.mr_type == 1)? "RX" : "TX"), + mac_ring_state2str(ring.mr_state), ring.mr_flag, + ring.mr_gh, mac_ring_classify2str(ring.mr_classify_type), + group.mrg_mh, ring.mr_srs, flent.fe_flow_name); + return (DCMD_OK); +} + +static int +mac_ring_walk_init(mdb_walk_state_t *wsp) +{ + if (mdb_layered_walk(LAYERED_WALKER_FOR_RING, wsp) == -1) { + mdb_warn("failed to walk `mac_ring`"); + return (WALK_ERR); + } + return (WALK_NEXT); +} + +static void +mac_ring_help(void) +{ + mdb_printf("If an address is specified, then mac_ring_t " + "structure at that address is printed. Otherwise all the " + "hardware rings in the system are printed.\n"); +} + +/* Supported dee-commands */ +static const mdb_dcmd_t dcmds[] = { + {"mac_flow", "?[-u] [-aprtsm]", "display Flow Entry structures", + mac_flow_dcmd, mac_flow_help}, + {"mac_srs", "?[-rt]", "display MAC Soft Ring Set structures", + mac_srs_dcmd, mac_srs_help}, + {"mac_ring", "?", "display MAC ring (hardware) structures", + mac_ring_dcmd, mac_ring_help}, + { NULL } +}; + +/* Supported walkers */ +static const mdb_walker_t walkers[] = { + {"mac_flow", "walk list of flow entry structures", mac_flow_walk_init, + mac_common_walk_step, NULL, NULL}, + {"mac_srs", "walk list of mac soft ring set structures", + mac_srs_walk_init, mac_common_walk_step, NULL, NULL}, + {"mac_ring", "walk list of mac ring structures", mac_ring_walk_init, + mac_common_walk_step, NULL, NULL}, + { NULL } +}; + +static const mdb_modinfo_t modinfo = { MDB_API_VERSION, dcmds, walkers }; + +const mdb_modinfo_t * +_mdb_init(void) +{ + return (&modinfo); +} diff --git a/usr/src/cmd/mdb/intel/amd64/mac/Makefile b/usr/src/cmd/mdb/intel/amd64/mac/Makefile new file mode 100644 index 0000000000..6f24b28ea6 --- /dev/null +++ b/usr/src/cmd/mdb/intel/amd64/mac/Makefile @@ -0,0 +1,34 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +MODULE = mac.so +MDBTGT = kvm + +MODSRCS = mac.c + +include ../../../../Makefile.cmd +include ../../../../Makefile.cmd.64 +include ../../Makefile.amd64 +include ../../../Makefile.module diff --git a/usr/src/cmd/mdb/intel/ia32/mac/Makefile b/usr/src/cmd/mdb/intel/ia32/mac/Makefile new file mode 100644 index 0000000000..69c8c97b19 --- /dev/null +++ b/usr/src/cmd/mdb/intel/ia32/mac/Makefile @@ -0,0 +1,33 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +MODULE = mac.so +MDBTGT = kvm + +MODSRCS = mac.c + +include ../../../../Makefile.cmd +include ../../Makefile.ia32 +include ../../../Makefile.module diff --git a/usr/src/cmd/mdb/sparc/v9/mac/Makefile b/usr/src/cmd/mdb/sparc/v9/mac/Makefile new file mode 100644 index 0000000000..1456211245 --- /dev/null +++ b/usr/src/cmd/mdb/sparc/v9/mac/Makefile @@ -0,0 +1,34 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +MODULE = mac.so +MDBTGT = kvm + +MODSRCS = mac.c + +include ../../../../Makefile.cmd +include ../../../../Makefile.cmd.64 +include ../../Makefile.sparcv9 +include ../../../Makefile.module diff --git a/usr/src/cmd/rcm_daemon/Makefile.com b/usr/src/cmd/rcm_daemon/Makefile.com index a7293e76f1..365371c45c 100644 --- a/usr/src/cmd/rcm_daemon/Makefile.com +++ b/usr/src/cmd/rcm_daemon/Makefile.com @@ -22,8 +22,6 @@ # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# include ../../Makefile.cmd @@ -51,6 +49,7 @@ COMMON_MOD_SRC = \ $(COMMON)/swap_rcm.c \ $(COMMON)/network_rcm.c \ $(COMMON)/vlan_rcm.c \ + $(COMMON)/vnic_rcm.c \ $(COMMON)/aggr_rcm.c \ $(COMMON)/ip_rcm.c \ $(COMMON)/cluster_rcm.c \ @@ -71,6 +70,7 @@ COMMON_MOD_OBJ = \ swap_rcm.o \ network_rcm.o \ vlan_rcm.o \ + vnic_rcm.o \ aggr_rcm.o \ ip_rcm.o \ cluster_rcm.o \ @@ -89,6 +89,7 @@ COMMON_RCM_MODS = \ SUNW_swap_rcm.so \ SUNW_network_rcm.so \ SUNW_vlan_rcm.so \ + SUNW_vnic_rcm.so \ SUNW_aggr_rcm.so \ SUNW_ip_rcm.so \ SUNW_cluster_rcm.so \ @@ -121,6 +122,7 @@ SUNW_pool_rcm.so := LDLIBS_MODULES += -L$(ROOT)/usr/lib -lpool SUNW_svm_rcm.so := LDLIBS_MODULES += -L$(ROOT)/usr/lib -lmeta SUNW_network_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm SUNW_vlan_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm +SUNW_vnic_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm SUNW_aggr_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -ldladm SUNW_ip_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -linetutil -ldladm SUNW_ip_anon_rcm.so := LDLIBS_MODULES += -L$(ROOT)/lib -linetutil diff --git a/usr/src/cmd/rcm_daemon/common/vlan_rcm.c b/usr/src/cmd/rcm_daemon/common/vlan_rcm.c index 1177d5e384..a657baa2d4 100644 --- a/usr/src/cmd/rcm_daemon/common/vlan_rcm.c +++ b/usr/src/cmd/rcm_daemon/common/vlan_rcm.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * This RCM module adds support to the RCM framework for VLAN links */ @@ -68,7 +66,6 @@ typedef struct dl_vlan { struct dl_vlan *dv_next; /* next VLAN on the same link */ struct dl_vlan *dv_prev; /* prev VLAN on the same link */ datalink_id_t dv_vlanid; - boolean_t dv_implicit; vlan_flag_t dv_flags; /* VLAN link flags */ } dl_vlan_t; @@ -399,7 +396,6 @@ vlan_online_vlan(link_cache_t *node) if (!(vlan->dv_flags & VLAN_OFFLINED)) continue; - assert(!vlan->dv_implicit); if ((status = dladm_vlan_up(vlan->dv_vlanid)) != DLADM_STATUS_OK) { /* @@ -429,10 +425,6 @@ vlan_offline_vlan(link_cache_t *node, uint32_t flags, cache_node_state_t state) * Try to delete all explicit created VLAN */ for (vlan = node->vc_vlan; vlan != NULL; vlan = vlan->dv_next) { - - if (vlan->dv_implicit) - continue; - if ((status = dladm_vlan_delete(vlan->dv_vlanid, DLADM_OPT_ACTIVE)) != DLADM_STATUS_OK) { rcm_log_message(RCM_WARNING, @@ -918,7 +910,6 @@ vlan_update(datalink_id_t vlanid, void *arg) node->vc_vlan = vlan; } - vlan->dv_implicit = vlan_attr.dv_implicit; node->vc_state &= ~CACHE_NODE_STALE; if (newnode) @@ -1186,18 +1177,16 @@ vlan_notify_new_vlan(rcm_handle_t *hd, char *rsrc) } for (vlan = node->vc_vlan; vlan != NULL; vlan = vlan->dv_next) { - if (!vlan->dv_implicit) { - rcm_log_message(RCM_TRACE2, - "VLAN: vlan_notify_new_vlan add (%u)\n", - vlan->dv_vlanid); + rcm_log_message(RCM_TRACE2, + "VLAN: vlan_notify_new_vlan add (%u)\n", + vlan->dv_vlanid); - id = vlan->dv_vlanid; - if (nvlist_add_uint64(nvl, RCM_NV_LINKID, id) != 0) { - rcm_log_message(RCM_ERROR, - _("VLAN: failed to construct nvlist\n")); - (void) mutex_unlock(&cache_lock); - goto done; - } + id = vlan->dv_vlanid; + if (nvlist_add_uint64(nvl, RCM_NV_LINKID, id) != 0) { + rcm_log_message(RCM_ERROR, + _("VLAN: failed to construct nvlist\n")); + (void) mutex_unlock(&cache_lock); + goto done; } } (void) mutex_unlock(&cache_lock); diff --git a/usr/src/cmd/rcm_daemon/common/vnic_rcm.c b/usr/src/cmd/rcm_daemon/common/vnic_rcm.c new file mode 100644 index 0000000000..178d3b44a8 --- /dev/null +++ b/usr/src/cmd/rcm_daemon/common/vnic_rcm.c @@ -0,0 +1,1329 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * This RCM module adds support to the RCM framework for VNIC links + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <sys/types.h> +#include <synch.h> +#include <assert.h> +#include <strings.h> +#include "rcm_module.h" +#include <libintl.h> +#include <libdllink.h> +#include <libdlvnic.h> +#include <libdlpi.h> + +/* + * Definitions + */ +#ifndef lint +#define _(x) gettext(x) +#else +#define _(x) x +#endif + +/* Some generic well-knowns and defaults used in this module */ +#define RCM_LINK_PREFIX "SUNW_datalink" /* RCM datalink name prefix */ +#define RCM_LINK_RESOURCE_MAX (13 + LINKID_STR_WIDTH) + +/* VNIC link flags */ +typedef enum { + VNIC_OFFLINED = 0x1, + VNIC_CONSUMER_OFFLINED = 0x2, + VNIC_STALE = 0x4 +} vnic_flag_t; + +/* link representation */ +typedef struct dl_vnic { + struct dl_vnic *dlv_next; /* next VNIC on the same link */ + struct dl_vnic *dlv_prev; /* prev VNIC on the same link */ + datalink_id_t dlv_vnic_id; + vnic_flag_t dlv_flags; /* VNIC link flags */ +} dl_vnic_t; + +/* VNIC Cache state flags */ +typedef enum { + CACHE_NODE_STALE = 0x1, /* stale cached data */ + CACHE_NODE_NEW = 0x2, /* new cached nodes */ + CACHE_NODE_OFFLINED = 0x4 /* nodes offlined */ +} cache_node_state_t; + +/* Network Cache lookup options */ +#define CACHE_NO_REFRESH 0x1 /* cache refresh not needed */ +#define CACHE_REFRESH 0x2 /* refresh cache */ + +/* Cache element */ +typedef struct link_cache { + struct link_cache *vc_next; /* next cached resource */ + struct link_cache *vc_prev; /* prev cached resource */ + char *vc_resource; /* resource name */ + datalink_id_t vc_linkid; /* linkid */ + dl_vnic_t *vc_vnic; /* VNIC list on this link */ + cache_node_state_t vc_state; /* cache state flags */ +} link_cache_t; + +/* + * Global cache for network VNICs + */ +static link_cache_t cache_head; +static link_cache_t cache_tail; +static mutex_t cache_lock; +static int events_registered = 0; + +/* + * RCM module interface prototypes + */ +static int vnic_register(rcm_handle_t *); +static int vnic_unregister(rcm_handle_t *); +static int vnic_get_info(rcm_handle_t *, char *, id_t, uint_t, + char **, char **, nvlist_t *, rcm_info_t **); +static int vnic_suspend(rcm_handle_t *, char *, id_t, + timespec_t *, uint_t, char **, rcm_info_t **); +static int vnic_resume(rcm_handle_t *, char *, id_t, uint_t, + char **, rcm_info_t **); +static int vnic_offline(rcm_handle_t *, char *, id_t, uint_t, + char **, rcm_info_t **); +static int vnic_undo_offline(rcm_handle_t *, char *, id_t, uint_t, + char **, rcm_info_t **); +static int vnic_remove(rcm_handle_t *, char *, id_t, uint_t, + char **, rcm_info_t **); +static int vnic_notify_event(rcm_handle_t *, char *, id_t, uint_t, + char **, nvlist_t *, rcm_info_t **); +static int vnic_configure(rcm_handle_t *, datalink_id_t); + +/* Module private routines */ +static void cache_free(); +static int cache_update(rcm_handle_t *); +static void cache_remove(link_cache_t *); +static void node_free(link_cache_t *); +static void cache_insert(link_cache_t *); +static link_cache_t *cache_lookup(rcm_handle_t *, char *, char); +static int vnic_consumer_offline(rcm_handle_t *, link_cache_t *, + char **, uint_t, rcm_info_t **); +static void vnic_consumer_online(rcm_handle_t *, link_cache_t *, + char **, uint_t, rcm_info_t **); +static int vnic_offline_vnic(link_cache_t *, uint32_t, + cache_node_state_t); +static void vnic_online_vnic(link_cache_t *); +static char *vnic_usage(link_cache_t *); +static void vnic_log_err(datalink_id_t, char **, char *); +static int vnic_consumer_notify(rcm_handle_t *, datalink_id_t, + char **, uint_t, rcm_info_t **); + +/* Module-Private data */ +static struct rcm_mod_ops vnic_ops = +{ + RCM_MOD_OPS_VERSION, + vnic_register, + vnic_unregister, + vnic_get_info, + vnic_suspend, + vnic_resume, + vnic_offline, + vnic_undo_offline, + vnic_remove, + NULL, + NULL, + vnic_notify_event +}; + +/* + * rcm_mod_init() - Update registrations, and return the ops structure. + */ +struct rcm_mod_ops * +rcm_mod_init(void) +{ + rcm_log_message(RCM_TRACE1, "VNIC: mod_init\n"); + + cache_head.vc_next = &cache_tail; + cache_head.vc_prev = NULL; + cache_tail.vc_prev = &cache_head; + cache_tail.vc_next = NULL; + (void) mutex_init(&cache_lock, 0, NULL); + + /* Return the ops vectors */ + return (&vnic_ops); +} + +/* + * rcm_mod_info() - Return a string describing this module. + */ +const char * +rcm_mod_info(void) +{ + rcm_log_message(RCM_TRACE1, "VNIC: mod_info\n"); + + return ("VNIC module"); +} + +/* + * rcm_mod_fini() - Destroy the network VNIC cache. + */ +int +rcm_mod_fini(void) +{ + rcm_log_message(RCM_TRACE1, "VNIC: mod_fini\n"); + + /* + * Note that vnic_unregister() does not seem to be called anywhere, + * therefore we free the cache nodes here. In theory we should call + * rcm_register_interest() for each node before we free it, the + * framework does not provide the rcm_handle to allow us to do so. + */ + cache_free(); + (void) mutex_destroy(&cache_lock); + return (RCM_SUCCESS); +} + +/* + * vnic_register() - Make sure the cache is properly sync'ed, and its + * registrations are in order. + */ +static int +vnic_register(rcm_handle_t *hd) +{ + rcm_log_message(RCM_TRACE1, "VNIC: register\n"); + + if (cache_update(hd) < 0) + return (RCM_FAILURE); + + /* + * Need to register interest in all new resources + * getting attached, so we get attach event notifications + */ + if (!events_registered) { + if (rcm_register_event(hd, RCM_RESOURCE_LINK_NEW, 0, NULL) + != RCM_SUCCESS) { + rcm_log_message(RCM_ERROR, + _("VNIC: failed to register %s\n"), + RCM_RESOURCE_LINK_NEW); + return (RCM_FAILURE); + } else { + rcm_log_message(RCM_DEBUG, "VNIC: registered %s\n", + RCM_RESOURCE_LINK_NEW); + events_registered++; + } + } + + return (RCM_SUCCESS); +} + +/* + * vnic_unregister() - Walk the cache, unregistering all the networks. + */ +static int +vnic_unregister(rcm_handle_t *hd) +{ + link_cache_t *node; + + rcm_log_message(RCM_TRACE1, "VNIC: unregister\n"); + + /* Walk the cache, unregistering everything */ + (void) mutex_lock(&cache_lock); + node = cache_head.vc_next; + while (node != &cache_tail) { + if (rcm_unregister_interest(hd, node->vc_resource, 0) + != RCM_SUCCESS) { + rcm_log_message(RCM_ERROR, + _("VNIC: failed to unregister %s\n"), + node->vc_resource); + (void) mutex_unlock(&cache_lock); + return (RCM_FAILURE); + } + cache_remove(node); + node_free(node); + node = cache_head.vc_next; + } + (void) mutex_unlock(&cache_lock); + + /* + * Unregister interest in all new resources + */ + if (events_registered) { + if (rcm_unregister_event(hd, RCM_RESOURCE_LINK_NEW, 0) + != RCM_SUCCESS) { + rcm_log_message(RCM_ERROR, + _("VNIC: failed to unregister %s\n"), + RCM_RESOURCE_LINK_NEW); + return (RCM_FAILURE); + } else { + rcm_log_message(RCM_DEBUG, "VNIC: unregistered %s\n", + RCM_RESOURCE_LINK_NEW); + events_registered--; + } + } + + return (RCM_SUCCESS); +} + +/* + * vnic_offline() - Offline VNICs on a specific node. + */ +static int +vnic_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, + char **errorp, rcm_info_t **info) +{ + link_cache_t *node; + + rcm_log_message(RCM_TRACE1, "VNIC: offline(%s)\n", rsrc); + + /* Lock the cache and lookup the resource */ + (void) mutex_lock(&cache_lock); + node = cache_lookup(hd, rsrc, CACHE_REFRESH); + if (node == NULL) { + /* should not happen because the resource is registered. */ + vnic_log_err(node->vc_linkid, errorp, "unrecognized resource"); + (void) mutex_unlock(&cache_lock); + return (RCM_SUCCESS); + } + + /* + * Inform consumers (IP interfaces) of associated VNICs to be offlined + */ + if (vnic_consumer_offline(hd, node, errorp, flags, info) == + RCM_SUCCESS) { + rcm_log_message(RCM_DEBUG, + "VNIC: consumers agreed on offline\n"); + } else { + vnic_log_err(node->vc_linkid, errorp, + "consumers failed to offline"); + (void) mutex_unlock(&cache_lock); + return (RCM_FAILURE); + } + + /* Check if it's a query */ + if (flags & RCM_QUERY) { + rcm_log_message(RCM_TRACE1, + "VNIC: offline query succeeded(%s)\n", rsrc); + (void) mutex_unlock(&cache_lock); + return (RCM_SUCCESS); + } + + if (vnic_offline_vnic(node, VNIC_OFFLINED, CACHE_NODE_OFFLINED) != + RCM_SUCCESS) { + vnic_online_vnic(node); + vnic_log_err(node->vc_linkid, errorp, "offline failed"); + (void) mutex_unlock(&cache_lock); + return (RCM_FAILURE); + } + + rcm_log_message(RCM_TRACE1, "VNIC: Offline succeeded(%s)\n", rsrc); + (void) mutex_unlock(&cache_lock); + return (RCM_SUCCESS); +} + +/* + * vnic_undo_offline() - Undo offline of a previously offlined node. + */ +/*ARGSUSED*/ +static int +vnic_undo_offline(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, + char **errorp, rcm_info_t **info) +{ + link_cache_t *node; + + rcm_log_message(RCM_TRACE1, "VNIC: online(%s)\n", rsrc); + + (void) mutex_lock(&cache_lock); + node = cache_lookup(hd, rsrc, CACHE_NO_REFRESH); + if (node == NULL) { + vnic_log_err(DATALINK_INVALID_LINKID, errorp, "no such link"); + (void) mutex_unlock(&cache_lock); + errno = ENOENT; + return (RCM_FAILURE); + } + + /* Check if no attempt should be made to online the link here */ + if (!(node->vc_state & CACHE_NODE_OFFLINED)) { + vnic_log_err(node->vc_linkid, errorp, "link not offlined"); + (void) mutex_unlock(&cache_lock); + errno = ENOTSUP; + return (RCM_SUCCESS); + } + + vnic_online_vnic(node); + + /* + * Inform IP interfaces on associated VNICs to be onlined + */ + vnic_consumer_online(hd, node, errorp, flags, info); + + node->vc_state &= ~CACHE_NODE_OFFLINED; + rcm_log_message(RCM_TRACE1, "VNIC: online succeeded(%s)\n", rsrc); + (void) mutex_unlock(&cache_lock); + return (RCM_SUCCESS); +} + +static void +vnic_online_vnic(link_cache_t *node) +{ + dl_vnic_t *vnic; + dladm_status_t status; + char errmsg[DLADM_STRSIZE]; + + /* + * Try to bring on all offlined VNICs + */ + for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) { + if (!(vnic->dlv_flags & VNIC_OFFLINED)) + continue; + + if ((status = dladm_vnic_up(vnic->dlv_vnic_id, 0)) != + DLADM_STATUS_OK) { + /* + * Print a warning message and continue to online + * other VNICs. + */ + rcm_log_message(RCM_WARNING, + _("VNIC: VNIC online failed (%u): %s\n"), + vnic->dlv_vnic_id, + dladm_status2str(status, errmsg)); + } else { + vnic->dlv_flags &= ~VNIC_OFFLINED; + } + } +} + +static int +vnic_offline_vnic(link_cache_t *node, uint32_t flags, cache_node_state_t state) +{ + dl_vnic_t *vnic; + dladm_status_t status; + char errmsg[DLADM_STRSIZE]; + + rcm_log_message(RCM_TRACE2, "VNIC: vnic_offline_vnic (%s %u %u)\n", + node->vc_resource, flags, state); + + /* + * Try to delete all explicit created VNIC + */ + for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) { + + if ((status = dladm_vnic_delete(vnic->dlv_vnic_id, + DLADM_OPT_ACTIVE)) != DLADM_STATUS_OK) { + rcm_log_message(RCM_WARNING, + _("VNIC: VNIC offline failed (%u): %s\n"), + vnic->dlv_vnic_id, + dladm_status2str(status, errmsg)); + return (RCM_FAILURE); + } else { + rcm_log_message(RCM_TRACE1, + "VNIC: VNIC offline succeeded(%u)\n", + vnic->dlv_vnic_id); + vnic->dlv_flags |= flags; + } + } + + node->vc_state |= state; + return (RCM_SUCCESS); +} + +/* + * vnic_get_info() - Gather usage information for this resource. + */ +/*ARGSUSED*/ +int +vnic_get_info(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, + char **usagep, char **errorp, nvlist_t *props, rcm_info_t **info) +{ + link_cache_t *node; + + rcm_log_message(RCM_TRACE1, "VNIC: get_info(%s)\n", rsrc); + + (void) mutex_lock(&cache_lock); + node = cache_lookup(hd, rsrc, CACHE_REFRESH); + if (node == NULL) { + rcm_log_message(RCM_INFO, + _("VNIC: get_info(%s) unrecognized resource\n"), rsrc); + (void) mutex_unlock(&cache_lock); + errno = ENOENT; + return (RCM_FAILURE); + } + + *usagep = vnic_usage(node); + (void) mutex_unlock(&cache_lock); + if (*usagep == NULL) { + /* most likely malloc failure */ + rcm_log_message(RCM_ERROR, + _("VNIC: get_info(%s) malloc failure\n"), rsrc); + (void) mutex_unlock(&cache_lock); + errno = ENOMEM; + return (RCM_FAILURE); + } + + /* Set client/role properties */ + (void) nvlist_add_string(props, RCM_CLIENT_NAME, "VNIC"); + + rcm_log_message(RCM_TRACE1, "VNIC: get_info(%s) info = %s\n", + rsrc, *usagep); + return (RCM_SUCCESS); +} + +/* + * vnic_suspend() - Nothing to do, always okay + */ +/*ARGSUSED*/ +static int +vnic_suspend(rcm_handle_t *hd, char *rsrc, id_t id, timespec_t *interval, + uint_t flags, char **errorp, rcm_info_t **info) +{ + rcm_log_message(RCM_TRACE1, "VNIC: suspend(%s)\n", rsrc); + return (RCM_SUCCESS); +} + +/* + * vnic_resume() - Nothing to do, always okay + */ +/*ARGSUSED*/ +static int +vnic_resume(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, + char **errorp, rcm_info_t **info) +{ + rcm_log_message(RCM_TRACE1, "VNIC: resume(%s)\n", rsrc); + return (RCM_SUCCESS); +} + +/* + * vnic_consumer_remove() + * + * Notify VNIC consumers to remove cache. + */ +static int +vnic_consumer_remove(rcm_handle_t *hd, link_cache_t *node, uint_t flags, + rcm_info_t **info) +{ + dl_vnic_t *vnic = NULL; + char rsrc[RCM_LINK_RESOURCE_MAX]; + int ret = RCM_SUCCESS; + + rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_remove (%s)\n", + node->vc_resource); + + for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) { + + /* + * This will only be called when the offline operation + * succeeds, so the VNIC consumers must have been offlined + * at this point. + */ + assert(vnic->dlv_flags & VNIC_CONSUMER_OFFLINED); + + (void) snprintf(rsrc, RCM_LINK_RESOURCE_MAX, "%s/%u", + RCM_LINK_PREFIX, vnic->dlv_vnic_id); + + ret = rcm_notify_remove(hd, rsrc, flags, info); + if (ret != RCM_SUCCESS) { + rcm_log_message(RCM_WARNING, + _("VNIC: notify remove failed (%s)\n"), rsrc); + break; + } + } + + rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_remove done\n"); + return (ret); +} + +/* + * vnic_remove() - remove a resource from cache + */ +/*ARGSUSED*/ +static int +vnic_remove(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, + char **errorp, rcm_info_t **info) +{ + link_cache_t *node; + int rv; + + rcm_log_message(RCM_TRACE1, "VNIC: remove(%s)\n", rsrc); + + (void) mutex_lock(&cache_lock); + node = cache_lookup(hd, rsrc, CACHE_NO_REFRESH); + if (node == NULL) { + rcm_log_message(RCM_INFO, + _("VNIC: remove(%s) unrecognized resource\n"), rsrc); + (void) mutex_unlock(&cache_lock); + errno = ENOENT; + return (RCM_FAILURE); + } + + /* remove the cached entry for the resource */ + cache_remove(node); + (void) mutex_unlock(&cache_lock); + + rv = vnic_consumer_remove(hd, node, flags, info); + node_free(node); + return (rv); +} + +/* + * vnic_notify_event - Project private implementation to receive new resource + * events. It intercepts all new resource events. If the + * new resource is a network resource, pass up a notify + * for it too. The new resource need not be cached, since + * it is done at register again. + */ +/*ARGSUSED*/ +static int +vnic_notify_event(rcm_handle_t *hd, char *rsrc, id_t id, uint_t flags, + char **errorp, nvlist_t *nvl, rcm_info_t **info) +{ + nvpair_t *nvp = NULL; + datalink_id_t linkid; + uint64_t id64; + int rv = RCM_SUCCESS; + + rcm_log_message(RCM_TRACE1, "VNIC: notify_event(%s)\n", rsrc); + + if (strcmp(rsrc, RCM_RESOURCE_LINK_NEW) != 0) { + vnic_log_err(DATALINK_INVALID_LINKID, errorp, + "unrecognized event"); + errno = EINVAL; + return (RCM_FAILURE); + } + + /* Update cache to reflect latest VNICs */ + if (cache_update(hd) < 0) { + vnic_log_err(DATALINK_INVALID_LINKID, errorp, + "private Cache update failed"); + return (RCM_FAILURE); + } + + /* + * Try best to recover all configuration. + */ + rcm_log_message(RCM_DEBUG, "VNIC: process_nvlist\n"); + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + if (strcmp(nvpair_name(nvp), RCM_NV_LINKID) != 0) + continue; + + if (nvpair_value_uint64(nvp, &id64) != 0) { + vnic_log_err(DATALINK_INVALID_LINKID, errorp, + "cannot get linkid"); + rv = RCM_FAILURE; + continue; + } + + linkid = (datalink_id_t)id64; + if (vnic_configure(hd, linkid) != 0) { + vnic_log_err(linkid, errorp, "configuring failed"); + rv = RCM_FAILURE; + continue; + } + + /* Notify all VNIC consumers */ + if (vnic_consumer_notify(hd, linkid, errorp, flags, + info) != 0) { + vnic_log_err(linkid, errorp, "consumer notify failed"); + rv = RCM_FAILURE; + } + } + + rcm_log_message(RCM_TRACE1, + "VNIC: notify_event: link configuration complete\n"); + return (rv); +} + +/* + * vnic_usage - Determine the usage of a link. + * The returned buffer is owned by caller, and the caller + * must free it up when done. + */ +static char * +vnic_usage(link_cache_t *node) +{ + dl_vnic_t *vnic; + int nvnic; + char *buf; + const char *fmt; + char *sep; + char errmsg[DLADM_STRSIZE]; + char name[MAXLINKNAMELEN]; + dladm_status_t status; + size_t bufsz; + + rcm_log_message(RCM_TRACE2, "VNIC: usage(%s)\n", node->vc_resource); + + assert(MUTEX_HELD(&cache_lock)); + if ((status = dladm_datalink_id2info(node->vc_linkid, NULL, NULL, NULL, + name, sizeof (name))) != DLADM_STATUS_OK) { + rcm_log_message(RCM_ERROR, + _("VNIC: usage(%s) get link name failure(%s)\n"), + node->vc_resource, dladm_status2str(status, errmsg)); + return (NULL); + } + + if (node->vc_state & CACHE_NODE_OFFLINED) + fmt = _("%1$s offlined"); + else + fmt = _("%1$s VNICs: "); + + /* TRANSLATION_NOTE: separator used between VNIC linkids */ + sep = _(", "); + + nvnic = 0; + for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) + nvnic++; + + /* space for VNICs and separators, plus message */ + bufsz = nvnic * (MAXLINKNAMELEN + strlen(sep)) + + strlen(fmt) + MAXLINKNAMELEN + 1; + if ((buf = malloc(bufsz)) == NULL) { + rcm_log_message(RCM_ERROR, + _("VNIC: usage(%s) malloc failure(%s)\n"), + node->vc_resource, strerror(errno)); + return (NULL); + } + (void) snprintf(buf, bufsz, fmt, name); + + if (node->vc_state & CACHE_NODE_OFFLINED) { + /* Nothing else to do */ + rcm_log_message(RCM_TRACE2, "VNIC: usage (%s) info = %s\n", + node->vc_resource, buf); + return (buf); + } + + for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) { + rcm_log_message(RCM_DEBUG, "VNIC:= %u\n", vnic->dlv_vnic_id); + + if ((status = dladm_datalink_id2info(vnic->dlv_vnic_id, NULL, + NULL, NULL, name, sizeof (name))) != DLADM_STATUS_OK) { + rcm_log_message(RCM_ERROR, + _("VNIC: usage(%s) get vnic %u name failure(%s)\n"), + node->vc_resource, vnic->dlv_vnic_id, + dladm_status2str(status, errmsg)); + free(buf); + return (NULL); + } + + (void) strlcat(buf, name, bufsz); + if (vnic->dlv_next != NULL) + (void) strlcat(buf, sep, bufsz); + } + + rcm_log_message(RCM_TRACE2, "VNIC: usage (%s) info = %s\n", + node->vc_resource, buf); + + return (buf); +} + +/* + * Cache management routines, all cache management functions should be + * be called with cache_lock held. + */ + +/* + * cache_lookup() - Get a cache node for a resource. + * Call with cache lock held. + * + * This ensures that the cache is consistent with the system state and + * returns a pointer to the cache element corresponding to the resource. + */ +static link_cache_t * +cache_lookup(rcm_handle_t *hd, char *rsrc, char options) +{ + link_cache_t *node; + + rcm_log_message(RCM_TRACE2, "VNIC: cache lookup(%s)\n", rsrc); + + assert(MUTEX_HELD(&cache_lock)); + if (options & CACHE_REFRESH) { + /* drop lock since update locks cache again */ + (void) mutex_unlock(&cache_lock); + (void) cache_update(hd); + (void) mutex_lock(&cache_lock); + } + + node = cache_head.vc_next; + for (; node != &cache_tail; node = node->vc_next) { + if (strcmp(rsrc, node->vc_resource) == 0) { + rcm_log_message(RCM_TRACE2, + "VNIC: cache lookup succeeded(%s)\n", rsrc); + return (node); + } + } + return (NULL); +} + +/* + * node_free - Free a node from the cache + */ +static void +node_free(link_cache_t *node) +{ + dl_vnic_t *vnic, *next; + + if (node != NULL) { + free(node->vc_resource); + + /* free the VNIC list */ + for (vnic = node->vc_vnic; vnic != NULL; vnic = next) { + next = vnic->dlv_next; + free(vnic); + } + free(node); + } +} + +/* + * cache_insert - Insert a resource node in cache + */ +static void +cache_insert(link_cache_t *node) +{ + assert(MUTEX_HELD(&cache_lock)); + + /* insert at the head for best performance */ + node->vc_next = cache_head.vc_next; + node->vc_prev = &cache_head; + + node->vc_next->vc_prev = node; + node->vc_prev->vc_next = node; +} + +/* + * cache_remove() - Remove a resource node from cache. + */ +static void +cache_remove(link_cache_t *node) +{ + assert(MUTEX_HELD(&cache_lock)); + node->vc_next->vc_prev = node->vc_prev; + node->vc_prev->vc_next = node->vc_next; + node->vc_next = NULL; + node->vc_prev = NULL; +} + +typedef struct vnic_update_arg_s { + rcm_handle_t *hd; + int retval; +} vnic_update_arg_t; + +/* + * vnic_update() - Update physical interface properties + */ +static int +vnic_update(datalink_id_t vnicid, void *arg) +{ + vnic_update_arg_t *vnic_update_argp = arg; + rcm_handle_t *hd = vnic_update_argp->hd; + link_cache_t *node; + dl_vnic_t *vnic; + char *rsrc; + dladm_vnic_attr_t vnic_attr; + dladm_status_t status; + char errmsg[DLADM_STRSIZE]; + boolean_t newnode = B_FALSE; + int ret = -1; + + rcm_log_message(RCM_TRACE2, "VNIC: vnic_update(%u)\n", vnicid); + + assert(MUTEX_HELD(&cache_lock)); + status = dladm_vnic_info(vnicid, &vnic_attr, DLADM_OPT_ACTIVE); + if (status != DLADM_STATUS_OK) { + rcm_log_message(RCM_TRACE1, + "VNIC: vnic_update() cannot get vnic information for " + "%u(%s)\n", vnicid, dladm_status2str(status, errmsg)); + return (DLADM_WALK_CONTINUE); + } + + if (vnic_attr.va_link_id == DATALINK_INVALID_LINKID) { + /* + * Skip the etherstubs. + */ + rcm_log_message(RCM_TRACE1, + "VNIC: vnic_update(): skip the etherstub %u\n", vnicid); + return (DLADM_WALK_CONTINUE); + } + + rsrc = malloc(RCM_LINK_RESOURCE_MAX); + if (rsrc == NULL) { + rcm_log_message(RCM_ERROR, _("VNIC: malloc error(%s): %u\n"), + strerror(errno), vnicid); + goto done; + } + + (void) snprintf(rsrc, RCM_LINK_RESOURCE_MAX, "%s/%u", + RCM_LINK_PREFIX, vnic_attr.va_link_id); + + node = cache_lookup(hd, rsrc, CACHE_NO_REFRESH); + if (node != NULL) { + rcm_log_message(RCM_DEBUG, + "VNIC: %s already registered (vnicid:%d)\n", + rsrc, vnic_attr.va_vnic_id); + free(rsrc); + } else { + rcm_log_message(RCM_DEBUG, + "VNIC: %s is a new resource (vnicid:%d)\n", + rsrc, vnic_attr.va_vnic_id); + if ((node = calloc(1, sizeof (link_cache_t))) == NULL) { + free(rsrc); + rcm_log_message(RCM_ERROR, _("VNIC: calloc: %s\n"), + strerror(errno)); + goto done; + } + + node->vc_resource = rsrc; + node->vc_vnic = NULL; + node->vc_linkid = vnic_attr.va_link_id; + node->vc_state |= CACHE_NODE_NEW; + newnode = B_TRUE; + } + + for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) { + if (vnic->dlv_vnic_id == vnicid) { + vnic->dlv_flags &= ~VNIC_STALE; + break; + } + } + + if (vnic == NULL) { + if ((vnic = calloc(1, sizeof (dl_vnic_t))) == NULL) { + rcm_log_message(RCM_ERROR, _("VNIC: malloc: %s\n"), + strerror(errno)); + if (newnode) { + free(rsrc); + free(node); + } + goto done; + } + vnic->dlv_vnic_id = vnicid; + vnic->dlv_next = node->vc_vnic; + vnic->dlv_prev = NULL; + if (node->vc_vnic != NULL) + node->vc_vnic->dlv_prev = vnic; + node->vc_vnic = vnic; + } + + node->vc_state &= ~CACHE_NODE_STALE; + + if (newnode) + cache_insert(node); + + rcm_log_message(RCM_TRACE3, "VNIC: vnic_update: succeeded(%u)\n", + vnicid); + ret = 0; +done: + vnic_update_argp->retval = ret; + return (ret == 0 ? DLADM_WALK_CONTINUE : DLADM_WALK_TERMINATE); +} + +/* + * vnic_update_all() - Determine all VNIC links in the system + */ +static int +vnic_update_all(rcm_handle_t *hd) +{ + vnic_update_arg_t arg = {NULL, 0}; + + rcm_log_message(RCM_TRACE2, "VNIC: vnic_update_all\n"); + + assert(MUTEX_HELD(&cache_lock)); + arg.hd = hd; + (void) dladm_walk_datalink_id(vnic_update, &arg, DATALINK_CLASS_VNIC, + DATALINK_ANY_MEDIATYPE, DLADM_OPT_ACTIVE); + return (arg.retval); +} + +/* + * cache_update() - Update cache with latest interface info + */ +static int +cache_update(rcm_handle_t *hd) +{ + link_cache_t *node, *nnode; + dl_vnic_t *vnic; + int rv; + + rcm_log_message(RCM_TRACE2, "VNIC: cache_update\n"); + + (void) mutex_lock(&cache_lock); + + /* first we walk the entire cache, marking each entry stale */ + node = cache_head.vc_next; + for (; node != &cache_tail; node = node->vc_next) { + node->vc_state |= CACHE_NODE_STALE; + for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) + vnic->dlv_flags |= VNIC_STALE; + } + + rv = vnic_update_all(hd); + + /* + * Continue to delete all stale nodes from the cache even + * vnic_update_all() failed. Unregister link that are not offlined + * and still in cache + */ + for (node = cache_head.vc_next; node != &cache_tail; node = nnode) { + dl_vnic_t *vnic, *next; + + for (vnic = node->vc_vnic; vnic != NULL; vnic = next) { + next = vnic->dlv_next; + + /* clear stale VNICs */ + if (vnic->dlv_flags & VNIC_STALE) { + if (vnic->dlv_prev != NULL) + vnic->dlv_prev->dlv_next = next; + else + node->vc_vnic = next; + + if (next != NULL) + next->dlv_prev = vnic->dlv_prev; + free(vnic); + } + } + + nnode = node->vc_next; + if (node->vc_state & CACHE_NODE_STALE) { + (void) rcm_unregister_interest(hd, node->vc_resource, + 0); + rcm_log_message(RCM_DEBUG, "VNIC: unregistered %s\n", + node->vc_resource); + assert(node->vc_vnic == NULL); + cache_remove(node); + node_free(node); + continue; + } + + if (!(node->vc_state & CACHE_NODE_NEW)) + continue; + + if (rcm_register_interest(hd, node->vc_resource, 0, NULL) != + RCM_SUCCESS) { + rcm_log_message(RCM_ERROR, + _("VNIC: failed to register %s\n"), + node->vc_resource); + rv = -1; + } else { + rcm_log_message(RCM_DEBUG, "VNIC: registered %s\n", + node->vc_resource); + node->vc_state &= ~CACHE_NODE_NEW; + } + } + + (void) mutex_unlock(&cache_lock); + return (rv); +} + +/* + * cache_free() - Empty the cache + */ +static void +cache_free() +{ + link_cache_t *node; + + rcm_log_message(RCM_TRACE2, "VNIC: cache_free\n"); + + (void) mutex_lock(&cache_lock); + node = cache_head.vc_next; + while (node != &cache_tail) { + cache_remove(node); + node_free(node); + node = cache_head.vc_next; + } + (void) mutex_unlock(&cache_lock); +} + +/* + * vnic_log_err() - RCM error log wrapper + */ +static void +vnic_log_err(datalink_id_t linkid, char **errorp, char *errmsg) +{ + char link[MAXLINKNAMELEN]; + char errstr[DLADM_STRSIZE]; + dladm_status_t status; + int len; + const char *errfmt; + char *error; + + link[0] = '\0'; + if (linkid != DATALINK_INVALID_LINKID) { + char rsrc[RCM_LINK_RESOURCE_MAX]; + + (void) snprintf(rsrc, sizeof (rsrc), "%s/%u", + RCM_LINK_PREFIX, linkid); + + rcm_log_message(RCM_ERROR, _("VNIC: %s(%s)\n"), errmsg, rsrc); + if ((status = dladm_datalink_id2info(linkid, NULL, NULL, + NULL, link, sizeof (link))) != DLADM_STATUS_OK) { + rcm_log_message(RCM_WARNING, + _("VNIC: cannot get link name for (%s) %s\n"), + rsrc, dladm_status2str(status, errstr)); + } + } else { + rcm_log_message(RCM_ERROR, _("VNIC: %s\n"), errmsg); + } + + errfmt = strlen(link) > 0 ? _("VNIC: %s(%s)") : _("VNIC: %s"); + len = strlen(errfmt) + strlen(errmsg) + MAXLINKNAMELEN + 1; + if ((error = malloc(len)) != NULL) { + if (strlen(link) > 0) + (void) snprintf(error, len, errfmt, errmsg, link); + else + (void) snprintf(error, len, errfmt, errmsg); + } + + if (errorp != NULL) + *errorp = error; +} + +/* + * vnic_consumer_online() + * + * Notify online to VNIC consumers. + */ +/* ARGSUSED */ +static void +vnic_consumer_online(rcm_handle_t *hd, link_cache_t *node, char **errorp, + uint_t flags, rcm_info_t **info) +{ + dl_vnic_t *vnic; + char rsrc[RCM_LINK_RESOURCE_MAX]; + + rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_online (%s)\n", + node->vc_resource); + + for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) { + if (!(vnic->dlv_flags & VNIC_CONSUMER_OFFLINED)) + continue; + + (void) snprintf(rsrc, RCM_LINK_RESOURCE_MAX, "%s/%u", + RCM_LINK_PREFIX, vnic->dlv_vnic_id); + + if (rcm_notify_online(hd, rsrc, flags, info) == RCM_SUCCESS) + vnic->dlv_flags &= ~VNIC_CONSUMER_OFFLINED; + } + + rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_online done\n"); +} + +/* + * vnic_consumer_offline() + * + * Offline VNIC consumers. + */ +static int +vnic_consumer_offline(rcm_handle_t *hd, link_cache_t *node, char **errorp, + uint_t flags, rcm_info_t **info) +{ + dl_vnic_t *vnic; + char rsrc[RCM_LINK_RESOURCE_MAX]; + int ret = RCM_SUCCESS; + + rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_offline (%s)\n", + node->vc_resource); + + for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) { + (void) snprintf(rsrc, RCM_LINK_RESOURCE_MAX, "%s/%u", + RCM_LINK_PREFIX, vnic->dlv_vnic_id); + + ret = rcm_request_offline(hd, rsrc, flags, info); + if (ret != RCM_SUCCESS) + break; + + vnic->dlv_flags |= VNIC_CONSUMER_OFFLINED; + } + + if (vnic != NULL) + vnic_consumer_online(hd, node, errorp, flags, info); + + rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_offline done\n"); + return (ret); +} + +/* + * Send RCM_RESOURCE_LINK_NEW events to other modules about new VNICs. + * Return 0 on success, -1 on failure. + */ +static int +vnic_notify_new_vnic(rcm_handle_t *hd, char *rsrc) +{ + link_cache_t *node; + dl_vnic_t *vnic; + nvlist_t *nvl = NULL; + uint64_t id; + int ret = -1; + + rcm_log_message(RCM_TRACE2, "VNIC: vnic_notify_new_vnic (%s)\n", rsrc); + + (void) mutex_lock(&cache_lock); + if ((node = cache_lookup(hd, rsrc, CACHE_REFRESH)) == NULL) { + (void) mutex_unlock(&cache_lock); + return (0); + } + + if (nvlist_alloc(&nvl, 0, 0) != 0) { + (void) mutex_unlock(&cache_lock); + rcm_log_message(RCM_WARNING, + _("VNIC: failed to allocate nvlist\n")); + goto done; + } + + for (vnic = node->vc_vnic; vnic != NULL; vnic = vnic->dlv_next) { + rcm_log_message(RCM_TRACE2, + "VNIC: vnic_notify_new_vnic add (%u)\n", vnic->dlv_vnic_id); + + id = vnic->dlv_vnic_id; + if (nvlist_add_uint64(nvl, RCM_NV_LINKID, id) != 0) { + rcm_log_message(RCM_ERROR, + _("VNIC: failed to construct nvlist\n")); + (void) mutex_unlock(&cache_lock); + goto done; + } + } + (void) mutex_unlock(&cache_lock); + + if (rcm_notify_event(hd, RCM_RESOURCE_LINK_NEW, 0, nvl, NULL) != + RCM_SUCCESS) { + rcm_log_message(RCM_ERROR, + _("VNIC: failed to notify %s event for %s\n"), + RCM_RESOURCE_LINK_NEW, node->vc_resource); + goto done; + } + + ret = 0; +done: + if (nvl != NULL) + nvlist_free(nvl); + return (ret); +} + +/* + * vnic_consumer_notify() - Notify consumers of VNICs coming back online. + */ +static int +vnic_consumer_notify(rcm_handle_t *hd, datalink_id_t linkid, char **errorp, + uint_t flags, rcm_info_t **info) +{ + char rsrc[RCM_LINK_RESOURCE_MAX]; + link_cache_t *node; + + /* Check for the interface in the cache */ + (void) snprintf(rsrc, RCM_LINK_RESOURCE_MAX, "%s/%u", RCM_LINK_PREFIX, + linkid); + + rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_notify(%s)\n", rsrc); + + /* + * Inform IP consumers of the new link. + */ + if (vnic_notify_new_vnic(hd, rsrc) != 0) { + (void) mutex_lock(&cache_lock); + if ((node = cache_lookup(hd, rsrc, CACHE_NO_REFRESH)) != NULL) { + (void) vnic_offline_vnic(node, VNIC_STALE, + CACHE_NODE_STALE); + } + (void) mutex_unlock(&cache_lock); + rcm_log_message(RCM_TRACE2, + "VNIC: vnic_notify_new_vnic failed(%s)\n", rsrc); + return (-1); + } + + rcm_log_message(RCM_TRACE2, "VNIC: vnic_consumer_notify succeeded\n"); + return (0); +} + +typedef struct vnic_up_arg_s { + datalink_id_t linkid; + int retval; +} vnic_up_arg_t; + +static int +vnic_up(datalink_id_t vnicid, void *arg) +{ + vnic_up_arg_t *vnic_up_argp = arg; + dladm_status_t status; + dladm_vnic_attr_t vnic_attr; + char errmsg[DLADM_STRSIZE]; + + status = dladm_vnic_info(vnicid, &vnic_attr, DLADM_OPT_PERSIST); + if (status != DLADM_STATUS_OK) { + rcm_log_message(RCM_TRACE1, + "VNIC: vnic_up(): cannot get information for VNIC %u " + "(%s)\n", vnicid, dladm_status2str(status, errmsg)); + return (DLADM_WALK_CONTINUE); + } + + if (vnic_attr.va_link_id != vnic_up_argp->linkid) + return (DLADM_WALK_CONTINUE); + + rcm_log_message(RCM_TRACE3, "VNIC: vnic_up(%u)\n", vnicid); + if ((status = dladm_vnic_up(vnicid, 0)) == DLADM_STATUS_OK) + return (DLADM_WALK_CONTINUE); + + /* + * Prompt the warning message and continue to UP other VNICs. + */ + rcm_log_message(RCM_WARNING, + _("VNIC: VNIC up failed (%u): %s\n"), + vnicid, dladm_status2str(status, errmsg)); + + vnic_up_argp->retval = -1; + return (DLADM_WALK_CONTINUE); +} + +/* + * vnic_configure() - Configure VNICs over a physical link after it attaches + */ +static int +vnic_configure(rcm_handle_t *hd, datalink_id_t linkid) +{ + char rsrc[RCM_LINK_RESOURCE_MAX]; + link_cache_t *node; + vnic_up_arg_t arg = {DATALINK_INVALID_LINKID, 0}; + + /* Check for the VNICs in the cache */ + (void) snprintf(rsrc, sizeof (rsrc), "%s/%u", RCM_LINK_PREFIX, linkid); + + rcm_log_message(RCM_TRACE2, "VNIC: vnic_configure(%s)\n", rsrc); + + /* Check if the link is new or was previously offlined */ + (void) mutex_lock(&cache_lock); + if (((node = cache_lookup(hd, rsrc, CACHE_REFRESH)) != NULL) && + (!(node->vc_state & CACHE_NODE_OFFLINED))) { + rcm_log_message(RCM_TRACE2, + "VNIC: Skipping configured interface(%s)\n", rsrc); + (void) mutex_unlock(&cache_lock); + return (0); + } + (void) mutex_unlock(&cache_lock); + + arg.linkid = linkid; + (void) dladm_walk_datalink_id(vnic_up, &arg, DATALINK_CLASS_VNIC, + DATALINK_ANY_MEDIATYPE, DLADM_OPT_PERSIST); + + if (arg.retval == 0) { + rcm_log_message(RCM_TRACE2, + "VNIC: vnic_configure succeeded(%s)\n", rsrc); + } + return (arg.retval); +} diff --git a/usr/src/cmd/svc/milestone/net-physical b/usr/src/cmd/svc/milestone/net-physical index bcee0c9818..8530806768 100644 --- a/usr/src/cmd/svc/milestone/net-physical +++ b/usr/src/cmd/svc/milestone/net-physical @@ -26,8 +26,6 @@ # Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T. # All rights reserved. # -# -# ident "%Z%%M% %I% %E% SMI" . /lib/svc/share/smf_include.sh . /lib/svc/share/net_include.sh @@ -81,6 +79,14 @@ if smf_is_globalzone; then /sbin/dladm up-aggr /sbin/dladm up-vlan /sbin/dladm init-secobj + # + # Bring up VNICs + # + /sbin/dladm up-vnic + # + # Create flows via flowadm. + # + /sbin/flowadm init-flow fi # diff --git a/usr/src/cmd/svc/profile/generic_limited_net.xml b/usr/src/cmd/svc/profile/generic_limited_net.xml index 449d06bf1e..5fed0e86bf 100644 --- a/usr/src/cmd/svc/profile/generic_limited_net.xml +++ b/usr/src/cmd/svc/profile/generic_limited_net.xml @@ -62,6 +62,7 @@ <instance name='flow' enabled='false'/> <instance name='process' enabled='false'/> <instance name='task' enabled='false'/> + <instance name='net' enabled='false'/> </service> <service name='system/hal' version='1' type='service'> <instance name='default' enabled='true'/> diff --git a/usr/src/cmd/svc/profile/generic_open.xml b/usr/src/cmd/svc/profile/generic_open.xml index 7d837f4b53..34b600cca1 100644 --- a/usr/src/cmd/svc/profile/generic_open.xml +++ b/usr/src/cmd/svc/profile/generic_open.xml @@ -59,6 +59,7 @@ <instance name='flow' enabled='false'/> <instance name='process' enabled='false'/> <instance name='task' enabled='false'/> + <instance name='net' enabled='false'/> </service> <service name='system/hal' version='1' type='service'> <instance name='default' enabled='true'/> diff --git a/usr/src/cmd/truss/codes.c b/usr/src/cmd/truss/codes.c index 3869b370c1..46b2b5a958 100644 --- a/usr/src/cmd/truss/codes.c +++ b/usr/src/cmd/truss/codes.c @@ -84,6 +84,7 @@ #include <sys/ptms.h> #include <sys/aggr.h> #include <sys/dld.h> +#include <sys/vnic.h> #include <sys/fs/zfs.h> #include <inet/kssl/kssl.h> #include <sys/dkio.h> @@ -844,18 +845,38 @@ const struct ioc { { (uint_t)DLDIOC_ATTR, "DLDIOC_ATTR", "dld_ioc_attr"}, { (uint_t)DLDIOC_PHYS_ATTR, "DLDIOC_PHYS_ATTR", "dld_ioc_phys_attr"}, - { (uint_t)DLDIOC_VLAN_ATTR, "DLDIOC_VLAN_ATTR", - "dld_ioc_vlan_attr"}, - { (uint_t)DLDIOC_CREATE_VLAN, "DLDIOC_CREATE_VLAN", - "dld_ioc_create_vlan"}, - { (uint_t)DLDIOC_DELETE_VLAN, "DLDIOC_DELETE_VLAN", - "dld_ioc_delete_vlan"}, - { (uint_t)DLDIOC_DOORSERVER, "DLDIOC_DOORSERVER", "dld_ioc_door"}, - { (uint_t)DLDIOC_RENAME, "DLDIOC_RENAME", "dld_ioc_rename"}, - { (uint_t)DLDIOC_SETMACPROP, "DLDIOC_SETMACPROP", + { (uint_t)DLDIOC_DOORSERVER, "DLDIOC_DOORSERVER", "dld_ioc_door"}, + { (uint_t)DLDIOC_RENAME, "DLDIOC_RENAME", "dld_ioc_rename"}, + { (uint_t)DLDIOC_SECOBJ_GET, "DLDIOC_SECOBJ_GET", + "dld_ioc_secobj_get"}, + { (uint_t)DLDIOC_SECOBJ_SET, "DLDIOC_SECOBJ_SET", + "dld_ioc_secobj_set"}, + { (uint_t)DLDIOC_SECOBJ_UNSET, "DLDIOC_SECOBJ_UNSET", + "dld_ioc_secobj_unset"}, + { (uint_t)DLDIOC_MACADDRGET, "DLDIOC_MACADDRGET", + "dld_ioc_macaddrget"}, + { (uint_t)DLDIOC_SETMACPROP, "DLDIOC_SETMACPROP", "dld_ioc_macprop_s"}, - { (uint_t)DLDIOC_GETMACPROP, "DLDIOC_GETMACPROP", + { (uint_t)DLDIOC_GETMACPROP, "DLDIOC_GETMACPROP", "dld_ioc_macprop_s"}, + { (uint_t)DLDIOC_ADDFLOW, "DLDIOC_ADDFLOW", + "dld_ioc_addflow"}, + { (uint_t)DLDIOC_REMOVEFLOW, "DLDIOC_REMOVEFLOW", + "dld_ioc_removeflow"}, + { (uint_t)DLDIOC_MODIFYFLOW, "DLDIOC_MODIFYFLOW", + "dld_ioc_modifyflow"}, + { (uint_t)DLDIOC_WALKFLOW, "DLDIOC_WALKFLOW", + "dld_ioc_walkflow"}, + { (uint_t)DLDIOC_USAGELOG, "DLDIOC_USAGELOG", + "dld_ioc_usagelog"}, + + /* vnic ioctls */ + { (uint_t)VNIC_IOC_CREATE, "VNIC_IOC_CREATE", + "vnic_ioc_create"}, + { (uint_t)VNIC_IOC_DELETE, "VNIC_IOC_DELETE", + "vnic_ioc_delete"}, + { (uint_t)VNIC_IOC_INFO, "VNIC_IOC_INFO", + "vnic_ioc_info"}, /* ZFS ioctls */ { (uint_t)ZFS_IOC_POOL_CREATE, "ZFS_IOC_POOL_CREATE", diff --git a/usr/src/cmd/vna/Makefile b/usr/src/cmd/vna/Makefile index 4e5e25e85b..6b608e0126 100644 --- a/usr/src/cmd/vna/Makefile +++ b/usr/src/cmd/vna/Makefile @@ -22,15 +22,16 @@ # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" # PROG = vna include ../Makefile.cmd +LDLIBS += -L$(ROOT)/lib LDLIBS += -ldladm -lsocket -ldlpi + .KEEP_STATE: all: $(PROG) diff --git a/usr/src/cmd/vna/vna.c b/usr/src/cmd/vna/vna.c index 6262de5959..6a05cf1777 100644 --- a/usr/src/cmd/vna/vna.c +++ b/usr/src/cmd/vna/vna.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * This utility constitutes a private interface - it will be removed * in a future release of Solaris. Neither users nor other software @@ -40,7 +38,7 @@ #include <libdlpi.h> typedef struct vnic_attr { - dladm_vnic_attr_sys_t attr; + dladm_vnic_attr_t attr; char *name; } vnic_attr_t; @@ -48,7 +46,7 @@ typedef struct vnic_attr { static int v_print(datalink_id_t vnic_id, void *arg) { - dladm_vnic_attr_sys_t attr; + dladm_vnic_attr_t attr; char vnic[MAXLINKNAMELEN]; char link[MAXLINKNAMELEN]; @@ -87,8 +85,8 @@ static int v_find(datalink_id_t vnic_id, void *arg) { vnic_attr_t *vattr = arg; - dladm_vnic_attr_sys_t *specp = &vattr->attr; - dladm_vnic_attr_sys_t attr; + dladm_vnic_attr_t *specp = &vattr->attr; + dladm_vnic_attr_t attr; char linkname[MAXLINKNAMELEN]; if (dladm_vnic_info(vnic_id, &attr, DLADM_OPT_ACTIVE) != @@ -221,7 +219,8 @@ v_add(char *link, char *addr, char *name) */ status = dladm_vnic_create(name, linkid, VNIC_MAC_ADDR_TYPE_FIXED, (uchar_t *)ea->ether_addr_octet, - ETHERADDRL, &vnic_id, DLADM_OPT_ACTIVE); + ETHERADDRL, NULL, 0, 0, &vnic_id, NULL, DLADM_OPT_ACTIVE); + if (status != DLADM_STATUS_OK) { (void) fprintf(stderr, "dladm_vnic_create: %s\n", dladm_status2str(status, buf)); diff --git a/usr/src/lib/Makefile b/usr/src/lib/Makefile index b7a9d26795..8b3fb0aaf9 100644 --- a/usr/src/lib/Makefile +++ b/usr/src/lib/Makefile @@ -546,7 +546,7 @@ libdevinfo: libnvpair libsec libdhcpagent: libsocket libdhcputil libuuid libdlpi libdhcpsvc: libinetutil libdhcputil: libnsl libgen libinetutil libdlpi -libdladm: libdevinfo libinetutil libsocket +libdladm: libdevinfo libinetutil libsocket libnsl libexacct libscf libdll: libast libdlpi: libinetutil libdladm libdscfg: libnsctl libunistat libsocket libnsl diff --git a/usr/src/lib/libdladm/Makefile b/usr/src/lib/libdladm/Makefile index 630a7e2e19..ebe6c51eee 100644 --- a/usr/src/lib/libdladm/Makefile +++ b/usr/src/lib/libdladm/Makefile @@ -22,14 +22,14 @@ # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" # include $(SRC)/lib/Makefile.lib HDRS = libdladm.h libdladm_impl.h libdllink.h libdlaggr.h \ libdlwlan.h libdlwlan_impl.h libdlvnic.h libdlvlan.h \ - libdlmgmt.h + libdlmgmt.h libdlflow.h libdlflow_impl.h libdlstat.h + HDRDIR = common SUBDIRS = $(MACH) @@ -39,7 +39,11 @@ POFILE = libdladm.po MSGFILES = common/libdladm.c common/linkprop.c common/secobj.c \ common/libdllink.c common/libdlaggr.c \ common/libdlwlan.c common/libdlvnic.c \ - common/libdlvlan.c common/libdlmgmt.c + common/libdlvlan.c common/libdlmgmt.c \ + common/flowattr.c common/flowprop.c \ + common/propfuncs.c common/libdlflow.c \ + common/libdlstat.c common/flowattr.c + XGETFLAGS = -a -x libdladm.xcl all := TARGET = all diff --git a/usr/src/lib/libdladm/Makefile.com b/usr/src/lib/libdladm/Makefile.com index 0f6419bd29..50aa57e710 100644 --- a/usr/src/lib/libdladm/Makefile.com +++ b/usr/src/lib/libdladm/Makefile.com @@ -22,13 +22,13 @@ # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# LIBRARY = libdladm.a VERS = .1 OBJECTS = libdladm.o secobj.o linkprop.o libdllink.o libdlaggr.o \ - libdlwlan.o libdlvnic.o libdlmgmt.o libdlvlan.o + libdlwlan.o libdlvnic.o libdlmgmt.o libdlvlan.o \ + flowattr.o flowprop.o propfuncs.o libdlflow.o libdlstat.o \ + usage.o include ../../Makefile.lib @@ -36,8 +36,8 @@ include ../../Makefile.lib include ../../Makefile.rootfs LIBS = $(DYNLIB) $(LINTLIB) -LDLIBS += -ldevinfo -lc -linetutil -lsocket -lscf -lrcm \ - -lnvpair -lkstat +LDLIBS += -ldevinfo -lc -linetutil -lsocket -lscf -lrcm -lnvpair \ + -lexacct -lnsl -lkstat -lcurses SRCDIR = ../common $(LINTLIB) := SRCS = $(SRCDIR)/$(LINTSRC) diff --git a/usr/src/lib/libdladm/common/flowattr.c b/usr/src/lib/libdladm/common/flowattr.c new file mode 100644 index 0000000000..4fb578e5bc --- /dev/null +++ b/usr/src/lib/libdladm/common/flowattr.c @@ -0,0 +1,411 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <errno.h> +#include <stdlib.h> +#include <strings.h> +#include <sys/mac_flow.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <netdb.h> +#include <net/if_types.h> +#include <net/if_dl.h> +#include <inet/ip.h> +#include <inet/ip6.h> + +#include <libdladm.h> +#include <libdlflow.h> +#include <libdlflow_impl.h> + +#define V4_PART_OF_V6(v6) ((v6)._S6_un._S6_u32[3]) + +/* max port number for UDP, TCP & SCTP */ +#define MAX_PORT 65535 + +static fad_checkf_t do_check_local_ip; +static fad_checkf_t do_check_remote_ip; +static fad_checkf_t do_check_protocol; +static fad_checkf_t do_check_local_port; + +static dladm_status_t do_check_port(char *, boolean_t, flow_desc_t *); + +static fattr_desc_t attr_table[] = { + { "local_ip", do_check_local_ip }, + { "remote_ip", do_check_remote_ip }, + { "transport", do_check_protocol }, + { "local_port", do_check_local_port }, + { "dsfield", do_check_dsfield }, +}; + +#define DLADM_MAX_FLOWATTRS (sizeof (attr_table) / sizeof (fattr_desc_t)) + +static dladm_status_t +do_check_local_ip(char *attr_val, flow_desc_t *fdesc) +{ + return (do_check_ip_addr(attr_val, B_TRUE, fdesc)); +} + +static dladm_status_t +do_check_remote_ip(char *attr_val, flow_desc_t *fdesc) +{ + return (do_check_ip_addr(attr_val, B_FALSE, fdesc)); +} + +dladm_status_t +do_check_ip_addr(char *addr_str, boolean_t local, flow_desc_t *fd) +{ + struct addrinfo *info = NULL; + dladm_status_t status; + int err, prefix_max, prefix_len = 0; + char *prefix_str, *endp = NULL; + flow_mask_t mask; + in6_addr_t *addr; + uchar_t *netmask; + + if ((prefix_str = strchr(addr_str, '/')) != NULL) { + *prefix_str++ = '\0'; + errno = 0; + prefix_len = (int)strtol(prefix_str, &endp, 10); + if (errno != 0 || prefix_len == 0 || *endp != '\0') + return (DLADM_STATUS_INVALID_PREFIXLEN); + } + + err = getaddrinfo(addr_str, NULL, NULL, &info); + if (err != 0) + return (DLADM_STATUS_INVALID_IP); + + mask = FLOW_IP_VERSION; + if (local) { + mask |= FLOW_IP_LOCAL; + addr = &fd->fd_local_addr; + netmask = (uchar_t *)&fd->fd_local_netmask; + } else { + mask |= FLOW_IP_REMOTE; + addr = &fd->fd_remote_addr; + netmask = (uchar_t *)&fd->fd_remote_netmask; + } + + if (info->ai_family == AF_INET) { + IN6_INADDR_TO_V4MAPPED(&(((struct sockaddr_in *) + (void *)info->ai_addr)->sin_addr), addr); + prefix_max = IP_ABITS; + fd->fd_ipversion = IPV4_VERSION; + netmask = (uchar_t *) + &(V4_PART_OF_V6((*((in6_addr_t *)(void *)netmask)))); + } else if (info->ai_family == AF_INET6) { + *addr = ((struct sockaddr_in6 *) + (void *)info->ai_addr)->sin6_addr; + prefix_max = IPV6_ABITS; + fd->fd_ipversion = IPV6_VERSION; + } else { + freeaddrinfo(info); + return (DLADM_STATUS_INVALID_IP); + } + + if (prefix_len == 0) + prefix_len = prefix_max; + + status = dladm_prefixlen2mask(prefix_len, prefix_max, netmask); + + if (status != DLADM_STATUS_OK) { + freeaddrinfo(info); + return (DLADM_STATUS_INVALID_PREFIXLEN); + } + + fd->fd_mask |= mask; + freeaddrinfo(info); + return (DLADM_STATUS_OK); +} + +dladm_status_t +do_check_protocol(char *attr_val, flow_desc_t *fdesc) +{ + uint8_t protocol; + + protocol = dladm_str2proto(attr_val); + + if (protocol != 0) { + fdesc->fd_mask |= FLOW_IP_PROTOCOL; + fdesc->fd_protocol = protocol; + return (DLADM_STATUS_OK); + } else { + return (DLADM_STATUS_INVALID_PROTOCOL); + } +} + +dladm_status_t +do_check_local_port(char *attr_val, flow_desc_t *fdesc) +{ + return (do_check_port(attr_val, B_TRUE, fdesc)); +} + +dladm_status_t +do_check_port(char *attr_val, boolean_t local, flow_desc_t *fdesc) +{ + char *endp = NULL; + long val; + + if (local) { + fdesc->fd_mask |= FLOW_ULP_PORT_LOCAL; + val = strtol(attr_val, &endp, 10); + if (val < 1 || val > MAX_PORT) + return (DLADM_STATUS_INVALID_PORT); + fdesc->fd_local_port = htons((uint16_t)val); + } else { + return (DLADM_STATUS_BADVAL); + } + + return (DLADM_STATUS_OK); +} + +/* + * Check for invalid and/or duplicate attribute specification + */ +static dladm_status_t +flow_attrlist_check(dladm_arg_list_t *attrlist) +{ + int i, j; + boolean_t isset[DLADM_MAX_FLOWATTRS]; + boolean_t matched; + + for (j = 0; j < DLADM_MAX_FLOWATTRS; j++) + isset[j] = B_FALSE; + + for (i = 0; i < attrlist->al_count; i++) { + matched = B_FALSE; + for (j = 0; j < DLADM_MAX_FLOWATTRS; j++) { + if (strcmp(attrlist->al_info[i].ai_name, + attr_table[j].ad_name) == 0) { + if (isset[j]) + return (DLADM_STATUS_FLOW_INCOMPATIBLE); + else + isset[j] = B_TRUE; + matched = B_TRUE; + } + } + /* + * if the attribute did not match any of the attribute in + * attr_table, then it's an invalid attribute. + */ + if (!matched) + return (DLADM_STATUS_BADARG); + } + return (DLADM_STATUS_OK); +} + +/* + * Convert an attribute list to a flow_desc_t using the attribute ad_check() + * functions. + */ +dladm_status_t +dladm_flow_attrlist_extract(dladm_arg_list_t *attrlist, flow_desc_t *flowdesc) +{ + dladm_status_t status = DLADM_STATUS_BADARG; + int i; + + for (i = 0; i < attrlist->al_count; i++) { + dladm_arg_info_t *aip = &attrlist->al_info[i]; + int j; + + for (j = 0; j < DLADM_MAX_FLOWATTRS; j++) { + fattr_desc_t *adp = &attr_table[j]; + + if (strcasecmp(aip->ai_name, adp->ad_name) != 0) + continue; + + if ((aip->ai_val == NULL) || (*aip->ai_val == NULL)) + return (DLADM_STATUS_BADARG); + + if (adp->ad_check != NULL) + status = adp->ad_check(*aip->ai_val, flowdesc); + else + status = DLADM_STATUS_BADARG; + + if (status != DLADM_STATUS_OK) + return (status); + } + } + return (status); +} + +void +dladm_free_attrs(dladm_arg_list_t *list) +{ + dladm_free_args(list); +} + +dladm_status_t +dladm_parse_flow_attrs(char *str, dladm_arg_list_t **listp, boolean_t novalues) +{ + + if (dladm_parse_args(str, listp, novalues) + != DLADM_STATUS_OK) + return (DLADM_STATUS_ATTR_PARSE_ERR); + + if (flow_attrlist_check(*listp) != DLADM_STATUS_OK) { + dladm_free_attrs(*listp); + return (DLADM_STATUS_ATTR_PARSE_ERR); + } + + return (DLADM_STATUS_OK); +} + +dladm_status_t +do_check_dsfield(char *str, flow_desc_t *fd) +{ + char *mask_str, *endp = NULL; + uint_t mask = 0xff, value; + + if ((mask_str = strchr(str, ':')) != NULL) { + *mask_str++ = '\0'; + errno = 0; + mask = strtoul(mask_str, &endp, 16); + if (errno != 0 || mask == 0 || mask > 0xff || + *endp != '\0') + return (DLADM_STATUS_INVALID_DSFMASK); + } + errno = 0; + endp = NULL; + value = strtoul(str, &endp, 16); + if (errno != 0 || value == 0 || value > 0xff || *endp != '\0') + return (DLADM_STATUS_INVALID_DSF); + + fd->fd_dsfield = (uint8_t)value; + fd->fd_dsfield_mask = (uint8_t)mask; + fd->fd_mask |= FLOW_IP_DSFIELD; + return (DLADM_STATUS_OK); +} + +char * +dladm_proto2str(uint8_t protocol) +{ + if (protocol == IPPROTO_TCP) + return ("tcp"); + if (protocol == IPPROTO_UDP) + return ("udp"); + if (protocol == IPPROTO_SCTP) + return ("sctp"); + if (protocol == IPPROTO_ICMPV6) + return ("icmpv6"); + if (protocol == IPPROTO_ICMP) + return ("icmp"); + else + return (""); +} + +uint8_t +dladm_str2proto(const char *protostr) +{ + if (strncasecmp(protostr, "tcp", 3) == 0) + return (IPPROTO_TCP); + else if (strncasecmp(protostr, "udp", 3) == 0) + return (IPPROTO_UDP); + else if (strncasecmp(protostr, "sctp", 4) == 0) + return (IPPROTO_SCTP); + else if (strncasecmp(protostr, "icmpv6", 6) == 0) + return (IPPROTO_ICMPV6); + else if (strncasecmp(protostr, "icmp", 4) == 0) + return (IPPROTO_ICMP); + + return (0); +} + +void +dladm_flow_attr_ip2str(dladm_flow_attr_t *attrp, char *buf, size_t buf_len) +{ + flow_desc_t fdesc = attrp->fa_flow_desc; + struct in_addr ipaddr; + int prefix_len, prefix_max; + char *cp, abuf[INET6_ADDRSTRLEN]; + + if (fdesc.fd_mask & FLOW_IP_LOCAL) { + if (fdesc.fd_ipversion == IPV6_VERSION) { + (void) inet_ntop(AF_INET6, &fdesc.fd_local_addr, abuf, + INET6_ADDRSTRLEN); + cp = abuf; + prefix_max = IPV6_ABITS; + } else { + ipaddr.s_addr = fdesc.fd_local_addr._S6_un._S6_u32[3]; + cp = inet_ntoa(ipaddr); + prefix_max = IP_ABITS; + } + (void) dladm_mask2prefixlen(&fdesc.fd_local_netmask, + prefix_max, &prefix_len); + (void) snprintf(buf, buf_len, "LCL:%s/%d ", cp, prefix_len); + } else if (fdesc.fd_mask & FLOW_IP_REMOTE) { + if (fdesc.fd_ipversion == IPV6_VERSION) { + (void) inet_ntop(AF_INET6, &fdesc.fd_remote_addr, abuf, + INET6_ADDRSTRLEN); + cp = abuf; + prefix_max = IPV6_ABITS; + } else { + ipaddr.s_addr = fdesc.fd_remote_addr._S6_un._S6_u32[3]; + cp = inet_ntoa(ipaddr); + prefix_max = IP_ABITS; + } + (void) dladm_mask2prefixlen(&fdesc.fd_remote_netmask, + prefix_max, &prefix_len); + (void) snprintf(buf, buf_len, "RMT:%s/%d ", cp, prefix_len); + } else { + buf[0] = '\0'; + } +} + +void +dladm_flow_attr_proto2str(dladm_flow_attr_t *attrp, char *buf, size_t buf_len) +{ + flow_desc_t fdesc = attrp->fa_flow_desc; + + (void) snprintf(buf, buf_len, "%s", + dladm_proto2str(fdesc.fd_protocol)); +} + +void +dladm_flow_attr_port2str(dladm_flow_attr_t *attrp, char *buf, size_t buf_len) +{ + flow_desc_t fdesc = attrp->fa_flow_desc; + + if (fdesc.fd_mask & FLOW_ULP_PORT_LOCAL) { + (void) snprintf(buf, buf_len, "%d", + ntohs(fdesc.fd_local_port)); + } else { + buf[0] = '\0'; + } +} + +void +dladm_flow_attr_dsfield2str(dladm_flow_attr_t *attrp, char *buf, size_t buf_len) +{ + flow_desc_t fdesc = attrp->fa_flow_desc; + + if (fdesc.fd_mask & FLOW_IP_DSFIELD) { + (void) snprintf(buf, buf_len, "0x%x:0x%x", + fdesc.fd_dsfield, fdesc.fd_dsfield_mask); + } else { + buf[0] = '\0'; + } +} diff --git a/usr/src/lib/libdladm/common/flowprop.c b/usr/src/lib/libdladm/common/flowprop.c new file mode 100644 index 0000000000..a2125a9d33 --- /dev/null +++ b/usr/src/lib/libdladm/common/flowprop.c @@ -0,0 +1,611 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <stdlib.h> +#include <strings.h> +#include <errno.h> +#include <ctype.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/dld.h> +#include <fcntl.h> +#include <unistd.h> +#include <libdevinfo.h> +#include <libdladm_impl.h> +#include <libdlflow.h> +#include <libdlflow_impl.h> +#include <libintl.h> + +#include <dlfcn.h> +#include <link.h> + +/* + * XXX duplicate define + */ +#define DLADM_PROP_VAL_MAX 32 + +static dladm_status_t i_dladm_set_flowprop_db(const char *, const char *, + char **, uint_t); +static dladm_status_t i_dladm_get_flowprop_db(const char *, const char *, + char **, uint_t *); + +static fpd_getf_t do_get_maxbw; +static fpd_setf_t do_set_maxbw; +static fpd_checkf_t do_check_maxbw; + +static fpd_getf_t do_get_priority; +static fpd_setf_t do_set_priority; +static fpd_checkf_t do_check_priority; + +static fprop_desc_t prop_table[] = { + { "maxbw", { "", NULL }, NULL, 0, B_FALSE, + do_set_maxbw, NULL, + do_get_maxbw, do_check_maxbw}, + { "priority", { "", NULL }, NULL, 0, B_FALSE, + do_set_priority, NULL, + do_get_priority, do_check_priority} +}; + +#define DLADM_MAX_FLOWPROPS (sizeof (prop_table) / sizeof (fprop_desc_t)) + +static prop_table_t prop_tbl = { + prop_table, + DLADM_MAX_FLOWPROPS +}; + +static resource_prop_t rsrc_prop_table[] = { + {"maxbw", do_extract_maxbw}, + {"priority", do_extract_priority} +}; +#define DLADM_MAX_RSRC_PROP (sizeof (rsrc_prop_table) / \ + sizeof (resource_prop_t)) + +static dladm_status_t flow_proplist_check(dladm_arg_list_t *); + +dladm_status_t +dladm_set_flowprop(const char *flow, const char *prop_name, char **prop_val, + uint_t val_cnt, uint_t flags, char **errprop) +{ + dladm_status_t status = DLADM_STATUS_BADARG; + + if (flow == NULL || (prop_val == NULL && val_cnt > 0) || + (prop_val != NULL && val_cnt == 0) || flags == 0) + return (DLADM_STATUS_BADARG); + + if ((flags & DLADM_OPT_ACTIVE) != 0) { + status = i_dladm_set_prop_temp(flow, prop_name, prop_val, + val_cnt, flags, errprop, &prop_tbl); + if (status == DLADM_STATUS_TEMPONLY && + (flags & DLADM_OPT_PERSIST) != 0) + return (DLADM_STATUS_TEMPONLY); + if (status != DLADM_STATUS_OK) + return (status); + } + if ((flags & DLADM_OPT_PERSIST) != 0) { + if (i_dladm_is_prop_temponly(prop_name, errprop, &prop_tbl)) + return (DLADM_STATUS_TEMPONLY); + + status = i_dladm_set_flowprop_db(flow, prop_name, + prop_val, val_cnt); + } + return (status); +} + +dladm_status_t +dladm_walk_flowprop(int (*func)(void *, const char *), const char *flow, + void *arg) +{ + int i; + + if (flow == NULL || func == NULL) + return (DLADM_STATUS_BADARG); + + /* Then show data-flow properties if there are any */ + for (i = 0; i < DLADM_MAX_FLOWPROPS; i++) { + if (func(arg, prop_table[i].pd_name) != DLADM_WALK_CONTINUE) + break; + } + return (DLADM_STATUS_OK); +} + +dladm_status_t +dladm_get_flowprop(const char *flow, uint32_t type, + const char *prop_name, char **prop_val, uint_t *val_cntp) +{ + dladm_status_t status; + + if (flow == NULL || prop_name == NULL || prop_val == NULL || + val_cntp == NULL || *val_cntp == 0) + return (DLADM_STATUS_BADARG); + + if (type == DLADM_PROP_VAL_PERSISTENT) { + if (i_dladm_is_prop_temponly(prop_name, NULL, &prop_tbl)) + return (DLADM_STATUS_TEMPONLY); + return (i_dladm_get_flowprop_db(flow, prop_name, + prop_val, val_cntp)); + } + + status = i_dladm_get_prop_temp(flow, type, prop_name, + prop_val, val_cntp, &prop_tbl); + if (status != DLADM_STATUS_NOTFOUND) + return (status); + + return (DLADM_STATUS_BADARG); +} + +#define FLOWPROP_RW_DB(statep, writeop) \ + (i_dladm_rw_db("/etc/dladm/flowprop.conf", \ + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH, process_prop_db, \ + (statep), (writeop))) + +static dladm_status_t +i_dladm_set_flowprop_db(const char *flow, const char *prop_name, + char **prop_val, uint_t val_cnt) +{ + prop_db_state_t state; + + state.ls_op = process_prop_set; + state.ls_name = flow; + state.ls_propname = prop_name; + state.ls_propval = prop_val; + state.ls_valcntp = &val_cnt; + state.ls_initop = NULL; + + return (FLOWPROP_RW_DB(&state, B_TRUE)); +} + +static dladm_status_t +i_dladm_get_flowprop_db(const char *flow, const char *prop_name, + char **prop_val, uint_t *val_cntp) +{ + prop_db_state_t state; + + state.ls_op = process_prop_get; + state.ls_name = flow; + state.ls_propname = prop_name; + state.ls_propval = prop_val; + state.ls_valcntp = val_cntp; + state.ls_initop = NULL; + + return (FLOWPROP_RW_DB(&state, B_FALSE)); +} + +dladm_status_t +i_dladm_init_flowprop_db(void) +{ + prop_db_state_t state; + + state.ls_op = process_prop_init; + state.ls_name = NULL; + state.ls_propname = NULL; + state.ls_propval = NULL; + state.ls_valcntp = NULL; + state.ls_initop = dladm_set_flowprop; + + return (FLOWPROP_RW_DB(&state, B_FALSE)); +} + +#define MIN_INFO_SIZE (4 * 1024) + +dladm_status_t +dladm_flow_info(const char *flow, dladm_flow_attr_t *attr) +{ + dld_ioc_walkflow_t *ioc; + int bufsize, fd; + dld_flowinfo_t *flowinfo; + + if ((flow == NULL) || (attr == NULL)) + return (DLADM_STATUS_BADARG); + + if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0) + return (dladm_errno2status(errno)); + + bufsize = MIN_INFO_SIZE; + if ((ioc = calloc(1, bufsize)) == NULL) { + (void) close(fd); + return (dladm_errno2status(errno)); + } + + (void) strlcpy(ioc->wf_name, flow, sizeof (ioc->wf_name)); + ioc->wf_len = bufsize - sizeof (*ioc); + + while (ioctl(fd, DLDIOC_WALKFLOW, ioc) < 0) { + if (errno == ENOSPC) { + bufsize *= 2; + ioc = realloc(ioc, bufsize); + if (ioc != NULL) { + (void) strlcpy(ioc->wf_name, flow, + MAXNAMELEN); + ioc->wf_len = bufsize - sizeof (*ioc); + continue; + } + } + free(ioc); + (void) close(fd); + return (dladm_errno2status(errno)); + } + + bzero(attr, sizeof (*attr)); + + flowinfo = (dld_flowinfo_t *)(void *)(ioc + 1); + + attr->fa_linkid = flowinfo->fi_linkid; + bcopy(&flowinfo->fi_flowname, &attr->fa_flowname, + sizeof (attr->fa_flowname)); + bcopy(&flowinfo->fi_flow_desc, &attr->fa_flow_desc, + sizeof (attr->fa_flow_desc)); + bcopy(&flowinfo->fi_resource_props, &attr->fa_resource_props, + sizeof (attr->fa_resource_props)); + + free(ioc); + (void) close(fd); + return (DLADM_STATUS_OK); +} + +/* ARGSUSED */ +static dladm_status_t +do_get_maxbw(const char *flow, char **prop_val, uint_t *val_cnt) +{ + mac_resource_props_t *mrp; + char buf[DLADM_STRSIZE]; + dladm_flow_attr_t fa; + dladm_status_t status; + + status = dladm_flow_info(flow, &fa); + if (status != DLADM_STATUS_OK) + return (status); + mrp = &(fa.fa_resource_props); + + *val_cnt = 1; + if (mrp->mrp_mask & MRP_MAXBW) { + (void) snprintf(prop_val[0], DLADM_STRSIZE, "%s", + dladm_bw2str(mrp->mrp_maxbw, buf)); + } else { + return (DLADM_STATUS_NOTSUP); + } + return (DLADM_STATUS_OK); +} + +/* ARGSUSED */ +static dladm_status_t +do_set_maxbw(const char *flow, val_desc_t *vdp, uint_t val_cnt) +{ + dld_ioc_modifyflow_t attr; + int fd; + mac_resource_props_t mrp; + void *val; + + if (val_cnt != 1) + return (DLADM_STATUS_BADVALCNT); + + bzero(&mrp, sizeof (mrp)); + if (vdp != NULL && (val = (void *)vdp->vd_val) != NULL) { + bcopy(val, &mrp.mrp_maxbw, sizeof (int64_t)); + free(val); + } else { + mrp.mrp_maxbw = MRP_MAXBW_RESETVAL; + } + mrp.mrp_mask = MRP_MAXBW; + + bzero(&attr, sizeof (attr)); + (void) strlcpy(attr.mf_name, flow, sizeof (attr.mf_name)); + bcopy(&mrp, &attr.mf_resource_props, sizeof (mac_resource_props_t)); + + fd = open(DLD_CONTROL_DEV, O_RDWR); + if (fd < 0) { + return (dladm_errno2status(errno)); + } + + if (ioctl(fd, DLDIOC_MODIFYFLOW, &attr) < 0) { + (void) close(fd); + return (dladm_errno2status(errno)); + } + (void) close(fd); + return (DLADM_STATUS_OK); +} + +/* ARGSUSED */ +static dladm_status_t +do_check_maxbw(fprop_desc_t *pdp, char **prop_val, uint_t val_cnt, + val_desc_t **vdpp) +{ + uint64_t *maxbw; + val_desc_t *vdp = NULL; + dladm_status_t status = DLADM_STATUS_OK; + + if (val_cnt != 1) + return (DLADM_STATUS_BADVALCNT); + + maxbw = malloc(sizeof (uint64_t)); + if (maxbw == NULL) + return (DLADM_STATUS_NOMEM); + + status = dladm_str2bw(*prop_val, maxbw); + if (status != DLADM_STATUS_OK) { + free(maxbw); + return (status); + } + + if ((*maxbw < MRP_MAXBW_MINVAL) && (*maxbw != 0)) { + free(maxbw); + return (DLADM_STATUS_MINMAXBW); + } + + vdp = malloc(sizeof (val_desc_t)); + if (vdp == NULL) { + free(maxbw); + return (DLADM_STATUS_NOMEM); + } + + vdp->vd_val = (uintptr_t)maxbw; + *vdpp = vdp; + return (DLADM_STATUS_OK); +} + +/* ARGSUSED */ +static dladm_status_t +do_get_priority(const char *flow, char **prop_val, uint_t *val_cnt) +{ + mac_resource_props_t *mrp; + char buf[DLADM_STRSIZE]; + dladm_flow_attr_t fa; + dladm_status_t status; + + bzero(&fa, sizeof (dladm_flow_attr_t)); + status = dladm_flow_info(flow, &fa); + if (status != DLADM_STATUS_OK) + return (status); + mrp = &(fa.fa_resource_props); + + *val_cnt = 1; + if (mrp->mrp_mask & MRP_PRIORITY) { + (void) snprintf(prop_val[0], DLADM_STRSIZE, "%s", + dladm_pri2str(mrp->mrp_priority, buf)); + } else { + return (DLADM_STATUS_NOTSUP); + } + return (DLADM_STATUS_OK); +} + +/* ARGSUSED */ +static dladm_status_t +do_set_priority(const char *flow, val_desc_t *vdp, uint_t val_cnt) +{ + dld_ioc_modifyflow_t attr; + int fd; + mac_resource_props_t mrp; + void *val; + + if (val_cnt != 1) + return (DLADM_STATUS_BADVALCNT); + + bzero(&mrp, sizeof (mrp)); + if (vdp != NULL && (val = (void *)vdp->vd_val) != NULL) { + bcopy(val, &mrp.mrp_priority, sizeof (mac_priority_level_t)); + free(val); + } else { + mrp.mrp_priority = MPL_RESET; + } + mrp.mrp_mask = MRP_PRIORITY; + + bzero(&attr, sizeof (attr)); + (void) strlcpy(attr.mf_name, flow, sizeof (attr.mf_name)); + bcopy(&mrp, &attr.mf_resource_props, sizeof (mac_resource_props_t)); + + fd = open(DLD_CONTROL_DEV, O_RDWR); + if (fd < 0) { + return (dladm_errno2status(errno)); + } + + if (ioctl(fd, DLDIOC_MODIFYFLOW, &attr) < 0) { + (void) close(fd); + return (dladm_errno2status(errno)); + } + (void) close(fd); + return (DLADM_STATUS_OK); +} + +/* ARGSUSED */ +static dladm_status_t +do_check_priority(fprop_desc_t *pdp, char **prop_val, uint_t val_cnt, + val_desc_t **vdpp) +{ + mac_priority_level_t *pri; + val_desc_t *vdp = NULL; + dladm_status_t status = DLADM_STATUS_OK; + + if (val_cnt != 1) + return (DLADM_STATUS_BADVALCNT); + + pri = malloc(sizeof (mac_priority_level_t)); + if (pri == NULL) + return (DLADM_STATUS_NOMEM); + + status = dladm_str2pri(*prop_val, pri); + if (status != DLADM_STATUS_OK) { + free(pri); + return (status); + } + + if (*pri == -1) { + free(pri); + return (DLADM_STATUS_BADVAL); + } + + vdp = malloc(sizeof (val_desc_t)); + if (vdp == NULL) { + free(pri); + return (DLADM_STATUS_NOMEM); + } + + vdp->vd_val = (uintptr_t)pri; + *vdpp = vdp; + return (DLADM_STATUS_OK); +} + +static dladm_status_t +flow_proplist_check(dladm_arg_list_t *proplist) +{ + int i, j; + boolean_t matched; + + for (i = 0; i < proplist->al_count; i++) { + matched = B_FALSE; + for (j = 0; j < DLADM_MAX_FLOWPROPS; j++) { + if (strcmp(proplist->al_info[i].ai_name, + prop_table[j].pd_name) == 0) + matched = B_TRUE; + } + if (!matched) + return (DLADM_STATUS_BADPROP); + } + return (DLADM_STATUS_OK); + +} + +dladm_status_t +dladm_parse_flow_props(char *str, dladm_arg_list_t **listp, boolean_t novalues) +{ + dladm_status_t status; + + status = dladm_parse_args(str, listp, novalues); + if (status != DLADM_STATUS_OK) + return (status); + + status = flow_proplist_check(*listp); + if (status != DLADM_STATUS_OK) { + dladm_free_props(*listp); + return (status); + } + + return (DLADM_STATUS_OK); +} + +/* + * Retrieve the named property from a proplist, check the value and + * convert to a kernel structure. + */ +static dladm_status_t +i_dladm_flow_proplist_extract_one(dladm_arg_list_t *proplist, + const char *name, void *val) +{ + dladm_status_t status; + dladm_arg_info_t *aip = NULL; + int i, j; + + /* Find named property in proplist */ + for (i = 0; i < proplist->al_count; i++) { + aip = &proplist->al_info[i]; + if (strcasecmp(aip->ai_name, name) == 0) + break; + } + + /* Property not in list */ + if (i == proplist->al_count) + return (DLADM_STATUS_OK); + + for (i = 0; i < DLADM_MAX_FLOWPROPS; i++) { + fprop_desc_t *pdp = &prop_table[i]; + val_desc_t *vdp; + + vdp = malloc(sizeof (val_desc_t) * aip->ai_count); + if (vdp == NULL) + return (DLADM_STATUS_NOMEM); + + if (strcasecmp(aip->ai_name, pdp->pd_name) != 0) + continue; + + if (aip->ai_val == NULL) + return (DLADM_STATUS_BADARG); + + /* Check property value */ + if (pdp->pd_check != NULL) { + status = pdp->pd_check(pdp, aip->ai_val, + aip->ai_count, &vdp); + } else { + status = DLADM_STATUS_BADARG; + } + + if (status != DLADM_STATUS_OK) + return (status); + + for (j = 0; j < DLADM_MAX_RSRC_PROP; j++) { + resource_prop_t *rpp = &rsrc_prop_table[j]; + + if (strcasecmp(aip->ai_name, rpp->rp_name) != 0) + continue; + + /* Extract kernel structure */ + if (rpp->rp_extract != NULL) { + status = rpp->rp_extract(vdp, val, + aip->ai_count); + } else { + status = DLADM_STATUS_BADARG; + } + break; + } + + if (status != DLADM_STATUS_OK) + return (status); + + break; + } + return (status); +} + +/* + * Extract properties from a proplist and convert to mac_resource_props_t. + */ +dladm_status_t +dladm_flow_proplist_extract(dladm_arg_list_t *proplist, + mac_resource_props_t *mrp) +{ + dladm_status_t status = DLADM_STATUS_OK; + + status = i_dladm_flow_proplist_extract_one(proplist, "maxbw", mrp); + if (status != DLADM_STATUS_OK) + return (status); + status = i_dladm_flow_proplist_extract_one(proplist, "priority", mrp); + if (status != DLADM_STATUS_OK) + return (status); + return (status); +} + +dladm_status_t +i_dladm_set_flow_proplist_db(char *flow, dladm_arg_list_t *proplist) +{ + dladm_status_t status, ssave = DLADM_STATUS_OK; + dladm_arg_info_t ai; + int i; + + for (i = 0; i < proplist->al_count; i++) { + ai = proplist->al_info[i]; + status = i_dladm_set_flowprop_db(flow, ai.ai_name, + ai.ai_val, ai.ai_count); + if (status != DLADM_STATUS_OK) + ssave = status; + } + return (ssave); +} diff --git a/usr/src/lib/libdladm/common/libdladm.c b/usr/src/lib/libdladm/common/libdladm.c index fa588df066..cc6bf542f7 100644 --- a/usr/src/lib/libdladm/common/libdladm.c +++ b/usr/src/lib/libdladm/common/libdladm.c @@ -29,6 +29,7 @@ #include <fcntl.h> #include <strings.h> #include <dirent.h> +#include <stdlib.h> #include <sys/param.h> #include <sys/stat.h> #include <libdladm_impl.h> @@ -89,7 +90,7 @@ dladm_status2str(dladm_status_t status, char *buf) s = "I/O error"; break; case DLADM_STATUS_TEMPONLY: - s = "change cannot be persistent, specify -t please"; + s = "change cannot be persistent"; break; case DLADM_STATUS_TIMEDOUT: s = "operation timed out"; @@ -127,6 +128,117 @@ dladm_status2str(dladm_status_t status, char *buf) case DLADM_STATUS_NONOTIF: s = "link notification is not supported"; break; + case DLADM_STATUS_BADTIMEVAL: + s = "invalid time range"; + break; + case DLADM_STATUS_INVALIDMACADDR: + s = "invalid MAC address value"; + break; + case DLADM_STATUS_INVALIDMACADDRNIC: + s = "MAC address reserved for use by underlying data-link"; + break; + case DLADM_STATUS_INVALIDMACADDRINUSE: + s = "MAC address is already in use"; + break; + case DLADM_STATUS_MACFACTORYSLOTINVALID: + s = "invalid factory MAC address slot"; + break; + case DLADM_STATUS_MACFACTORYSLOTUSED: + s = "factory MAC address slot already used"; + break; + case DLADM_STATUS_MACFACTORYSLOTALLUSED: + s = "all factory MAC address slots are in use"; + break; + case DLADM_STATUS_MACFACTORYNOTSUP: + s = "factory MAC address slots not supported"; + break; + case DLADM_STATUS_INVALIDMACPREFIX: + s = "Invalid MAC address prefix value"; + break; + case DLADM_STATUS_INVALIDMACPREFIXLEN: + s = "Invalid MAC address prefix length"; + break; + case DLADM_STATUS_CPUMAX: + s = "non-existent processor ID"; + break; + case DLADM_STATUS_CPUERR: + s = "could not determine processor status"; + break; + case DLADM_STATUS_CPUNOTONLINE: + s = "processor not online"; + break; + case DLADM_STATUS_DB_NOTFOUND: + s = "database not found"; + break; + case DLADM_STATUS_DB_PARSE_ERR: + s = "database parse error"; + break; + case DLADM_STATUS_PROP_PARSE_ERR: + s = "property parse error"; + break; + case DLADM_STATUS_ATTR_PARSE_ERR: + s = "attribute parse error"; + break; + case DLADM_STATUS_FLOW_DB_ERR: + s = "flow database error"; + break; + case DLADM_STATUS_FLOW_DB_OPEN_ERR: + s = "flow database open error"; + break; + case DLADM_STATUS_FLOW_DB_PARSE_ERR: + s = "flow database parse error"; + break; + case DLADM_STATUS_FLOWPROP_DB_PARSE_ERR: + s = "flow property database parse error"; + break; + case DLADM_STATUS_FLOW_ADD_ERR: + s = "flow add error"; + break; + case DLADM_STATUS_FLOW_WALK_ERR: + s = "flow walk error"; + break; + case DLADM_STATUS_FLOW_IDENTICAL: + s = "a flow with identical attributes exists"; + break; + case DLADM_STATUS_FLOW_INCOMPATIBLE: + s = "flow(s) with incompatible attributes exists"; + break; + case DLADM_STATUS_FLOW_EXISTS: + s = "link still has flows"; + break; + case DLADM_STATUS_PERSIST_FLOW_EXISTS: + s = "persistent flow with the same name exists"; + break; + case DLADM_STATUS_INVALID_IP: + s = "invalid IP address"; + break; + case DLADM_STATUS_INVALID_PREFIXLEN: + s = "invalid IP prefix length"; + break; + case DLADM_STATUS_INVALID_PROTOCOL: + s = "invalid IP protocol"; + break; + case DLADM_STATUS_INVALID_PORT: + s = "invalid port number"; + break; + case DLADM_STATUS_INVALID_DSF: + s = "invalid dsfield"; + break; + case DLADM_STATUS_INVALID_DSFMASK: + s = "invalid dsfield mask"; + break; + case DLADM_STATUS_INVALID_MACMARGIN: + s = "MTU check failed, use lower MTU or -f option"; + break; + case DLADM_STATUS_BADPROP: + s = "invalid property"; + break; + case DLADM_STATUS_MINMAXBW: + s = "minimum value for maxbw is 1.2M"; + break; + case DLADM_STATUS_NO_HWRINGS: + s = "request hw rings failed"; + break; default: s = "<unknown error>"; break; @@ -169,11 +281,100 @@ dladm_errno2status(int err) return (DLADM_STATUS_LINKBUSY); case EAGAIN: return (DLADM_STATUS_TRYAGAIN); + case ENOTEMPTY: + return (DLADM_STATUS_FLOW_EXISTS); + case EOPNOTSUPP: + return (DLADM_STATUS_FLOW_INCOMPATIBLE); + case EALREADY: + return (DLADM_STATUS_FLOW_IDENTICAL); default: return (DLADM_STATUS_FAILED); } } +dladm_status_t +dladm_str2bw(char *oarg, uint64_t *bw) +{ + char *endp = NULL; + int64_t n; + int mult = 1; + + n = strtoull(oarg, &endp, 10); + + if ((errno != 0) || (strlen(endp) > 1)) + return (DLADM_STATUS_BADARG); + + if (n < 0) + return (DLADM_STATUS_BADVAL); + + switch (*endp) { + case 'k': + case 'K': + mult = 1000; + break; + case 'm': + case 'M': + case '\0': + mult = 1000000; + break; + case 'g': + case 'G': + mult = 1000000000; + break; + case '%': + /* + * percentages not supported for now, + * see RFE 6540675 + */ + return (DLADM_STATUS_NOTSUP); + default: + return (DLADM_STATUS_BADVAL); + } + + *bw = n * mult; + + /* check for overflow */ + if (*bw / mult != n) + return (DLADM_STATUS_BADARG); + + return (DLADM_STATUS_OK); +} + +/* + * Convert bandwidth in bps to a string in mpbs. For values greater + * than 1mbps or 1000000, print a whole mbps value. For values that + * have fractional Mbps in whole Kbps , print the bandwidth in a manner + * simlilar to a floating point format. + * + * bps string + * 0 0 + * 100 0 + * 2000 0.002 + * 431000 0.431 + * 1000000 1 + * 1030000 1.030 + * 100000000 100 + */ +const char * +dladm_bw2str(int64_t bw, char *buf) +{ + int kbps, mbps; + + kbps = (bw%1000000)/1000; + mbps = bw/1000000; + if (kbps != 0) { + if (mbps == 0) + (void) snprintf(buf, DLADM_STRSIZE, "0.%03u", kbps); + else + (void) snprintf(buf, DLADM_STRSIZE, "%5u.%03u", mbps, + kbps); + } else { + (void) snprintf(buf, DLADM_STRSIZE, "%5u", mbps); + } + + return (buf); +} + #define LOCK_DB_PERMS S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH static int @@ -241,6 +442,9 @@ dladm_class2str(datalink_class_t class, char *buf) case DATALINK_CLASS_VNIC: s = "vnic"; break; + case DATALINK_CLASS_ETHERSTUB: + s = "etherstub"; + break; default: s = "unknown"; break; @@ -491,3 +695,123 @@ dladm_valid_linkname(const char *link) return (B_TRUE); } + +/* + * Convert priority string to a value. + */ +dladm_status_t +dladm_str2pri(char *token, mac_priority_level_t *pri) +{ + if (strlen(token) == strlen("low") && + strncasecmp(token, "low", strlen("low")) == 0) { + *pri = MPL_LOW; + } else if (strlen(token) == strlen("medium") && + strncasecmp(token, "medium", strlen("medium")) == 0) { + *pri = MPL_MEDIUM; + } else if (strlen(token) == strlen("high") && + strncasecmp(token, "high", strlen("high")) == 0) { + *pri = MPL_HIGH; + } else { + return (DLADM_STATUS_BADVAL); + } + return (DLADM_STATUS_OK); +} + +/* + * Convert priority value to a string. + */ +const char * +dladm_pri2str(mac_priority_level_t pri, char *buf) +{ + const char *s; + + switch (pri) { + case MPL_LOW: + s = "low"; + break; + case MPL_MEDIUM: + s = "medium"; + break; + case MPL_HIGH: + s = "high"; + break; + default: + s = "--"; + break; + } + (void) snprintf(buf, DLADM_STRSIZE, "%s", dgettext(TEXT_DOMAIN, s)); + return (buf); +} + +void +dladm_free_args(dladm_arg_list_t *list) +{ + if (list != NULL) { + free(list->al_buf); + free(list); + } +} + +dladm_status_t +dladm_parse_args(char *str, dladm_arg_list_t **listp, boolean_t novalues) +{ + dladm_arg_list_t *list; + dladm_arg_info_t *aip; + char *buf, *curr; + int len, i; + + list = malloc(sizeof (dladm_arg_list_t)); + if (list == NULL) + return (dladm_errno2status(errno)); + + list->al_count = 0; + list->al_buf = buf = strdup(str); + if (buf == NULL) + return (dladm_errno2status(errno)); + + curr = buf; + len = strlen(buf); + aip = NULL; + for (i = 0; i < len; i++) { + char c = buf[i]; + boolean_t match = (c == '=' || c == ','); + + if (!match && i != len - 1) + continue; + + if (match) { + buf[i] = '\0'; + if (*curr == '\0') + goto fail; + } + + if (aip != NULL && c != '=') { + if (aip->ai_count > DLADM_MAX_ARG_VALS) + goto fail; + + if (novalues) + goto fail; + + aip->ai_val[aip->ai_count] = curr; + aip->ai_count++; + } else { + if (list->al_count > DLADM_MAX_ARG_VALS) + goto fail; + + aip = &list->al_info[list->al_count]; + aip->ai_name = curr; + aip->ai_count = 0; + list->al_count++; + if (c == ',') + aip = NULL; + } + curr = buf + i + 1; + } + + *listp = list; + return (DLADM_STATUS_OK); + +fail: + dladm_free_args(list); + return (DLADM_STATUS_FAILED); +} diff --git a/usr/src/lib/libdladm/common/libdladm.h b/usr/src/lib/libdladm/common/libdladm.h index df69a54615..a76245d478 100644 --- a/usr/src/lib/libdladm/common/libdladm.h +++ b/usr/src/lib/libdladm/common/libdladm.h @@ -26,7 +26,7 @@ #ifndef _LIBDLADM_H #define _LIBDLADM_H -#include <sys/dls.h> +#include <sys/dls_mgmt.h> #include <sys/dlpi.h> /* @@ -60,16 +60,28 @@ extern "C" { * * - DLADM_OPT_PREFIX: * The function requests to generate a link name using the specified prefix. + * + * - DLADM_OPT_VLAN: + * Signifies VLAN creation code path + * + * - DLADM_OPT_HWRINGS: + * Requires a hardware group of rings when creating a vnic. */ #define DLADM_OPT_ACTIVE 0x00000001 #define DLADM_OPT_PERSIST 0x00000002 #define DLADM_OPT_CREATE 0x00000004 #define DLADM_OPT_FORCE 0x00000008 #define DLADM_OPT_PREFIX 0x00000010 +#define DLADM_OPT_ANCHOR 0x00000020 +#define DLADM_OPT_VLAN 0x00000040 +#define DLADM_OPT_HWRINGS 0x00000080 #define DLADM_WALK_TERMINATE 0 #define DLADM_WALK_CONTINUE -1 +#define DLADM_MAX_ARG_CNT 32 +#define DLADM_MAX_ARG_VALS 32 + typedef enum { DLADM_STATUS_OK = 0, DLADM_STATUS_BADARG, @@ -99,7 +111,44 @@ typedef enum { DLADM_STATUS_VIDINVAL, DLADM_STATUS_NONOTIF, DLADM_STATUS_TRYAGAIN, - DLADM_STATUS_NOTDEFINED + DLADM_STATUS_BADTIMEVAL, + DLADM_STATUS_INVALIDMACADDR, + DLADM_STATUS_INVALIDMACADDRNIC, + DLADM_STATUS_INVALIDMACADDRINUSE, + DLADM_STATUS_MACFACTORYSLOTINVALID, + DLADM_STATUS_MACFACTORYSLOTUSED, + DLADM_STATUS_MACFACTORYSLOTALLUSED, + DLADM_STATUS_MACFACTORYNOTSUP, + DLADM_STATUS_INVALIDMACPREFIX, + DLADM_STATUS_INVALIDMACPREFIXLEN, + DLADM_STATUS_CPUMAX, + DLADM_STATUS_CPUERR, + DLADM_STATUS_CPUNOTONLINE, + DLADM_STATUS_DB_NOTFOUND, + DLADM_STATUS_DB_PARSE_ERR, + DLADM_STATUS_PROP_PARSE_ERR, + DLADM_STATUS_ATTR_PARSE_ERR, + DLADM_STATUS_FLOW_DB_ERR, + DLADM_STATUS_FLOW_DB_OPEN_ERR, + DLADM_STATUS_FLOW_DB_PARSE_ERR, + DLADM_STATUS_FLOWPROP_DB_PARSE_ERR, + DLADM_STATUS_FLOW_ADD_ERR, + DLADM_STATUS_FLOW_WALK_ERR, + DLADM_STATUS_FLOW_IDENTICAL, + DLADM_STATUS_FLOW_INCOMPATIBLE, + DLADM_STATUS_FLOW_EXISTS, + DLADM_STATUS_PERSIST_FLOW_EXISTS, + DLADM_STATUS_INVALID_IP, + DLADM_STATUS_INVALID_PREFIXLEN, + DLADM_STATUS_INVALID_PROTOCOL, + DLADM_STATUS_INVALID_PORT, + DLADM_STATUS_INVALID_DSF, + DLADM_STATUS_INVALID_DSFMASK, + DLADM_STATUS_INVALID_MACMARGIN, + DLADM_STATUS_NOTDEFINED, + DLADM_STATUS_BADPROP, + DLADM_STATUS_MINMAXBW, + DLADM_STATUS_NO_HWRINGS } dladm_status_t; typedef enum { @@ -111,11 +160,63 @@ typedef enum { typedef int dladm_conf_t; #define DLADM_INVALID_CONF 0 +typedef struct dladm_arg_info { + const char *ai_name; + char *ai_val[DLADM_MAX_ARG_VALS]; + uint_t ai_count; +} dladm_arg_info_t; + +typedef struct dladm_arg_list { + dladm_arg_info_t al_info[DLADM_MAX_ARG_CNT]; + uint_t al_count; + char *al_buf; +} dladm_arg_list_t; + +typedef enum { + DLADM_LOGTYPE_LINK = 1, + DLADM_LOGTYPE_FLOW +} dladm_logtype_t; + +typedef struct dladm_usage { + char du_name[MAXLINKNAMELEN]; + uint64_t du_duration; + uint64_t du_stime; + uint64_t du_etime; + uint64_t du_ipackets; + uint64_t du_rbytes; + uint64_t du_opackets; + uint64_t du_obytes; + uint64_t du_bandwidth; + boolean_t du_last; +} dladm_usage_t; + extern const char *dladm_status2str(dladm_status_t, char *); extern dladm_status_t dladm_set_rootdir(const char *); extern const char *dladm_class2str(datalink_class_t, char *); extern const char *dladm_media2str(uint32_t, char *); extern boolean_t dladm_valid_linkname(const char *); +extern dladm_status_t dladm_str2bw(char *, uint64_t *); +extern const char *dladm_bw2str(int64_t, char *); + +extern dladm_status_t dladm_parse_flow_props(char *, dladm_arg_list_t **, + boolean_t); +extern dladm_status_t dladm_parse_link_props(char *, dladm_arg_list_t **, + boolean_t); +extern void dladm_free_props(dladm_arg_list_t *); +extern dladm_status_t dladm_parse_flow_attrs(char *, dladm_arg_list_t **, + boolean_t); +extern void dladm_free_attrs(dladm_arg_list_t *); + +extern dladm_status_t dladm_start_usagelog(dladm_logtype_t, uint_t); +extern dladm_status_t dladm_stop_usagelog(dladm_logtype_t); +extern dladm_status_t dladm_walk_usage_res(int (*)(dladm_usage_t *, void *), + int, char *, char *, char *, char *, void *); +extern dladm_status_t dladm_walk_usage_time(int (*)(dladm_usage_t *, void *), + int, char *, char *, char *, void *); +extern dladm_status_t dladm_usage_summary(int (*)(dladm_usage_t *, void *), + int, char *, void *); +extern dladm_status_t dladm_usage_dates(int (*)(dladm_usage_t *, void *), + int, char *, char *, void *); #ifdef __cplusplus } diff --git a/usr/src/lib/libdladm/common/libdladm_impl.h b/usr/src/lib/libdladm/common/libdladm_impl.h index d4a5a52445..41f09b3a46 100644 --- a/usr/src/lib/libdladm/common/libdladm_impl.h +++ b/usr/src/lib/libdladm/common/libdladm_impl.h @@ -36,18 +36,17 @@ extern "C" { #define MAXLINELEN 1024 #define BUFLEN(lim, ptr) (((lim) > (ptr)) ? ((lim) - (ptr)) : 0) -typedef struct val_desc { - char *vd_name; - uintptr_t vd_val; -} val_desc_t; - -#define VALCNT(vals) (sizeof ((vals)) / sizeof (val_desc_t)) - extern dladm_status_t dladm_errno2status(int); extern dladm_status_t i_dladm_rw_db(const char *, mode_t, dladm_status_t (*)(void *, FILE *, FILE *), void *, boolean_t); +extern const char *dladm_pri2str(mac_priority_level_t, char *); +extern dladm_status_t dladm_str2pri(char *, mac_priority_level_t *); +extern dladm_status_t dladm_parse_args(char *, dladm_arg_list_t **, + boolean_t); +extern void dladm_free_args(dladm_arg_list_t *); + /* * Link attributes persisted by dlmgmtd. */ @@ -65,11 +64,64 @@ extern dladm_status_t i_dladm_rw_db(const char *, mode_t, #define FPORTS "portnames" /* string */ #define FPOLICY "policy" /* uint64_t */ #define FFIXMACADDR "fix_macaddr" /* boolean_t */ -#define FMACADDR "macaddr" /* string */ #define FFORCE "force" /* boolean_t */ #define FLACPMODE "lacp_mode" /* uint64_t */ #define FLACPTIMER "lacp_timer" /* uint64_t */ +/* + * Set for VNICs only + */ +#define FMADDRTYPE "maddrtype" /* uint64_t */ +#define FMADDRLEN "maddrlen" /* uint64_t */ +#define FMADDRSLOT "maddrslot" /* uint64_t */ +#define FMADDRPREFIXLEN "maddrpreflen" /* uint64_t */ +#define FHWRINGS "hwrings" /* boolean_t */ + +/* + * Common fields + */ +#define FMACADDR "macaddr" /* string */ + +/* + * Data structures used for implementing temporary properties + */ + +typedef struct val_desc { + char *vd_name; + uintptr_t vd_val; +} val_desc_t; + +#define VALCNT(vals) (sizeof ((vals)) / sizeof (val_desc_t)) + +extern dladm_status_t dladm_link_proplist_extract(dladm_arg_list_t *, + mac_resource_props_t *); + +extern dladm_status_t dladm_flow_proplist_extract(dladm_arg_list_t *, + mac_resource_props_t *); + +/* + * The prop extract() callback. + * + * rp_extract extracts the kernel structure from the val_desc_t created + * by the pd_check function. + */ +typedef dladm_status_t rp_extractf_t(val_desc_t *propval, void *arg, + uint_t cnt); +extern rp_extractf_t do_extract_maxbw, do_extract_priority, + do_extract_cpus; + +typedef struct resource_prop_s { + /* + * resource property name + */ + char *rp_name; + + /* + * callback to extract kernel structure + */ + rp_extractf_t *rp_extract; +} resource_prop_t; + #ifdef __cplusplus } #endif diff --git a/usr/src/lib/libdladm/common/libdlaggr.c b/usr/src/lib/libdladm/common/libdlaggr.c index dba84441ea..5a155fcad9 100644 --- a/usr/src/lib/libdladm/common/libdlaggr.c +++ b/usr/src/lib/libdladm/common/libdlaggr.c @@ -37,6 +37,7 @@ #include <libintl.h> #include <net/if_types.h> #include <net/if_dl.h> +#include <sys/dld.h> #include <libdllink.h> #include <libdlvlan.h> #include <libdlaggr.h> @@ -1110,7 +1111,7 @@ dladm_aggr_create(const char *name, uint16_t key, uint32_t nports, for (i = 0; i < nports; i++) { if ((dladm_datalink_id2info(ports[i].lp_linkid, NULL, &class, &media, NULL, 0) != DLADM_STATUS_OK) || - (class != DATALINK_CLASS_PHYS) && (media != DL_ETHER)) { + !((class == DATALINK_CLASS_PHYS) && (media == DL_ETHER))) { return (DLADM_STATUS_BADARG); } } diff --git a/usr/src/lib/libdladm/common/libdlflow.c b/usr/src/lib/libdladm/common/libdlflow.c new file mode 100644 index 0000000000..3ec77705a7 --- /dev/null +++ b/usr/src/lib/libdladm/common/libdlflow.c @@ -0,0 +1,903 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <stdio.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/ethernet.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <sys/stat.h> +#include <string.h> +#include <fcntl.h> +#include <unistd.h> +#include <stropts.h> +#include <stdlib.h> +#include <errno.h> +#include <strings.h> +#include <libintl.h> +#include <netdb.h> +#include <net/if_types.h> +#include <net/if_dl.h> +#include <inet/ip.h> +#include <inet/ip6.h> +#include <libdlflow.h> +#include <libdlflow_impl.h> +#include <libdladm_impl.h> + +/* minimum buffer size for DLDIOCWALKFLOW */ +#define MIN_INFO_SIZE (4 * 1024) + +#define DLADM_FLOW_DB "/etc/dladm/flowadm.conf" +#define DLADM_FLOW_DB_TMP "/etc/dladm/flowadm.conf.new" +#define DLADM_FLOW_DB_LOCK "/tmp/flowadm.conf.lock" + +#define DLADM_FLOW_DB_PERMS S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH +#define DLADM_FLOW_DB_OWNER UID_DLADM +#define DLADM_FLOW_DB_GROUP GID_SYS + +#define BLANK_LINE(s) ((s[0] == '\0') || (s[0] == '#') || (s[0] == '\n')) +#define MAXLINELEN 1024 +#define MAXPATHLEN 1024 + +#define V4_PART_OF_V6(v6) ((v6)._S6_un._S6_u32[3]) + +/* database file parameters */ +static const char *BW_LIMIT = "bw_limit"; +static const char *PRIORITY = "priority"; +static const char *LOCAL_IP_ADDR = "local_ip"; +static const char *REMOTE_IP_ADDR = "remote_ip"; +static const char *TRANSPORT = "transport"; +static const char *LOCAL_PORT = "local_port"; +static const char *DSFIELD = "dsfield"; + +/* + * Open and lock the flowadm configuration file lock. The lock is + * acquired as a reader (F_RDLCK) or writer (F_WRLCK). + */ +static int +i_dladm_flow_lock_db(short type) +{ + int lock_fd; + struct flock lock; + + if ((lock_fd = open(DLADM_FLOW_DB_LOCK, O_RDWR | O_CREAT | O_TRUNC, + DLADM_FLOW_DB_PERMS)) < 0) + return (-1); + + lock.l_type = type; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + + if (fcntl(lock_fd, F_SETLKW, &lock) < 0) { + (void) close(lock_fd); + (void) unlink(DLADM_FLOW_DB_LOCK); + return (-1); + } + return (lock_fd); +} + +/* + * Unlock and close the specified file. + */ +static void +i_dladm_flow_unlock_db(int fd) +{ + struct flock lock; + + if (fd < 0) + return; + + lock.l_type = F_UNLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + + (void) fcntl(fd, F_SETLKW, &lock); + (void) close(fd); + (void) unlink(DLADM_FLOW_DB_LOCK); +} + +/* + * Parse one line of the link flowadm DB + * Returns -1 on failure, 0 on success. + */ +dladm_status_t +dladm_flow_parse_db(char *line, dld_flowinfo_t *attr) +{ + char *token; + char *value, *name = NULL; + char *endp = NULL; + char *lasts = NULL; + dladm_status_t status = DLADM_STATUS_FLOW_DB_PARSE_ERR; + + bzero(attr, sizeof (*attr)); + + /* flow name */ + if ((token = strtok_r(line, " \t", &lasts)) == NULL) + goto done; + + if (strlcpy(attr->fi_flowname, token, MAXNAMELEN) >= MAXNAMELEN) + goto done; + + /* resource control and flow descriptor parameters */ + while ((token = strtok_r(NULL, " \t", &lasts)) != NULL) { + if ((name = strdup(token)) == NULL) + goto done; + + (void) strtok(name, "="); + value = strtok(NULL, "="); + if (value == NULL) + goto done; + + if (strcmp(name, "linkid") == 0) { + if ((attr->fi_linkid = + (uint32_t)strtol(value, &endp, 10)) == + DATALINK_INVALID_LINKID) + goto done; + + } else if (strcmp(name, BW_LIMIT) == 0) { + attr->fi_resource_props.mrp_mask |= + MRP_MAXBW; + attr->fi_resource_props.mrp_maxbw = + (uint64_t)strtol(value, &endp, 0); + + } else if (strcmp(name, PRIORITY) == 0) { + attr->fi_resource_props.mrp_mask |= MRP_PRIORITY; + status = dladm_str2pri(value, + &attr->fi_resource_props.mrp_priority); + if (status != DLADM_STATUS_OK) + goto done; + + } else if (strcmp(name, DSFIELD) == 0) { + status = do_check_dsfield(value, + &attr->fi_flow_desc); + if (status != DLADM_STATUS_OK) + goto done; + + } else if (strcmp(name, LOCAL_IP_ADDR) == 0) { + status = do_check_ip_addr(value, B_TRUE, + &attr->fi_flow_desc); + if (status != DLADM_STATUS_OK) + goto done; + + } else if (strcmp(name, REMOTE_IP_ADDR) == 0) { + status = do_check_ip_addr(value, B_FALSE, + &attr->fi_flow_desc); + if (status != DLADM_STATUS_OK) + goto done; + + } else if (strcmp(name, TRANSPORT) == 0) { + attr->fi_flow_desc.fd_mask |= FLOW_IP_PROTOCOL; + attr->fi_flow_desc.fd_protocol = + (uint8_t)strtol(value, &endp, 0); + + } else if (strcmp(name, LOCAL_PORT) == 0) { + attr->fi_flow_desc.fd_mask |= FLOW_ULP_PORT_LOCAL; + attr->fi_flow_desc.fd_local_port = + (uint16_t)strtol(value, &endp, 10); + attr->fi_flow_desc.fd_local_port = + htons(attr->fi_flow_desc.fd_local_port); + } + free(name); + name = NULL; + } + if (attr->fi_linkid != DATALINK_INVALID_LINKID) + status = DLADM_STATUS_OK; +done: + free(name); + return (status); +} + +#define FPRINTF_ERR(fcall) if ((fcall) < 0) return (-1); + +/* + * Write the attribute of a group to the specified file. Returns 0 on + * success, -1 on failure. + */ +static int +i_dladm_flow_fput_grp(FILE *fp, dld_flowinfo_t *attr) +{ + + FPRINTF_ERR(fprintf(fp, "%s\tlinkid=%d\t", + attr->fi_flowname, attr->fi_linkid)); + + /* flow policy */ + if (attr->fi_resource_props.mrp_mask & MRP_MAXBW) + FPRINTF_ERR(fprintf(fp, "%s=%" PRIu64 "\t", BW_LIMIT, + attr->fi_resource_props.mrp_maxbw)); + + if (attr->fi_resource_props.mrp_mask & MRP_PRIORITY) + FPRINTF_ERR(fprintf(fp, "%s=%d\t", PRIORITY, + attr->fi_resource_props.mrp_priority)); + + /* flow descriptor */ + if (attr->fi_flow_desc.fd_mask & FLOW_IP_DSFIELD) + FPRINTF_ERR(fprintf(fp, "%s=%x:%x\t", DSFIELD, + attr->fi_flow_desc.fd_dsfield, + attr->fi_flow_desc.fd_dsfield_mask)); + + if (attr->fi_flow_desc.fd_mask & FLOW_IP_LOCAL) { + char abuf[INET6_ADDRSTRLEN], *ap; + struct in_addr ipaddr; + int prefix_len, prefix_max; + + if (attr->fi_flow_desc.fd_ipversion != 6) { + ipaddr.s_addr = + attr->fi_flow_desc. + fd_local_addr._S6_un._S6_u32[3]; + + ap = inet_ntoa(ipaddr); + prefix_max = IP_ABITS; + } else { + (void) inet_ntop(AF_INET6, + &attr->fi_flow_desc.fd_local_addr, + abuf, INET6_ADDRSTRLEN); + + ap = abuf; + prefix_max = IPV6_ABITS; + } + (void) dladm_mask2prefixlen( + &attr->fi_flow_desc.fd_local_netmask, prefix_max, + &prefix_len); + + FPRINTF_ERR(fprintf(fp, "%s=%s/%d\t", LOCAL_IP_ADDR, + ap, prefix_len)); + } + if (attr->fi_flow_desc.fd_mask & FLOW_IP_REMOTE) { + char abuf[INET6_ADDRSTRLEN], *ap; + struct in_addr ipaddr; + int prefix_len, prefix_max; + + if (attr->fi_flow_desc.fd_ipversion != 6) { + ipaddr.s_addr = + attr->fi_flow_desc. + fd_remote_addr._S6_un._S6_u32[3]; + + ap = inet_ntoa(ipaddr); + prefix_max = IP_ABITS; + } else { + (void) inet_ntop(AF_INET6, + &(attr->fi_flow_desc.fd_remote_addr), + abuf, INET6_ADDRSTRLEN); + + ap = abuf; + prefix_max = IPV6_ABITS; + } + (void) dladm_mask2prefixlen( + &attr->fi_flow_desc.fd_remote_netmask, prefix_max, + &prefix_len); + + FPRINTF_ERR(fprintf(fp, "%s=%s/%d\t", REMOTE_IP_ADDR, + ap, prefix_len)); + } + if (attr->fi_flow_desc.fd_mask & FLOW_IP_PROTOCOL) + FPRINTF_ERR(fprintf(fp, "%s=%d\t", TRANSPORT, + attr->fi_flow_desc.fd_protocol)); + + if (attr->fi_flow_desc.fd_mask & FLOW_ULP_PORT_LOCAL) + FPRINTF_ERR(fprintf(fp, "%s=%d\t", LOCAL_PORT, + ntohs(attr->fi_flow_desc.fd_local_port))); + + FPRINTF_ERR(fprintf(fp, "\n")); + + return (0); + +} + +static dladm_status_t +i_dladm_flow_walk_rw_db(int (*fn)(void *, dld_flowinfo_t *), + void *arg, + const char *root) +{ + FILE *fp, *nfp; + int nfd, fn_rc, lock_fd; + char line[MAXLINELEN]; + dld_flowinfo_t attr; + char *db_file, *tmp_db_file; + char db_file_buf[MAXPATHLEN]; + char tmp_db_file_buf[MAXPATHLEN]; + dladm_status_t status = DLADM_STATUS_FLOW_DB_ERR; + + if (root == NULL) { + db_file = DLADM_FLOW_DB; + tmp_db_file = DLADM_FLOW_DB_TMP; + } else { + (void) snprintf(db_file_buf, MAXPATHLEN, "%s%s", root, + DLADM_FLOW_DB); + (void) snprintf(tmp_db_file_buf, MAXPATHLEN, "%s%s", root, + DLADM_FLOW_DB_TMP); + db_file = db_file_buf; + tmp_db_file = tmp_db_file_buf; + } + + if ((lock_fd = i_dladm_flow_lock_db(F_WRLCK)) < 0) + return (DLADM_STATUS_FLOW_DB_ERR); + + if ((fp = fopen(db_file, "r")) == NULL) { + i_dladm_flow_unlock_db(lock_fd); + return (DLADM_STATUS_FLOW_DB_OPEN_ERR); + } + + if ((nfd = open(tmp_db_file, O_WRONLY|O_CREAT|O_TRUNC, + DLADM_FLOW_DB_PERMS)) == -1) { + (void) fclose(fp); + i_dladm_flow_unlock_db(lock_fd); + return (DLADM_STATUS_FLOW_DB_OPEN_ERR); + } + + if ((nfp = fdopen(nfd, "w")) == NULL) { + (void) close(nfd); + (void) fclose(fp); + (void) unlink(tmp_db_file); + i_dladm_flow_unlock_db(lock_fd); + return (DLADM_STATUS_FLOW_DB_OPEN_ERR); + } + + while (fgets(line, MAXLINELEN, fp) != NULL) { + + /* skip comments */ + if (BLANK_LINE(line)) { + if (fputs(line, nfp) == EOF) + goto failed; + continue; + } + (void) strtok(line, " \n"); + + if ((status = dladm_flow_parse_db(line, &attr)) != + DLADM_STATUS_OK) + goto failed; + + fn_rc = fn(arg, &attr); + + switch (fn_rc) { + case -1: + /* failure, stop walking */ + goto failed; + case 0: + /* + * Success, write group attributes, which could + * have been modified by fn(). + */ + if (i_dladm_flow_fput_grp(nfp, &attr) != 0) + goto failed; + break; + case 1: + /* skip current group */ + break; + } + } + if (fchmod(nfd, DLADM_FLOW_DB_PERMS) == -1) + goto failed; + + if (fchown(nfd, DLADM_FLOW_DB_OWNER, DLADM_FLOW_DB_GROUP) == -1) + goto failed; + + if (fflush(nfp) == EOF) + goto failed; + + (void) fclose(fp); + (void) fclose(nfp); + + if (rename(tmp_db_file, db_file) == -1) { + (void) unlink(tmp_db_file); + i_dladm_flow_unlock_db(lock_fd); + return (DLADM_STATUS_FLOW_DB_ERR); + } + i_dladm_flow_unlock_db(lock_fd); + return (DLADM_STATUS_OK); + +failed: + (void) fclose(fp); + (void) fclose(nfp); + (void) unlink(tmp_db_file); + i_dladm_flow_unlock_db(lock_fd); + + return (status); +} + +/* + * Remove existing flow from DB. + */ + +typedef struct remove_db_state { + dld_flowinfo_t rs_newattr; + dld_flowinfo_t rs_oldattr; + boolean_t rs_found; +} remove_db_state_t; + +static int +i_dladm_flow_remove_db_fn(void *arg, dld_flowinfo_t *grp) +{ + remove_db_state_t *state = (remove_db_state_t *)arg; + dld_flowinfo_t *attr = &state->rs_newattr; + + if ((strcmp(grp->fi_flowname, attr->fi_flowname)) != 0) + return (0); + else { + bcopy(grp, &state->rs_oldattr, + sizeof (dld_flowinfo_t)); + state->rs_found = B_TRUE; + return (1); + } +} + +/* ARGSUSED */ +static int +i_dladm_flow_remove_db(remove_db_state_t *state, const char *root) +{ + if (i_dladm_flow_walk_rw_db(i_dladm_flow_remove_db_fn, state, root) + != 0) + return (-1); + + if (!state->rs_found) { + errno = ENOENT; + return (-1); + } + + return (0); +} + +/* + * Create a flow in the DB. + */ + +typedef struct modify_db_state { + dld_flowinfo_t ms_newattr; + dld_flowinfo_t ms_oldattr; + boolean_t ms_found; +} modify_db_state_t; + +static dladm_status_t +i_dladm_flow_create_db(dld_flowinfo_t *attr, const char *root) +{ + FILE *fp; + char line[MAXLINELEN]; + char *db_file; + char db_file_buf[MAXPATHLEN]; + int lock_fd; + dladm_status_t status = DLADM_STATUS_OK; + + if (root == NULL) { + db_file = DLADM_FLOW_DB; + } else { + (void) snprintf(db_file_buf, MAXPATHLEN, "%s%s", root, + DLADM_FLOW_DB); + db_file = db_file_buf; + } + + if ((lock_fd = i_dladm_flow_lock_db(F_WRLCK)) < 0) + return (DLADM_STATUS_FLOW_DB_ERR); + + if ((fp = fopen(db_file, "r+")) == NULL && + (fp = fopen(db_file, "w")) == NULL) { + i_dladm_flow_unlock_db(lock_fd); + return (DLADM_STATUS_FLOW_DB_OPEN_ERR); + } + + /* look for existing group with same flowname */ + while (fgets(line, MAXLINELEN, fp) != NULL) { + char *holder, *lasts; + + /* skip comments */ + if (BLANK_LINE(line)) + continue; + + /* ignore corrupted lines */ + holder = strtok_r(line, " \t", &lasts); + if (holder == NULL) + continue; + + /* flow id */ + if (strcmp(holder, attr->fi_flowname) == 0) { + /* group with flow id already exists */ + status = DLADM_STATUS_PERSIST_FLOW_EXISTS; + goto failed; + } + } + /* + * If we get here, we've verified that no existing group with + * the same flow id already exists. Its now time to add the new + * group to the DB. + */ + if (i_dladm_flow_fput_grp(fp, attr) != 0) + status = DLADM_STATUS_FLOW_DB_PARSE_ERR; + +failed: + (void) fclose(fp); + i_dladm_flow_unlock_db(lock_fd); + return (status); +} + +static dladm_status_t +i_dladm_flow_add(char *flowname, datalink_id_t linkid, flow_desc_t *flowdesc, + mac_resource_props_t *mrp) +{ + dld_ioc_addflow_t attr; + int fd; + + /* create flow */ + bzero(&attr, sizeof (attr)); + bcopy(flowdesc, &attr.af_flow_desc, sizeof (flow_desc_t)); + if (mrp != NULL) { + bcopy(mrp, &attr.af_resource_props, + sizeof (mac_resource_props_t)); + } + + (void) strlcpy(attr.af_name, flowname, sizeof (attr.af_name)); + attr.af_linkid = linkid; + + fd = open(DLD_CONTROL_DEV, O_RDWR); + if (fd < 0) + return (dladm_errno2status(errno)); + + if (ioctl(fd, DLDIOC_ADDFLOW, &attr) < 0) { + (void) close(fd); + return (dladm_errno2status(errno)); + } + + (void) close(fd); + + return (DLADM_STATUS_OK); +} + +static dladm_status_t +i_dladm_flow_remove(char *flowname) +{ + dld_ioc_removeflow_t attr; + int fd; + dladm_status_t status = DLADM_STATUS_OK; + + (void) strlcpy(attr.rf_name, flowname, + sizeof (attr.rf_name)); + + fd = open(DLD_CONTROL_DEV, O_RDWR); + if (fd < 0) + return (dladm_errno2status(errno)); + + if (ioctl(fd, DLDIOC_REMOVEFLOW, &attr) < 0) + status = dladm_errno2status(errno); + + (void) close(fd); + + return (status); +} + + +/* ARGSUSED */ +dladm_status_t +dladm_flow_add(datalink_id_t linkid, dladm_arg_list_t *attrlist, + dladm_arg_list_t *proplist, char *flowname, boolean_t tempop, + const char *root) +{ + dld_flowinfo_t db_attr; + flow_desc_t flowdesc; + mac_resource_props_t mrp; + dladm_status_t status; + + /* Extract flow attributes from attrlist */ + bzero(&flowdesc, sizeof (flow_desc_t)); + if (attrlist != NULL && (status = dladm_flow_attrlist_extract(attrlist, + &flowdesc)) != DLADM_STATUS_OK) { + return (status); + } + + /* Extract resource_ctl and cpu_list from proplist */ + bzero(&mrp, sizeof (mac_resource_props_t)); + if (proplist != NULL && (status = dladm_flow_proplist_extract(proplist, + &mrp)) != DLADM_STATUS_OK) { + return (status); + } + + /* Add flow in kernel */ + status = i_dladm_flow_add(flowname, linkid, &flowdesc, &mrp); + if (status != DLADM_STATUS_OK) + return (status); + + /* Add flow to DB */ + if (!tempop) { + bzero(&db_attr, sizeof (db_attr)); + bcopy(&flowdesc, &db_attr.fi_flow_desc, sizeof (flow_desc_t)); + (void) strlcpy(db_attr.fi_flowname, flowname, + sizeof (db_attr.fi_flowname)); + db_attr.fi_linkid = linkid; + + if ((status = i_dladm_flow_create_db(&db_attr, root)) != + DLADM_STATUS_OK) { + (void) i_dladm_flow_remove(flowname); + return (status); + } + /* set flow properties */ + if (proplist != NULL) { + status = i_dladm_set_flow_proplist_db(flowname, + proplist); + if (status != DLADM_STATUS_OK) { + (void) i_dladm_flow_remove(flowname); + return (status); + } + } + } + return (status); +} + +/* + * Remove a flow. + */ +/* ARGSUSED */ +dladm_status_t +dladm_flow_remove(char *flowname, boolean_t tempop, + const char *root) +{ + remove_db_state_t state; + dladm_status_t status = DLADM_STATUS_OK; + dladm_status_t s = DLADM_STATUS_OK; + + /* remove flow */ + status = i_dladm_flow_remove(flowname); + if ((status != DLADM_STATUS_OK) && + (tempop || status != DLADM_STATUS_NOTFOUND)) + goto done; + + /* remove flow from DB */ + if (!tempop) { + bzero(&state, sizeof (state)); + (void) strlcpy(state.rs_newattr.fi_flowname, flowname, + sizeof (state.rs_newattr.fi_flowname)); + state.rs_found = B_FALSE; + + /* flow DB */ + if (i_dladm_flow_remove_db(&state, root) < 0) { + s = dladm_errno2status(errno); + goto done; + } + + /* flow prop DB */ + s = dladm_set_flowprop(flowname, NULL, NULL, 0, + DLADM_OPT_PERSIST, NULL); + } + +done: + if (!tempop) { + if (s == DLADM_STATUS_OK) { + if (status == DLADM_STATUS_NOTFOUND) + status = s; + } else { + if (s != DLADM_STATUS_NOTFOUND) + status = s; + } + } + return (status); +} + +/* + * Get an existing flow in the DB. + */ + +typedef struct get_db_state { + int (*gs_fn)(dladm_flow_attr_t *, void *); + void *gs_arg; + datalink_id_t gs_linkid; +} get_db_state_t; + +/* + * For each flow which matches the linkid, copy all flow information + * to a new dladm_flow_attr_t structure and call the provided + * function. This is used to display perisistent flows from + * the database. + */ + +static int +i_dladm_flow_get_db_fn(void *arg, dld_flowinfo_t *grp) +{ + get_db_state_t *state = (get_db_state_t *)arg; + dladm_flow_attr_t attr; + + if (grp->fi_linkid == state->gs_linkid) { + attr.fa_linkid = state->gs_linkid; + bcopy(grp->fi_flowname, &attr.fa_flowname, + sizeof (attr.fa_flowname)); + bcopy(&grp->fi_flow_desc, &attr.fa_flow_desc, + sizeof (attr.fa_flow_desc)); + bcopy(&grp->fi_resource_props, &attr.fa_resource_props, + sizeof (attr.fa_resource_props)); + (void) state->gs_fn(&attr, state->gs_arg); + } + return (0); +} + +/* + * Walk through the flows defined on the system and for each flow + * invoke <fn>(<arg>, <flow>); + * Currently used for show-flow. + */ +/* ARGSUSED */ +dladm_status_t +dladm_walk_flow(int (*fn)(dladm_flow_attr_t *, void *), + datalink_id_t linkid, void *arg, boolean_t persist) +{ + dld_flowinfo_t *flow; + int i, bufsize, fd; + dld_ioc_walkflow_t *ioc = NULL; + dladm_flow_attr_t attr; + dladm_status_t status = DLADM_STATUS_OK; + + if (fn == NULL) + return (DLADM_STATUS_BADARG); + + if (persist) { + get_db_state_t state; + + bzero(&state, sizeof (state)); + + state.gs_linkid = linkid; + state.gs_fn = fn; + state.gs_arg = arg; + status = i_dladm_flow_walk_rw_db(i_dladm_flow_get_db_fn, + &state, NULL); + if (status != DLADM_STATUS_OK) + return (status); + } else { + if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0) + return (dladm_errno2status(errno)); + + bufsize = MIN_INFO_SIZE; + if ((ioc = calloc(1, bufsize)) == NULL) { + status = dladm_errno2status(errno); + (void) close(fd); + return (status); + } + + ioc->wf_linkid = linkid; + ioc->wf_len = bufsize - sizeof (*ioc); + + while (ioctl(fd, DLDIOC_WALKFLOW, ioc) < 0) { + if (errno == ENOSPC) { + bufsize *= 2; + ioc = realloc(ioc, bufsize); + if (ioc != NULL) { + ioc->wf_linkid = linkid; + ioc->wf_len = bufsize - sizeof (*ioc); + continue; + } + } + goto bail; + } + + flow = (dld_flowinfo_t *)(void *)(ioc + 1); + for (i = 0; i < ioc->wf_nflows; i++, flow++) { + bzero(&attr, sizeof (attr)); + + attr.fa_linkid = flow->fi_linkid; + bcopy(&flow->fi_flowname, &attr.fa_flowname, + sizeof (attr.fa_flowname)); + bcopy(&flow->fi_flow_desc, &attr.fa_flow_desc, + sizeof (attr.fa_flow_desc)); + bcopy(&flow->fi_resource_props, &attr.fa_resource_props, + sizeof (attr.fa_resource_props)); + + if (fn(&attr, arg) == DLADM_WALK_TERMINATE) + break; + } + } + +bail: + free(ioc); + (void) close(fd); + return (status); +} + +dladm_status_t +dladm_flow_init(void) +{ + flow_desc_t flowdesc; + datalink_id_t linkid; + dladm_status_t s, status = DLADM_STATUS_OK; + char name[MAXNAMELEN]; + char line[MAXLINELEN]; + dld_flowinfo_t attr; + FILE *fp; + + if ((fp = fopen(DLADM_FLOW_DB, "r")) == NULL) + return (DLADM_STATUS_DB_NOTFOUND); + + while (fgets(line, MAXLINELEN, fp) != NULL) { + /* skip comments */ + if (BLANK_LINE(line)) + continue; + + (void) strtok(line, " \n"); + + s = dladm_flow_parse_db(line, &attr); + if (s != DLADM_STATUS_OK) { + status = s; + continue; + } + bzero(&flowdesc, sizeof (flowdesc)); + bcopy(&attr.fi_flow_desc, &flowdesc, sizeof (flow_desc_t)); + (void) strlcpy(name, attr.fi_flowname, + sizeof (attr.fi_flowname)); + linkid = attr.fi_linkid; + + s = i_dladm_flow_add(name, linkid, &flowdesc, NULL); + if (s != DLADM_STATUS_OK) + status = s; + } + s = i_dladm_init_flowprop_db(); + if (s != DLADM_STATUS_OK) + status = s; + + (void) fclose(fp); + return (status); +} + +dladm_status_t +dladm_prefixlen2mask(int prefixlen, int maxlen, uchar_t *mask) +{ + if (prefixlen < 0 || prefixlen > maxlen) + return (DLADM_STATUS_BADARG); + + while (prefixlen > 0) { + if (prefixlen >= 8) { + *mask++ = 0xFF; + prefixlen -= 8; + continue; + } + *mask |= 1 << (8 - prefixlen); + prefixlen--; + } + return (DLADM_STATUS_OK); +} + +dladm_status_t +dladm_mask2prefixlen(in6_addr_t *mask, int plen, int *prefixlen) +{ + int bits; + int i, end; + + switch (plen) { + case IP_ABITS: + end = 3; + break; + case IPV6_ABITS: + end = 0; + break; + default: + return (DLADM_STATUS_BADARG); + } + + for (i = 3; i >= end; i--) { + if (mask->_S6_un._S6_u32[i] == 0) { + plen -= 32; + continue; + } + bits = ffs(ntohl(mask->_S6_un._S6_u32[i])) - 1; + if (bits == 0) + break; + plen -= bits; + } + *prefixlen = plen; + return (DLADM_STATUS_OK); +} diff --git a/usr/src/lib/libdladm/common/libdlflow.h b/usr/src/lib/libdladm/common/libdlflow.h new file mode 100644 index 0000000000..d35631ba4b --- /dev/null +++ b/usr/src/lib/libdladm/common/libdlflow.h @@ -0,0 +1,93 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBDLFLOW_H +#define _LIBDLFLOW_H + +/* + * This file includes strcutures, macros and routines used by general + * flow administration + */ +#include <sys/types.h> +#include <netinet/in.h> +#include <sys/mac_flow.h> +#include <sys/dld.h> +#include <sys/param.h> +#include <sys/mac.h> +#include <libdladm.h> +#include <libdladm_impl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct dladm_flow_attr { + datalink_id_t fa_linkid; + char fa_flowname[MAXNAMELEN]; + flow_desc_t fa_flow_desc; + mac_resource_props_t fa_resource_props; + uint64_t fa_mask; + int fa_nattr; +} dladm_flow_attr_t; + +extern dladm_status_t dladm_flow_add(datalink_id_t, dladm_arg_list_t *, + dladm_arg_list_t *, char *, boolean_t, + const char *); +extern dladm_status_t dladm_flow_remove(char *, boolean_t, const char *); +extern dladm_status_t dladm_flow_init(void); + +extern dladm_status_t dladm_flow_parse_db(char *, dld_flowinfo_t *); +extern dladm_status_t dladm_walk_flow(int (*)(dladm_flow_attr_t *, + void *), datalink_id_t, void *, boolean_t); +extern dladm_status_t dladm_flow_info(const char *, dladm_flow_attr_t *); + +extern dladm_status_t dladm_set_flowprop(const char *, const char *, + char **, uint_t, uint_t, char **); +extern dladm_status_t dladm_get_flowprop(const char *, uint32_t, + const char *, char **, uint_t *); +extern dladm_status_t dladm_walk_flowprop(int (*)(void *, const char *), + const char *, void *); + +extern void dladm_flow_attr_mask(uint64_t, dladm_flow_attr_t *); +extern dladm_status_t dladm_flow_attr_check(dladm_arg_list_t *); +extern dladm_status_t dladm_prefixlen2mask(int, int, uchar_t *); +extern dladm_status_t dladm_mask2prefixlen(in6_addr_t *, int, int *); +extern char *dladm_proto2str(uint8_t); +extern uint8_t dladm_str2proto(const char *); + +extern void dladm_flow_attr_ip2str(dladm_flow_attr_t *, + char *, size_t); +extern void dladm_flow_attr_proto2str(dladm_flow_attr_t *, + char *, size_t); +extern void dladm_flow_attr_port2str(dladm_flow_attr_t *, + char *, size_t); +extern void dladm_flow_attr_dsfield2str(dladm_flow_attr_t *, + char *, size_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBDLFLOW_H */ diff --git a/usr/src/lib/libdladm/common/libdlflow_impl.h b/usr/src/lib/libdladm/common/libdlflow_impl.h new file mode 100644 index 0000000000..09b6d55bc1 --- /dev/null +++ b/usr/src/lib/libdladm/common/libdlflow_impl.h @@ -0,0 +1,138 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBDLFLOW_IMPL_H +#define _LIBDLFLOW_IMPL_H + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/mac.h> +#include <libdladm.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct fprop_desc; +struct fattr_desc; + +typedef dladm_status_t fpd_getf_t(const char *, char **, uint_t *); +typedef dladm_status_t fpd_setf_t(const char *, val_desc_t *, uint_t); +typedef dladm_status_t fpd_checkf_t(struct fprop_desc *, char **, + uint_t, val_desc_t **); + +typedef struct fprop_desc { + char *pd_name; + val_desc_t pd_defval; + val_desc_t *pd_modval; + uint_t pd_nmodval; + boolean_t pd_temponly; + fpd_setf_t *pd_set; + fpd_getf_t *pd_getmod; + fpd_getf_t *pd_get; + fpd_checkf_t *pd_check; +} fprop_desc_t; + +typedef struct prop_table { + fprop_desc_t *pt_table; + uint_t pt_size; +} prop_table_t; + +typedef enum { + DLADM_PROP_VAL_CURRENT = 1, + DLADM_PROP_VAL_DEFAULT, + DLADM_PROP_VAL_MODIFIABLE, + DLADM_PROP_VAL_PERSISTENT +} prop_type_t; + +typedef dladm_status_t fad_checkf_t(char *, flow_desc_t *); + +extern dladm_status_t do_check_ip_addr(char *, boolean_t, flow_desc_t *); +extern dladm_status_t do_check_dsfield(char *, flow_desc_t *); + +typedef struct fattr_desc { + const char *ad_name; + fad_checkf_t *ad_check; +} fattr_desc_t; + +extern dladm_status_t i_dladm_get_prop_temp(const char *, prop_type_t, + const char *, char **, uint_t *, prop_table_t *); +extern dladm_status_t i_dladm_set_prop_temp(const char *, const char *, + char **, uint_t, uint_t, char **, prop_table_t *); +extern boolean_t i_dladm_is_prop_temponly(const char *prop_name, + char **, prop_table_t *); +/* + * Data structures used for implementing persistent properties + */ +typedef struct prop_val { + const char *lv_name; + struct prop_val *lv_nextval; +} prop_val_t; + +typedef struct prop_db_info { + const char *li_name; + struct prop_db_info *li_nextprop; + struct prop_val *li_val; +} prop_db_info_t; + +typedef struct prop_db_state prop_db_state_t; + +typedef boolean_t (*prop_db_op_t)(prop_db_state_t *, + char *, prop_db_info_t *, dladm_status_t *); + +typedef dladm_status_t (*prop_db_initop_t)(const char *, const char *, + char **, uint_t, uint_t, char **); + +struct prop_db_state { + prop_db_op_t ls_op; + const char *ls_name; + const char *ls_propname; + char **ls_propval; + uint_t *ls_valcntp; + prop_db_initop_t ls_initop; +}; + +extern boolean_t process_prop_set(prop_db_state_t *lsp, char *buf, + prop_db_info_t *listp, dladm_status_t *statusp); +extern boolean_t process_prop_get(prop_db_state_t *lsp, char *buf, + prop_db_info_t *listp, dladm_status_t *statusp); +extern boolean_t process_prop_init(prop_db_state_t *lsp, char *buf, + prop_db_info_t *listp, dladm_status_t *statusp); +extern dladm_status_t process_prop_db(void *arg, FILE *fp, FILE *nfp); + +extern dladm_status_t i_dladm_init_flowprop_db(void); +extern dladm_status_t i_dladm_set_flow_proplist_db(char *, + dladm_arg_list_t *); +extern dladm_status_t i_dladm_flow_check_restriction(datalink_id_t, + flow_desc_t *, mac_resource_props_t *, boolean_t); + +extern dladm_status_t dladm_flow_attrlist_extract(dladm_arg_list_t *, + flow_desc_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBDLFLOW_IMPL_H */ diff --git a/usr/src/lib/libdladm/common/libdllink.c b/usr/src/lib/libdladm/common/libdllink.c index 8deed6fe76..5698409442 100644 --- a/usr/src/lib/libdladm/common/libdllink.c +++ b/usr/src/lib/libdladm/common/libdllink.c @@ -62,6 +62,50 @@ i_dladm_info(int fd, const datalink_id_t linkid, dladm_attr_t *dap) return (DLADM_STATUS_OK); } +static dladm_status_t +dladm_usagelog(dladm_logtype_t type, dld_ioc_usagelog_t *log_info) +{ + int fd; + + fd = open(DLD_CONTROL_DEV, O_RDWR); + if (fd < 0) + return (DLADM_STATUS_IOERR); + + if (type == DLADM_LOGTYPE_FLOW) + log_info->ul_type = MAC_LOGTYPE_FLOW; + else + log_info->ul_type = MAC_LOGTYPE_LINK; + + if (ioctl(fd, DLDIOC_USAGELOG, log_info) < 0) { + (void) close(fd); + return (DLADM_STATUS_IOERR); + } + (void) close(fd); + return (DLADM_STATUS_OK); +} + +dladm_status_t +dladm_start_usagelog(dladm_logtype_t type, uint_t interval) +{ + dld_ioc_usagelog_t log_info; + + log_info.ul_onoff = B_TRUE; + log_info.ul_interval = interval; + + return (dladm_usagelog(type, &log_info)); +} + +dladm_status_t +dladm_stop_usagelog(dladm_logtype_t type) +{ + dld_ioc_usagelog_t log_info; + + log_info.ul_onoff = B_FALSE; + log_info.ul_interval = 0; + + return (dladm_usagelog(type, &log_info)); +} + struct i_dladm_walk_arg { dladm_walkcb_t *fn; void *arg; @@ -96,6 +140,112 @@ dladm_walk(dladm_walkcb_t *fn, void *arg, datalink_class_t class, class, dmedia, flags)); } +#define MAXGRPPERLINK 64 + +int +dladm_walk_hwgrp(datalink_id_t linkid, void *arg, + boolean_t (*fn)(void *, dladm_hwgrp_attr_t *)) +{ + int fd, bufsize, ret; + int nhwgrp = MAXGRPPERLINK; + dld_ioc_hwgrpget_t *iomp = NULL; + + if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0) + return (-1); + + bufsize = sizeof (dld_ioc_hwgrpget_t) + + nhwgrp * sizeof (dld_hwgrpinfo_t); + + if ((iomp = (dld_ioc_hwgrpget_t *)calloc(1, bufsize)) == NULL) + return (-1); + + iomp->dih_size = nhwgrp * sizeof (dld_hwgrpinfo_t); + iomp->dih_linkid = linkid; + + ret = ioctl(fd, DLDIOC_GETHWGRP, iomp); + if (ret == 0) { + int i; + dld_hwgrpinfo_t *dhip; + dladm_hwgrp_attr_t attr; + + dhip = (dld_hwgrpinfo_t *)(iomp + 1); + for (i = 0; i < iomp->dih_n_groups; i++) { + bzero(&attr, sizeof (attr)); + + (void) strlcpy(attr.hg_link_name, + dhip->dhi_link_name, sizeof (attr.hg_link_name)); + attr.hg_grp_num = dhip->dhi_grp_num; + attr.hg_grp_type = dhip->dhi_grp_type; + attr.hg_n_rings = dhip->dhi_n_rings; + attr.hg_n_clnts = dhip->dhi_n_clnts; + (void) strlcpy(attr.hg_client_names, + dhip->dhi_clnts, sizeof (attr.hg_client_names)); + + if (!(*fn)(arg, &attr)) + break; + dhip++; + } + } + free(iomp); + (void) close(fd); + return (ret); +} + +/* + * Invoke the specified callback for each MAC address entry defined on + * the specified device. + */ +int +dladm_walk_macaddr(datalink_id_t linkid, void *arg, + boolean_t (*fn)(void *, dladm_macaddr_attr_t *)) +{ + int fd, bufsize, ret; + int nmacaddr = 1024; + dld_ioc_macaddrget_t *iomp = NULL; + + if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0) + return (-1); + + bufsize = sizeof (dld_ioc_macaddrget_t) + + nmacaddr * sizeof (dld_macaddrinfo_t); + + if ((iomp = (dld_ioc_macaddrget_t *)calloc(1, bufsize)) == NULL) + return (-1); + + iomp->dig_size = nmacaddr * sizeof (dld_macaddrinfo_t); + iomp->dig_linkid = linkid; + + ret = ioctl(fd, DLDIOC_MACADDRGET, iomp); + if (ret == 0) { + int i; + dld_macaddrinfo_t *dmip; + dladm_macaddr_attr_t attr; + + dmip = (dld_macaddrinfo_t *)(iomp + 1); + for (i = 0; i < iomp->dig_count; i++) { + bzero(&attr, sizeof (attr)); + + attr.ma_slot = dmip->dmi_slot; + attr.ma_flags = 0; + if (dmip->dmi_flags & DLDIOCMACADDR_USED) + attr.ma_flags |= DLADM_MACADDR_USED; + bcopy(dmip->dmi_addr, attr.ma_addr, + dmip->dmi_addrlen); + attr.ma_addrlen = dmip->dmi_addrlen; + (void) strlcpy(attr.ma_client_name, + dmip->dmi_client_name, MAXNAMELEN); + attr.ma_client_linkid = dmip->dma_client_linkid; + + if (!(*fn)(arg, &attr)) + break; + dmip++; + } + } + free(iomp); + (void) close(fd); + return (ret); +} + /* * These routines are used by administration tools such as dladm(1M) to * iterate through the list of MAC interfaces @@ -253,84 +403,22 @@ dladm_linkduplex2str(link_duplex_t duplex, char *buf) /* * Set zoneid of a given link. Note that this function takes a link name * argument instead of a linkid, because a data-link (and its linkid) could - * be created implicitly as the result of this function. For example, a VLAN - * could be created if a VLAN PPA hack name is assigned to an exclusive - * non-global zone. + * be created implicitly as the result of this function. */ dladm_status_t dladm_setzid(const char *dlname, char *zone_name) { datalink_id_t linkid; - char *val; - char **prop_val; - char link[MAXLINKNAMELEN]; - uint_t ppa; - char dev[DLPI_LINKNAME_MAX]; - int valsize; dladm_status_t status = DLADM_STATUS_OK; - char *prop_name = "zone"; - boolean_t needfree = B_FALSE; - char delim = ':'; /* If the link does not exist, it is a ppa-hacked vlan. */ status = dladm_name2info(dlname, &linkid, NULL, NULL, NULL); - switch (status) { - case DLADM_STATUS_NOTFOUND: - if (strlen(dlname) > MAXLINKNAMELEN) - return (DLADM_STATUS_BADVAL); - - if (strlen(zone_name) > ZONENAME_MAX) - return (DLADM_STATUS_BADVAL); - - status = dladm_parselink(dlname, dev, &ppa); - if (status != DLADM_STATUS_OK) - return (status); - - ppa = (uint_t)DLS_PPA2INST(ppa); - (void) snprintf(link, sizeof (link), "%s%d", dev, ppa); - - status = dladm_name2info(link, &linkid, NULL, NULL, NULL); - if (status != DLADM_STATUS_OK) - return (status); - - /* - * Since the link does not exist as yet, we've to pass the - * link name too as part of data, so that the kernel can - * create the link. Hence, we're packing the zone_name and - * the link name into val. - */ - valsize = ZONENAME_MAX + MAXLINKNAMELEN + 1; - val = malloc(valsize); - if (val == NULL) - return (DLADM_STATUS_NOMEM); - needfree = B_TRUE; - - (void) snprintf(val, valsize, "%s%c%s", zone_name, - delim, dlname); - - break; - case DLADM_STATUS_OK: - /* - * The link exists, so only the zone_name is being passed as - * val. We could also pass zone_name + linkname like in the - * previous case just to maintain consistency, but other calls - * like set_linkprop() in dladm.c [which is called when we run - * 'dladm set-linkprop -p zone <linkname>' at the command line] - * pass in the value entered at the command line [which is zone - * name] as val. - */ - val = zone_name; - break; - default: - return (DLADM_STATUS_FAILED); - } + if (status != DLADM_STATUS_OK) + return (status); - prop_val = &val; - status = dladm_set_linkprop(linkid, prop_name, prop_val, 1, + status = dladm_set_linkprop(linkid, "zone", &zone_name, 1, DLADM_OPT_ACTIVE); - if (needfree) - free(val); return (status); } @@ -958,86 +1046,6 @@ done: } dladm_status_t -dladm_get_single_mac_stat(datalink_id_t linkid, const char *name, uint8_t type, - void *val) -{ - char module[DLPI_LINKNAME_MAX]; - uint_t instance; - char link[DLPI_LINKNAME_MAX]; - dladm_status_t status; - uint32_t flags, media; - kstat_ctl_t *kcp; - kstat_t *ksp; - dladm_phys_attr_t dpap; - - if ((status = dladm_datalink_id2info(linkid, &flags, NULL, &media, - link, DLPI_LINKNAME_MAX)) != DLADM_STATUS_OK) - return (status); - - if (media != DL_ETHER) - return (DLADM_STATUS_LINKINVAL); - - status = dladm_phys_info(linkid, &dpap, DLADM_OPT_PERSIST); - - if (status != DLADM_STATUS_OK) - return (status); - - status = dladm_parselink(dpap.dp_dev, module, &instance); - - if (status != DLADM_STATUS_OK) - return (status); - - if ((kcp = kstat_open()) == NULL) - return (dladm_errno2status(errno)); - - /* - * The kstat query could fail if the underlying MAC - * driver was already detached. - */ - if ((ksp = kstat_lookup(kcp, module, instance, "mac")) == NULL && - (ksp = kstat_lookup(kcp, module, instance, NULL)) == NULL) - goto bail; - - if (kstat_read(kcp, ksp, NULL) == -1) - goto bail; - - if (dladm_kstat_value(ksp, name, type, val) < 0) - goto bail; - - (void) kstat_close(kcp); - return (DLADM_STATUS_OK); -bail: - (void) kstat_close(kcp); - return (dladm_errno2status(errno)); - -} - -int -dladm_kstat_value(kstat_t *ksp, const char *name, uint8_t type, void *buf) -{ - kstat_named_t *knp; - - if ((knp = kstat_data_lookup(ksp, (char *)name)) == NULL) - return (-1); - - if (knp->data_type != type) - return (-1); - - switch (type) { - case KSTAT_DATA_UINT64: - *(uint64_t *)buf = knp->value.ui64; - break; - case KSTAT_DATA_UINT32: - *(uint32_t *)buf = knp->value.ui32; - break; - default: - return (-1); - } - - return (0); -} - -dladm_status_t dladm_parselink(const char *dev, char *provider, uint_t *ppa) { ifspec_t ifsp; diff --git a/usr/src/lib/libdladm/common/libdllink.h b/usr/src/lib/libdladm/common/libdllink.h index ea51087a83..29d078470c 100644 --- a/usr/src/lib/libdladm/common/libdllink.h +++ b/usr/src/lib/libdladm/common/libdllink.h @@ -31,17 +31,19 @@ * link administration (i.e. not limited to one specific type of link). */ +#include <stdio.h> #include <sys/types.h> #include <sys/param.h> #include <libdladm.h> -#include <kstat.h> +#include <libdladm_impl.h> +#include <sys/mac_flow.h> #ifdef __cplusplus extern "C" { #endif typedef struct dladm_attr { - uint_t da_max_sdu; + uint_t da_max_sdu; } dladm_attr_t; typedef struct dladm_phys_attr { @@ -86,6 +88,32 @@ typedef int dladm_secobj_class_t; typedef int (dladm_walkcb_t)(const char *, void *); +/* possible flags for ma_flags below */ +#define DLADM_MACADDR_USED 0x1 + +typedef enum { + DLADM_HWGRP_TYPE_RX = 0x1, + DLADM_HWGRP_TYPE_TX +} dladm_hwgrp_type_t; + +typedef struct dladm_hwgrp_attr { + char hg_link_name[MAXLINKNAMELEN]; + uint_t hg_grp_num; + dladm_hwgrp_type_t hg_grp_type; + uint_t hg_n_rings; + uint_t hg_n_clnts; + char hg_client_names[MAXCLIENTNAMELEN]; +} dladm_hwgrp_attr_t; + +typedef struct dladm_macaddr_attr { + uint_t ma_slot; + uint_t ma_flags; + uchar_t ma_addr[MAXMACADDRLEN]; + uint_t ma_addrlen; + char ma_client_name[MAXNAMELEN]; + datalink_id_t ma_client_linkid; +} dladm_macaddr_attr_t; + extern dladm_status_t dladm_walk(dladm_walkcb_t *, void *, datalink_class_t, datalink_media_t, uint32_t); extern dladm_status_t dladm_mac_walk(dladm_walkcb_t *, void *); @@ -148,12 +176,19 @@ extern dladm_status_t dladm_phys_delete(datalink_id_t); extern dladm_status_t dladm_phys_info(datalink_id_t, dladm_phys_attr_t *, uint32_t); -extern dladm_status_t dladm_get_single_mac_stat(datalink_id_t, const char *, - uint8_t, void *); -extern int dladm_kstat_value(kstat_t *, const char *, uint8_t, - void *); extern dladm_status_t dladm_parselink(const char *, char *, uint_t *); +extern int dladm_walk_macaddr(datalink_id_t, void *, + boolean_t (*)(void *, dladm_macaddr_attr_t *)); +extern int dladm_walk_hwgrp(datalink_id_t, void *, + boolean_t (*)(void *, dladm_hwgrp_attr_t *)); + +extern dladm_status_t dladm_link_get_proplist(datalink_id_t, + dladm_arg_list_t **); + +extern dladm_status_t i_dladm_set_link_proplist_db(char *, + dladm_arg_list_t *); + #ifdef __cplusplus } #endif diff --git a/usr/src/lib/libdladm/common/libdlstat.c b/usr/src/lib/libdladm/common/libdlstat.c new file mode 100644 index 0000000000..1990d27c67 --- /dev/null +++ b/usr/src/lib/libdladm/common/libdlstat.c @@ -0,0 +1,684 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <err.h> +#include <errno.h> +#include <kstat.h> +#include <unistd.h> +#include <signal.h> +#include <sys/dld.h> + +#include <libdllink.h> +#include <libdlflow.h> +#include <libdlstat.h> + +/* + * x86 <sys/regs> ERR conflicts with <curses.h> ERR. + * Include curses.h last. + */ +#if defined(ERR) +#undef ERR +#endif +#include <curses.h> + +struct flowlist { + char flowname[MAXNAMELEN]; + datalink_id_t linkid; + uint_t ifspeed; + boolean_t first; + boolean_t display; + pktsum_t prevstats; + pktsum_t diffstats; +}; + +static int maxx, maxy, redraw = 0; +static volatile uint_t handle_resize = 0, handle_break = 0; + +pktsum_t totalstats; +struct flowlist *stattable = NULL; +static int statentry = -1, maxstatentries = 0; + +#define STATGROWSIZE 16 + + +/* + * Search for flowlist entry in stattable which matches + * the flowname and linkide. If no match is found, use + * next available slot. If no slots are available, + * reallocate table with more slots. + * + * Return: *flowlist of matching flow + * NULL if realloc fails + */ + +static struct flowlist * +findstat(const char *flowname, datalink_id_t linkid) +{ + int match = 0; + struct flowlist *flist; + + /* Look for match in the stattable */ + for (match = 0, flist = stattable; + match <= statentry; + match++, flist++) { + + if (flist == NULL) + break; + /* match the flowname */ + if (flowname != NULL) { + if (strncmp(flowname, flist->flowname, MAXNAMELEN) + == NULL) + return (flist); + /* match the linkid */ + } else { + if (linkid == flist->linkid) + return (flist); + } + } + + /* + * No match found in the table. Store statistics in the next slot. + * If necessary, make room for this entry. + */ + statentry++; + if ((maxstatentries == 0) || (maxstatentries == statentry)) { + maxstatentries += STATGROWSIZE; + stattable = realloc(stattable, + maxstatentries * sizeof (struct flowlist)); + if (stattable == NULL) { + perror("realloc"); + return (struct flowlist *)(NULL); + } + } + flist = &stattable[statentry]; + bzero(flist, sizeof (struct flowlist)); + flist->first = B_TRUE; + + if (flowname != NULL) + (void) strncpy(flist->flowname, flowname, MAXNAMELEN); + flist->linkid = linkid; + return (flist); +} + +static void +print_flow_stats(struct flowlist *flist) +{ + struct flowlist *fcurr; + double ikbs, okbs; + double ipks, opks; + double dlt; + int fcount; + static boolean_t first = B_TRUE; + + if (first) { + first = B_FALSE; + (void) printw("please wait...\n"); + return; + } + + for (fcount = 0, fcurr = flist; + fcount <= statentry; + fcount++, fcurr++) { + if (fcurr->flowname && fcurr->display) { + char linkname[MAXNAMELEN]; + + (void) dladm_datalink_id2info(fcurr->linkid, NULL, NULL, + NULL, linkname, sizeof (linkname)); + dlt = (double)fcurr->diffstats.snaptime/(double)NANOSEC; + ikbs = fcurr->diffstats.rbytes * 8 / dlt / 1024; + okbs = fcurr->diffstats.obytes * 8 / dlt / 1024; + ipks = fcurr->diffstats.ipackets / dlt; + opks = fcurr->diffstats.opackets / dlt; + (void) printw("%-15.15s", fcurr->flowname); + (void) printw("%-10.10s", linkname); + (void) printw("%9.2f %9.2f %9.2f %9.2f ", + ikbs, okbs, ipks, opks); + (void) printw("\n"); + } + } +} + +/*ARGSUSED*/ +static int +flow_kstats(dladm_flow_attr_t *attr, void *arg) +{ + kstat_ctl_t *kcp = (kstat_ctl_t *)arg; + kstat_t *ksp; + struct flowlist *flist; + pktsum_t currstats, *prevstats, *diffstats; + + flist = findstat(attr->fa_flowname, attr->fa_linkid); + if (flist != NULL) { + prevstats = &flist->prevstats; + diffstats = &flist->diffstats; + } else { + return (DLADM_STATUS_FAILED); + } + + /* lookup kstat entry */ + ksp = dladm_kstat_lookup(kcp, NULL, -1, attr->fa_flowname, "flow"); + + if (ksp == NULL) + return (DLADM_WALK_TERMINATE); + else + flist->display = B_TRUE; + + dladm_get_stats(kcp, ksp, &currstats); + if (flist->ifspeed == 0) + (void) dladm_kstat_value(ksp, "ifspeed", KSTAT_DATA_UINT64, + &flist->ifspeed); + + if (flist->first) + flist->first = B_FALSE; + else { + dladm_stats_diff(diffstats, &currstats, prevstats); + dladm_stats_total(&totalstats, diffstats, &totalstats); + } + + bcopy(&currstats, prevstats, sizeof (pktsum_t)); + return (DLADM_WALK_CONTINUE); +} + +static void +print_link_stats(struct flowlist *flist) +{ + struct flowlist *fcurr; + double ikbs, okbs; + double ipks, opks; + double util; + double dlt; + int fcount; + static boolean_t first = B_TRUE; + + if (first) { + first = B_FALSE; + (void) printw("please wait...\n"); + return; + } + + for (fcount = 0, fcurr = flist; + fcount <= statentry; + fcount++, fcurr++) { + if ((fcurr->linkid != DATALINK_INVALID_LINKID) && + fcurr->display) { + char linkname[MAXNAMELEN]; + + (void) dladm_datalink_id2info(fcurr->linkid, NULL, NULL, + NULL, linkname, sizeof (linkname)); + dlt = (double)fcurr->diffstats.snaptime/(double)NANOSEC; + ikbs = (double)fcurr->diffstats.rbytes * 8 / dlt / 1024; + okbs = (double)fcurr->diffstats.obytes * 8 / dlt / 1024; + ipks = (double)fcurr->diffstats.ipackets / dlt; + opks = (double)fcurr->diffstats.opackets / dlt; + (void) printw("%-10.10s", linkname); + (void) printw("%9.2f %9.2f %9.2f %9.2f ", + ikbs, okbs, ipks, opks); + if (fcurr->ifspeed != 0) + util = ((ikbs + okbs) * 1024) * + 100/ fcurr->ifspeed; + else + util = (double)0; + (void) attron(A_BOLD); + (void) printw(" %6.2f", util); + (void) attroff(A_BOLD); + (void) printw("\n"); + } + } +} + +/* + * This function is called through the dladm_walk_datalink_id() walker and + * calls the dladm_walk_flow() walker. + */ + +/*ARGSUSED*/ +static int +link_flowstats(datalink_id_t linkid, void *arg) +{ + return (dladm_walk_flow(flow_kstats, linkid, arg, B_FALSE)); +} + +/*ARGSUSED*/ +static int +link_kstats(datalink_id_t linkid, void *arg) +{ + kstat_ctl_t *kcp = (kstat_ctl_t *)arg; + struct flowlist *flist; + pktsum_t currstats, *prevstats, *diffstats; + kstat_t *ksp; + char linkname[MAXNAMELEN]; + + /* find the flist entry */ + flist = findstat(NULL, linkid); + if (flist != NULL) { + prevstats = &flist->prevstats; + diffstats = &flist->diffstats; + } else { + return (DLADM_WALK_CONTINUE); + } + + /* lookup kstat entry */ + (void) dladm_datalink_id2info(linkid, NULL, NULL, NULL, linkname, + sizeof (linkname)); + + if (linkname == NULL) { + warn("no linkname for linkid"); + return (DLADM_WALK_TERMINATE); + } + + ksp = dladm_kstat_lookup(kcp, NULL, -1, linkname, "net"); + + if (ksp == NULL) + return (DLADM_WALK_TERMINATE); + else + flist->display = B_TRUE; + + /* read packet and byte stats */ + dladm_get_stats(kcp, ksp, &currstats); + + if (flist->ifspeed == 0) + (void) dladm_kstat_value(ksp, "ifspeed", KSTAT_DATA_UINT64, + &flist->ifspeed); + + if (flist->first == B_TRUE) + flist->first = B_FALSE; + else + dladm_stats_diff(diffstats, &currstats, prevstats); + + bcopy(&currstats, prevstats, sizeof (*prevstats)); + + return (DLADM_WALK_CONTINUE); +} + +/*ARGSUSED*/ +static void +sig_break(int s) +{ + handle_break = 1; +} + +/*ARGSUSED*/ +static void +sig_resize(int s) +{ + handle_resize = 1; +} + +static void +curses_init() +{ + maxx = maxx; /* lint */ + maxy = maxy; /* lint */ + + /* Install signal handlers */ + (void) signal(SIGINT, sig_break); + (void) signal(SIGQUIT, sig_break); + (void) signal(SIGTERM, sig_break); + (void) signal(SIGWINCH, sig_resize); + + /* Initialize ncurses */ + (void) initscr(); + (void) cbreak(); + (void) noecho(); + (void) curs_set(0); + timeout(0); + getmaxyx(stdscr, maxy, maxx); +} + +static void +curses_fin() +{ + (void) printw("\n"); + (void) curs_set(1); + (void) nocbreak(); + (void) endwin(); + + free(stattable); +} + +static void +stat_report(kstat_ctl_t *kcp, datalink_id_t linkid, const char *flowname, + int opt) +{ + + double dlt, ikbs, okbs, ipks, opks; + + struct flowlist *fstable = stattable; + + if ((opt != LINK_REPORT) && (opt != FLOW_REPORT)) + return; + + /* Handle window resizes */ + if (handle_resize) { + (void) endwin(); + (void) initscr(); + (void) cbreak(); + (void) noecho(); + (void) curs_set(0); + timeout(0); + getmaxyx(stdscr, maxy, maxx); + redraw = 1; + handle_resize = 0; + } + + /* Print title */ + (void) erase(); + (void) attron(A_BOLD); + (void) move(0, 0); + if (opt == FLOW_REPORT) + (void) printw("%-15.15s", "Flow"); + (void) printw("%-10.10s", "Link"); + (void) printw("%9.9s %9.9s %9.9s %9.9s ", + "iKb/s", "oKb/s", "iPk/s", "oPk/s"); + if (opt == LINK_REPORT) + (void) printw(" %6.6s", "%Util"); + (void) printw("\n"); + (void) attroff(A_BOLD); + + (void) move(2, 0); + + /* Print stats for each link or flow */ + bzero(&totalstats, sizeof (totalstats)); + if (opt == LINK_REPORT) { + /* Display all links */ + if (linkid == DATALINK_ALL_LINKID) { + (void) dladm_walk_datalink_id(link_kstats, + (void *)kcp, DATALINK_CLASS_ALL, + DATALINK_ANY_MEDIATYPE, DLADM_OPT_ACTIVE); + /* Display 1 link */ + } else { + (void) link_kstats(linkid, kcp); + } + print_link_stats(fstable); + + } else if (opt == FLOW_REPORT) { + /* Display 1 flow */ + if (flowname != NULL) { + dladm_flow_attr_t fattr; + if (dladm_flow_info(flowname, &fattr) != + DLADM_STATUS_OK) + return; + (void) flow_kstats(&fattr, kcp); + /* Display all flows on all links */ + } else if (linkid == DATALINK_ALL_LINKID) { + (void) dladm_walk_datalink_id(link_flowstats, + (void *)kcp, DATALINK_CLASS_ALL, + DATALINK_ANY_MEDIATYPE, DLADM_OPT_ACTIVE); + /* Display all flows on a link */ + } else if (linkid != DATALINK_INVALID_LINKID) { + (void) dladm_walk_flow(flow_kstats, linkid, kcp, + B_FALSE); + } + print_flow_stats(fstable); + + /* Print totals */ + (void) attron(A_BOLD); + dlt = (double)totalstats.snaptime / (double)NANOSEC; + ikbs = totalstats.rbytes / dlt / 1024; + okbs = totalstats.obytes / dlt / 1024; + ipks = totalstats.ipackets / dlt; + opks = totalstats.opackets / dlt; + (void) printw("\n%-25.25s", "Totals"); + (void) printw("%9.2f %9.2f %9.2f %9.2f ", + ikbs, okbs, ipks, opks); + (void) attroff(A_BOLD); + } + + if (redraw) + (void) clearok(stdscr, 1); + + if (refresh() == ERR) + return; + + if (redraw) { + (void) clearok(stdscr, 0); + redraw = 0; + } +} + +/* Exported functions */ + +/* + * Continuously display link or flow statstics using a libcurses + * based display. + */ + +void +dladm_continuous(datalink_id_t linkid, const char *flowname, int interval, + int opt) +{ + kstat_ctl_t *kcp; + + if ((kcp = kstat_open()) == NULL) { + warn("kstat open operation failed"); + return; + } + + curses_init(); + + for (;;) { + + if (handle_break) + break; + + stat_report(kcp, linkid, flowname, opt); + + (void) sleep(max(1, interval)); + } + + (void) curses_fin(); + (void) kstat_close(kcp); +} + +/* + * dladm_kstat_lookup() is a modified version of kstat_lookup which + * adds the class as a selector. + */ + +kstat_t * +dladm_kstat_lookup(kstat_ctl_t *kcp, const char *module, int instance, + const char *name, const char *class) +{ + kstat_t *ksp = NULL; + + for (ksp = kcp->kc_chain; ksp != NULL; ksp = ksp->ks_next) { + if ((module == NULL || strcmp(ksp->ks_module, module) == 0) && + (instance == -1 || ksp->ks_instance == instance) && + (name == NULL || strcmp(ksp->ks_name, name) == 0) && + (class == NULL || strcmp(ksp->ks_class, class) == 0)) + return (ksp); + } + + errno = ENOENT; + return (NULL); +} + +/* + * dladm_get_stats() populates the supplied pktsum_t structure with + * the input and output packet and byte kstats from the kstat_t + * found with dladm_kstat_lookup. + */ +void +dladm_get_stats(kstat_ctl_t *kcp, kstat_t *ksp, pktsum_t *stats) +{ + + if (kstat_read(kcp, ksp, NULL) == -1) + return; + + stats->snaptime = gethrtime(); + + if (dladm_kstat_value(ksp, "ipackets64", KSTAT_DATA_UINT64, + &stats->ipackets) < 0) { + if (dladm_kstat_value(ksp, "ipackets", KSTAT_DATA_UINT64, + &stats->ipackets) < 0) + return; + } + + if (dladm_kstat_value(ksp, "opackets64", KSTAT_DATA_UINT64, + &stats->opackets) < 0) { + if (dladm_kstat_value(ksp, "opackets", KSTAT_DATA_UINT64, + &stats->opackets) < 0) + return; + } + + if (dladm_kstat_value(ksp, "rbytes64", KSTAT_DATA_UINT64, + &stats->rbytes) < 0) { + if (dladm_kstat_value(ksp, "rbytes", KSTAT_DATA_UINT64, + &stats->rbytes) < 0) + return; + } + + if (dladm_kstat_value(ksp, "obytes64", KSTAT_DATA_UINT64, + &stats->obytes) < 0) { + if (dladm_kstat_value(ksp, "obytes", KSTAT_DATA_UINT64, + &stats->obytes) < 0) + return; + } + + if (dladm_kstat_value(ksp, "ierrors", KSTAT_DATA_UINT32, + &stats->ierrors) < 0) { + if (dladm_kstat_value(ksp, "ierrors", KSTAT_DATA_UINT64, + &stats->ierrors) < 0) + return; + } + + if (dladm_kstat_value(ksp, "oerrors", KSTAT_DATA_UINT32, + &stats->oerrors) < 0) { + if (dladm_kstat_value(ksp, "oerrors", KSTAT_DATA_UINT64, + &stats->oerrors) < 0) + return; + } +} + +int +dladm_kstat_value(kstat_t *ksp, const char *name, uint8_t type, void *buf) +{ + kstat_named_t *knp; + + if ((knp = kstat_data_lookup(ksp, (char *)name)) == NULL) + return (-1); + + if (knp->data_type != type) + return (-1); + + switch (type) { + case KSTAT_DATA_UINT64: + *(uint64_t *)buf = knp->value.ui64; + break; + case KSTAT_DATA_UINT32: + *(uint32_t *)buf = knp->value.ui32; + break; + default: + return (-1); + } + + return (0); +} + +dladm_status_t +dladm_get_single_mac_stat(datalink_id_t linkid, const char *name, uint8_t type, + void *val) +{ + kstat_ctl_t *kcp; + char module[DLPI_LINKNAME_MAX]; + uint_t instance; + char link[DLPI_LINKNAME_MAX]; + dladm_status_t status; + uint32_t flags, media; + kstat_t *ksp; + dladm_phys_attr_t dpap; + + if ((kcp = kstat_open()) == NULL) { + warn("kstat_open operation failed"); + return (-1); + } + + if ((status = dladm_datalink_id2info(linkid, &flags, NULL, &media, + link, DLPI_LINKNAME_MAX)) != DLADM_STATUS_OK) + return (status); + + if (media != DL_ETHER) + return (DLADM_STATUS_LINKINVAL); + + status = dladm_phys_info(linkid, &dpap, DLADM_OPT_PERSIST); + + if (status != DLADM_STATUS_OK) + return (status); + + status = dladm_parselink(dpap.dp_dev, module, &instance); + + if (status != DLADM_STATUS_OK) + return (status); + + /* + * The kstat query could fail if the underlying MAC + * driver was already detached. + */ + if ((ksp = kstat_lookup(kcp, module, instance, "mac")) == NULL && + (ksp = kstat_lookup(kcp, module, instance, NULL)) == NULL) + goto bail; + + if (kstat_read(kcp, ksp, NULL) == -1) + goto bail; + + if (dladm_kstat_value(ksp, name, type, val) < 0) + goto bail; + + (void) kstat_close(kcp); + return (DLADM_STATUS_OK); + +bail: + (void) kstat_close(kcp); + return (dladm_errno2status(errno)); +} + +/* Compute sum of 2 pktsums (s1 = s2 + s3) */ +void +dladm_stats_total(pktsum_t *s1, pktsum_t *s2, pktsum_t *s3) +{ + s1->rbytes = s2->rbytes + s3->rbytes; + s1->ipackets = s2->ipackets + s3->ipackets; + s1->ierrors = s2->ierrors + s3->ierrors; + s1->obytes = s2->obytes + s3->obytes; + s1->opackets = s2->opackets + s3->opackets; + s1->oerrors = s2->oerrors + s3->oerrors; + s1->snaptime = s2->snaptime; +} + +/* Compute differences between 2 pktsums (s1 = s2 - s3) */ +void +dladm_stats_diff(pktsum_t *s1, pktsum_t *s2, pktsum_t *s3) +{ + s1->rbytes = s2->rbytes - s3->rbytes; + s1->ipackets = s2->ipackets - s3->ipackets; + s1->ierrors = s2->ierrors - s3->ierrors; + s1->obytes = s2->obytes - s3->obytes; + s1->opackets = s2->opackets - s3->opackets; + s1->oerrors = s2->oerrors - s3->oerrors; + s1->snaptime = s2->snaptime - s3->snaptime; +} diff --git a/usr/src/lib/libdladm/common/libdlstat.h b/usr/src/lib/libdladm/common/libdlstat.h new file mode 100644 index 0000000000..a142275268 --- /dev/null +++ b/usr/src/lib/libdladm/common/libdlstat.h @@ -0,0 +1,71 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBDLSTAT_H +#define _LIBDLSTAT_H + +/* + * This file includes structures, macros and common routines shared by all + * data-link administration, and routines which are used to retrieve and + * display statistics. + */ + +#include <kstat.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define LINK_REPORT 1 +#define FLOW_REPORT 2 + +typedef struct pktsum_s { + hrtime_t snaptime; + uint64_t ipackets; + uint64_t opackets; + uint64_t rbytes; + uint64_t obytes; + uint64_t ierrors; + uint64_t oerrors; +} pktsum_t; + +extern void dladm_continuous(datalink_id_t, const char *, int, int); + +extern kstat_t *dladm_kstat_lookup(kstat_ctl_t *, const char *, int, + const char *, const char *); +extern void dladm_get_stats(kstat_ctl_t *, kstat_t *, pktsum_t *); +extern int dladm_kstat_value(kstat_t *, const char *, uint8_t, + void *); +extern dladm_status_t dladm_get_single_mac_stat(datalink_id_t, const char *, + uint8_t, void *); + +extern void dladm_stats_total(pktsum_t *, pktsum_t *, pktsum_t *); +extern void dladm_stats_diff(pktsum_t *, pktsum_t *, pktsum_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBDLSTAT_H */ diff --git a/usr/src/lib/libdladm/common/libdlvlan.c b/usr/src/lib/libdladm/common/libdlvlan.c index f6d855db72..1dc04bf4eb 100644 --- a/usr/src/lib/libdladm/common/libdlvlan.c +++ b/usr/src/lib/libdladm/common/libdlvlan.c @@ -23,16 +23,8 @@ * Use is subject to license terms. */ -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <unistd.h> -#include <errno.h> -#include <assert.h> -#include <sys/dld.h> -#include <libdladm_impl.h> -#include <libdllink.h> #include <libdlvlan.h> +#include <libdlvnic.h> /* * VLAN Administration Library. @@ -44,106 +36,19 @@ /* * Returns the current attributes of the specified VLAN. */ -static dladm_status_t -i_dladm_vlan_info_active(datalink_id_t vlanid, dladm_vlan_attr_t *dvap) -{ - int fd; - dld_ioc_vlan_attr_t div; - dladm_status_t status = DLADM_STATUS_OK; - - if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0) - return (dladm_errno2status(errno)); - - div.div_vlanid = vlanid; - - if (ioctl(fd, DLDIOC_VLAN_ATTR, &div) < 0) - status = dladm_errno2status(errno); - - dvap->dv_vid = div.div_vid; - dvap->dv_linkid = div.div_linkid; - dvap->dv_force = div.div_force; - dvap->dv_implicit = div.div_implicit; -done: - (void) close(fd); - return (status); -} - -/* - * Returns the persistent attributes of the specified VLAN. - */ -static dladm_status_t -i_dladm_vlan_info_persist(datalink_id_t vlanid, dladm_vlan_attr_t *dvap) -{ - dladm_conf_t conf = DLADM_INVALID_CONF; - dladm_status_t status; - uint64_t u64; - - if ((status = dladm_read_conf(vlanid, &conf)) != DLADM_STATUS_OK) - return (status); - - status = dladm_get_conf_field(conf, FLINKOVER, &u64, sizeof (u64)); - if (status != DLADM_STATUS_OK) - goto done; - dvap->dv_linkid = (datalink_id_t)u64; - - status = dladm_get_conf_field(conf, FFORCE, &dvap->dv_force, - sizeof (boolean_t)); - if (status != DLADM_STATUS_OK) - goto done; - - dvap->dv_implicit = B_FALSE; - - status = dladm_get_conf_field(conf, FVLANID, &u64, sizeof (u64)); - if (status != DLADM_STATUS_OK) - goto done; - dvap->dv_vid = (uint16_t)u64; - -done: - dladm_destroy_conf(conf); - return (status); -} - dladm_status_t dladm_vlan_info(datalink_id_t vlanid, dladm_vlan_attr_t *dvap, uint32_t flags) { - assert(flags == DLADM_OPT_ACTIVE || flags == DLADM_OPT_PERSIST); - if (flags == DLADM_OPT_ACTIVE) - return (i_dladm_vlan_info_active(vlanid, dvap)); - else - return (i_dladm_vlan_info_persist(vlanid, dvap)); -} - -static dladm_status_t -dladm_persist_vlan_conf(const char *vlan, datalink_id_t vlanid, - boolean_t force, datalink_id_t linkid, uint16_t vid) -{ - dladm_conf_t conf = DLADM_INVALID_CONF; - dladm_status_t status; - uint64_t u64; + dladm_status_t status; + dladm_vnic_attr_t attr, *vnic = &attr; - if ((status = dladm_create_conf(vlan, vlanid, DATALINK_CLASS_VLAN, - DL_ETHER, &conf)) != DLADM_STATUS_OK) { + if ((status = dladm_vnic_info(vlanid, vnic, flags)) != + DLADM_STATUS_OK) return (status); - } - u64 = linkid; - status = dladm_set_conf_field(conf, FLINKOVER, DLADM_TYPE_UINT64, &u64); - if (status != DLADM_STATUS_OK) - goto done; - - status = dladm_set_conf_field(conf, FFORCE, DLADM_TYPE_BOOLEAN, &force); - if (status != DLADM_STATUS_OK) - goto done; - - u64 = vid; - status = dladm_set_conf_field(conf, FVLANID, DLADM_TYPE_UINT64, &u64); - if (status != DLADM_STATUS_OK) - goto done; - - status = dladm_write_conf(conf); - -done: - dladm_destroy_conf(conf); + dvap->dv_vid = vnic->va_vid; + dvap->dv_linkid = vnic->va_link_id; + dvap->dv_force = vnic->va_force; return (status); } @@ -152,63 +57,11 @@ done: */ dladm_status_t dladm_vlan_create(const char *vlan, datalink_id_t linkid, uint16_t vid, - uint32_t flags) + dladm_arg_list_t *proplist, uint32_t flags, datalink_id_t *vlan_id_out) { - dld_ioc_create_vlan_t dic; - int fd; - datalink_id_t vlanid = DATALINK_INVALID_LINKID; - uint_t media; - datalink_class_t class; - dladm_status_t status; - - if (vid < 1 || vid > 4094) - return (DLADM_STATUS_VIDINVAL); - - if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0) - return (dladm_errno2status(errno)); - - status = dladm_datalink_id2info(linkid, NULL, &class, &media, NULL, 0); - if (status != DLADM_STATUS_OK || media != DL_ETHER || - class == DATALINK_CLASS_VLAN) { - return (DLADM_STATUS_BADARG); - } - - status = dladm_create_datalink_id(vlan, DATALINK_CLASS_VLAN, DL_ETHER, - flags & (DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST), &vlanid); - if (status != DLADM_STATUS_OK) - goto fail; - - if (flags & DLADM_OPT_PERSIST) { - status = dladm_persist_vlan_conf(vlan, vlanid, - (flags & DLADM_OPT_FORCE) != 0, linkid, vid); - if (status != DLADM_STATUS_OK) - goto fail; - } - - if (flags & DLADM_OPT_ACTIVE) { - dic.dic_vlanid = vlanid; - dic.dic_linkid = linkid; - dic.dic_vid = vid; - dic.dic_force = (flags & DLADM_OPT_FORCE) != 0; - - if (ioctl(fd, DLDIOC_CREATE_VLAN, &dic) < 0) { - status = dladm_errno2status(errno); - if (flags & DLADM_OPT_PERSIST) - (void) dladm_remove_conf(vlanid); - goto fail; - } - } - - (void) close(fd); - return (DLADM_STATUS_OK); - -fail: - if (vlanid != DATALINK_INVALID_LINKID) { - (void) dladm_destroy_datalink_id(vlanid, - flags & (DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST)); - } - (void) close(fd); - return (status); + return (dladm_vnic_create(vlan, linkid, VNIC_MAC_ADDR_TYPE_PRIMARY, + NULL, 0, NULL, 0, vid, vlan_id_out, proplist, + flags | DLADM_OPT_VLAN)); } /* @@ -217,124 +70,11 @@ fail: dladm_status_t dladm_vlan_delete(datalink_id_t vlanid, uint32_t flags) { - dld_ioc_delete_vlan_t did; - int fd; - datalink_class_t class; - dladm_status_t status = DLADM_STATUS_OK; - - if ((dladm_datalink_id2info(vlanid, NULL, &class, NULL, NULL, 0) != - DLADM_STATUS_OK) || (class != DATALINK_CLASS_VLAN)) { - return (DLADM_STATUS_BADARG); - } - - if (flags & DLADM_OPT_ACTIVE) { - if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0) - return (dladm_errno2status(errno)); - - did.did_linkid = vlanid; - if ((ioctl(fd, DLDIOC_DELETE_VLAN, &did) < 0) && - ((errno != ENOENT) || !(flags & DLADM_OPT_PERSIST))) { - (void) close(fd); - return (dladm_errno2status(errno)); - } - (void) close(fd); - - /* - * Delete active linkprop before this active link is deleted. - */ - (void) dladm_set_linkprop(vlanid, NULL, NULL, 0, - DLADM_OPT_ACTIVE); - } - - (void) dladm_destroy_datalink_id(vlanid, - flags & (DLADM_OPT_ACTIVE | DLADM_OPT_PERSIST)); - - if (flags & DLADM_OPT_PERSIST) - (void) dladm_remove_conf(vlanid); - - return (status); -} - -/* - * Callback used by dladm_vlan_up() - */ -static int -i_dladm_vlan_up(datalink_id_t vlanid, void *arg) -{ - dladm_vlan_attr_t dva; - dld_ioc_create_vlan_t dic; - dladm_status_t *statusp = arg; - uint32_t flags; - int fd; - dladm_status_t status; - - status = dladm_vlan_info(vlanid, &dva, DLADM_OPT_PERSIST); - if (status != DLADM_STATUS_OK) - goto done; - - /* - * Validate (and delete) the link associated with this VLAN, see if - * the specific hardware has been removed during system shutdown. - */ - if ((status = dladm_datalink_id2info(dva.dv_linkid, &flags, NULL, - NULL, NULL, 0)) != DLADM_STATUS_OK) { - goto done; - } - - if (!(flags & DLADM_OPT_ACTIVE)) { - status = DLADM_STATUS_BADARG; - goto done; - } - - dic.dic_linkid = dva.dv_linkid; - dic.dic_force = dva.dv_force; - dic.dic_vid = dva.dv_vid; - - if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0) { - status = dladm_errno2status(errno); - goto done; - } - - dic.dic_vlanid = vlanid; - if (ioctl(fd, DLDIOC_CREATE_VLAN, &dic) < 0) { - status = dladm_errno2status(errno); - goto done; - } - - if ((status = dladm_up_datalink_id(vlanid)) != DLADM_STATUS_OK) { - dld_ioc_delete_vlan_t did; - - did.did_linkid = vlanid; - (void) ioctl(fd, DLDIOC_DELETE_VLAN, &did); - } else { - /* - * Reset the active linkprop of this specific link. - */ - (void) dladm_init_linkprop(vlanid, B_FALSE); - } - - (void) close(fd); -done: - *statusp = status; - return (DLADM_WALK_CONTINUE); + return (dladm_vnic_delete(vlanid, flags | DLADM_OPT_VLAN)); } -/* - * Bring up one VLAN, or all persistent VLANs. In the latter case, the - * walk may terminate early if bringup of a VLAN fails. - */ dladm_status_t dladm_vlan_up(datalink_id_t linkid) { - dladm_status_t status; - - if (linkid == DATALINK_ALL_LINKID) { - (void) dladm_walk_datalink_id(i_dladm_vlan_up, &status, - DATALINK_CLASS_VLAN, DATALINK_ANY_MEDIATYPE, - DLADM_OPT_PERSIST); - return (DLADM_STATUS_OK); - } else { - (void) i_dladm_vlan_up(linkid, &status); - return (status); - } + return (dladm_vnic_up(linkid, DLADM_OPT_VLAN)); } diff --git a/usr/src/lib/libdladm/common/libdlvlan.h b/usr/src/lib/libdladm/common/libdlvlan.h index 7a305443df..91f6ee8671 100644 --- a/usr/src/lib/libdladm/common/libdlvlan.h +++ b/usr/src/lib/libdladm/common/libdlvlan.h @@ -26,8 +26,6 @@ #ifndef _LIBDLVLAN_H #define _LIBDLVLAN_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * This file includes structures, macros and routines used by VLAN link * administration. @@ -43,13 +41,13 @@ typedef struct dladm_vlan_attr { uint16_t dv_vid; datalink_id_t dv_linkid; boolean_t dv_force; - boolean_t dv_implicit; } dladm_vlan_attr_t; extern dladm_status_t dladm_vlan_info(datalink_id_t, dladm_vlan_attr_t *, uint32_t); extern dladm_status_t dladm_vlan_create(const char *, datalink_id_t, - uint16_t, uint32_t); + uint16_t, dladm_arg_list_t *, uint32_t, + datalink_id_t *); extern dladm_status_t dladm_vlan_delete(datalink_id_t, uint32_t); extern dladm_status_t dladm_vlan_up(datalink_id_t); diff --git a/usr/src/lib/libdladm/common/libdlvnic.c b/usr/src/lib/libdladm/common/libdlvnic.c index ac97372785..dfa58bcac5 100644 --- a/usr/src/lib/libdladm/common/libdlvnic.c +++ b/usr/src/lib/libdladm/common/libdlvnic.c @@ -36,6 +36,7 @@ #include <libintl.h> #include <net/if_types.h> #include <net/if_dl.h> +#include <sys/dld.h> #include <libdladm_impl.h> #include <libdllink.h> #include <libdlvnic.h> @@ -44,137 +45,258 @@ * VNIC administration library. */ -/* Limits on buffer size for VNIC_IOC_INFO request */ -#define MIN_INFO_SIZE (4*1024) -#define MAX_INFO_SIZE (128*1024) - -/* configuration database entry */ -typedef struct dladm_vnic_attr_db { - datalink_id_t vt_vnic_id; - datalink_id_t vt_link_id; - vnic_mac_addr_type_t vt_mac_addr_type; - uint_t vt_mac_len; - uchar_t vt_mac_addr[MAXMACADDRLEN]; -} dladm_vnic_attr_db_t; - -typedef struct dladm_vnic_modify_attr { - vnic_mac_addr_type_t vm_mac_addr_type; - int vm_mac_len; - uchar_t vm_mac_addr[MAXMACADDRLEN]; -} dladm_vnic_modify_attr_t; +/* + * Default random MAC address prefix (locally administered). + */ +static char dladm_vnic_def_prefix[] = {0x02, 0x08, 0x20}; + +static dladm_status_t dladm_vnic_persist_conf(const char *name, + dladm_vnic_attr_t *, datalink_class_t); +static const char *dladm_vnic_macaddr2str(const uchar_t *, char *); +static dladm_status_t dladm_vnic_str2macaddr(const char *, uchar_t *); /* - * Send a create command to the VNIC driver. + * Convert a diagnostic returned by the kernel into a dladm_status_t. */ static dladm_status_t -i_dladm_vnic_create_sys(int fd, dladm_vnic_attr_db_t *attr) +dladm_vnic_diag2status(vnic_ioc_diag_t ioc_diag) { - vnic_ioc_create_t ioc; - - ioc.vc_vnic_id = attr->vt_vnic_id; - ioc.vc_link_id = attr->vt_link_id; - ioc.vc_mac_addr_type = attr->vt_mac_addr_type; - ioc.vc_mac_len = attr->vt_mac_len; - bcopy(attr->vt_mac_addr, ioc.vc_mac_addr, attr->vt_mac_len); - - if (ioctl(fd, VNIC_IOC_CREATE, &ioc) < 0) - return (dladm_errno2status(errno)); - + switch (ioc_diag) { + case VNIC_IOC_DIAG_MACADDR_INVALID: + return (DLADM_STATUS_INVALIDMACADDR); + case VNIC_IOC_DIAG_MACADDRLEN_INVALID: + return (DLADM_STATUS_INVALIDMACADDRLEN); + case VNIC_IOC_DIAG_MACADDR_NIC: + return (DLADM_STATUS_INVALIDMACADDRNIC); + case VNIC_IOC_DIAG_MACADDR_INUSE: + return (DLADM_STATUS_INVALIDMACADDRINUSE); + case VNIC_IOC_DIAG_MACFACTORYSLOTINVALID: + return (DLADM_STATUS_MACFACTORYSLOTINVALID); + case VNIC_IOC_DIAG_MACFACTORYSLOTUSED: + return (DLADM_STATUS_MACFACTORYSLOTUSED); + case VNIC_IOC_DIAG_MACFACTORYSLOTALLUSED: + return (DLADM_STATUS_MACFACTORYSLOTALLUSED); + case VNIC_IOC_DIAG_MACFACTORYNOTSUP: + return (DLADM_STATUS_MACFACTORYNOTSUP); + case VNIC_IOC_DIAG_MACPREFIX_INVALID: + return (DLADM_STATUS_INVALIDMACPREFIX); + case VNIC_IOC_DIAG_MACPREFIXLEN_INVALID: + return (DLADM_STATUS_INVALIDMACPREFIXLEN); + case VNIC_IOC_DIAG_MACMARGIN_INVALID: + return (DLADM_STATUS_INVALID_MACMARGIN); + case VNIC_IOC_DIAG_NO_HWRINGS: + return (DLADM_STATUS_NO_HWRINGS); + } return (DLADM_STATUS_OK); } /* - * Send a modify command to the VNIC driver. + * Send a create command to the VNIC driver. */ -static dladm_status_t -i_dladm_vnic_modify_sys(datalink_id_t vnic_id, uint32_t modify_mask, - dladm_vnic_modify_attr_t *attr) +dladm_status_t +i_dladm_vnic_create_sys(dladm_vnic_attr_t *attr) { + int rc, fd; + vnic_ioc_create_t ioc; dladm_status_t status = DLADM_STATUS_OK; - int fd; - vnic_ioc_modify_t ioc; - - ioc.vm_vnic_id = vnic_id; - ioc.vm_modify_mask = 0; - if (modify_mask & DLADM_VNIC_MODIFY_ADDR) - ioc.vm_modify_mask |= VNIC_IOC_MODIFY_ADDR; - - ioc.vm_mac_addr_type = attr->vm_mac_addr_type; - ioc.vm_mac_len = attr->vm_mac_len; - bcopy(attr->vm_mac_addr, ioc.vm_mac_addr, MAXMACADDRLEN); + bzero(&ioc, sizeof (ioc)); + ioc.vc_vnic_id = attr->va_vnic_id; + ioc.vc_link_id = attr->va_link_id; + ioc.vc_mac_addr_type = attr->va_mac_addr_type; + ioc.vc_mac_len = attr->va_mac_len; + ioc.vc_mac_slot = attr->va_mac_slot; + ioc.vc_mac_prefix_len = attr->va_mac_prefix_len; + ioc.vc_vid = attr->va_vid; + ioc.vc_flags = attr->va_force ? VNIC_IOC_CREATE_FORCE : 0; + ioc.vc_flags |= attr->va_hwrings ? VNIC_IOC_CREATE_REQ_HWRINGS : 0; + + if (attr->va_mac_len > 0 || ioc.vc_mac_prefix_len > 0) + bcopy(attr->va_mac_addr, ioc.vc_mac_addr, MAXMACADDRLEN); + bcopy(&attr->va_resource_props, &ioc.vc_resource_props, + sizeof (mac_resource_props_t)); + if (attr->va_link_id == DATALINK_INVALID_LINKID) + ioc.vc_flags |= VNIC_IOC_CREATE_ANCHOR; if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0) return (dladm_errno2status(errno)); - if (ioctl(fd, VNIC_IOC_MODIFY, &ioc) < 0) + rc = ioctl(fd, VNIC_IOC_CREATE, &ioc); + if (rc < 0) status = dladm_errno2status(errno); (void) close(fd); + if (status != DLADM_STATUS_OK) { + if (ioc.vc_diag != VNIC_IOC_DIAG_NONE) + status = dladm_vnic_diag2status(ioc.vc_diag); + } + if (status != DLADM_STATUS_OK) + return (status); + + attr->va_mac_addr_type = ioc.vc_mac_addr_type; + switch (ioc.vc_mac_addr_type) { + case VNIC_MAC_ADDR_TYPE_FACTORY: + attr->va_mac_slot = ioc.vc_mac_slot; + break; + case VNIC_MAC_ADDR_TYPE_RANDOM: + bcopy(ioc.vc_mac_addr, attr->va_mac_addr, MAXMACADDRLEN); + attr->va_mac_len = ioc.vc_mac_len; + break; + } return (status); } /* * Get the configuration information of the given VNIC. */ -dladm_status_t -dladm_vnic_info(datalink_id_t vnic_id, dladm_vnic_attr_sys_t *attrp, - uint32_t flags) +static dladm_status_t +i_dladm_vnic_info_active(datalink_id_t linkid, dladm_vnic_attr_t *attrp) { - vnic_ioc_info_t *ioc; - vnic_ioc_info_vnic_t *vnic; - int bufsize, fd; + vnic_ioc_info_t ioc; + vnic_info_t *vnic; + int rc, fd; dladm_status_t status = DLADM_STATUS_OK; - /* for now, only temporary creations are supported */ - if (flags & DLADM_OPT_PERSIST) - return (dladm_errno2status(ENOTSUP)); - if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) == -1) return (dladm_errno2status(errno)); - bufsize = sizeof (vnic_ioc_info_t) + sizeof (vnic_ioc_info_vnic_t); - ioc = (vnic_ioc_info_t *)calloc(1, bufsize); - if (ioc == NULL) { - (void) close(fd); - return (dladm_errno2status(ENOMEM)); - } + bzero(&ioc, sizeof (ioc)); + vnic = &ioc.vi_info; + vnic->vn_vnic_id = linkid; - ioc->vi_vnic_id = vnic_id; - ioc->vi_size = bufsize - sizeof (vnic_ioc_info_t); - if (ioctl(fd, VNIC_IOC_INFO, ioc) != 0) { + rc = ioctl(fd, VNIC_IOC_INFO, &ioc); + if (rc != 0) { status = dladm_errno2status(errno); goto bail; } - vnic = (vnic_ioc_info_vnic_t *)(ioc + 1); - attrp->va_vnic_id = vnic->vn_vnic_id; attrp->va_link_id = vnic->vn_link_id; attrp->va_mac_addr_type = vnic->vn_mac_addr_type; - bcopy(vnic->vn_mac_addr, attrp->va_mac_addr, ETHERADDRL); + bcopy(vnic->vn_mac_addr, attrp->va_mac_addr, MAXMACADDRLEN); attrp->va_mac_len = vnic->vn_mac_len; + attrp->va_mac_slot = vnic->vn_mac_slot; + attrp->va_mac_prefix_len = vnic->vn_mac_prefix_len; + attrp->va_vid = vnic->vn_vid; + attrp->va_force = vnic->vn_force; bail: - free(ioc); (void) close(fd); return (status); } +static dladm_status_t +i_dladm_vnic_info_persist(datalink_id_t linkid, dladm_vnic_attr_t *attrp) +{ + dladm_conf_t conf; + dladm_status_t status; + char macstr[ETHERADDRL * 3]; + uint64_t u64; + datalink_class_t class; + + attrp->va_vnic_id = linkid; + if ((status = dladm_read_conf(linkid, &conf)) != DLADM_STATUS_OK) + return (status); + + status = dladm_get_conf_field(conf, FLINKOVER, &u64, sizeof (u64)); + attrp->va_link_id = ((status == DLADM_STATUS_OK) ? + (datalink_id_t)u64 : DATALINK_INVALID_LINKID); + + status = dladm_get_conf_field(conf, FHWRINGS, &attrp->va_hwrings, + sizeof (boolean_t)); + + if (status != DLADM_STATUS_OK && status != DLADM_STATUS_NOTFOUND) + goto done; + if (status == DLADM_STATUS_NOTFOUND) + attrp->va_hwrings = B_FALSE; + + if ((status = dladm_datalink_id2info(linkid, NULL, &class, + NULL, NULL, 0)) != DLADM_STATUS_OK) + goto done; + + if (class == DATALINK_CLASS_VLAN) { + if (attrp->va_link_id == DATALINK_INVALID_LINKID) { + status = DLADM_STATUS_BADARG; + goto done; + } + attrp->va_mac_addr_type = VNIC_MAC_ADDR_TYPE_PRIMARY; + attrp->va_mac_len = 0; + } else { + status = dladm_get_conf_field(conf, FMADDRTYPE, &u64, + sizeof (u64)); + if (status != DLADM_STATUS_OK) + goto done; + + attrp->va_mac_addr_type = (vnic_mac_addr_type_t)u64; + + status = dladm_get_conf_field(conf, FMADDRLEN, &u64, + sizeof (u64)); + attrp->va_mac_len = ((status == DLADM_STATUS_OK) ? + (uint_t)u64 : ETHERADDRL); + + status = dladm_get_conf_field(conf, FMADDRSLOT, &u64, + sizeof (u64)); + attrp->va_mac_slot = ((status == DLADM_STATUS_OK) ? + (int)u64 : -1); + + status = dladm_get_conf_field(conf, FMADDRPREFIXLEN, &u64, + sizeof (u64)); + attrp->va_mac_prefix_len = ((status == DLADM_STATUS_OK) ? + (uint_t)u64 : sizeof (dladm_vnic_def_prefix)); + + status = dladm_get_conf_field(conf, FMACADDR, macstr, + sizeof (macstr)); + if (status != DLADM_STATUS_OK) + goto done; + + status = dladm_vnic_str2macaddr(macstr, attrp->va_mac_addr); + if (status != DLADM_STATUS_OK) + goto done; + } + + status = dladm_get_conf_field(conf, FVLANID, &u64, sizeof (u64)); + attrp->va_vid = ((status == DLADM_STATUS_OK) ? (uint16_t)u64 : 0); + + + status = DLADM_STATUS_OK; +done: + dladm_destroy_conf(conf); + return (status); +} + +dladm_status_t +dladm_vnic_info(datalink_id_t linkid, dladm_vnic_attr_t *attrp, + uint32_t flags) +{ + if (flags == DLADM_OPT_ACTIVE) + return (i_dladm_vnic_info_active(linkid, attrp)); + else if (flags == DLADM_OPT_PERSIST) + return (i_dladm_vnic_info_persist(linkid, attrp)); + else + return (DLADM_STATUS_BADARG); +} + /* * Remove a VNIC from the kernel. */ -static dladm_status_t -i_dladm_vnic_delete_sys(int fd, dladm_vnic_attr_sys_t *attr) +dladm_status_t +i_dladm_vnic_delete_sys(datalink_id_t linkid) { vnic_ioc_delete_t ioc; + dladm_status_t status = DLADM_STATUS_OK; + int rc, fd; - ioc.vd_vnic_id = attr->va_vnic_id; - - if (ioctl(fd, VNIC_IOC_DELETE, &ioc) < 0) + if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0) return (dladm_errno2status(errno)); - return (DLADM_STATUS_OK); + ioc.vd_vnic_id = linkid; + + rc = ioctl(fd, VNIC_IOC_DELETE, &ioc); + if (rc < 0) + status = dladm_errno2status(errno); + + (void) close(fd); + return (status); } /* @@ -182,20 +304,32 @@ i_dladm_vnic_delete_sys(int fd, dladm_vnic_attr_sys_t *attr) */ typedef struct dladm_vnic_addr_type_s { - char *va_str; - vnic_mac_addr_type_t va_type; + const char *va_str; + vnic_mac_addr_type_t va_type; } dladm_vnic_addr_type_t; static dladm_vnic_addr_type_t addr_types[] = { {"fixed", VNIC_MAC_ADDR_TYPE_FIXED}, + {"random", VNIC_MAC_ADDR_TYPE_RANDOM}, + {"factory", VNIC_MAC_ADDR_TYPE_FACTORY}, + {"auto", VNIC_MAC_ADDR_TYPE_AUTO}, + {"fixed", VNIC_MAC_ADDR_TYPE_PRIMARY} }; #define NADDR_TYPES (sizeof (addr_types) / sizeof (dladm_vnic_addr_type_t)) -/* - * Return DLADM_STATUS_OK if a matching type was found, - * DLADM_STATUS_BADARG otherwise - */ +static const char * +dladm_vnic_macaddrtype2str(vnic_mac_addr_type_t type) +{ + int i; + + for (i = 0; i < NADDR_TYPES; i++) { + if (type == addr_types[i].va_type) + return (addr_types[i].va_str); + } + return (NULL); +} + dladm_status_t dladm_vnic_str2macaddrtype(const char *str, vnic_mac_addr_type_t *val) { @@ -209,136 +343,397 @@ dladm_vnic_str2macaddrtype(const char *str, vnic_mac_addr_type_t *val) return (DLADM_STATUS_OK); } } - return (DLADM_STATUS_BADARG); } + + /* - * Create a new VNIC. Update the configuration file and bring it up. + * Create a new VNIC / VLAN. Update the configuration file and bring it up. */ dladm_status_t dladm_vnic_create(const char *vnic, datalink_id_t linkid, vnic_mac_addr_type_t mac_addr_type, uchar_t *mac_addr, int mac_len, - datalink_id_t *vnic_id_out, uint32_t flags) + int *mac_slot, uint_t mac_prefix_len, uint16_t vid, + datalink_id_t *vnic_id_out, dladm_arg_list_t *proplist, uint32_t flags) { - dladm_vnic_attr_db_t attr; - int i, fd; + dladm_vnic_attr_t attr; datalink_id_t vnic_id; datalink_class_t class; - uint32_t media; - char *name = (char *)vnic; + uint32_t media = DL_ETHER; + char name[MAXLINKNAMELEN]; + uchar_t tmp_addr[MAXMACADDRLEN]; dladm_status_t status; + boolean_t is_vlan; + boolean_t is_etherstub; + int i; /* * Sanity test arguments. */ - if (flags & DLADM_OPT_PERSIST) - return (dladm_errno2status(ENOTSUP)); + if ((flags & DLADM_OPT_ACTIVE) == 0) + return (DLADM_STATUS_NOTSUP); + + is_vlan = ((flags & DLADM_OPT_VLAN) != 0); + if (is_vlan && ((vid < 1 || vid > 4094))) + return (DLADM_STATUS_VIDINVAL); + + is_etherstub = (linkid == DATALINK_INVALID_LINKID); if (mac_len > MAXMACADDRLEN) return (DLADM_STATUS_INVALIDMACADDRLEN); - for (i = 0; i < NADDR_TYPES; i++) { - if (mac_addr_type == addr_types[i].va_type) - break; - } - if (i == NADDR_TYPES) + if (!dladm_vnic_macaddrtype2str(mac_addr_type)) return (DLADM_STATUS_INVALIDMACADDRTYPE); - if ((status = dladm_datalink_id2info(linkid, NULL, &class, &media, - NULL, 0)) != DLADM_STATUS_OK) { - return (status); + /* + * If a random address might be generated, but no prefix + * was specified by the caller, use the default MAC address + * prefix. + */ + if ((mac_addr_type == VNIC_MAC_ADDR_TYPE_RANDOM || + mac_addr_type == VNIC_MAC_ADDR_TYPE_AUTO) && + mac_prefix_len == 0) { + mac_prefix_len = sizeof (dladm_vnic_def_prefix); + mac_addr = tmp_addr; + bcopy(dladm_vnic_def_prefix, mac_addr, mac_prefix_len); } - if (class == DATALINK_CLASS_VNIC) - return (DLADM_STATUS_BADARG); + if ((flags & DLADM_OPT_ANCHOR) == 0) { + if ((status = dladm_datalink_id2info(linkid, NULL, &class, + &media, NULL, 0)) != DLADM_STATUS_OK) + return (status); + + if (class == DATALINK_CLASS_VNIC || + class == DATALINK_CLASS_VLAN) + return (DLADM_STATUS_BADARG); + } else { + /* it's an anchor VNIC */ + if (linkid != DATALINK_INVALID_LINKID || vid != 0) + return (DLADM_STATUS_BADARG); + } if (vnic == NULL) { flags |= DLADM_OPT_PREFIX; - name = "vnic"; + (void) strlcpy(name, "vnic", sizeof (name)); + } else { + (void) strlcpy(name, vnic, sizeof (name)); } - if ((status = dladm_create_datalink_id(name, DATALINK_CLASS_VNIC, - media, flags, &vnic_id)) != DLADM_STATUS_OK) { + class = is_vlan ? DATALINK_CLASS_VLAN : + (is_etherstub ? DATALINK_CLASS_ETHERSTUB : DATALINK_CLASS_VNIC); + if ((status = dladm_create_datalink_id(name, class, + media, flags, &vnic_id)) != DLADM_STATUS_OK) return (status); + + if ((flags & DLADM_OPT_PREFIX) != 0) { + (void) snprintf(name + 4, sizeof (name), "%llu", vnic_id); + flags &= ~DLADM_OPT_PREFIX; } bzero(&attr, sizeof (attr)); - attr.vt_vnic_id = vnic_id; - attr.vt_link_id = linkid; - attr.vt_mac_addr_type = mac_addr_type; - attr.vt_mac_len = mac_len; - bcopy(mac_addr, attr.vt_mac_addr, mac_len); - if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0) { - status = dladm_errno2status(errno); + /* Extract resource_ctl and cpu_list from proplist */ + if (proplist != NULL) { + status = dladm_link_proplist_extract(proplist, + &attr.va_resource_props); + if (status != DLADM_STATUS_OK) + goto done; + } + + attr.va_vnic_id = vnic_id; + attr.va_link_id = linkid; + attr.va_mac_addr_type = mac_addr_type; + attr.va_mac_len = mac_len; + if (mac_slot != NULL) + attr.va_mac_slot = *mac_slot; + if (mac_len > 0) + bcopy(mac_addr, attr.va_mac_addr, mac_len); + else if (mac_prefix_len > 0) + bcopy(mac_addr, attr.va_mac_addr, mac_prefix_len); + attr.va_mac_prefix_len = mac_prefix_len; + attr.va_vid = vid; + attr.va_force = (flags & DLADM_OPT_FORCE) != 0; + attr.va_hwrings = (flags & DLADM_OPT_HWRINGS) != 0; + + status = i_dladm_vnic_create_sys(&attr); + if (status != DLADM_STATUS_OK) + goto done; + + /* Save vnic configuration and its properties */ + if (!(flags & DLADM_OPT_PERSIST)) + goto done; + + status = dladm_vnic_persist_conf(name, &attr, class); + if (status != DLADM_STATUS_OK) { + (void) i_dladm_vnic_delete_sys(vnic_id); goto done; } - status = i_dladm_vnic_create_sys(fd, &attr); - (void) close(fd); + if (proplist != NULL) { + for (i = 0; i < proplist->al_count; i++) { + dladm_arg_info_t *aip = &proplist->al_info[i]; + + status = dladm_set_linkprop(vnic_id, aip->ai_name, + aip->ai_val, aip->ai_count, DLADM_OPT_PERSIST); + if (status != DLADM_STATUS_OK) + break; + } + + if (status != DLADM_STATUS_OK) { + (void) dladm_remove_conf(vnic_id); + (void) i_dladm_vnic_delete_sys(vnic_id); + } + } done: if (status != DLADM_STATUS_OK) { - (void) dladm_destroy_datalink_id(vnic_id, - flags & ~DLADM_OPT_PREFIX); + (void) dladm_destroy_datalink_id(vnic_id, flags); } else { - *vnic_id_out = vnic_id; + if (vnic_id_out != NULL) + *vnic_id_out = vnic_id; + if (mac_slot != NULL) + *mac_slot = attr.va_mac_slot; } - return (status); } /* - * Modify the properties of a VNIC. + * Delete a VNIC / VLAN. */ dladm_status_t -dladm_vnic_modify(datalink_id_t vnic_id, uint32_t modify_mask, - vnic_mac_addr_type_t mac_addr_type, uint_t mac_len, uchar_t *mac_addr, - uint32_t flags) +dladm_vnic_delete(datalink_id_t linkid, uint32_t flags) { - dladm_vnic_modify_attr_t new_attr; + dladm_status_t status; + datalink_class_t class; - /* for now, only temporary creations are supported */ - if (flags & DLADM_OPT_PERSIST) - return (dladm_errno2status(ENOTSUP)); + if (flags == 0) + return (DLADM_STATUS_BADARG); - bzero(&new_attr, sizeof (new_attr)); + if ((dladm_datalink_id2info(linkid, NULL, &class, NULL, NULL, 0) != + DLADM_STATUS_OK)) + return (DLADM_STATUS_BADARG); - if (modify_mask & DLADM_VNIC_MODIFY_ADDR) { - new_attr.vm_mac_addr_type = mac_addr_type; - new_attr.vm_mac_len = mac_len; - bcopy(mac_addr, new_attr.vm_mac_addr, MAXMACADDRLEN); + if ((flags & DLADM_OPT_VLAN) != 0) { + if (class != DATALINK_CLASS_VLAN) + return (DLADM_STATUS_BADARG); + } else { + if (class != DATALINK_CLASS_VNIC && + class != DATALINK_CLASS_ETHERSTUB) + return (DLADM_STATUS_BADARG); } - /* update the properties of the existing VNIC */ - return (i_dladm_vnic_modify_sys(vnic_id, modify_mask, &new_attr)); + if ((flags & DLADM_OPT_ACTIVE) != 0) { + status = i_dladm_vnic_delete_sys(linkid); + if (status == DLADM_STATUS_OK) { + (void) dladm_set_linkprop(linkid, NULL, NULL, 0, + DLADM_OPT_ACTIVE); + (void) dladm_destroy_datalink_id(linkid, + DLADM_OPT_ACTIVE); + } else if (status != DLADM_STATUS_NOTFOUND || + !(flags & DLADM_OPT_PERSIST)) { + return (status); + } + } + if ((flags & DLADM_OPT_PERSIST) != 0) { + (void) dladm_destroy_datalink_id(linkid, DLADM_OPT_PERSIST); + (void) dladm_remove_conf(linkid); + } + return (DLADM_STATUS_OK); } -/* - * Delete a VNIC. - */ -dladm_status_t -dladm_vnic_delete(datalink_id_t vnic_id, uint32_t flags) +static const char * +dladm_vnic_macaddr2str(const uchar_t *mac, char *buf) { - dladm_status_t status; - dladm_vnic_attr_sys_t sys_attr; - int fd; + static char unknown_mac[] = {0, 0, 0, 0, 0, 0}; - /* for now, only temporary deletes are supported */ - if (flags & DLADM_OPT_PERSIST) - return (dladm_errno2status(ENOTSUP)); + if (buf == NULL) + return (NULL); - if ((fd = open(DLD_CONTROL_DEV, O_RDWR)) < 0) - return (dladm_errno2status(errno)); + if (bcmp(unknown_mac, mac, ETHERADDRL) == 0) + (void) strlcpy(buf, "unknown", DLADM_STRSIZE); + else + return (_link_ntoa(mac, buf, ETHERADDRL, IFT_OTHER)); - sys_attr.va_vnic_id = vnic_id; - status = i_dladm_vnic_delete_sys(fd, &sys_attr); - (void) close(fd); + return (buf); +} - if (status != DLADM_STATUS_OK) +static dladm_status_t +dladm_vnic_str2macaddr(const char *str, uchar_t *buf) +{ + int len = 0; + uchar_t *b = _link_aton(str, &len); + + if (b == NULL || len >= MAXMACADDRLEN) + return (DLADM_STATUS_BADARG); + + bcopy(b, buf, len); + free(b); + return (DLADM_STATUS_OK); +} + + +static dladm_status_t +dladm_vnic_persist_conf(const char *name, dladm_vnic_attr_t *attrp, + datalink_class_t class) +{ + dladm_conf_t conf = DLADM_INVALID_CONF; + dladm_status_t status; + char macstr[ETHERADDRL * 3]; + uint64_t u64; + + if ((status = dladm_create_conf(name, attrp->va_vnic_id, + class, DL_ETHER, &conf)) != DLADM_STATUS_OK) return (status); - (void) dladm_destroy_datalink_id(vnic_id, flags); + if (attrp->va_link_id != DATALINK_INVALID_LINKID) { + u64 = attrp->va_link_id; + status = dladm_set_conf_field(conf, FLINKOVER, + DLADM_TYPE_UINT64, &u64); + if (status != DLADM_STATUS_OK) + goto done; + } + + if (class != DATALINK_CLASS_VLAN) { + u64 = attrp->va_mac_addr_type; + status = dladm_set_conf_field(conf, FMADDRTYPE, + DLADM_TYPE_UINT64, &u64); + if (status != DLADM_STATUS_OK) + goto done; + + if (attrp->va_mac_len != ETHERADDRL) { + u64 = attrp->va_mac_len; + status = dladm_set_conf_field(conf, FMADDRLEN, + DLADM_TYPE_UINT64, &u64); + if (status != DLADM_STATUS_OK) + goto done; + } + } + + if (attrp->va_hwrings) { + boolean_t hwrings = attrp->va_hwrings; + status = dladm_set_conf_field(conf, FHWRINGS, + DLADM_TYPE_BOOLEAN, &hwrings); + if (status != DLADM_STATUS_OK) + goto done; + } + + if (class != DATALINK_CLASS_VLAN) { + if (attrp->va_mac_slot != -1) { + u64 = attrp->va_mac_slot; + status = dladm_set_conf_field(conf, FMADDRSLOT, + DLADM_TYPE_UINT64, &u64); + if (status != DLADM_STATUS_OK) + goto done; + } + + if (attrp->va_mac_prefix_len != + sizeof (dladm_vnic_def_prefix)) { + u64 = attrp->va_mac_prefix_len; + status = dladm_set_conf_field(conf, FMADDRPREFIXLEN, + DLADM_TYPE_UINT64, &u64); + if (status != DLADM_STATUS_OK) + goto done; + } + + (void) dladm_vnic_macaddr2str(attrp->va_mac_addr, macstr); + status = dladm_set_conf_field(conf, FMACADDR, DLADM_TYPE_STR, + macstr); + if (status != DLADM_STATUS_OK) + goto done; + } + + if (attrp->va_vid != 0) { + u64 = attrp->va_vid; + status = dladm_set_conf_field(conf, FVLANID, + DLADM_TYPE_UINT64, &u64); + if (status != DLADM_STATUS_OK) + goto done; + } + + /* + * Commit the link configuration. + */ + status = dladm_write_conf(conf); + +done: + dladm_destroy_conf(conf); return (status); } + +typedef struct dladm_vnic_up_arg_s { + uint32_t flags; + dladm_status_t status; +} dladm_vnic_up_arg_t; + +#define DLADM_VNIC_UP_FIRST_WALK 0x1 +#define DLADM_VNIC_UP_SECOND_WALK 0x2 + +static int +i_dladm_vnic_up(datalink_id_t linkid, void *arg) +{ + dladm_status_t *statusp = &(((dladm_vnic_up_arg_t *)arg)->status); + dladm_vnic_attr_t attr; + dladm_status_t status; + dladm_arg_list_t *proplist; + uint32_t flags = ((dladm_vnic_up_arg_t *)arg)->flags; + + bzero(&attr, sizeof (attr)); + + status = dladm_vnic_info(linkid, &attr, DLADM_OPT_PERSIST); + if (status != DLADM_STATUS_OK) + goto done; + + /* + * Create the vnics that request hardware group first + * Create the vnics that don't request hardware group in the second walk + */ + if ((flags == DLADM_VNIC_UP_FIRST_WALK && !attr.va_hwrings) || + (flags == DLADM_VNIC_UP_SECOND_WALK && attr.va_hwrings)) + goto done; + + /* Get all properties for this vnic */ + status = dladm_link_get_proplist(linkid, &proplist); + if (status != DLADM_STATUS_OK) + goto done; + + if (proplist != NULL) { + status = dladm_link_proplist_extract(proplist, + &attr.va_resource_props); + } + + status = i_dladm_vnic_create_sys(&attr); + if (status != DLADM_STATUS_OK) + goto done; + + if ((status = dladm_up_datalink_id(linkid)) != DLADM_STATUS_OK) { + (void) i_dladm_vnic_delete_sys(linkid); + goto done; + } +done: + *statusp = status; + return (DLADM_WALK_CONTINUE); +} + +dladm_status_t +dladm_vnic_up(datalink_id_t linkid, uint32_t flags) +{ + dladm_vnic_up_arg_t vnic_arg; + datalink_class_t class; + + class = ((flags & DLADM_OPT_VLAN) != 0) ? DATALINK_CLASS_VLAN : + (DATALINK_CLASS_VNIC | DATALINK_CLASS_ETHERSTUB); + + if (linkid == DATALINK_ALL_LINKID) { + vnic_arg.flags = DLADM_VNIC_UP_FIRST_WALK; + (void) dladm_walk_datalink_id(i_dladm_vnic_up, &vnic_arg, + class, DATALINK_ANY_MEDIATYPE, DLADM_OPT_PERSIST); + vnic_arg.flags = DLADM_VNIC_UP_SECOND_WALK; + (void) dladm_walk_datalink_id(i_dladm_vnic_up, &vnic_arg, + class, DATALINK_ANY_MEDIATYPE, DLADM_OPT_PERSIST); + return (DLADM_STATUS_OK); + } else { + (void) i_dladm_vnic_up(linkid, &vnic_arg); + return (vnic_arg.status); + } +} diff --git a/usr/src/lib/libdladm/common/libdlvnic.h b/usr/src/lib/libdladm/common/libdlvnic.h index 79b4b01ba2..77f78130be 100644 --- a/usr/src/lib/libdladm/common/libdlvnic.h +++ b/usr/src/lib/libdladm/common/libdlvnic.h @@ -26,39 +26,43 @@ #ifndef _LIBDLVNIC_H #define _LIBDLVNIC_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <netinet/in.h> #include <libdladm.h> +#include <libdladm_impl.h> +#include <sys/mac_flow.h> #include <sys/vnic.h> #ifdef __cplusplus extern "C" { #endif -typedef struct dladm_vnic_attr_sys { +typedef struct dladm_vnic_attr { datalink_id_t va_vnic_id; datalink_id_t va_link_id; vnic_mac_addr_type_t va_mac_addr_type; - uchar_t va_mac_addr[ETHERADDRL]; uint_t va_mac_len; -} dladm_vnic_attr_sys_t; + uchar_t va_mac_addr[MAXMACADDRLEN]; + int va_mac_slot; + uint_t va_mac_prefix_len; + uint16_t va_vid; + boolean_t va_force; + boolean_t va_hwrings; + mac_resource_props_t va_resource_props; +} dladm_vnic_attr_t; -/* - * Modification flags for dladm_vnic_modify(). - */ -#define DLADM_VNIC_MODIFY_ADDR 0x01 +extern dladm_status_t dladm_vnic_create(const char *, datalink_id_t, + vnic_mac_addr_type_t, uchar_t *, int, int *, + uint_t, uint16_t, datalink_id_t *, + dladm_arg_list_t *, uint32_t); + +extern dladm_status_t dladm_vnic_delete(datalink_id_t, uint32_t); +extern dladm_status_t dladm_vnic_info(datalink_id_t, dladm_vnic_attr_t *, + uint32_t); -extern dladm_status_t dladm_vnic_create(const char *, datalink_id_t, - vnic_mac_addr_type_t, uchar_t *, int, uint_t *, uint32_t); -extern dladm_status_t dladm_vnic_modify(datalink_id_t, uint32_t, - vnic_mac_addr_type_t, uint_t, uchar_t *, uint32_t); -extern dladm_status_t dladm_vnic_delete(datalink_id_t, uint32_t); -extern dladm_status_t dladm_vnic_info(datalink_id_t, dladm_vnic_attr_sys_t *, - uint32_t); -extern dladm_status_t dladm_vnic_str2macaddrtype(const char *, - vnic_mac_addr_type_t *); +extern dladm_status_t dladm_vnic_up(datalink_id_t, uint32_t); +extern dladm_status_t dladm_vnic_str2macaddrtype(const char *, + vnic_mac_addr_type_t *); #ifdef __cplusplus } diff --git a/usr/src/lib/libdladm/common/linkprop.c b/usr/src/lib/libdladm/common/linkprop.c index 8a570c70ef..2d58b585f8 100644 --- a/usr/src/lib/libdladm/common/linkprop.c +++ b/usr/src/lib/libdladm/common/linkprop.c @@ -41,30 +41,34 @@ #include <libdlwlan_impl.h> #include <libdlwlan.h> #include <libdlvlan.h> +#include <libdlvnic.h> +#include <libintl.h> #include <dlfcn.h> #include <link.h> #include <inet/wifi_ioctl.h> #include <libdladm.h> +#include <libdlstat.h> #include <sys/param.h> +#include <sys/debug.h> +#include <sys/dld.h> +#include <sys/mac_flow.h> #include <inttypes.h> #include <sys/ethernet.h> #include <net/wpa.h> #include <sys/sysmacros.h> -#define PERM_READ_ONLY "r-" -#define PERM_READ_WRITE "rw" - /* * The linkprop get() callback. - * - pd: pointer to the struct prop_desc + * - pd: pointer to the prop_desc_t * - propstrp: a property string array to keep the returned property. * Caller allocated. * - cntp: number of returned properties. * Caller also uses it to indicate how many it expects. */ struct prop_desc; +typedef struct prop_desc prop_desc_t; -typedef dladm_status_t pd_getf_t(struct prop_desc *pd, +typedef dladm_status_t pd_getf_t(prop_desc_t *pdp, datalink_id_t, char **propstp, uint_t *cntp, datalink_media_t, uint_t, uint_t *); @@ -79,10 +83,9 @@ typedef dladm_status_t pd_getf_t(struct prop_desc *pd, * of ioctl buffers etc. pd_set() may call another common routine (used * by all other pd_sets) which invokes the ioctl. */ -typedef dladm_status_t pd_setf_t(struct prop_desc *, datalink_id_t, - val_desc_t *propval, uint_t cnt, uint_t flags, - datalink_media_t); - +typedef dladm_status_t pd_setf_t(prop_desc_t *, datalink_id_t, + val_desc_t *propval, uint_t cnt, uint_t flags, + datalink_media_t); /* * The linkprop check() callback. @@ -98,9 +101,8 @@ typedef dladm_status_t pd_setf_t(struct prop_desc *, datalink_id_t, * with either a val_desc_t found on the pd_modval list or something * generated on the fly. */ -typedef dladm_status_t pd_checkf_t(struct prop_desc *pd, - datalink_id_t, char **propstrp, - uint_t cnt, val_desc_t *propval, +typedef dladm_status_t pd_checkf_t(prop_desc_t *pdp, datalink_id_t, + char **propstrp, uint_t cnt, val_desc_t *propval, datalink_media_t); typedef struct link_attr_s { @@ -110,39 +112,45 @@ typedef struct link_attr_s { } link_attr_t; static dld_ioc_macprop_t *i_dladm_buf_alloc_by_name(size_t, datalink_id_t, - const char *, uint_t, dladm_status_t *); + const char *, uint_t, dladm_status_t *); static dld_ioc_macprop_t *i_dladm_buf_alloc_by_id(size_t, datalink_id_t, - mac_prop_id_t, uint_t, - dladm_status_t *); + mac_prop_id_t, uint_t, dladm_status_t *); +static dld_ioc_macprop_t *i_dladm_get_public_prop(datalink_id_t, char *, uint_t, + dladm_status_t *, uint_t *); + static dladm_status_t i_dladm_set_prop(datalink_id_t, const char *, char **, uint_t, uint_t); static dladm_status_t i_dladm_get_prop(datalink_id_t, const char *, char **, uint_t *, dladm_prop_type_t, uint_t); static link_attr_t *dladm_name2prop(const char *); static link_attr_t *dladm_id2prop(mac_prop_id_t); -static dld_ioc_macprop_t *i_dladm_get_public_prop(datalink_id_t, char *, uint_t, - dladm_status_t *); + static pd_getf_t do_get_zone, do_get_autopush, do_get_rate_mod, do_get_rate_prop, do_get_channel_prop, do_get_powermode_prop, do_get_radio_prop, i_dladm_duplex_get, i_dladm_status_get, i_dladm_binary_get, i_dladm_uint32_get, - i_dladm_flowctl_get; + i_dladm_flowctl_get, dld_maxbw_get, dld_cpus_get, + dld_priority_get; + static pd_setf_t do_set_zone, do_set_rate_prop, do_set_powermode_prop, do_set_radio_prop, - i_dladm_set_public_prop; + i_dladm_set_public_prop, do_set_res, do_set_cpus; + static pd_checkf_t do_check_zone, do_check_autopush, do_check_rate, - i_dladm_defmtu_check; + i_dladm_defmtu_check, do_check_maxbw, do_check_cpus, + do_check_priority; -static dladm_status_t i_dladm_speed_get(struct prop_desc *, datalink_id_t, - char **, uint_t *, uint_t); +static dladm_status_t i_dladm_speed_get(prop_desc_t *, datalink_id_t, + char **, uint_t *, uint_t, uint_t *); static dladm_status_t i_dladm_wlan_get_legacy_ioctl(datalink_id_t, void *, uint_t, uint_t); static dladm_status_t i_dladm_wlan_set_legacy_ioctl(datalink_id_t, void *, uint_t, uint_t); static dladm_status_t i_dladm_macprop(void *, boolean_t); +static const char *dladm_perm2str(uint_t, char *); -typedef struct prop_desc { +struct prop_desc { /* * link property name */ @@ -202,7 +210,7 @@ typedef struct prop_desc { * indicate link media type this property applies to. */ datalink_media_t pd_dmedia; -} prop_desc_t; +}; #define MAC_PROP_BUFSIZE(v) sizeof (dld_ioc_macprop_t) + (v) - 1 @@ -303,7 +311,14 @@ static link_attr_t link_attr[] = { { MAC_PROP_WL_MLME, sizeof (wl_mlme_t), "mlme"}, + { MAC_PROP_MAXBW, sizeof (mac_resource_props_t), "maxbw"}, + + { MAC_PROP_PRIO, sizeof (mac_resource_props_t), "priority"}, + + { MAC_PROP_BIND_CPU, sizeof (mac_resource_props_t), "cpus"}, + { MAC_PROP_PRIVATE, 0, "driver-private"} + }; static val_desc_t link_duplex_vals[] = { @@ -324,8 +339,11 @@ static val_desc_t link_flow_vals[] = { { "rx", LINK_FLOWCTRL_RX }, { "bi", LINK_FLOWCTRL_BI } }; - -#define VALCNT(vals) (sizeof ((vals)) / sizeof (val_desc_t)) +static val_desc_t link_priority_vals[] = { + { "low", MPL_LOW }, + { "medium", MPL_MEDIUM }, + { "high", MPL_HIGH } +}; static val_desc_t dladm_wlan_radio_vals[] = { { "on", DLADM_WLAN_RADIO_ON }, @@ -338,8 +356,10 @@ static val_desc_t dladm_wlan_powermode_vals[] = { { "max", DLADM_WLAN_PM_MAX } }; -static prop_desc_t prop_table[] = { +#define VALCNT(vals) (sizeof ((vals)) / sizeof (val_desc_t)) +#define RESET_VAL ((uintptr_t)-1) +static prop_desc_t prop_table[] = { { "channel", { NULL, 0 }, NULL, 0, NULL, NULL, do_get_channel_prop, NULL, 0, @@ -372,12 +392,12 @@ static prop_desc_t prop_table[] = { do_get_zone, do_check_zone, PD_TEMPONLY|PD_CHECK_ALLOC, DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE }, - { "duplex", { "", 0 }, + { "duplex", { "", 0 }, link_duplex_vals, VALCNT(link_duplex_vals), NULL, NULL, i_dladm_duplex_get, NULL, 0, DATALINK_CLASS_PHYS, DL_ETHER }, - { "state", { "up", LINK_STATE_UP }, + { "state", { "up", LINK_STATE_UP }, link_status_vals, VALCNT(link_status_vals), NULL, NULL, i_dladm_status_get, NULL, 0, DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE }, @@ -455,12 +475,34 @@ static prop_desc_t prop_table[] = { { "en_10hdx_cap", { "", 0 }, link_01_vals, VALCNT(link_01_vals), i_dladm_set_public_prop, NULL, i_dladm_binary_get, NULL, - 0, DATALINK_CLASS_PHYS, DL_ETHER } + 0, DATALINK_CLASS_PHYS, DL_ETHER }, + + { "maxbw", { "--", RESET_VAL }, NULL, 0, + do_set_res, NULL, + dld_maxbw_get, do_check_maxbw, PD_CHECK_ALLOC, + DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE }, + { "cpus", { "--", RESET_VAL }, NULL, 0, + do_set_cpus, NULL, + dld_cpus_get, do_check_cpus, 0, + DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE }, + + { "priority", { "high", RESET_VAL }, + link_priority_vals, VALCNT(link_priority_vals), do_set_res, NULL, + dld_priority_get, do_check_priority, PD_CHECK_ALLOC, + DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE }, }; #define DLADM_MAX_PROPS (sizeof (prop_table) / sizeof (prop_desc_t)) +static resource_prop_t rsrc_prop_table[] = { + {"maxbw", do_extract_maxbw}, + {"priority", do_extract_priority}, + {"cpus", do_extract_cpus} +}; +#define DLADM_MAX_RSRC_PROP (sizeof (rsrc_prop_table) / \ + sizeof (resource_prop_t)) + /* * when retrieving private properties, we pass down a buffer with * DLADM_PROP_BUF_CHUNK of space for the driver to return the property value. @@ -477,6 +519,9 @@ static dladm_status_t i_dladm_set_linkprop(datalink_id_t, const char *, char **, uint_t, uint_t); static dladm_status_t i_dladm_getset_defval(prop_desc_t *, datalink_id_t, datalink_media_t, uint_t); + +static dladm_status_t link_proplist_check(dladm_arg_list_t *); + /* * Unfortunately, MAX_SCAN_SUPPORT_RATES is too small to allow all * rates to be retrieved. However, we cannot increase it at this @@ -539,17 +584,13 @@ i_dladm_set_single_prop(datalink_id_t linkid, datalink_class_t class, if (pdp->pd_set == NULL) return (DLADM_STATUS_PROPRDONLY); - if (pdp->pd_flags & PD_CHECK_ALLOC) - needfree = B_TRUE; - else - needfree = B_FALSE; if (prop_val != NULL) { vdp = malloc(sizeof (val_desc_t) * val_cnt); if (vdp == NULL) return (DLADM_STATUS_NOMEM); - if (pdp->pd_check != NULL) { + needfree = ((pdp->pd_flags & PD_CHECK_ALLOC) != 0); status = pdp->pd_check(pdp, linkid, prop_val, val_cnt, vdp, media); } else if (pdp->pd_optval != NULL) { @@ -563,23 +604,25 @@ i_dladm_set_single_prop(datalink_id_t linkid, datalink_class_t class, cnt = val_cnt; } else { + boolean_t defval = B_FALSE; + if (pdp->pd_defval.vd_name == NULL) return (DLADM_STATUS_NOTSUP); cnt = 1; - if ((pdp->pd_flags & PD_CHECK_ALLOC) != 0 || - strlen(pdp->pd_defval.vd_name) > 0) { + defval = (strlen(pdp->pd_defval.vd_name) > 0); + if ((pdp->pd_flags & PD_CHECK_ALLOC) != 0 || defval) { if ((vdp = malloc(sizeof (val_desc_t))) == NULL) return (DLADM_STATUS_NOMEM); - if (pdp->pd_check != NULL) { + if (defval) { + (void) memcpy(vdp, &pdp->pd_defval, + sizeof (val_desc_t)); + } else if (pdp->pd_check != NULL) { status = pdp->pd_check(pdp, linkid, prop_val, cnt, vdp, media); if (status != DLADM_STATUS_OK) goto done; - } else { - (void) memcpy(vdp, &pdp->pd_defval, - sizeof (val_desc_t)); } } else { status = i_dladm_getset_defval(pdp, linkid, @@ -618,7 +661,6 @@ i_dladm_set_linkprop(datalink_id_t linkid, const char *prop_name, if (prop_name != NULL && (strcasecmp(prop_name, pdp->pd_name) != 0)) continue; - found = B_TRUE; s = i_dladm_set_single_prop(linkid, class, media, pdp, prop_val, val_cnt, flags); @@ -774,16 +816,8 @@ dladm_get_linkprop(datalink_id_t linkid, dladm_prop_type_t type, } *prop_val[0] = '\0'; - switch (perm_flags) { - case MAC_PROP_PERM_READ: - (void) strncpy(*prop_val, PERM_READ_ONLY, - DLADM_PROP_VAL_MAX); - break; - case MAC_PROP_PERM_RW: - (void) strncpy(*prop_val, PERM_READ_WRITE, - DLADM_PROP_VAL_MAX); - break; - } + if (status == DLADM_STATUS_OK) + (void) dladm_perm2str(perm_flags, *prop_val); break; case DLADM_PROP_VAL_DEFAULT: @@ -879,7 +913,16 @@ done: static int i_dladm_init_linkprop(datalink_id_t linkid, void *arg) { - (void) dladm_init_linkprop(linkid, B_TRUE); + datalink_class_t class; + dladm_status_t status; + + status = dladm_datalink_id2info(linkid, NULL, &class, NULL, NULL, 0); + if (status != DLADM_STATUS_OK) + return (DLADM_WALK_TERMINATE); + + if ((class & (DATALINK_CLASS_VNIC | DATALINK_CLASS_VLAN)) == 0) + (void) dladm_init_linkprop(linkid, B_TRUE); + return (DLADM_WALK_CONTINUE); } @@ -904,24 +947,24 @@ dladm_init_linkprop(datalink_id_t linkid, boolean_t any_media) /* ARGSUSED */ static dladm_status_t -do_get_zone(struct prop_desc *pd, datalink_id_t linkid, - char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags, - uint_t *perm_flags) +do_get_zone(prop_desc_t *pdp, datalink_id_t linkid, + char **prop_val, uint_t *val_cnt, datalink_media_t media, + uint_t flags, uint_t *perm_flags) { - char zone_name[ZONENAME_MAX]; - zoneid_t zid; - dladm_status_t status; - char *cp; + char zone_name[ZONENAME_MAX]; + zoneid_t zid; + dladm_status_t status; + char *cp; dld_ioc_macprop_t *dip; if (flags != 0) return (DLADM_STATUS_NOTSUP); - dip = i_dladm_get_public_prop(linkid, pd->pd_name, flags, &status); + dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags, + &status, perm_flags); if (status != DLADM_STATUS_OK) return (status); - *perm_flags = dip->pr_perm_flags; cp = dip->pr_val; (void) memcpy(&zid, cp, sizeof (zid)); free(dip); @@ -929,14 +972,12 @@ do_get_zone(struct prop_desc *pd, datalink_id_t linkid, *val_cnt = 1; if (zid != GLOBAL_ZONEID) { if (getzonenamebyid(zid, zone_name, sizeof (zone_name)) < 0) { - *perm_flags = 0; return (dladm_errno2status(errno)); } (void) strncpy(*prop_val, zone_name, DLADM_PROP_VAL_MAX); } else { *prop_val[0] = '\0'; - *perm_flags = 0; } return (DLADM_STATUS_OK); @@ -1011,13 +1052,13 @@ cleanup: /* ARGSUSED */ static dladm_status_t -do_set_zone(prop_desc_t *pd, datalink_id_t linkid, val_desc_t *vdp, +do_set_zone(prop_desc_t *pdp, datalink_id_t linkid, val_desc_t *vdp, uint_t val_cnt, uint_t flags, datalink_media_t media) { - dladm_status_t status = DLADM_STATUS_OK; - zoneid_t zid_old, zid_new; - char link[MAXLINKNAMELEN]; - char *cp; + dladm_status_t status = DLADM_STATUS_OK; + zoneid_t zid_old, zid_new; + char link[MAXLINKNAMELEN]; + char *cp; dld_ioc_macprop_t *dip; dld_ioc_zid_t *dzp; @@ -1026,25 +1067,14 @@ do_set_zone(prop_desc_t *pd, datalink_id_t linkid, val_desc_t *vdp, dzp = (dld_ioc_zid_t *)vdp->vd_val; - /* - * If diz_is_ppa_hack is set, then an implicit vlan must be created. - * There is no old value to compare against, and vdp->vd_val is - * already populated with the zoneid and linkname in the function - * do_check_zone(). - */ - - if (dzp->diz_is_ppa_hack) { - zid_old = GLOBAL_ZONEID; - } else { - dip = i_dladm_get_public_prop(linkid, pd->pd_name, - flags, &status); - if (status != DLADM_STATUS_OK) - return (status); + dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags, + &status, NULL); + if (status != DLADM_STATUS_OK) + return (status); - cp = dip->pr_val; - (void) memcpy(&zid_old, cp, sizeof (zid_old)); - free(dip); - } + cp = dip->pr_val; + (void) memcpy(&zid_old, cp, sizeof (zid_old)); + free(dip); zid_new = dzp->diz_zid; (void) strlcpy(link, dzp->diz_link, MAXLINKNAMELEN); @@ -1066,7 +1096,7 @@ do_set_zone(prop_desc_t *pd, datalink_id_t linkid, val_desc_t *vdp, * link and prevent a link renaming, so we need to do it * before other operations. */ - status = i_dladm_set_public_prop(pd, linkid, vdp, val_cnt, + status = i_dladm_set_public_prop(pdp, linkid, vdp, val_cnt, flags, media); if (status != DLADM_STATUS_OK) return (status); @@ -1092,16 +1122,9 @@ do_set_zone(prop_desc_t *pd, datalink_id_t linkid, val_desc_t *vdp, goto rollback2; } - if (dzp->diz_is_ppa_hack) { - if ((status = dladm_name2info(link, &linkid, NULL, NULL, - NULL)) != DLADM_STATUS_OK) { - return (status); - } - } - (void) i_dladm_update_deventry(zid_new, linkid, B_TRUE); } else { - status = i_dladm_set_public_prop(pd, linkid, vdp, val_cnt, + status = i_dladm_set_public_prop(pdp, linkid, vdp, val_cnt, flags, media); if (status != DLADM_STATUS_OK) goto rollback2; @@ -1117,7 +1140,7 @@ rollback2: rollback1: if (zid_new != GLOBAL_ZONEID) { dzp->diz_zid = zid_old; - (void) i_dladm_set_public_prop(pd, linkid, vdp, val_cnt, + (void) i_dladm_set_public_prop(pdp, linkid, vdp, val_cnt, flags, media); } @@ -1126,15 +1149,13 @@ rollback1: /* ARGSUSED */ static dladm_status_t -do_check_zone(struct prop_desc *pd, datalink_id_t linkid, char **prop_val, +do_check_zone(prop_desc_t *pdp, datalink_id_t linkid, char **prop_val, uint_t val_cnt, val_desc_t *vdp, datalink_media_t media) { char *zone_name; char linkname[MAXLINKNAMELEN]; zoneid_t zoneid; - char *cp; dladm_status_t status = DLADM_STATUS_OK; - boolean_t is_ppa_hack = B_FALSE; dld_ioc_zid_t *dzp; if (val_cnt != 1) @@ -1144,32 +1165,12 @@ do_check_zone(struct prop_desc *pd, datalink_id_t linkid, char **prop_val, if (dzp == NULL) return (DLADM_STATUS_NOMEM); - if (prop_val) { - /* - * The prop_val contains zone_name{:linkname}. The linkname is - * present only when the link is a ppa-hacked vlan. - */ - cp = strchr(*prop_val, ':'); - if (cp) { - (void) strlcpy(linkname, cp + 1, MAXLINKNAMELEN); - *cp = '\0'; - is_ppa_hack = B_TRUE; - } else { - status = dladm_datalink_id2info(linkid, NULL, NULL, - NULL, linkname, MAXLINKNAMELEN); - if (status != DLADM_STATUS_OK) { - goto done; - } - } - zone_name = *prop_val; - } else { - zone_name = GLOBAL_ZONENAME; - if ((status = dladm_datalink_id2info(linkid, NULL, NULL, NULL, - linkname, MAXLINKNAMELEN)) != DLADM_STATUS_OK) { - goto done; - } + if ((status = dladm_datalink_id2info(linkid, NULL, NULL, NULL, + linkname, MAXLINKNAMELEN)) != DLADM_STATUS_OK) { + goto done; } + zone_name = (prop_val != NULL) ? *prop_val : GLOBAL_ZONENAME; if (strlen(linkname) > MAXLINKNAMELEN) { status = DLADM_STATUS_BADVAL; goto done; @@ -1199,7 +1200,6 @@ do_check_zone(struct prop_desc *pd, datalink_id_t linkid, char **prop_val, dzp->diz_zid = zoneid; (void) strlcpy(dzp->diz_link, linkname, MAXLINKNAMELEN); - dzp->diz_is_ppa_hack = is_ppa_hack; vdp->vd_val = (uintptr_t)dzp; return (DLADM_STATUS_OK); @@ -1210,9 +1210,359 @@ done: /* ARGSUSED */ static dladm_status_t -do_get_autopush(struct prop_desc *pd, datalink_id_t linkid, - char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags, - uint_t *perm_flags) +dld_maxbw_get(prop_desc_t *pdp, datalink_id_t linkid, + char **prop_val, uint_t *val_cnt, datalink_media_t media, + uint_t flags, uint_t *perm_flags) +{ + dld_ioc_macprop_t *dip; + mac_resource_props_t mrp; + dladm_status_t status; + + dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags, + &status, perm_flags); + if (dip == NULL) + return (status); + + bcopy(dip->pr_val, &mrp, sizeof (mac_resource_props_t)); + free(dip); + + if ((mrp.mrp_mask & MRP_MAXBW) == 0) { + (*prop_val)[0] = '\0'; + } else { + (void) dladm_bw2str(mrp.mrp_maxbw, prop_val[0]); + } + *val_cnt = 1; + return (DLADM_STATUS_OK); +} + +/* ARGSUSED */ +static dladm_status_t +do_check_maxbw(prop_desc_t *pdp, datalink_id_t linkid, char **prop_val, + uint_t val_cnt, val_desc_t *vdp, datalink_media_t media) +{ + uint64_t *maxbw; + dladm_status_t status = DLADM_STATUS_OK; + + if (val_cnt != 1) + return (DLADM_STATUS_BADVALCNT); + + maxbw = malloc(sizeof (uint64_t)); + if (maxbw == NULL) + return (DLADM_STATUS_NOMEM); + + status = dladm_str2bw(*prop_val, maxbw); + if (status != DLADM_STATUS_OK) { + free(maxbw); + return (status); + } + + if ((*maxbw < MRP_MAXBW_MINVAL) && (*maxbw != 0)) { + free(maxbw); + return (DLADM_STATUS_MINMAXBW); + } + + vdp->vd_val = (uintptr_t)maxbw; + return (DLADM_STATUS_OK); +} + +/* ARGSUSED */ +dladm_status_t +do_extract_maxbw(val_desc_t *vdp, void *arg, uint_t cnt) +{ + mac_resource_props_t *mrp = (mac_resource_props_t *)arg; + + bcopy((char *)vdp->vd_val, &mrp->mrp_maxbw, sizeof (uint64_t)); + mrp->mrp_mask |= MRP_MAXBW; + + return (DLADM_STATUS_OK); +} + +/* ARGSUSED */ +static dladm_status_t +dld_cpus_get(prop_desc_t *pdp, datalink_id_t linkid, + char **prop_val, uint_t *val_cnt, datalink_media_t media, + uint_t flags, uint_t *perm_flags) +{ + dld_ioc_macprop_t *dip; + mac_resource_props_t mrp; + int i; + uint32_t ncpus; + uchar_t *cp; + dladm_status_t status; + + dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags, + &status, perm_flags); + if (dip == NULL) + return (status); + + cp = (uchar_t *)dip->pr_val; + (void) memcpy(&mrp, cp, sizeof (mac_resource_props_t)); + free(dip); + + ncpus = mrp.mrp_ncpus; + + if (ncpus > *val_cnt) + return (DLADM_STATUS_TOOSMALL); + + if (ncpus == 0) { + (*prop_val)[0] = '\0'; + *val_cnt = 1; + return (DLADM_STATUS_OK); + } + + *val_cnt = ncpus; + for (i = 0; i < ncpus; i++) { + (void) snprintf(prop_val[i], DLADM_PROP_VAL_MAX, + "%u", mrp.mrp_cpu[i]); + } + return (DLADM_STATUS_OK); +} + +/* ARGSUSED */ +static dladm_status_t +do_set_res(prop_desc_t *pdp, datalink_id_t linkid, val_desc_t *vdp, + uint_t val_cnt, uint_t flags, datalink_media_t media) +{ + mac_resource_props_t mrp; + dladm_status_t status = DLADM_STATUS_OK; + dld_ioc_macprop_t *dip; + + bzero(&mrp, sizeof (mac_resource_props_t)); + dip = i_dladm_buf_alloc_by_name(0, linkid, pdp->pd_name, + flags, &status); + + if (dip == NULL) + return (status); + + if (vdp->vd_val == RESET_VAL) { + switch (dip->pr_num) { + case MAC_PROP_MAXBW: + mrp.mrp_maxbw = MRP_MAXBW_RESETVAL; + mrp.mrp_mask = MRP_MAXBW; + break; + case MAC_PROP_PRIO: + mrp.mrp_priority = MPL_RESET; + mrp.mrp_mask = MRP_PRIORITY; + break; + default: + free(dip); + return (DLADM_STATUS_BADARG); + } + } else { + switch (dip->pr_num) { + case MAC_PROP_MAXBW: + bcopy((void *)vdp->vd_val, &mrp.mrp_maxbw, + sizeof (uint64_t)); + mrp.mrp_mask = MRP_MAXBW; + break; + case MAC_PROP_PRIO: + bcopy((void *)vdp->vd_val, &mrp.mrp_priority, + sizeof (mac_priority_level_t)); + mrp.mrp_mask = MRP_PRIORITY; + break; + default: + free(dip); + return (DLADM_STATUS_BADARG); + } + } + + (void) memcpy(dip->pr_val, &mrp, dip->pr_valsize); + status = i_dladm_macprop(dip, B_TRUE); + free(dip); + return (status); +} + +/* ARGSUSED */ +static dladm_status_t +do_set_cpus(prop_desc_t *pdp, datalink_id_t linkid, val_desc_t *vdp, + uint_t val_cnt, uint_t flags, datalink_media_t media) +{ + mac_resource_props_t mrp; + dladm_status_t status; + dld_ioc_macprop_t *dip; + datalink_class_t class; + + /* + * CPU bindings can be set on VNIC and regular physical links. + * However VNICs fails the dladm_phys_info test(). So apply + * the phys_info test only on physical links. + */ + if ((status = dladm_datalink_id2info(linkid, NULL, &class, + NULL, NULL, 0)) != DLADM_STATUS_OK) { + return (status); + } + + /* + * We set intr_cpu to -1. The interrupt will be retargetted, + * if possible when the setup is complete in MAC. + */ + bzero(&mrp, sizeof (mac_resource_props_t)); + mrp.mrp_mask = MRP_CPUS; + if (vdp != NULL && vdp->vd_val != RESET_VAL) { + mac_resource_props_t *vmrp; + + vmrp = (mac_resource_props_t *)vdp->vd_val; + if (vmrp->mrp_ncpus > 0) { + bcopy(vmrp, &mrp, sizeof (mac_resource_props_t)); + mrp.mrp_mask = MRP_CPUS; + } + mrp.mrp_mask |= MRP_CPUS_USERSPEC; + mrp.mrp_fanout_mode = MCM_CPUS; + mrp.mrp_intr_cpu = -1; + } + + dip = i_dladm_buf_alloc_by_name(0, linkid, pdp->pd_name, + flags, &status); + if (dip == NULL) + return (status); + + (void) memcpy(dip->pr_val, &mrp, dip->pr_valsize); + status = i_dladm_macprop(dip, B_TRUE); + free(dip); + return (status); +} + +/* ARGSUSED */ +static dladm_status_t +do_check_cpus(prop_desc_t *pdp, datalink_id_t linkid, char **prop_val, + uint_t val_cnt, val_desc_t *vdp, datalink_media_t media) +{ + uint32_t cpuid; + int i, j, rc; + long nproc = sysconf(_SC_NPROCESSORS_CONF); + mac_resource_props_t *mrp; + + mrp = malloc(sizeof (mac_resource_props_t)); + if (mrp == NULL) + return (DLADM_STATUS_NOMEM); + + for (i = 0; i < val_cnt; i++) { + errno = 0; + cpuid = strtol(prop_val[i], (char **)NULL, 10); + if (errno != 0 || cpuid >= nproc) { + free(mrp); + return (DLADM_STATUS_CPUMAX); + } + rc = p_online(cpuid, P_STATUS); + if (rc < 1) { + free(mrp); + return (DLADM_STATUS_CPUERR); + } + if (rc != P_ONLINE) { + free(mrp); + return (DLADM_STATUS_CPUNOTONLINE); + } + mrp->mrp_cpu[i] = cpuid; + } + mrp->mrp_ncpus = (uint32_t)val_cnt; + + /* Check for duplicates */ + for (i = 0; i < val_cnt; i++) { + for (j = 0; j < val_cnt; j++) { + if (i != j && mrp->mrp_cpu[i] == mrp->mrp_cpu[j]) { + free(mrp); + return (DLADM_STATUS_BADARG); + } + } + } + vdp->vd_val = (uintptr_t)mrp; + + return (DLADM_STATUS_OK); +} + +/* ARGSUSED */ +dladm_status_t +do_extract_cpus(val_desc_t *vdp, void *arg, uint_t cnt) +{ + mac_resource_props_t *mrp = (mac_resource_props_t *)arg; + mac_resource_props_t *vmrp = (mac_resource_props_t *)vdp->vd_val; + int i; + + for (i = 0; i < vmrp->mrp_ncpus; i++) { + mrp->mrp_cpu[i] = vmrp->mrp_cpu[i]; + } + mrp->mrp_ncpus = vmrp->mrp_ncpus; + mrp->mrp_mask |= (MRP_CPUS|MRP_CPUS_USERSPEC); + mrp->mrp_fanout_mode = MCM_CPUS; + + return (DLADM_STATUS_OK); +} + +/* ARGSUSED */ +static dladm_status_t +dld_priority_get(prop_desc_t *pdp, datalink_id_t linkid, + char **prop_val, uint_t *val_cnt, datalink_media_t media, + uint_t flags, uint_t *perm_flags) +{ + dld_ioc_macprop_t *dip; + mac_resource_props_t mrp; + mac_priority_level_t pri; + dladm_status_t status; + + dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags, + &status, perm_flags); + if (dip == NULL) + return (status); + + bcopy(dip->pr_val, &mrp, sizeof (mac_resource_props_t)); + free(dip); + + pri = ((mrp.mrp_mask & MRP_PRIORITY) == 0) ? MPL_HIGH : + mrp.mrp_priority; + + (void) dladm_pri2str(pri, prop_val[0]); + *val_cnt = 1; + return (DLADM_STATUS_OK); +} + +/* ARGSUSED */ +static dladm_status_t +do_check_priority(prop_desc_t *pdp, datalink_id_t linkid, char **prop_val, + uint_t val_cnt, val_desc_t *vdp, datalink_media_t media) +{ + mac_priority_level_t *pri; + dladm_status_t status = DLADM_STATUS_OK; + + if (val_cnt != 1) + return (DLADM_STATUS_BADVALCNT); + + pri = malloc(sizeof (mac_priority_level_t)); + if (pri == NULL) + return (DLADM_STATUS_NOMEM); + + status = dladm_str2pri(*prop_val, pri); + if (status != DLADM_STATUS_OK) { + free(pri); + return (status); + } + + if (*pri < MPL_LOW || *pri > MPL_HIGH) { + free(pri); + return (DLADM_STATUS_BADVAL); + } + + vdp->vd_val = (uintptr_t)pri; + return (DLADM_STATUS_OK); +} + +/* ARGSUSED */ +dladm_status_t +do_extract_priority(val_desc_t *vdp, void *arg, uint_t cnt) +{ + mac_resource_props_t *mrp = (mac_resource_props_t *)arg; + + bcopy((char *)vdp->vd_val, &mrp->mrp_priority, + sizeof (mac_priority_level_t)); + mrp->mrp_mask |= MRP_PRIORITY; + + return (DLADM_STATUS_OK); +} + +/* ARGSUSED */ +static dladm_status_t +do_get_autopush(prop_desc_t *pdp, datalink_id_t linkid, + char **prop_val, uint_t *val_cnt, datalink_media_t media, + uint_t flags, uint_t *perm_flags) { struct dlautopush dlap; int i, len; @@ -1223,10 +1573,11 @@ do_get_autopush(struct prop_desc *pd, datalink_id_t linkid, return (DLADM_STATUS_NOTDEFINED); *val_cnt = 1; - dip = i_dladm_get_public_prop(linkid, pd->pd_name, flags, &status); + dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags, + &status, perm_flags); if (dip == NULL) { (*prop_val)[0] = '\0'; - goto done; + return (DLADM_STATUS_OK); } (void) memcpy(&dlap, dip->pr_val, sizeof (dlap)); @@ -1246,8 +1597,6 @@ do_get_autopush(struct prop_desc *pd, datalink_id_t linkid, len += (strlen(AP_ANCHOR) + 1); } } - - *perm_flags = dip->pr_perm_flags; free(dip); done: return (DLADM_STATUS_OK); @@ -1292,7 +1641,7 @@ i_dladm_add_ap_module(const char *module, struct dlautopush *dlap) */ /* ARGSUSED */ static dladm_status_t -do_check_autopush(struct prop_desc *pd, datalink_id_t linkid, char **prop_val, +do_check_autopush(prop_desc_t *pdp, datalink_id_t linkid, char **prop_val, uint_t val_cnt, val_desc_t *vdp, datalink_media_t media) { char *module; @@ -1331,8 +1680,8 @@ do_check_autopush(struct prop_desc *pd, datalink_id_t linkid, char **prop_val, /* ARGSUSED */ static dladm_status_t -do_get_rate_common(struct prop_desc *pd, datalink_id_t linkid, - char **prop_val, uint_t *val_cnt, uint_t id) +do_get_rate_common(prop_desc_t *pdp, datalink_id_t linkid, + char **prop_val, uint_t *val_cnt, uint_t id, uint_t *perm_flags) { wl_rates_t *wrp; uint_t i; @@ -1363,6 +1712,7 @@ do_get_rate_common(struct prop_desc *pd, datalink_id_t linkid, (float)wrp->wl_rates_rates[i] / 2); } *val_cnt = wrp->wl_rates_num; + *perm_flags = MAC_PROP_PERM_RW; done: free(wrp); @@ -1370,29 +1720,25 @@ done: } static dladm_status_t -do_get_rate_prop(struct prop_desc *pd, datalink_id_t linkid, - char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags, - uint_t *perm_flags) +do_get_rate_prop(prop_desc_t *pdp, datalink_id_t linkid, + char **prop_val, uint_t *val_cnt, datalink_media_t media, + uint_t flags, uint_t *perm_flags) { if (media != DL_WIFI) { - *perm_flags = MAC_PROP_PERM_READ; - return (i_dladm_speed_get(pd, linkid, prop_val, - val_cnt, flags)); + return (i_dladm_speed_get(pdp, linkid, prop_val, + val_cnt, flags, perm_flags)); } - *perm_flags = MAC_PROP_PERM_RW; - return (do_get_rate_common(pd, linkid, prop_val, val_cnt, - MAC_PROP_WL_DESIRED_RATES)); + return (do_get_rate_common(pdp, linkid, prop_val, val_cnt, + MAC_PROP_WL_DESIRED_RATES, perm_flags)); } /* ARGSUSED */ static dladm_status_t -do_get_rate_mod(struct prop_desc *pd, datalink_id_t linkid, - char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags, - uint_t *perm_flags) +do_get_rate_mod(prop_desc_t *pdp, datalink_id_t linkid, + char **prop_val, uint_t *val_cnt, datalink_media_t media, + uint_t flags, uint_t *perm_flags) { - *perm_flags = MAC_PROP_PERM_READ; - switch (media) { case DL_ETHER: /* @@ -1402,8 +1748,8 @@ do_get_rate_mod(struct prop_desc *pd, datalink_id_t linkid, return (DLADM_STATUS_NOTSUP); case DL_WIFI: - return (do_get_rate_common(pd, linkid, prop_val, val_cnt, - MAC_PROP_WL_SUPPORTED_RATES)); + return (do_get_rate_common(pdp, linkid, prop_val, val_cnt, + MAC_PROP_WL_SUPPORTED_RATES, perm_flags)); default: return (DLADM_STATUS_BADARG); } @@ -1437,7 +1783,7 @@ do_set_rate(datalink_id_t linkid, dladm_wlan_rates_t *rates) /* ARGSUSED */ static dladm_status_t -do_set_rate_prop(prop_desc_t *pd, datalink_id_t linkid, +do_set_rate_prop(prop_desc_t *pdp, datalink_id_t linkid, val_desc_t *vdp, uint_t val_cnt, uint_t flags, datalink_media_t media) { dladm_wlan_rates_t rates; @@ -1463,7 +1809,7 @@ done: /* ARGSUSED */ static dladm_status_t -do_check_rate(struct prop_desc *pd, datalink_id_t linkid, char **prop_val, +do_check_rate(prop_desc_t *pdp, datalink_id_t linkid, char **prop_val, uint_t val_cnt, val_desc_t *vdp, datalink_media_t media) { int i; @@ -1517,16 +1863,15 @@ do_get_phyconf(datalink_id_t linkid, void *buf, int buflen) /* ARGSUSED */ static dladm_status_t -do_get_channel_prop(struct prop_desc *pd, datalink_id_t linkid, - char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags, - uint_t *perm_flags) +do_get_channel_prop(prop_desc_t *pdp, datalink_id_t linkid, + char **prop_val, uint_t *val_cnt, datalink_media_t media, + uint_t flags, uint_t *perm_flags) { uint32_t channel; char buf[WLDP_BUFSIZE]; dladm_status_t status = DLADM_STATUS_OK; wl_phy_conf_t wl_phy_conf; - *perm_flags = MAC_PROP_PERM_READ; if ((status = do_get_phyconf(linkid, buf, sizeof (buf))) != DLADM_STATUS_OK) goto done; @@ -1539,7 +1884,7 @@ do_get_channel_prop(struct prop_desc *pd, datalink_id_t linkid, (void) snprintf(*prop_val, DLADM_STRSIZE, "%u", channel); *val_cnt = 1; - + *perm_flags = MAC_PROP_PERM_READ; done: return (status); } @@ -1553,9 +1898,9 @@ do_get_powermode(datalink_id_t linkid, void *buf, int buflen) /* ARGSUSED */ static dladm_status_t -do_get_powermode_prop(struct prop_desc *pd, datalink_id_t linkid, - char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags, - uint_t *perm_flags) +do_get_powermode_prop(prop_desc_t *pdp, datalink_id_t linkid, + char **prop_val, uint_t *val_cnt, datalink_media_t media, + uint_t flags, uint_t *perm_flags) { wl_ps_mode_t mode; const char *s; @@ -1583,12 +1928,8 @@ do_get_powermode_prop(struct prop_desc *pd, datalink_id_t linkid, } (void) snprintf(*prop_val, DLADM_STRSIZE, "%s", s); *val_cnt = 1; - + *perm_flags = MAC_PROP_PERM_RW; done: - if (status == DLADM_STATUS_OK) - *perm_flags = MAC_PROP_PERM_RW; - else - *perm_flags = 0; return (status); } @@ -1618,7 +1959,7 @@ do_set_powermode(datalink_id_t linkid, dladm_wlan_powermode_t *pm) /* ARGSUSED */ static dladm_status_t -do_set_powermode_prop(prop_desc_t *pd, datalink_id_t linkid, +do_set_powermode_prop(prop_desc_t *pdp, datalink_id_t linkid, val_desc_t *vdp, uint_t val_cnt, uint_t flags, datalink_media_t media) { dladm_wlan_powermode_t powermode = (dladm_wlan_powermode_t)vdp->vd_val; @@ -1641,9 +1982,9 @@ do_get_radio(datalink_id_t linkid, void *buf, int buflen) /* ARGSUSED */ static dladm_status_t -do_get_radio_prop(struct prop_desc *pd, datalink_id_t linkid, - char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags, - uint_t *perm_flags) +do_get_radio_prop(prop_desc_t *pdp, datalink_id_t linkid, + char **prop_val, uint_t *val_cnt, datalink_media_t media, + uint_t flags, uint_t *perm_flags) { wl_radio_t radio; const char *s; @@ -1668,12 +2009,8 @@ do_get_radio_prop(struct prop_desc *pd, datalink_id_t linkid, } (void) snprintf(*prop_val, DLADM_STRSIZE, "%s", s); *val_cnt = 1; - + *perm_flags = MAC_PROP_PERM_RW; done: - if (status == DLADM_STATUS_OK) - *perm_flags = MAC_PROP_PERM_RW; - else - *perm_flags = 0; return (status); } @@ -1698,7 +2035,7 @@ do_set_radio(datalink_id_t linkid, dladm_wlan_radio_t *radio) /* ARGSUSED */ static dladm_status_t -do_set_radio_prop(prop_desc_t *pd, datalink_id_t linkid, +do_set_radio_prop(prop_desc_t *pdp, datalink_id_t linkid, val_desc_t *vdp, uint_t val_cnt, uint_t fags, datalink_media_t media) { dladm_wlan_radio_t radio = (dladm_wlan_radio_t)vdp->vd_val; @@ -1860,7 +2197,7 @@ i_dladm_buf_alloc_by_id(size_t valsize, datalink_id_t linkid, /* ARGSUSED */ static dladm_status_t -i_dladm_set_public_prop(prop_desc_t *pd, datalink_id_t linkid, +i_dladm_set_public_prop(prop_desc_t *pdp, datalink_id_t linkid, val_desc_t *vdp, uint_t val_cnt, uint_t flags, datalink_media_t media) { dld_ioc_macprop_t *dip; @@ -1870,11 +2207,11 @@ i_dladm_set_public_prop(prop_desc_t *pd, datalink_id_t linkid, uint32_t u32; void *val; - dip = i_dladm_buf_alloc_by_name(0, linkid, pd->pd_name, 0, &status); + dip = i_dladm_buf_alloc_by_name(0, linkid, pdp->pd_name, 0, &status); if (dip == NULL) return (status); - if (pd->pd_flags & PD_CHECK_ALLOC) + if (pdp->pd_flags & PD_CHECK_ALLOC) val = (void *)vdp->vd_val; else { /* @@ -1931,7 +2268,7 @@ i_dladm_macprop(void *dip, boolean_t set) static dld_ioc_macprop_t * i_dladm_get_public_prop(datalink_id_t linkid, char *prop_name, uint_t flags, - dladm_status_t *status) + dladm_status_t *status, uint_t *perm_flags) { dld_ioc_macprop_t *dip = NULL; @@ -1944,12 +2281,15 @@ i_dladm_get_public_prop(datalink_id_t linkid, char *prop_name, uint_t flags, free(dip); return (NULL); } + if (perm_flags != NULL) + *perm_flags = dip->pr_perm_flags; + return (dip); } /* ARGSUSED */ static dladm_status_t -i_dladm_defmtu_check(struct prop_desc *pd, datalink_id_t linkid, +i_dladm_defmtu_check(prop_desc_t *pdp, datalink_id_t linkid, char **prop_val, uint_t val_cnt, val_desc_t *v, datalink_media_t media) { if (val_cnt != 1) @@ -1960,9 +2300,9 @@ i_dladm_defmtu_check(struct prop_desc *pd, datalink_id_t linkid, /* ARGSUSED */ static dladm_status_t -i_dladm_duplex_get(struct prop_desc *pd, datalink_id_t linkid, - char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags, - uint_t *perm_flags) +i_dladm_duplex_get(prop_desc_t *pdp, datalink_id_t linkid, + char **prop_val, uint_t *val_cnt, datalink_media_t media, + uint_t flags, uint_t *perm_flags) { link_duplex_t link_duplex; dladm_status_t status; @@ -1988,8 +2328,8 @@ i_dladm_duplex_get(struct prop_desc *pd, datalink_id_t linkid, /* ARGSUSED */ static dladm_status_t -i_dladm_speed_get(struct prop_desc *pd, datalink_id_t linkid, - char **prop_val, uint_t *val_cnt, uint_t flags) +i_dladm_speed_get(prop_desc_t *pdp, datalink_id_t linkid, + char **prop_val, uint_t *val_cnt, uint_t flags, uint_t *perm_flags) { uint64_t ifspeed = 0; dladm_status_t status; @@ -2006,23 +2346,26 @@ i_dladm_speed_get(struct prop_desc *pd, datalink_id_t linkid, "%llu", ifspeed / 1000000); /* Mbps */ } *val_cnt = 1; + *perm_flags = MAC_PROP_PERM_READ; return (DLADM_STATUS_OK); } /* ARGSUSED */ static dladm_status_t -i_dladm_status_get(struct prop_desc *pd, datalink_id_t linkid, - char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags, - uint_t *perm_flags) +i_dladm_status_get(prop_desc_t *pdp, datalink_id_t linkid, + char **prop_val, uint_t *val_cnt, datalink_media_t media, + uint_t flags, uint_t *perm_flags) { - link_state_t link_state; - dladm_status_t status; - uchar_t *cp; - dld_ioc_macprop_t *dip; + link_state_t link_state; + dladm_status_t status; + uchar_t *cp; + dld_ioc_macprop_t *dip; - dip = i_dladm_get_public_prop(linkid, pd->pd_name, flags, &status); + dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags, + &status, perm_flags); if (status != DLADM_STATUS_OK) return (status); + cp = (uchar_t *)dip->pr_val; (void) memcpy(&link_state, cp, sizeof (link_state)); @@ -2038,25 +2381,25 @@ i_dladm_status_get(struct prop_desc *pd, datalink_id_t linkid, break; } *val_cnt = 1; - *perm_flags = dip->pr_perm_flags; free(dip); return (DLADM_STATUS_OK); } /* ARGSUSED */ static dladm_status_t -i_dladm_binary_get(struct prop_desc *pd, datalink_id_t linkid, - char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags, - uint_t *perm_flags) +i_dladm_binary_get(prop_desc_t *pdp, datalink_id_t linkid, + char **prop_val, uint_t *val_cnt, datalink_media_t media, + uint_t flags, uint_t *perm_flags) { dld_ioc_macprop_t *dip; dladm_status_t status; - dip = i_dladm_get_public_prop(linkid, pd->pd_name, flags, &status); + dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags, + &status, perm_flags); if (dip == NULL) return (status); + (void) snprintf(*prop_val, DLADM_PROP_VAL_MAX, "%x", dip->pr_val[0]); - *perm_flags = dip->pr_perm_flags; free(dip); *val_cnt = 1; return (DLADM_STATUS_OK); @@ -2064,22 +2407,23 @@ i_dladm_binary_get(struct prop_desc *pd, datalink_id_t linkid, /* ARGSUSED */ static dladm_status_t -i_dladm_uint32_get(struct prop_desc *pd, datalink_id_t linkid, - char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags, - uint_t *perm_flags) +i_dladm_uint32_get(prop_desc_t *pdp, datalink_id_t linkid, + char **prop_val, uint_t *val_cnt, datalink_media_t media, + uint_t flags, uint_t *perm_flags) { dld_ioc_macprop_t *dip; - uint32_t v = 0; + uint32_t v = 0; uchar_t *cp; dladm_status_t status; - dip = i_dladm_get_public_prop(linkid, pd->pd_name, flags, &status); + dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags, + &status, perm_flags); if (dip == NULL) return (status); + cp = (uchar_t *)dip->pr_val; (void) memcpy(&v, cp, sizeof (v)); (void) snprintf(*prop_val, DLADM_PROP_VAL_MAX, "%ld", v); - *perm_flags = dip->pr_perm_flags; free(dip); *val_cnt = 1; return (DLADM_STATUS_OK); @@ -2087,18 +2431,20 @@ i_dladm_uint32_get(struct prop_desc *pd, datalink_id_t linkid, /* ARGSUSED */ static dladm_status_t -i_dladm_flowctl_get(struct prop_desc *pd, datalink_id_t linkid, - char **prop_val, uint_t *val_cnt, datalink_media_t media, uint_t flags, - uint_t *perm_flags) +i_dladm_flowctl_get(prop_desc_t *pdp, datalink_id_t linkid, + char **prop_val, uint_t *val_cnt, datalink_media_t media, + uint_t flags, uint_t *perm_flags) { dld_ioc_macprop_t *dip; link_flowctrl_t v; dladm_status_t status; uchar_t *cp; - dip = i_dladm_get_public_prop(linkid, pd->pd_name, flags, &status); + dip = i_dladm_get_public_prop(linkid, pdp->pd_name, flags, + &status, perm_flags); if (dip == NULL) return (status); + cp = (uchar_t *)dip->pr_val; (void) memcpy(&v, cp, sizeof (v)); switch (v) { @@ -2115,7 +2461,6 @@ i_dladm_flowctl_get(struct prop_desc *pd, datalink_id_t linkid, (void) sprintf(*prop_val, "bi"); break; } - *perm_flags = dip->pr_perm_flags; free(dip); *val_cnt = 1; return (DLADM_STATUS_OK); @@ -2220,17 +2565,7 @@ i_dladm_get_prop(datalink_id_t linkid, const char *prop_name, if ((status = i_dladm_macprop(dip, B_FALSE)) == DLADM_STATUS_OK) { if (type == DLADM_PROP_VAL_PERM) { - switch (dip->pr_perm_flags) { - case MAC_PROP_PERM_READ: - (void) strncpy(*prop_val, - PERM_READ_ONLY, DLADM_PROP_VAL_MAX); - break; - case MAC_PROP_PERM_RW: - (void) strncpy(*prop_val, - PERM_READ_WRITE, - DLADM_PROP_VAL_MAX); - break; - } + (void) dladm_perm2str(dip->pr_perm_flags, *prop_val); } else { (void) strncpy(*prop_val, dip->pr_val, DLADM_PROP_VAL_MAX); @@ -2434,3 +2769,189 @@ i_dladm_wlan_set_legacy_ioctl(datalink_id_t linkid, void *buf, uint_t buflen, free(gbuf); return (status); } + +static dladm_status_t +link_proplist_check(dladm_arg_list_t *proplist) +{ + int i, j; + boolean_t matched; + + for (i = 0; i < proplist->al_count; i++) { + matched = B_FALSE; + for (j = 0; j < DLADM_MAX_PROPS; j++) { + if (strcmp(proplist->al_info[i].ai_name, + prop_table[j].pd_name) == 0) + matched = B_TRUE; + } + if (!matched) + return (DLADM_STATUS_BADPROP); + } + return (DLADM_STATUS_OK); +} + +dladm_status_t +dladm_parse_link_props(char *str, dladm_arg_list_t **listp, boolean_t novalues) +{ + dladm_status_t status; + + status = dladm_parse_args(str, listp, novalues); + if (status != DLADM_STATUS_OK) + return (status); + + status = link_proplist_check(*listp); + if (status != DLADM_STATUS_OK) { + dladm_free_props(*listp); + return (status); + } + + return (DLADM_STATUS_OK); +} + +/* + * Retrieve the one link property from the database + */ +/*ARGSUSED*/ +static int +i_dladm_get_one_prop(datalink_id_t linkid, const char *prop_name, void *arg) +{ + dladm_arg_list_t *proplist = arg; + dladm_arg_info_t *aip = NULL; + + aip = &proplist->al_info[proplist->al_count]; + /* + * it is fine to point to prop_name since prop_name points to the + * prop_table[n].pd_name. + */ + aip->ai_name = prop_name; + + (void) dladm_get_linkprop(linkid, DLADM_PROP_VAL_PERSISTENT, prop_name, + aip->ai_val, &aip->ai_count); + + if (aip->ai_count != 0) + proplist->al_count++; + + return (DLADM_WALK_CONTINUE); +} + + +/* + * Retrieve all link properties for a link from the database and + * return a property list. + */ +dladm_status_t +dladm_link_get_proplist(datalink_id_t linkid, dladm_arg_list_t **listp) +{ + dladm_arg_list_t *list; + dladm_status_t status = DLADM_STATUS_OK; + + list = calloc(1, sizeof (dladm_arg_list_t)); + if (list == NULL) + return (dladm_errno2status(errno)); + + status = dladm_walk_linkprop(linkid, list, i_dladm_get_one_prop); + + *listp = list; + return (status); +} + +/* + * Retrieve the named property from a proplist, check the value and + * convert to a kernel structure. + */ +static dladm_status_t +i_dladm_link_proplist_extract_one(dladm_arg_list_t *proplist, + const char *name, void *val) +{ + dladm_status_t status; + dladm_arg_info_t *aip = NULL; + int i, j; + + /* Find named property in proplist */ + for (i = 0; i < proplist->al_count; i++) { + aip = &proplist->al_info[i]; + if (strcasecmp(aip->ai_name, name) == 0) + break; + } + + /* Property not in list */ + if (i == proplist->al_count) + return (DLADM_STATUS_OK); + + for (i = 0; i < DLADM_MAX_PROPS; i++) { + prop_desc_t *pdp = &prop_table[i]; + val_desc_t *vdp; + + vdp = malloc(sizeof (val_desc_t) * aip->ai_count); + if (vdp == NULL) + return (DLADM_STATUS_NOMEM); + + if (strcasecmp(aip->ai_name, pdp->pd_name) != 0) + continue; + + if (aip->ai_val == NULL) + return (DLADM_STATUS_BADARG); + + /* Check property value */ + if (pdp->pd_check != NULL) { + status = pdp->pd_check(pdp, 0, aip->ai_val, + aip->ai_count, vdp, 0); + } else { + status = DLADM_STATUS_BADARG; + } + + if (status != DLADM_STATUS_OK) + return (status); + + for (j = 0; j < DLADM_MAX_RSRC_PROP; j++) { + resource_prop_t *rpp = &rsrc_prop_table[j]; + + if (strcasecmp(aip->ai_name, rpp->rp_name) != 0) + continue; + + /* Extract kernel structure */ + if (rpp->rp_extract != NULL) { + status = rpp->rp_extract(vdp, val, + aip->ai_count); + } else { + status = DLADM_STATUS_BADARG; + } + break; + } + + if (status != DLADM_STATUS_OK) + return (status); + + break; + } + return (status); +} + +/* + * Extract properties from a proplist and convert to mac_resource_props_t. + */ +dladm_status_t +dladm_link_proplist_extract(dladm_arg_list_t *proplist, + mac_resource_props_t *mrp) +{ + dladm_status_t status = DLADM_STATUS_OK; + + status = i_dladm_link_proplist_extract_one(proplist, "maxbw", mrp); + if (status != DLADM_STATUS_OK) + return (status); + status = i_dladm_link_proplist_extract_one(proplist, "priority", mrp); + if (status != DLADM_STATUS_OK) + return (status); + status = i_dladm_link_proplist_extract_one(proplist, "cpus", mrp); + if (status != DLADM_STATUS_OK) + return (status); + return (status); +} + +static const char * +dladm_perm2str(uint_t perm, char *buf) +{ + (void) snprintf(buf, DLADM_STRSIZE, "%c%c", + ((perm & MAC_PROP_PERM_READ) != 0) ? 'r' : '-', + ((perm & MAC_PROP_PERM_WRITE) != 0) ? 'w' : '-'); + return (buf); +} diff --git a/usr/src/lib/libdladm/common/llib-ldladm b/usr/src/lib/libdladm/common/llib-ldladm index a6fc19b517..ae8bb981bf 100644 --- a/usr/src/lib/libdladm/common/llib-ldladm +++ b/usr/src/lib/libdladm/common/llib-ldladm @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /*LINTLIBRARY*/ /*PROTOLIB1*/ @@ -34,3 +32,5 @@ #include <libdlvnic.h> #include <libdlvlan.h> #include <libdlmgmt.h> +#include <libdlflow.h> +#include <libdlstat.h> diff --git a/usr/src/lib/libdladm/common/mapfile-vers b/usr/src/lib/libdladm/common/mapfile-vers index 9c61b84883..bd8d6a9eb1 100644 --- a/usr/src/lib/libdladm/common/mapfile-vers +++ b/usr/src/lib/libdladm/common/mapfile-vers @@ -35,7 +35,6 @@ SUNWprivate_1.1 { dladm_valid_linkname; dladm_mac_walk; dladm_init_linkprop; - dladm_get_single_mac_stat; dladm_get_linkprop; dladm_set_linkprop; dladm_walk_linkprop; @@ -44,6 +43,8 @@ SUNWprivate_1.1 { dladm_set_secobj; dladm_unset_secobj; dladm_walk_secobj; + dladm_bw2str; + dladm_str2bw; dladm_secobjclass2str; dladm_str2secobjclass; dladm_aggr_up; @@ -118,12 +119,60 @@ SUNWprivate_1.1 { dladm_wlan_wpa_set_key; dladm_wlan_wpa_set_mlme; dladm_vnic_create; - dladm_vnic_modify; dladm_vnic_delete; dladm_vnic_info; dladm_vnic_str2macaddrtype; - dladm_kstat_value; + dladm_vnic_up; + dladm_walk_macaddr; + dladm_walk_hwgrp; + dladm_pri2str; + dladm_str2pri; + dladm_start_usagelog; + dladm_stop_usagelog; + dladm_walk_usage_res; + dladm_walk_usage_time; + dladm_usage_summary; + dladm_usage_dates; + + dladm_flow_add; + dladm_flow_remove; + dladm_flow_parse_db; + dladm_walk_flow; + dladm_flow_init; + dladm_flow_info; + dladm_prefixlen2mask; + dladm_mask2prefixlen; + dladm_str2proto; + dladm_proto2str; + + dladm_free_attrs; + dladm_parse_flow_attrs; + + dladm_flow_attr_ip2str; + dladm_flow_attr_proto2str; + dladm_flow_attr_port2str; + dladm_flow_attr_dsfield2str; + + dladm_free_props; + dladm_parse_link_props; + dladm_get_linkprop; + dladm_set_linkprop; + dladm_walk_linkprop; + dladm_parse_flow_props; + dladm_get_flowprop; + dladm_set_flowprop; + dladm_walk_flowprop; + dladm_parselink; + + dladm_continuous; + dladm_kstat_lookup; + dladm_get_stats; + dladm_kstat_value; + dladm_get_single_mac_stat; + dladm_stats_total; + dladm_stats_diff; + local: *; }; diff --git a/usr/src/lib/libdladm/common/propfuncs.c b/usr/src/lib/libdladm/common/propfuncs.c new file mode 100644 index 0000000000..74964511eb --- /dev/null +++ b/usr/src/lib/libdladm/common/propfuncs.c @@ -0,0 +1,699 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <stdlib.h> +#include <strings.h> +#include <errno.h> +#include <ctype.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/dld.h> +#include <fcntl.h> +#include <unistd.h> +#include <libdladm_impl.h> +#include <libdlflow_impl.h> + +/* + * XXX duplicate defines + */ +#define DLADM_PROP_VAL_MAX 32 +#define DLADM_MAX_PROPS 32 + +static void +free_props(prop_db_info_t *lip) +{ + prop_db_info_t *lip_next; + prop_val_t *lvp, *lvp_next; + + for (; lip != NULL; lip = lip_next) { + lip_next = lip->li_nextprop; + for (lvp = lip->li_val; lvp != NULL; lvp = lvp_next) { + lvp_next = lvp->lv_nextval; + free(lvp); + } + free(lip); + } +} + +/* + * Generate an entry in the property database. + * Each entry has this format: + * <name> <prop0>=<val0>,...,<valn>;...;<propn>=<val0>,...,<valn>; + */ +static void +generate_prop_line(const char *name, char *buf, + prop_db_info_t *listp, dladm_status_t *statusp) +{ + char tmpbuf[MAXLINELEN]; + char *ptr, *lim = tmpbuf + MAXLINELEN; + prop_db_info_t *lip = listp; + prop_val_t *lvp = NULL; + + /* + * Delete line if there are no properties left. + */ + if (lip == NULL || + (lip->li_val == NULL && lip->li_nextprop == NULL)) { + buf[0] = '\0'; + return; + } + ptr = tmpbuf; + ptr += snprintf(ptr, BUFLEN(lim, ptr), "%s\t", name); + for (; lip != NULL; lip = lip->li_nextprop) { + /* + * Skip properties without values. + */ + if (lip->li_val == NULL) + continue; + + ptr += snprintf(ptr, BUFLEN(lim, ptr), "%s=", lip->li_name); + for (lvp = lip->li_val; lvp != NULL; lvp = lvp->lv_nextval) { + ptr += snprintf(ptr, BUFLEN(lim, ptr), "%s%c", + lvp->lv_name, + ((lvp->lv_nextval == NULL) ? ';' : ',')); + } + } + if (ptr > lim) { + *statusp = DLADM_STATUS_TOOSMALL; + return; + } + (void) snprintf(buf, MAXLINELEN, "%s\n", tmpbuf); +} + +/* + * This function is used to update or create an entry in the persistent db. + * process_prop_db() will first scan the db for an entry matching the + * specified name. If a match is found, this function is invoked with the + * entry's contents (buf) and its linked-list representation (listp). lsp + * holds the name and values of the property to be added or updated; this + * information will be merged with listp. Subsequently, an updated entry + * will be written to buf, which will in turn be written to disk by + * process_prop_db(). If no entry matches the specified name, listp + * will be NULL; a new entry will be generated in this case and it will + * contain only the property information in lsp. + */ +boolean_t +process_prop_set(prop_db_state_t *lsp, char *buf, + prop_db_info_t *listp, dladm_status_t *statusp) +{ + dladm_status_t status; + prop_db_info_t *lastp = NULL, *lip = listp, *nlip = NULL; + prop_val_t **lvpp; + int i; + + if (lsp->ls_propname == NULL) { + buf[0] = '\0'; + return (B_FALSE); + } + + /* + * Find the prop we want to change. + */ + for (; lip != NULL; lip = lip->li_nextprop) { + if (strcmp(lip->li_name, lsp->ls_propname) == 0) + break; + + lastp = lip; + } + + if (lip == NULL) { + /* + * If the prop is not found, append it to the list. + */ + if ((nlip = malloc(sizeof (prop_db_info_t))) == NULL) { + status = DLADM_STATUS_NOMEM; + goto fail; + } + /* + * nlip will need to be freed later if there is no list to + * append to. + */ + if (lastp != NULL) + lastp->li_nextprop = nlip; + nlip->li_name = lsp->ls_propname; + nlip->li_nextprop = NULL; + nlip->li_val = NULL; + lvpp = &nlip->li_val; + } else { + prop_val_t *lvp, *lvp_next; + + /* + * If the prop is found, delete the existing values from it. + */ + for (lvp = lip->li_val; lvp != NULL; lvp = lvp_next) { + lvp_next = lvp->lv_nextval; + free(lvp); + } + lip->li_val = NULL; + lvpp = &lip->li_val; + } + + /* + * Fill our prop with the specified values. + */ + for (i = 0; i < *lsp->ls_valcntp; i++) { + if ((*lvpp = malloc(sizeof (prop_val_t))) == NULL) { + status = DLADM_STATUS_NOMEM; + goto fail; + } + (*lvpp)->lv_name = lsp->ls_propval[i]; + (*lvpp)->lv_nextval = NULL; + lvpp = &(*lvpp)->lv_nextval; + } + + if (listp != NULL) { + generate_prop_line(lsp->ls_name, buf, listp, statusp); + } else { + generate_prop_line(lsp->ls_name, buf, nlip, statusp); + free_props(nlip); + } + return (B_FALSE); + +fail: + *statusp = status; + if (listp == NULL) + free_props(nlip); + + return (B_FALSE); +} + +/* + * This function is used for retrieving the values for a specific property. + * It gets called if an entry matching the specified name exists in the db. + * The entry is converted into a linked-list listp. This list is then scanned + * for the specified property name; if a matching property exists, its + * associated values are copied to the array lsp->ls_propval. + */ +/* ARGSUSED */ +boolean_t +process_prop_get(prop_db_state_t *lsp, char *buf, + prop_db_info_t *listp, dladm_status_t *statusp) +{ + prop_db_info_t *lip = listp; + prop_val_t *lvp; + uint_t valcnt = 0; + + /* + * Find the prop we want to get. + */ + for (; lip != NULL; lip = lip->li_nextprop) { + if (strcmp(lip->li_name, lsp->ls_propname) == 0) + break; + } + if (lip == NULL) { + *statusp = DLADM_STATUS_NOTFOUND; + return (B_FALSE); + } + + for (lvp = lip->li_val; lvp != NULL; lvp = lvp->lv_nextval) { + (void) strncpy(lsp->ls_propval[valcnt], lvp->lv_name, + DLADM_PROP_VAL_MAX); + + if (++valcnt >= *lsp->ls_valcntp && lvp->lv_nextval != NULL) { + *statusp = DLADM_STATUS_TOOSMALL; + return (B_FALSE); + } + } + /* + * This function is meant to be called at most once for each call + * to process_prop_db(). For this reason, it's ok to overwrite + * the caller's valcnt array size with the actual number of values + * returned. + */ + *lsp->ls_valcntp = valcnt; + return (B_FALSE); +} + +/* + * This is used for initializing properties. + * Unlike the other routines, this gets called for every entry in the + * database. lsp->ls_name is not user-specified but instead is set to + * the current name being processed. + */ +/* ARGSUSED */ +boolean_t +process_prop_init(prop_db_state_t *lsp, char *buf, + prop_db_info_t *listp, dladm_status_t *statusp) +{ + dladm_status_t status = DLADM_STATUS_OK; + prop_db_info_t *lip = listp; + prop_val_t *lvp; + uint_t valcnt, i; + char **propval; + + for (; lip != NULL; lip = lip->li_nextprop) { + /* + * Construct the propval array and fill it with + * values from listp. + */ + for (lvp = lip->li_val, valcnt = 0; + lvp != NULL; lvp = lvp->lv_nextval, valcnt++) { + } + + propval = malloc(sizeof (char *) * valcnt); + if (propval == NULL) { + *statusp = DLADM_STATUS_NOMEM; + break; + } + lvp = lip->li_val; + for (i = 0; i < valcnt; i++, lvp = lvp->lv_nextval) + propval[i] = (char *)lvp->lv_name; + + status = (*lsp->ls_initop)(lsp->ls_name, lip->li_name, + propval, valcnt, DLADM_OPT_ACTIVE, NULL); + + /* + * We continue with initializing other properties even + * after encountering an error. This error will be + * propagated to the caller via 'statusp'. + */ + if (status != DLADM_STATUS_OK) + *statusp = status; + + free(propval); + } + return (B_TRUE); +} + +static int +parse_props(char *buf, prop_db_info_t **lipp) +{ + int i, len; + char *curr; + prop_db_info_t *lip = NULL; + prop_db_info_t **tailp = lipp; + prop_val_t *lvp = NULL; + prop_val_t **vtailp = NULL; + + curr = buf; + len = strlen(buf); + for (i = 0; i < len; i++) { + char c = buf[i]; + boolean_t match = (c == '=' || c == ',' || c == ';'); + + /* + * Move to the next character if there is no match and + * if we have not reached the last character. + */ + if (!match && i != len - 1) + continue; + + if (match) { + /* + * Nul-terminate the string pointed to by 'curr'. + */ + buf[i] = '\0'; + if (*curr == '\0') + goto fail; + } + + if (lip != NULL) { + /* + * We get here after we have processed the "<prop>=" + * pattern. The pattern we are now interested in is + * "<val0>,<val1>,...,<valn>;". For each value we + * find, a prop_val_t will be allocated and + * added to the current 'lip'. + */ + if (c == '=') + goto fail; + + lvp = malloc(sizeof (*lvp)); + if (lvp == NULL) + goto fail; + + lvp->lv_name = curr; + lvp->lv_nextval = NULL; + *vtailp = lvp; + vtailp = &lvp->lv_nextval; + + if (c == ';') { + tailp = &lip->li_nextprop; + vtailp = NULL; + lip = NULL; + } + } else { + /* + * lip == NULL indicates that 'curr' must be refering + * to a property name. We allocate a new prop_db_info_t + * append it to the list given by the caller. + */ + if (c != '=') + goto fail; + + lip = malloc(sizeof (*lip)); + if (lip == NULL) + goto fail; + + lip->li_name = curr; + lip->li_val = NULL; + lip->li_nextprop = NULL; + *tailp = lip; + vtailp = &lip->li_val; + } + curr = buf + i + 1; + } + /* + * The list must be non-empty and the last character must be ';'. + */ + if (*lipp == NULL || lip != NULL) + goto fail; + + return (0); + +fail: + free_props(*lipp); + *lipp = NULL; + return (-1); +} + +static boolean_t +process_prop_line(prop_db_state_t *lsp, char *buf, + dladm_status_t *statusp) +{ + prop_db_info_t *lip = NULL; + int i, len, llen; + char *str, *lasts; + boolean_t cont, noname = B_FALSE; + + /* + * Skip leading spaces, blank lines, and comments. + */ + len = strlen(buf); + for (i = 0; i < len; i++) { + if (!isspace(buf[i])) + break; + } + if (i == len || buf[i] == '#') + return (B_TRUE); + + str = buf + i; + if (lsp->ls_name != NULL) { + /* + * Skip names we're not interested in. + * Note that strncmp() and isspace() are used here + * instead of strtok() and strcmp() because we don't + * want to modify buf in case it does not contain the + * specified name. + */ + llen = strlen(lsp->ls_name); + if (strncmp(str, lsp->ls_name, llen) != 0 || + !isspace(str[llen])) + return (B_TRUE); + } else { + /* + * If a name is not specified, find the name + * and assign it to lsp->ls_name. + */ + if (strtok_r(str, " \n\t", &lasts) == NULL) + goto fail; + + llen = strlen(str); + lsp->ls_name = str; + noname = B_TRUE; + } + str += llen + 1; + if (str >= buf + len) + goto fail; + + /* + * Now find the list of properties. + */ + if ((str = strtok_r(str, " \n\t", &lasts)) == NULL) + goto fail; + + if (parse_props(str, &lip) < 0) + goto fail; + + cont = (*lsp->ls_op)(lsp, buf, lip, statusp); + free_props(lip); + if (noname) + lsp->ls_name = NULL; + return (cont); + +fail: + free_props(lip); + if (noname) + lsp->ls_name = NULL; + + /* + * Delete corrupted line. + */ + buf[0] = '\0'; + return (B_TRUE); +} + +dladm_status_t +process_prop_db(void *arg, FILE *fp, FILE *nfp) +{ + prop_db_state_t *lsp = arg; + dladm_status_t status = DLADM_STATUS_OK; + char buf[MAXLINELEN]; + boolean_t cont = B_TRUE; + + /* + * This loop processes each line of the configuration file. + * buf can potentially be modified by process_prop_line(). + * If this is a write operation and buf is not truncated, buf will + * be written to disk. process_prop_line() will no longer be + * called after it returns B_FALSE; at which point the remainder + * of the file will continue to be read and, if necessary, written + * to disk as well. + */ + while (fgets(buf, MAXLINELEN, fp) != NULL) { + if (cont) + cont = process_prop_line(lsp, buf, &status); + + if (nfp != NULL && buf[0] != '\0' && fputs(buf, nfp) == EOF) { + status = dladm_errno2status(errno); + break; + } + } + + if (status != DLADM_STATUS_OK || !cont) + return (status); + + if (lsp->ls_op == process_prop_set) { + /* + * If the specified name is not found above, we add the + * name and its properties to the configuration file. + */ + (void) (*lsp->ls_op)(lsp, buf, NULL, &status); + if (status == DLADM_STATUS_OK && fputs(buf, nfp) == EOF) + status = dladm_errno2status(errno); + } + + if (lsp->ls_op == process_prop_get) + status = DLADM_STATUS_NOTFOUND; + + return (status); +} + +dladm_status_t +i_dladm_get_prop_temp(const char *name, prop_type_t type, + const char *prop_name, char **prop_val, uint_t *val_cntp, + prop_table_t *prop_tbl) +{ + int i; + dladm_status_t status; + uint_t cnt; + fprop_desc_t *pdp; + + if (name == NULL || prop_name == NULL || prop_val == NULL || + val_cntp == NULL || *val_cntp == 0) + return (DLADM_STATUS_BADARG); + + for (i = 0; i < prop_tbl->pt_size; i++) + if (strcasecmp(prop_name, prop_tbl->pt_table[i].pd_name) == 0) + break; + + if (i == prop_tbl->pt_size) + return (DLADM_STATUS_NOTFOUND); + + pdp = &prop_tbl->pt_table[i]; + status = DLADM_STATUS_OK; + + switch (type) { + case DLADM_PROP_VAL_CURRENT: + status = pdp->pd_get(name, prop_val, val_cntp); + break; + case DLADM_PROP_VAL_DEFAULT: + if (pdp->pd_defval.vd_name == NULL) { + status = DLADM_STATUS_NOTSUP; + break; + } + (void) strcpy(*prop_val, pdp->pd_defval.vd_name); + *val_cntp = 1; + break; + + case DLADM_PROP_VAL_MODIFIABLE: + if (pdp->pd_getmod != NULL) { + status = pdp->pd_getmod(name, prop_val, val_cntp); + break; + } + cnt = pdp->pd_nmodval; + if (cnt == 0) { + status = DLADM_STATUS_NOTSUP; + } else if (cnt > *val_cntp) { + status = DLADM_STATUS_TOOSMALL; + } else { + for (i = 0; i < cnt; i++) { + (void) strcpy(prop_val[i], + pdp->pd_modval[i].vd_name); + } + *val_cntp = cnt; + } + break; + default: + status = DLADM_STATUS_BADARG; + break; + } + + return (status); +} + +static dladm_status_t +i_dladm_set_one_prop_temp(const char *name, fprop_desc_t *pdp, char **prop_val, + uint_t val_cnt, uint_t flags) +{ + dladm_status_t status; + val_desc_t *vdp = NULL; + uint_t cnt; + + if (pdp->pd_temponly && (flags & DLADM_OPT_PERSIST) != 0) + return (DLADM_STATUS_TEMPONLY); + + if (pdp->pd_set == NULL) + return (DLADM_STATUS_PROPRDONLY); + + if (prop_val != NULL) { + if (pdp->pd_check != NULL) + status = pdp->pd_check(pdp, prop_val, val_cnt, &vdp); + else + status = DLADM_STATUS_BADARG; + + if (status != DLADM_STATUS_OK) + return (status); + + cnt = val_cnt; + } else { + if (pdp->pd_defval.vd_name == NULL) + return (DLADM_STATUS_NOTSUP); + + if ((vdp = malloc(sizeof (val_desc_t))) == NULL) + return (DLADM_STATUS_NOMEM); + + (void) memcpy(vdp, &pdp->pd_defval, sizeof (val_desc_t)); + cnt = 1; + } + + status = pdp->pd_set(name, vdp, cnt); + + free(vdp); + return (status); +} + +dladm_status_t +i_dladm_set_prop_temp(const char *name, const char *prop_name, char **prop_val, + uint_t val_cnt, uint_t flags, char **errprop, prop_table_t *prop_tbl) +{ + int i; + dladm_status_t status = DLADM_STATUS_OK; + boolean_t found = B_FALSE; + + for (i = 0; i < prop_tbl->pt_size; i++) { + fprop_desc_t *pdp = &prop_tbl->pt_table[i]; + dladm_status_t s; + + if (prop_name != NULL && + (strcasecmp(prop_name, pdp->pd_name) != 0)) + continue; + + found = B_TRUE; + s = i_dladm_set_one_prop_temp(name, pdp, prop_val, val_cnt, + flags); + + if (prop_name != NULL) { + status = s; + break; + } else { + if (s != DLADM_STATUS_OK && + s != DLADM_STATUS_NOTSUP) { + if (errprop != NULL) + *errprop = pdp->pd_name; + status = s; + break; + } + } + } + + if (!found) + status = DLADM_STATUS_NOTFOUND; + + return (status); +} + +boolean_t +i_dladm_is_prop_temponly(const char *prop_name, char **errprop, + prop_table_t *prop_tbl) +{ + int i; + + if (prop_name == NULL) + return (B_FALSE); + + for (i = 0; i < prop_tbl->pt_size; i++) { + fprop_desc_t *pdp = &prop_tbl->pt_table[i]; + + if (strcasecmp(prop_name, pdp->pd_name) != 0) + continue; + + if (errprop != NULL) + *errprop = pdp->pd_name; + + if (pdp->pd_temponly) + return (B_TRUE); + } + + return (B_FALSE); +} +void +dladm_free_props(dladm_arg_list_t *list) +{ + dladm_free_args(list); +} + +dladm_status_t +dladm_parse_props(char *str, dladm_arg_list_t **listp, boolean_t novalues) +{ + if (dladm_parse_args(str, listp, novalues) != DLADM_STATUS_OK) + goto fail; + + return (DLADM_STATUS_OK); + +fail: + dladm_free_args(*listp); + return (DLADM_STATUS_PROP_PARSE_ERR); +} diff --git a/usr/src/lib/libdladm/common/usage.c b/usr/src/lib/libdladm/common/usage.c new file mode 100644 index 0000000000..07ef7bbb22 --- /dev/null +++ b/usr/src/lib/libdladm/common/usage.c @@ -0,0 +1,1437 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <fcntl.h> +#include <stdlib.h> +#include <strings.h> +#include <exacct.h> +#include <libdladm.h> + +#define TIMEBUFLEN 20 +#define GBIT 1000000000 +#define MBIT 1000000 +#define KBIT 1000 + +#define NET_RESET_TOT(tbytes, ttime, tibytes, tobytes, step) { \ + (step) = 1; \ + (tbytes) = 0; \ + (ttime) = 0; \ + (tibytes) = 0; \ + (tobytes) = 0; \ + } + +/* Flow/Link Descriptor */ +typedef struct net_desc_s { + char net_desc_name[LIFNAMSIZ]; + char net_desc_devname[LIFNAMSIZ]; + uchar_t net_desc_ehost[ETHERADDRL]; + uchar_t net_desc_edest[ETHERADDRL]; + ushort_t net_desc_vlan_tpid; + ushort_t net_desc_vlan_tci; + ushort_t net_desc_sap; + ushort_t net_desc_cpuid; + ushort_t net_desc_priority; + uint64_t net_desc_bw_limit; + in6_addr_t net_desc_saddr; + in6_addr_t net_desc_daddr; + boolean_t net_desc_isv4; + in_port_t net_desc_sport; + in_port_t net_desc_dport; + uint8_t net_desc_protocol; + uint8_t net_desc_dsfield; + boolean_t net_desc_newrec; +} net_desc_t; + +/* Time structure: Year, Month, Day, Hour, Min, Sec */ +typedef struct net_time_s { + int net_time_yr; + int net_time_mon; + int net_time_day; + int net_time_hr; + int net_time_min; + int net_time_sec; +} net_time_t; + +/* Flow/Link Stats */ +typedef struct net_stat_s { + char net_stat_name[LIFNAMSIZ]; + uint64_t net_stat_ibytes; + uint64_t net_stat_obytes; + uint64_t net_stat_ipackets; + uint64_t net_stat_opackets; + uint64_t net_stat_ierrors; + uint64_t net_stat_oerrors; + uint64_t net_stat_tibytes; + uint64_t net_stat_tobytes; + uint64_t net_stat_tipackets; + uint64_t net_stat_topackets; + uint64_t net_stat_tierrors; + uint64_t net_stat_toerrors; + uint64_t net_stat_ctime; + uint64_t net_stat_tdiff; + net_time_t net_stat_time; + struct net_stat_s *net_stat_next; + net_desc_t *net_stat_desc; + boolean_t net_stat_isref; +} net_stat_t; + +/* Used to create the [gnu]plot file */ +typedef struct net_plot_entry_s { + char *net_pe_name; + uint64_t net_pe_tottime; + uint64_t net_pe_totbytes; + uint64_t net_pe_totibytes; + uint64_t net_pe_totobytes; + uint64_t net_pe_lasttime; +} net_plot_entry_t; + +/* Stats entry */ +typedef struct net_entry_s { + net_desc_t *net_entry_desc; + net_stat_t *net_entry_shead; + net_stat_t *net_entry_stail; + int net_entry_scount; + net_stat_t *net_entry_sref; + net_stat_t *net_entry_tstats; + uint64_t net_entry_ttime; + struct net_entry_s *net_entry_next; +} net_entry_t; + +/* Time sorted list */ +typedef struct net_time_entry_s { + net_stat_t *my_time_stat; + struct net_time_entry_s *net_time_entry_next; + struct net_time_entry_s *net_time_entry_prev; +} net_time_entry_t; + +/* The parsed table */ +typedef struct net_table_s { + /* List of stats */ + net_entry_t *net_table_head; + net_entry_t *net_table_tail; + int net_entries; + + /* + * Optimization I : List sorted by time, i.e: + * Time Resource .. + * ------------------------------- + * 11.15.10 bge0 + * 11.15.10 ce0 + * 11.15.10 vnic1 + * 11.15.15 bge0 + * 11.15.15 ce0 + * 11.15.15 vnic1 + */ + net_time_entry_t *net_time_head; + net_time_entry_t *net_time_tail; + + /* + * Optimization II : List sorted by resources + * Time Resource .. + * ------------------------------- + * 11.15.10 bge0 + * 11.15.15 bge0 + * 11.15.10 ce0 + * 11.15.15 ce0 + * 11.15.10 vnic1 + * 11.15.15 vnic1 + */ + net_time_entry_t *net_ctime_head; + net_time_entry_t *net_ctime_tail; + + /* Common to both the above (sorted) lists. */ + int net_time_entries; +} net_table_t; + +#define NET_DATE_GREATER 0 +#define NET_DATE_LESSER 1 +#define NET_DATE_EQUAL 2 + +#define NET_TIME_GREATER 0 +#define NET_TIME_LESSER 1 +#define NET_TIME_EQUAL 2 + +#ifndef _LP64 +#define FMT_UINT64 "%-15llu" +#else +#define FMT_UINT64 "%-15lu" +#endif + +/* + * Given a timebuf of the form M/D/Y,H:M:S break it into individual elements. + */ +static void +dissect_time(char *tbuf, net_time_t *nt) +{ + char *d; + char *t; + char *dd; + char *h; + char *endp; + + if (tbuf == NULL || nt == NULL) + return; + + d = strtok(tbuf, ","); /* Date */ + t = strtok(NULL, ","); /* Time */ + + /* Month */ + dd = strtok(d, "/"); + if (dd == NULL) + return; + nt->net_time_mon = strtol(dd, &endp, 10); + + /* Day */ + dd = strtok(NULL, "/"); + if (dd == NULL) + return; + nt->net_time_day = strtol(dd, &endp, 10); + + /* Year */ + dd = strtok(NULL, "/"); + if (dd == NULL) + return; + nt->net_time_yr = strtol(dd, &endp, 10); + if (strlen(dd) <= 2) + nt->net_time_yr += 2000; + + if (t == NULL) + return; + + /* Hour */ + h = strtok(t, ":"); + if (h == NULL) + return; + nt->net_time_hr = strtol(h, &endp, 10); + + /* Min */ + h = strtok(NULL, ":"); + if (h == NULL) + return; + nt->net_time_min = strtol(h, &endp, 10); + + /* Sec */ + h = strtok(NULL, ":"); + if (h == NULL) + return; + nt->net_time_sec = strtol(h, &endp, 10); +} + +/* Get a stat item from an object in the exacct file */ +static void +add_stat_item(ea_object_t *o, net_stat_t *ns) +{ + switch (o->eo_catalog & EXT_TYPE_MASK) { + case EXT_STRING: + if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_STATS_NAME) { + (void) strncpy(ns->net_stat_name, o->eo_item.ei_string, + strlen(o->eo_item.ei_string)); + } + break; + case EXT_UINT64: + if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_STATS_CURTIME) { + time_t _time; + char timebuf[TIMEBUFLEN]; + + ns->net_stat_ctime = o->eo_item.ei_uint64; + _time = ns->net_stat_ctime; + (void) strftime(timebuf, sizeof (timebuf), + "%m/%d/%Y,%T\n", localtime(&_time)); + dissect_time(timebuf, &ns->net_stat_time); + } else if ((o->eo_catalog & EXD_DATA_MASK) == + EXD_NET_STATS_IBYTES) { + ns->net_stat_ibytes = o->eo_item.ei_uint64; + } else if ((o->eo_catalog & EXD_DATA_MASK) == + EXD_NET_STATS_OBYTES) { + ns->net_stat_obytes = o->eo_item.ei_uint64; + } else if ((o->eo_catalog & EXD_DATA_MASK) == + EXD_NET_STATS_IPKTS) { + ns->net_stat_ipackets = o->eo_item.ei_uint64; + } else if ((o->eo_catalog & EXD_DATA_MASK) == + EXD_NET_STATS_OPKTS) { + ns->net_stat_opackets = o->eo_item.ei_uint64; + } else if ((o->eo_catalog & EXD_DATA_MASK) == + EXD_NET_STATS_IERRPKTS) { + ns->net_stat_ierrors = o->eo_item.ei_uint64; + } else if ((o->eo_catalog & EXD_DATA_MASK) == + EXD_NET_STATS_OERRPKTS) { + ns->net_stat_oerrors = o->eo_item.ei_uint64; + } + break; + default: + break; + } +} + +/* Get a description item from an object in the exacct file */ +static void +add_desc_item(ea_object_t *o, net_desc_t *nd) +{ + switch (o->eo_catalog & EXT_TYPE_MASK) { + case EXT_STRING: + if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_NAME) { + (void) strncpy(nd->net_desc_name, o->eo_item.ei_string, + strlen(o->eo_item.ei_string)); + } else if ((o->eo_catalog & EXD_DATA_MASK) == + EXD_NET_DESC_DEVNAME) { + (void) strncpy(nd->net_desc_devname, + o->eo_item.ei_string, strlen(o->eo_item.ei_string)); + } + break; + case EXT_UINT8: + if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_PROTOCOL) { + nd->net_desc_protocol = o->eo_item.ei_uint8; + } else if ((o->eo_catalog & EXD_DATA_MASK) == + EXD_NET_DESC_DSFIELD) { + nd->net_desc_dsfield = o->eo_item.ei_uint8; + } + break; + case EXT_UINT16: + if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_SPORT) { + nd->net_desc_sport = o->eo_item.ei_uint16; + } else if ((o->eo_catalog & EXD_DATA_MASK) == + EXD_NET_DESC_DPORT) { + nd->net_desc_dport = o->eo_item.ei_uint16; + } else if ((o->eo_catalog & EXD_DATA_MASK) == + EXD_NET_DESC_SAP) { + nd->net_desc_sap = o->eo_item.ei_uint16; + } else if ((o->eo_catalog & EXD_DATA_MASK) == + EXD_NET_DESC_VLAN_TPID) { + nd->net_desc_vlan_tpid = o->eo_item.ei_uint16; + } else if ((o->eo_catalog & EXD_DATA_MASK) == + EXD_NET_DESC_VLAN_TCI) { + nd->net_desc_vlan_tci = o->eo_item.ei_uint16; + } else if ((o->eo_catalog & EXD_DATA_MASK) == + EXD_NET_DESC_PRIORITY) { + nd->net_desc_priority = o->eo_item.ei_uint16; + } + break; + case EXT_UINT32: + if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_V4SADDR || + (o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_V4DADDR) { + struct in_addr addr; + + addr.s_addr = htonl(o->eo_item.ei_uint32); + + if ((o->eo_catalog & EXD_DATA_MASK) == + EXD_NET_DESC_V4SADDR) { + IN6_INADDR_TO_V4MAPPED(&addr, + &nd->net_desc_saddr); + } else { + IN6_INADDR_TO_V4MAPPED(&addr, + &nd->net_desc_daddr); + } + } + break; + case EXT_UINT64: + if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_BWLIMIT) + nd->net_desc_bw_limit = o->eo_item.ei_uint64; + break; + case EXT_RAW: + if ((o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_V6SADDR || + (o->eo_catalog & EXD_DATA_MASK) == EXD_NET_DESC_V6DADDR) { + in6_addr_t addr; + + addr = *(in6_addr_t *)o->eo_item.ei_raw; + if ((o->eo_catalog & EXD_DATA_MASK) == + EXD_NET_DESC_V6SADDR) { + nd->net_desc_saddr = addr; + } else { + nd->net_desc_daddr = addr; + } + } else if ((o->eo_catalog & EXD_DATA_MASK) == + EXD_NET_DESC_EHOST) { + bcopy((uchar_t *)o->eo_item.ei_raw, nd->net_desc_ehost, + ETHERADDRL); + } else if ((o->eo_catalog & EXD_DATA_MASK) == + EXD_NET_DESC_EDEST) { + bcopy((uchar_t *)o->eo_item.ei_raw, nd->net_desc_edest, + ETHERADDRL); + } + break; + default: + break; + } +} + +/* Add a description item to the table */ +static dladm_status_t +add_desc_to_tbl(net_table_t *net_table, net_desc_t *nd) +{ + net_entry_t *ne; + + if ((ne = calloc(1, sizeof (net_entry_t))) == NULL) + return (DLADM_STATUS_NOMEM); + + if ((ne->net_entry_tstats = calloc(1, sizeof (net_stat_t))) == NULL) { + free(ne); + return (DLADM_STATUS_NOMEM); + } + + ne->net_entry_desc = nd; + ne->net_entry_shead = NULL; + ne->net_entry_stail = NULL; + ne->net_entry_scount = 0; + + if (net_table->net_table_head == NULL) { + net_table->net_table_head = ne; + net_table->net_table_tail = ne; + } else { + net_table->net_table_tail->net_entry_next = ne; + net_table->net_table_tail = ne; + } + net_table->net_entries++; + return (DLADM_STATUS_OK); +} + +/* Compare dates and return if t1 is equal, greater or lesser than t2 */ +static int +compare_date(net_time_t *t1, net_time_t *t2) +{ + if (t1->net_time_yr == t2->net_time_yr && + t1->net_time_mon == t2->net_time_mon && + t1->net_time_day == t2->net_time_day) { + return (NET_DATE_EQUAL); + } + if (t1->net_time_yr > t2->net_time_yr || + (t1->net_time_yr == t2->net_time_yr && + t1->net_time_mon > t2->net_time_mon) || + (t1->net_time_yr == t2->net_time_yr && + t1->net_time_mon == t2->net_time_mon && + t1->net_time_day > t2->net_time_day)) { + return (NET_DATE_GREATER); + } + return (NET_DATE_LESSER); +} + +/* Compare times and return if t1 is equal, greater or lesser than t2 */ +static int +compare_time(net_time_t *t1, net_time_t *t2) +{ + int cd; + + cd = compare_date(t1, t2); + + if (cd == NET_DATE_GREATER) { + return (NET_TIME_GREATER); + } else if (cd == NET_DATE_LESSER) { + return (NET_TIME_LESSER); + } else { + if (t1->net_time_hr == t2->net_time_hr && + t1->net_time_min == t2->net_time_min && + t1->net_time_sec == t2->net_time_sec) { + return (NET_TIME_EQUAL); + } + if (t1->net_time_hr > t2->net_time_hr || + (t1->net_time_hr == t2->net_time_hr && + t1->net_time_min > t2->net_time_min) || + (t1->net_time_hr == t2->net_time_hr && + t1->net_time_min == t2->net_time_min && + t1->net_time_sec > t2->net_time_sec)) { + return (NET_TIME_GREATER); + } + } + return (NET_TIME_LESSER); +} + +/* + * Given a start and end time and start and end entries check if the + * times are within the range, and adjust, if needed. + */ +static dladm_status_t +chk_time_bound(net_time_t *s, net_time_t *e, net_time_t *sns, + net_time_t *ens) +{ + if (s != NULL && e != NULL) { + if (compare_time(s, e) == NET_TIME_GREATER) + return (DLADM_STATUS_BADTIMEVAL); + } + if (s != NULL) { + if (compare_time(s, sns) == NET_TIME_LESSER) { + s->net_time_yr = sns->net_time_yr; + s->net_time_mon = sns->net_time_mon; + s->net_time_day = sns->net_time_day; + s->net_time_hr = sns->net_time_hr; + s->net_time_min = sns->net_time_min; + s->net_time_sec = sns->net_time_sec; + } + } + if (e != NULL) { + if (compare_time(e, ens) == NET_TIME_GREATER) { + e->net_time_yr = ens->net_time_yr; + e->net_time_mon = ens->net_time_mon; + e->net_time_day = ens->net_time_day; + e->net_time_hr = ens->net_time_hr; + e->net_time_min = ens->net_time_min; + e->net_time_sec = ens->net_time_sec; + } + } + return (DLADM_STATUS_OK); +} + +/* + * Given a start and end time (strings), convert them into net_time_t + * and also check for the range given the head and tail of the list. + * If stime is lower then head or etime is greated than tail, adjust. + */ +static dladm_status_t +get_time_range(net_time_entry_t *head, net_time_entry_t *tail, + net_time_t *st, net_time_t *et, char *stime, char *etime) +{ + bzero(st, sizeof (net_time_t)); + bzero(et, sizeof (net_time_t)); + + if (stime == NULL && etime == NULL) + return (0); + + if (stime != NULL) + dissect_time(stime, st); + if (etime != NULL) + dissect_time(etime, et); + + if (stime != NULL || etime != NULL) { + return (chk_time_bound(stime == NULL ? NULL : st, + etime == NULL ? NULL : et, + &head->my_time_stat->net_stat_time, + &tail->my_time_stat->net_stat_time)); + } + return (0); +} + +/* + * Walk the list from a given starting point and return when we find + * an entry that is greater or equal to st. lasttime will point to the + * previous time entry. + */ +static void +get_starting_point(net_time_entry_t *head, net_time_entry_t **start, + net_time_t *st, char *stime, uint64_t *lasttime) +{ + net_time_entry_t *next = head; + + if (head == NULL) { + *start = NULL; + return; + } + if (stime == NULL) { + *start = head; + *lasttime = head->my_time_stat->net_stat_ctime; + return; + } + *start = NULL; + while (next != NULL) { + if (compare_time(st, + &next->my_time_stat->net_stat_time) != NET_TIME_LESSER) { + *lasttime = next->my_time_stat->net_stat_ctime; + next = next->net_time_entry_next; + continue; + } + *start = next; + break; + } +} + +/* + * Point entry (pe) functions + */ +/* Clear all the counters. Done after the contents are written to the file */ +static void +clear_pe(net_plot_entry_t *pe, int entries, int *pentries) +{ + int count; + + for (count = 0; count < entries; count++) { + pe[count].net_pe_totbytes = 0; + pe[count].net_pe_totibytes = 0; + pe[count].net_pe_totobytes = 0; + pe[count].net_pe_tottime = 0; + } + *pentries = 0; +} + +/* Update an entry in the point entry table */ +static void +update_pe(net_plot_entry_t *pe, net_stat_t *nns, int nentries, + int *pentries, uint64_t lasttime) +{ + int count; + + for (count = 0; count < nentries; count++) { + if ((strlen(nns->net_stat_name) == + strlen(pe[count].net_pe_name)) && + (strncmp(pe[count].net_pe_name, nns->net_stat_name, + strlen(nns->net_stat_name)) == 0)) { + break; + } + } + if (count == nentries) + return; + + if (pe[count].net_pe_totbytes == 0) + pe[count].net_pe_lasttime = lasttime; + + pe[count].net_pe_totbytes += nns->net_stat_ibytes + + nns->net_stat_obytes; + pe[count].net_pe_tottime += nns->net_stat_tdiff; + pe[count].net_pe_totibytes += nns->net_stat_ibytes; + pe[count].net_pe_totobytes += nns->net_stat_obytes; + (*pentries)++; +} + +/* Flush the contents of the point entry table to the file. */ +static void +add_pe_to_file(int (*fn)(dladm_usage_t *, void *), net_plot_entry_t *pe, + net_stat_t *ns, int entries, void *arg) +{ + int count; + dladm_usage_t usage; + uint64_t tottime; + + bcopy(&ns->net_stat_ctime, &usage.du_etime, sizeof (usage.du_etime)); + for (count = 0; count < entries; count++) { + bcopy(pe[count].net_pe_name, &usage.du_name, + sizeof (usage.du_name)); + bcopy(&pe[count].net_pe_lasttime, &usage.du_stime, + sizeof (usage.du_stime)); + usage.du_rbytes = pe[count].net_pe_totibytes; + usage.du_obytes = pe[count].net_pe_totobytes; + tottime = pe[count].net_pe_tottime; + usage.du_bandwidth = (tottime > 0) ? + ((pe[count].net_pe_totbytes * 8) / tottime) : 0; + usage.du_last = (count == entries-1); + fn(&usage, arg); + } +} + +/* + * Net entry functions + */ +static net_entry_t * +get_ne_from_table(net_table_t *net_table, char *name) +{ + int count; + net_desc_t *nd; + net_entry_t *ne = net_table->net_table_head; + + for (count = 0; count < net_table->net_entries; count++) { + nd = ne->net_entry_desc; + if ((strlen(name) == strlen(nd->net_desc_name)) && + (strncmp(name, nd->net_desc_name, strlen(name)) == 0)) { + return (ne); + } + ne = ne->net_entry_next; + } + return (NULL); +} + +/* Get the entry for the descriptor, if it exists */ +static net_desc_t * +get_ndesc(net_table_t *net_table, net_desc_t *nd) +{ + int count; + net_desc_t *nd1; + net_entry_t *ne = net_table->net_table_head; + + for (count = 0; count < net_table->net_entries; count++) { + nd1 = ne->net_entry_desc; + if (strlen(nd1->net_desc_name) == strlen(nd->net_desc_name) && + strlen(nd1->net_desc_devname) == + strlen(nd->net_desc_devname) && + strncmp(nd1->net_desc_name, nd->net_desc_name, + strlen(nd1->net_desc_name)) == 0 && + strncmp(nd1->net_desc_devname, nd->net_desc_devname, + strlen(nd1->net_desc_devname)) == 0 && + bcmp(nd1->net_desc_ehost, nd->net_desc_ehost, + ETHERADDRL) == 0 && + bcmp(nd1->net_desc_edest, nd->net_desc_edest, + ETHERADDRL) == 0 && + nd1->net_desc_vlan_tpid == nd->net_desc_vlan_tpid && + nd1->net_desc_vlan_tci == nd->net_desc_vlan_tci && + nd1->net_desc_sap == nd->net_desc_sap && + nd1->net_desc_cpuid == nd->net_desc_cpuid && + nd1->net_desc_priority == nd->net_desc_priority && + nd1->net_desc_bw_limit == nd->net_desc_bw_limit && + nd1->net_desc_sport == nd->net_desc_sport && + nd1->net_desc_dport == nd->net_desc_dport && + nd1->net_desc_protocol == nd->net_desc_protocol && + nd1->net_desc_dsfield == nd->net_desc_dsfield && + IN6_ARE_ADDR_EQUAL(&nd1->net_desc_saddr, + &nd->net_desc_saddr) && + IN6_ARE_ADDR_EQUAL(&nd1->net_desc_daddr, + &nd->net_desc_daddr)) { + return (nd1); + } + ne = ne->net_entry_next; + } + return (NULL); +} + +/* + * Update the stat entries. The stats in the file are cumulative, so in order + * to have increments, we maintain a reference stat entry, which contains + * the stats when the record was first written and a total stat entry, which + * maintains the running count. When we want to add a stat entry, if it + * the reference stat entry, we don't come here. For subsequent entries, + * we get the increment by subtracting the current value from the reference + * stat and the total stat. + */ +static void +update_stats(net_stat_t *ns1, net_entry_t *ne, net_stat_t *ref) +{ + + /* get the increment */ + ns1->net_stat_ibytes -= (ref->net_stat_ibytes + ref->net_stat_tibytes); + ns1->net_stat_obytes -= (ref->net_stat_obytes + ref->net_stat_tobytes); + ns1->net_stat_ipackets -= (ref->net_stat_ipackets + + ref->net_stat_tipackets); + ns1->net_stat_opackets -= (ref->net_stat_opackets + + ref->net_stat_topackets); + ns1->net_stat_ierrors -= (ref->net_stat_ierrors + + ref->net_stat_tierrors); + ns1->net_stat_oerrors -= (ref->net_stat_oerrors + + ref->net_stat_toerrors); + + /* update total bytes */ + ref->net_stat_tibytes += ns1->net_stat_ibytes; + ref->net_stat_tobytes += ns1->net_stat_obytes; + ref->net_stat_tipackets += ns1->net_stat_ipackets; + ref->net_stat_topackets += ns1->net_stat_opackets; + ref->net_stat_tierrors += ns1->net_stat_ierrors; + ref->net_stat_toerrors += ns1->net_stat_oerrors; + + ne->net_entry_tstats->net_stat_ibytes += ns1->net_stat_ibytes; + ne->net_entry_tstats->net_stat_obytes += ns1->net_stat_obytes; + ne->net_entry_tstats->net_stat_ipackets += ns1->net_stat_ipackets; + ne->net_entry_tstats->net_stat_opackets += ns1->net_stat_opackets; + ne->net_entry_tstats->net_stat_ierrors += ns1->net_stat_ierrors; + ne->net_entry_tstats->net_stat_oerrors += ns1->net_stat_oerrors; +} + +/* Add the stat entry into the table */ +static dladm_status_t +add_stat_to_tbl(net_table_t *net_table, net_stat_t *ns) +{ + net_entry_t *ne; + + ne = get_ne_from_table(net_table, ns->net_stat_name); + if (ne == NULL) + return (DLADM_STATUS_NOMEM); + + /* Ptr to flow desc */ + ns->net_stat_desc = ne->net_entry_desc; + if (ns->net_stat_desc->net_desc_newrec) { + ns->net_stat_desc->net_desc_newrec = B_FALSE; + ns->net_stat_isref = B_TRUE; + ne->net_entry_sref = ns; + } else if (ns->net_stat_ibytes < ne->net_entry_sref->net_stat_tibytes || + (ns->net_stat_obytes < ne->net_entry_sref->net_stat_tobytes)) { + ns->net_stat_isref = B_TRUE; + ne->net_entry_sref = ns; + } else { + ns->net_stat_isref = B_FALSE; + update_stats(ns, ne, ne->net_entry_sref); + } + if (ne->net_entry_shead == NULL) { + ne->net_entry_shead = ns; + ne->net_entry_stail = ns; + } else { + if (!ns->net_stat_isref) { + ne->net_entry_ttime += (ns->net_stat_ctime - + ne->net_entry_stail->net_stat_ctime); + ns->net_stat_tdiff = ns->net_stat_ctime - + ne->net_entry_stail->net_stat_ctime; + } + ne->net_entry_stail->net_stat_next = ns; + ne->net_entry_stail = ns; + } + + ne->net_entry_scount++; + return (DLADM_STATUS_OK); +} + +/* Add a flow/link descriptor record to the table */ +static dladm_status_t +add_desc(net_table_t *net_table, ea_file_t *ef, int nobjs) +{ + net_desc_t *nd; + net_desc_t *dnd; + int count; + ea_object_t scratch; + + if ((nd = calloc(1, sizeof (net_desc_t))) == NULL) + return (DLADM_STATUS_NOMEM); + nd->net_desc_newrec = B_TRUE; + + for (count = 0; count < nobjs; count++) { + if (ea_get_object(ef, &scratch) == -1) { + free(nd); + return (DLADM_STATUS_NOMEM); + } + add_desc_item(&scratch, nd); + } + if ((dnd = get_ndesc(net_table, nd)) != NULL) { + dnd->net_desc_newrec = B_TRUE; + free(nd); + return (DLADM_STATUS_OK); + } + if (add_desc_to_tbl(net_table, nd) != 0) { + free(nd); + return (DLADM_STATUS_NOMEM); + } + return (DLADM_STATUS_OK); +} + +/* Make an entry into the time sorted list */ +static void +addto_time_list(net_table_t *net_table, net_time_entry_t *nt, + net_time_entry_t *ntc) +{ + net_stat_t *ns = nt->my_time_stat; + net_stat_t *ns1; + net_time_entry_t *end; + net_time_t *t1; + int count; + + t1 = &ns->net_stat_time; + + net_table->net_time_entries++; + + if (net_table->net_time_head == NULL) { + net_table->net_time_head = nt; + net_table->net_time_tail = nt; + } else { + net_table->net_time_tail->net_time_entry_next = nt; + nt->net_time_entry_prev = net_table->net_time_tail; + net_table->net_time_tail = nt; + } + + if (net_table->net_ctime_head == NULL) { + net_table->net_ctime_head = ntc; + net_table->net_ctime_tail = ntc; + } else { + end = net_table->net_ctime_tail; + count = 0; + while (count < net_table->net_time_entries - 1) { + ns1 = end->my_time_stat; + /* Just add it to the tail */ + if (compare_date(t1, &ns1->net_stat_time) == + NET_DATE_GREATER) { + break; + } + if ((strlen(ns1->net_stat_name) == + strlen(ns->net_stat_name)) && + (strncmp(ns1->net_stat_name, ns->net_stat_name, + strlen(ns1->net_stat_name)) == 0)) { + ntc->net_time_entry_next = + end->net_time_entry_next; + if (end->net_time_entry_next != NULL) { + end->net_time_entry_next-> + net_time_entry_prev = ntc; + } else { + net_table->net_ctime_tail = ntc; + } + end->net_time_entry_next = ntc; + ntc->net_time_entry_prev = end; + return; + } + count++; + end = end->net_time_entry_prev; + } + net_table->net_ctime_tail->net_time_entry_next = ntc; + ntc->net_time_entry_prev = net_table->net_ctime_tail; + net_table->net_ctime_tail = ntc; + } +} + +/* Add stat entry into the lists */ +static dladm_status_t +add_stats(net_table_t *net_table, ea_file_t *ef, int nobjs) +{ + net_stat_t *ns; + int count; + ea_object_t scratch; + net_time_entry_t *nt; + net_time_entry_t *ntc; + + if ((ns = calloc(1, sizeof (net_stat_t))) == NULL) + return (DLADM_STATUS_NOMEM); + + if ((nt = calloc(1, sizeof (net_time_entry_t))) == NULL) { + free(ns); + return (DLADM_STATUS_NOMEM); + } + if ((ntc = calloc(1, sizeof (net_time_entry_t))) == NULL) { + free(ns); + free(nt); + return (DLADM_STATUS_NOMEM); + } + + nt->my_time_stat = ns; + ntc->my_time_stat = ns; + + for (count = 0; count < nobjs; count++) { + if (ea_get_object(ef, &scratch) == -1) { + free(ns); + free(nt); + free(ntc); + return (DLADM_STATUS_NOMEM); + } + add_stat_item(&scratch, ns); + } + if (add_stat_to_tbl(net_table, ns) != 0) { + free(ns); + free(nt); + free(ntc); + return (DLADM_STATUS_NOMEM); + } + addto_time_list(net_table, nt, ntc); + return (DLADM_STATUS_OK); +} + +/* Free the entire table */ +static void +free_logtable(net_table_t *net_table) +{ + net_entry_t *head; + net_entry_t *next; + net_stat_t *ns; + net_stat_t *ns1; + net_time_entry_t *thead; + net_time_entry_t *tnext; + + thead = net_table->net_time_head; + while (thead != NULL) { + thead->my_time_stat = NULL; + tnext = thead->net_time_entry_next; + thead->net_time_entry_next = NULL; + thead->net_time_entry_prev = NULL; + free(thead); + thead = tnext; + } + net_table->net_time_head = NULL; + net_table->net_time_tail = NULL; + + thead = net_table->net_ctime_head; + while (thead != NULL) { + thead->my_time_stat = NULL; + tnext = thead->net_time_entry_next; + thead->net_time_entry_next = NULL; + thead->net_time_entry_prev = NULL; + free(thead); + thead = tnext; + } + net_table->net_ctime_head = NULL; + net_table->net_ctime_tail = NULL; + + net_table->net_time_entries = 0; + + head = net_table->net_table_head; + while (head != NULL) { + next = head->net_entry_next; + head->net_entry_next = NULL; + ns = head->net_entry_shead; + while (ns != NULL) { + ns1 = ns->net_stat_next; + free(ns); + ns = ns1; + } + head->net_entry_scount = 0; + head->net_entry_sref = NULL; + free(head->net_entry_desc); + free(head->net_entry_tstats); + free(head); + head = next; + } + net_table->net_table_head = NULL; + net_table->net_table_tail = NULL; + net_table->net_time_entries = 0; + free(net_table); +} + +/* Parse the exacct file, and return the parsed table. */ +static void * +parse_logfile(char *file, int logtype, dladm_status_t *status) +{ + ea_file_t ef; + ea_object_t scratch; + net_table_t *net_table; + + *status = DLADM_STATUS_OK; + if ((net_table = calloc(1, sizeof (net_table_t))) == NULL) { + *status = DLADM_STATUS_NOMEM; + return (NULL); + } + if (ea_open(&ef, file, NULL, 0, O_RDONLY, 0) == -1) { + *status = DLADM_STATUS_BADARG; + free(net_table); + return (NULL); + } + bzero(&scratch, sizeof (ea_object_t)); + while (ea_get_object(&ef, &scratch) != -1) { + if (scratch.eo_type != EO_GROUP) { + (void) ea_free_item(&scratch, EUP_ALLOC); + (void) bzero(&scratch, sizeof (ea_object_t)); + continue; + } + /* Read Link Desc/Stat records */ + if (logtype == DLADM_LOGTYPE_FLOW) { + /* Flow Descriptor */ + if ((scratch.eo_catalog & + EXD_DATA_MASK) == EXD_GROUP_NET_FLOW_DESC) { + (void) add_desc(net_table, &ef, + scratch.eo_group.eg_nobjs - 1); + /* Flow Stats */ + } else if ((scratch.eo_catalog & + EXD_DATA_MASK) == EXD_GROUP_NET_FLOW_STATS) { + (void) add_stats(net_table, &ef, + scratch.eo_group.eg_nobjs - 1); + } + } else if (logtype == DLADM_LOGTYPE_LINK) { + /* Link Descriptor */ + if ((scratch.eo_catalog & + EXD_DATA_MASK) == EXD_GROUP_NET_LINK_DESC) { + (void) add_desc(net_table, &ef, + scratch.eo_group.eg_nobjs - 1); + /* Link Stats */ + } else if ((scratch.eo_catalog & + EXD_DATA_MASK) == EXD_GROUP_NET_LINK_STATS) { + (void) add_stats(net_table, &ef, + scratch.eo_group.eg_nobjs - 1); + } + } else { + if (((scratch.eo_catalog & EXD_DATA_MASK) == + EXD_GROUP_NET_LINK_DESC) || ((scratch.eo_catalog & + EXD_DATA_MASK) == EXD_GROUP_NET_FLOW_DESC)) { + (void) add_desc(net_table, &ef, + scratch.eo_group.eg_nobjs - 1); + } else if (((scratch.eo_catalog & EXD_DATA_MASK) == + EXD_GROUP_NET_LINK_STATS) || ((scratch.eo_catalog & + EXD_DATA_MASK) == EXD_GROUP_NET_FLOW_STATS)) { + (void) add_stats(net_table, &ef, + scratch.eo_group.eg_nobjs - 1); + } + } + (void) ea_free_item(&scratch, EUP_ALLOC); + (void) bzero(&scratch, sizeof (ea_object_t)); + } + + (void) ea_close(&ef); + return ((void *)net_table); +} + +/* + * Walk the ctime list. This is used when looking for usage records + * based on a "resource" name. + */ +dladm_status_t +dladm_walk_usage_res(int (*fn)(dladm_usage_t *, void *), int logtype, + char *logfile, char *resource, char *stime, char *etime, void *arg) +{ + net_table_t *net_table; + net_time_t st, et; + net_time_entry_t *start; + net_stat_t *ns = NULL; + net_stat_t *nns; + uint64_t tot_time = 0; + uint64_t last_time; + uint64_t tot_bytes = 0; + uint64_t tot_ibytes = 0; + uint64_t tot_obytes = 0; + boolean_t gotstart = B_FALSE; + dladm_status_t status; + dladm_usage_t usage; + int step = 1; + + /* Parse the log file */ + net_table = parse_logfile(logfile, logtype, &status); + if (net_table == NULL) + return (status); + + if (net_table->net_entries == 0) + return (DLADM_STATUS_OK); + start = net_table->net_ctime_head; + + /* Time range */ + status = get_time_range(net_table->net_ctime_head, + net_table->net_ctime_tail, &st, &et, stime, etime); + if (status != DLADM_STATUS_OK) + return (status); + + while (start != NULL) { + nns = start->my_time_stat; + + /* Get to the resource we are interested in */ + if ((strlen(resource) != strlen(nns->net_stat_name)) || + (strncmp(resource, nns->net_stat_name, + strlen(nns->net_stat_name)) != 0)) { + start = start->net_time_entry_next; + continue; + } + + /* Find the first record */ + if (!gotstart) { + get_starting_point(start, &start, &st, stime, + &last_time); + if (start == NULL) + break; + nns = start->my_time_stat; + gotstart = B_TRUE; + } + + /* Write one entry and return if we are out of the range */ + if (etime != NULL && compare_time(&nns->net_stat_time, &et) + == NET_TIME_GREATER) { + if (tot_bytes != 0) { + bcopy(ns->net_stat_name, &usage.du_name, + sizeof (usage.du_name)); + bcopy(&last_time, &usage.du_stime, + sizeof (usage.du_stime)); + bcopy(&ns->net_stat_ctime, &usage.du_etime, + sizeof (usage.du_etime)); + usage.du_rbytes = tot_ibytes; + usage.du_obytes = tot_obytes; + usage.du_bandwidth = tot_bytes*8/tot_time; + usage.du_last = B_TRUE; + fn(&usage, arg); + } + return (DLADM_STATUS_OK); + } + + /* + * If this is a reference entry, just print what we have + * and proceed. + */ + if (nns->net_stat_isref) { + if (tot_bytes != 0) { + bcopy(&nns->net_stat_name, &usage.du_name, + sizeof (usage.du_name)); + bcopy(&nns->net_stat_ctime, &usage.du_stime, + sizeof (usage.du_stime)); + usage.du_rbytes = tot_ibytes; + usage.du_obytes = tot_obytes; + usage.du_bandwidth = tot_bytes*8/tot_time; + usage.du_last = B_TRUE; + fn(&usage, arg); + NET_RESET_TOT(tot_bytes, tot_time, tot_ibytes, + tot_obytes, step); + } + last_time = nns->net_stat_ctime; + start = start->net_time_entry_next; + continue; + } + + ns = nns; + if (--step == 0) { + tot_bytes += ns->net_stat_ibytes + ns->net_stat_obytes; + tot_ibytes += ns->net_stat_ibytes; + tot_obytes += ns->net_stat_obytes; + tot_time += ns->net_stat_tdiff; + bcopy(&ns->net_stat_name, &usage.du_name, + sizeof (usage.du_name)); + bcopy(&last_time, &usage.du_stime, + sizeof (usage.du_stime)); + bcopy(&ns->net_stat_ctime, &usage.du_etime, + sizeof (usage.du_etime)); + usage.du_rbytes = tot_ibytes; + usage.du_obytes = tot_obytes; + usage.du_bandwidth = tot_bytes*8/tot_time; + usage.du_last = B_TRUE; + fn(&usage, arg); + + NET_RESET_TOT(tot_bytes, tot_time, tot_ibytes, + tot_obytes, step); + last_time = ns->net_stat_ctime; + } else { + tot_bytes += ns->net_stat_ibytes + ns->net_stat_obytes; + tot_ibytes += ns->net_stat_ibytes; + tot_obytes += ns->net_stat_obytes; + tot_time += ns->net_stat_tdiff; + } + start = start->net_time_entry_next; + } + + if (tot_bytes != 0) { + bcopy(&ns->net_stat_name, &usage.du_name, + sizeof (usage.du_name)); + bcopy(&last_time, &usage.du_stime, + sizeof (usage.du_stime)); + bcopy(&ns->net_stat_ctime, &usage.du_etime, + sizeof (usage.du_etime)); + usage.du_rbytes = tot_ibytes; + usage.du_obytes = tot_obytes; + usage.du_bandwidth = tot_bytes*8/tot_time; + usage.du_last = B_TRUE; + fn(&usage, arg); + } + + free_logtable(net_table); + return (status); +} + +/* + * Walk the time sorted list if a resource is not specified. + */ +dladm_status_t +dladm_walk_usage_time(int (*fn)(dladm_usage_t *, void *), int logtype, + char *logfile, char *stime, char *etime, void *arg) +{ + net_table_t *net_table; + net_time_entry_t *start; + net_stat_t *ns = NULL, *nns; + net_time_t st, et, *t1; + net_desc_t *nd; + net_entry_t *ne; + net_plot_entry_t *pe; + int count; + int step = 1; + int nentries = 0, pentries = 0; + uint64_t last_time; + dladm_status_t status; + + /* Parse the log file */ + net_table = parse_logfile(logfile, logtype, &status); + if (net_table == NULL) + return (status); + + if (net_table->net_entries == 0) + return (DLADM_STATUS_OK); + start = net_table->net_time_head; + + /* Find the first and last records and starting point */ + status = get_time_range(net_table->net_time_head, + net_table->net_time_tail, &st, &et, stime, etime); + if (status != DLADM_STATUS_OK) + return (status); + get_starting_point(start, &start, &st, stime, &last_time); + /* + * Could assert to be non-null, since get_time_range() + * would have adjusted. + */ + if (start == NULL) + return (DLADM_STATUS_BADTIMEVAL); + + /* + * Collect entries for all resources in a time slot before + * writing to the file. + */ + nentries = net_table->net_entries; + + pe = malloc(sizeof (net_plot_entry_t) * net_table->net_entries + 1); + if (pe == NULL) + return (DLADM_STATUS_NOMEM); + + ne = net_table->net_table_head; + for (count = 0; count < nentries; count++) { + nd = ne->net_entry_desc; + pe[count].net_pe_name = nd->net_desc_name; + ne = ne->net_entry_next; + } + + clear_pe(pe, nentries, &pentries); + + /* Write header to file */ + /* add_pe_to_file(fn, pe, ns, nentries, arg); */ + + t1 = &start->my_time_stat->net_stat_time; + + while (start != NULL) { + + nns = start->my_time_stat; + /* + * We have crossed the time boundary, check if we need to + * print out now. + */ + if (compare_time(&nns->net_stat_time, t1) == + NET_TIME_GREATER) { + /* return if we are out of the range */ + if (etime != NULL && + compare_time(&nns->net_stat_time, &et) == + NET_TIME_GREATER) { + if (pentries > 0) { + add_pe_to_file(fn, pe, ns, nentries, + arg); + clear_pe(pe, nentries, &pentries); + } + free(pe); + return (DLADM_STATUS_OK); + } + /* update the stats from the ns. */ + t1 = &nns->net_stat_time; + last_time = ns->net_stat_ctime; + if (--step == 0) { + if (pentries > 0) { + add_pe_to_file(fn, pe, ns, nentries, + arg); + clear_pe(pe, nentries, &pentries); + } + step = 1; + } + } + + /* + * if this is a reference entry, just print what we have + * for this resource and proceed. We will end up writing + * the stats for all the entries when we hit a ref element, + * which means 'steps' for some might not be accurate, but + * that is fine, the alternative is to write only the + * resource for which we hit a reference entry. + */ + if (nns->net_stat_isref) { + if (pentries > 0) { + add_pe_to_file(fn, pe, ns, nentries, arg); + clear_pe(pe, nentries, &pentries); + } + step = 1; + } else { + update_pe(pe, nns, nentries, &pentries, last_time); + } + ns = nns; + start = start->net_time_entry_next; + } + + if (pentries > 0) + add_pe_to_file(fn, pe, ns, nentries, arg); + + free(pe); + free_logtable(net_table); + + return (DLADM_STATUS_OK); +} + +dladm_status_t +dladm_usage_summary(int (*fn)(dladm_usage_t *, void *), int logtype, + char *logfile, void *arg) +{ + net_table_t *net_table; + net_entry_t *ne; + net_desc_t *nd; + net_stat_t *ns; + int count; + dladm_usage_t usage; + dladm_status_t status; + + /* Parse the log file */ + net_table = parse_logfile(logfile, logtype, &status); + if (net_table == NULL) + return (status); + + if (net_table->net_entries == 0) + return (DLADM_STATUS_OK); + + ne = net_table->net_table_head; + for (count = 0; count < net_table->net_entries; count++) { + ns = ne->net_entry_tstats; + nd = ne->net_entry_desc; + + if (ns->net_stat_ibytes + ns->net_stat_obytes == 0) + continue; + bcopy(&nd->net_desc_name, &usage.du_name, + sizeof (usage.du_name)); + usage.du_duration = ne->net_entry_ttime; + usage.du_ipackets = ns->net_stat_ipackets; + usage.du_rbytes = ns->net_stat_ibytes; + usage.du_opackets = ns->net_stat_opackets; + usage.du_obytes = ns->net_stat_obytes; + usage.du_bandwidth = + (ns->net_stat_ibytes + ns->net_stat_obytes) * 8 / + usage.du_duration; + usage.du_last = (count == net_table->net_entries-1); + fn(&usage, arg); + + ne = ne->net_entry_next; + } + + free_logtable(net_table); + return (DLADM_STATUS_OK); +} + +/* + * Walk the ctime list and display the dates of the records. + */ +dladm_status_t +dladm_usage_dates(int (*fn)(dladm_usage_t *, void *), int logtype, + char *logfile, char *resource, void *arg) +{ + net_table_t *net_table; + net_time_entry_t *start; + net_stat_t *nns; + net_time_t st; + net_time_t *lasttime = NULL; + uint64_t last_time; + boolean_t gotstart = B_FALSE; + dladm_status_t status; + dladm_usage_t usage; + + /* Parse the log file */ + net_table = parse_logfile(logfile, logtype, &status); + if (net_table == NULL) + return (status); + + if (net_table->net_entries == 0) + return (DLADM_STATUS_OK); + + start = net_table->net_ctime_head; + + while (start != NULL) { + nns = start->my_time_stat; + + /* get to the resource we are interested in */ + if (resource != NULL) { + if ((strlen(resource) != strlen(nns->net_stat_name)) || + (strncmp(resource, nns->net_stat_name, + strlen(nns->net_stat_name)) != 0)) { + start = start->net_time_entry_next; + continue; + } + } + + /* get the starting point in the logfile */ + if (!gotstart) { + get_starting_point(start, &start, &st, NULL, + &last_time); + if (start == NULL) + break; + nns = start->my_time_stat; + gotstart = B_TRUE; + } + + if (lasttime == NULL || + compare_date(&nns->net_stat_time, lasttime) == + NET_DATE_GREATER) { + bzero(&usage, sizeof (dladm_usage_t)); + bcopy(&nns->net_stat_ctime, &usage.du_stime, + sizeof (usage.du_stime)); + fn(&usage, arg); + lasttime = &nns->net_stat_time; + } + + start = start->net_time_entry_next; + continue; + } + + free_logtable(net_table); + return (status); +} diff --git a/usr/src/lib/libsecdb/exec_attr.txt b/usr/src/lib/libsecdb/exec_attr.txt index ae7d769e2a..e0ef11b073 100644 --- a/usr/src/lib/libsecdb/exec_attr.txt +++ b/usr/src/lib/libsecdb/exec_attr.txt @@ -193,6 +193,8 @@ Network Management:solaris:cmd:::/sbin/routeadm:euid=0;\ privs=proc_chroot,proc_owner,sys_ip_config Network Management:solaris:cmd:::/sbin/dladm:euid=dladm;egid=sys;\ privs=sys_dl_config,net_rawaccess,proc_audit +Network Management:solaris:cmd:::/sbin/flowadm:euid=dladm;egid=sys;\ + privs=sys_dl_config,net_rawaccess,proc_audit Network Management:suser:cmd:::/usr/bin/netstat:uid=0 Network Management:suser:cmd:::/usr/bin/rup:euid=0 Network Management:suser:cmd:::/usr/bin/ruptime:euid=0 diff --git a/usr/src/lib/libsecdb/help/auths/Makefile b/usr/src/lib/libsecdb/help/auths/Makefile index 8bc756895f..42d1d72c96 100644 --- a/usr/src/lib/libsecdb/help/auths/Makefile +++ b/usr/src/lib/libsecdb/help/auths/Makefile @@ -70,6 +70,7 @@ HTMLENTS = \ SmfExAcctFlowStates.html \ SmfExAcctProcessStates.html \ SmfExAcctTaskStates.html \ + SmfExAcctNetStates.html \ SmfHeader.html \ SmfInetdStates.html \ SmfIPsecStates.html \ @@ -93,6 +94,7 @@ HTMLENTS = \ SmfValueExAcctFlow.html \ SmfValueExAcctProcess.html \ SmfValueExAcctTask.html \ + SmfValueExAcctNet.html \ SmfVtStates.html \ SmfValueHeader.html \ SmfValueInetd.html \ diff --git a/usr/src/lib/libsecdb/help/auths/SmfExAcctNetStates.html b/usr/src/lib/libsecdb/help/auths/SmfExAcctNetStates.html new file mode 100644 index 0000000000..e042637323 --- /dev/null +++ b/usr/src/lib/libsecdb/help/auths/SmfExAcctNetStates.html @@ -0,0 +1,37 @@ +<HTML> +<!-- + CDDL HEADER START + + The contents of this file are subject to the terms of the + Common Development and Distribution License (the "License"). + You may not use this file except in compliance with the License. + + You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + or http://www.opensolaris.org/os/licensing. + See the License for the specific language governing permissions + and limitations under the License. + + When distributing Covered Code, include this CDDL HEADER in each + file and include the License file at usr/src/OPENSOLARIS.LICENSE. + If applicable, add the following below this CDDL HEADER, with the + fields enclosed by brackets "[]" replaced with your own identifying + information: Portions Copyright [yyyy] [name of copyright owner] + + CDDL HEADER END + +Copyright 2008 Sun Microsystems, Inc. All rights reserved. +Use is subject to license terms. +--> +<!-- + <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=iso-8859-1"> +--> +<BODY> +When Manage Net Extended Accounting Service States is in the Authorizations +Included column, it grants the authorization to enable or disable net +extended accounting. +<p> +If Manage Net Extended Accounting Service States is grayed, then you are not +entitled to Add or Remove this authorization. +<BR> +</BODY> +</HTML> diff --git a/usr/src/lib/libsecdb/help/auths/SmfValueExAcctNet.html b/usr/src/lib/libsecdb/help/auths/SmfValueExAcctNet.html new file mode 100644 index 0000000000..52f735c4b9 --- /dev/null +++ b/usr/src/lib/libsecdb/help/auths/SmfValueExAcctNet.html @@ -0,0 +1,35 @@ +<HTML> +<!-- + CDDL HEADER START + + The contents of this file are subject to the terms of the + Common Development and Distribution License (the "License"). + You may not use this file except in compliance with the License. + + You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + or http://www.opensolaris.org/os/licensing. + See the License for the specific language governing permissions + and limitations under the License. + + When distributing Covered Code, include this CDDL HEADER in each + file and include the License file at usr/src/OPENSOLARIS.LICENSE. + If applicable, add the following below this CDDL HEADER, with the + fields enclosed by brackets "[]" replaced with your own identifying + information: Portions Copyright [yyyy] [name of copyright owner] + + CDDL HEADER END + +Copyright 2008 Sun Microsystems, Inc. All rights reserved. +Use is subject to license terms. +--> + +<BODY> +When Change Values of Net Extended Accounting Service Properties is in the +Authorizations Included column, it grants the the authorization to change +net extended accounting configuration parameter values. +<P> +If Change Values of Net Extended Accounting Service Properties is grayed, +then you are not entitled to Add or Remove this authorization. +<p> +</BODY> +</HTML> diff --git a/usr/src/lib/libsecdb/help/profiles/Makefile b/usr/src/lib/libsecdb/help/profiles/Makefile index 37f9608f0b..0d93f0929b 100644 --- a/usr/src/lib/libsecdb/help/profiles/Makefile +++ b/usr/src/lib/libsecdb/help/profiles/Makefile @@ -38,6 +38,7 @@ HTMLENTS = \ RtExAcctFlow.html \ RtExAcctProcess.html \ RtExAcctTask.html \ + RtExAcctNet.html \ RtLogMngmnt.html \ RtDeviceMngmnt.html \ RtDeviceSecurity.html \ diff --git a/usr/src/lib/libsecdb/help/profiles/RtExAcctNet.html b/usr/src/lib/libsecdb/help/profiles/RtExAcctNet.html new file mode 100644 index 0000000000..25861d980e --- /dev/null +++ b/usr/src/lib/libsecdb/help/profiles/RtExAcctNet.html @@ -0,0 +1,39 @@ +<HTML> +<!-- + CDDL HEADER START + + The contents of this file are subject to the terms of the + Common Development and Distribution License (the "License"). + You may not use this file except in compliance with the License. + + You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + or http://www.opensolaris.org/os/licensing. + See the License for the specific language governing permissions + and limitations under the License. + + When distributing Covered Code, include this CDDL HEADER in each + file and include the License file at usr/src/OPENSOLARIS.LICENSE. + If applicable, add the following below this CDDL HEADER, with the + fields enclosed by brackets "[]" replaced with your own identifying + information: Portions Copyright [yyyy] [name of copyright owner] + + CDDL HEADER END + +-- Copyright 2008 Sun Microsystems, Inc. All rights reserved. +-- Use is subject to license terms. +--> +<HEAD> + <TITLE> </TITLE> + + +</HEAD> +<BODY> +When Manage the Net Extended Accounting service is in the Rights Included +column, it grants the right to commands needed to administer net extended +accounting. +<p> +If Manage the Net Extended Accounting service is grayed, then you are not +entitled to Add or Remove this right. +<p> +</BODY> +</HTML> diff --git a/usr/src/lib/libsecdb/prof_attr.txt b/usr/src/lib/libsecdb/prof_attr.txt index 9799ec15c2..ccf8b5081f 100644 --- a/usr/src/lib/libsecdb/prof_attr.txt +++ b/usr/src/lib/libsecdb/prof_attr.txt @@ -44,6 +44,7 @@ DHCP Management:::Manage the DHCP service:auths=solaris.dhcpmgr.*;help=RtDHCPMng Extended Accounting Flow Management:::Manage the Flow Extended Accounting service:auths=solaris.smf.manage.extended-accounting.flow,solaris.smf.value.extended-accounting.flow;profiles=acctadm;help=RtExActtFlow.html Extended Accounting Process Management:::Manage the Process Extended Accounting service:auths=solaris.smf.manage.extended-accounting.process,solaris.smf.value.extended-accounting.process;profiles=acctadm;hep=RtExAcctProcess.html Extended Accounting Task Management:::Manage the Task Extended Accounting service:auths=solaris.smf.manage.extended-accounting.task,solaris.smf.value.extended-accounting.task;profiles=acctadm;help=RtExAcctTask.html +Extended Accounting Net Management:::Manage the Net Extended Accounting service:auths=solaris.smf.manage.extended-accounting.net,solaris.smf.value.extended-accounting.net;profiles=acctadm;help=RtExActtNet.html File System Management:::Manage, mount, share file systems:profiles=SMB Management,VSCAN Management,SMBFS Management;auths=solaris.smf.manage.autofs,solaris.smf.manage.shares.*,solaris.smf.value.shares.*;help=RtFileSysMngmnt.html File System Security:::Manage file system security attributes:help=RtFileSysSecurity.html HAL Management:::Manage HAL SMF service:auths=solaris.smf.manage.hal;help=RtHALMngmnt.html diff --git a/usr/src/pkgdefs/SUNW0on/prototype_com b/usr/src/pkgdefs/SUNW0on/prototype_com index 14419f0097..34c71c492a 100644 --- a/usr/src/pkgdefs/SUNW0on/prototype_com +++ b/usr/src/pkgdefs/SUNW0on/prototype_com @@ -242,6 +242,7 @@ f none usr/lib/help/auths/locale/SmfCronStates.html 444 root bin f none usr/lib/help/auths/locale/SmfExAcctFlowStates.html 444 root bin f none usr/lib/help/auths/locale/SmfExAcctProcessStates.html 444 root bin f none usr/lib/help/auths/locale/SmfExAcctTaskStates.html 444 root bin +f none usr/lib/help/auths/locale/SmfExAcctNetStates.html 444 root bin f none usr/lib/help/auths/locale/SmfHeader.html 444 root bin f none usr/lib/help/auths/locale/SmfInetdStates.html 444 root bin f none usr/lib/help/auths/locale/SmfManageHeader.html 444 root bin @@ -267,6 +268,7 @@ f none usr/lib/help/auths/locale/SmfValueCoreadm.html 444 root bin f none usr/lib/help/auths/locale/SmfValueExAcctFlow.html 444 root bin f none usr/lib/help/auths/locale/SmfValueExAcctProcess.html 444 root bin f none usr/lib/help/auths/locale/SmfValueExAcctTask.html 444 root bin +f none usr/lib/help/auths/locale/SmfValueExAcctNet.html 444 root bin f none usr/lib/help/auths/locale/SmfVtStates.html 444 root bin f none usr/lib/help/auths/locale/SmfValueHeader.html 444 root bin f none usr/lib/help/auths/locale/SmfValueInetd.html 444 root bin @@ -344,6 +346,7 @@ f none usr/lib/help/profiles/locale/RtDeviceMngmnt.html 444 root bin f none usr/lib/help/profiles/locale/RtExAcctFlow.html 444 root bin f none usr/lib/help/profiles/locale/RtExAcctProcess.html 444 root bin f none usr/lib/help/profiles/locale/RtExAcctTask.html 444 root bin +f none usr/lib/help/profiles/locale/RtExAcctNet.html 444 root bin f none usr/lib/help/profiles/locale/RtPrntAdmin.html 444 root bin f none usr/lib/help/profiles/locale/RtConsUser.html 444 root bin f none usr/lib/help/profiles/locale/RtContractObserver.html 444 root bin diff --git a/usr/src/pkgdefs/SUNWcnetr/postinstall b/usr/src/pkgdefs/SUNWcnetr/postinstall index cb6ab86de9..20d09c70ee 100644 --- a/usr/src/pkgdefs/SUNWcnetr/postinstall +++ b/usr/src/pkgdefs/SUNWcnetr/postinstall @@ -109,6 +109,44 @@ if [ -f "${ORIG}" ]; then removef -f $PKGINST > /dev/null 2>&1 fi +# Convert hostname.xxx and zonecfg vlan entries +host_ifs=`ls -1 $rootprefix/etc | egrep -e '^hostname.|^hostname6.|^dhcp.'| \ + cut -d . -f2 | sort -u` + +zones=`zoneadm list -c | grep -v global` +for zone in $zones +do + zonecfg -z $zone info ip-type | grep exclusive >/dev/null + if [ $? -eq 0 ]; then + zif=`zonecfg -z $zone info net | grep physical | \ + nawk '{print $2}'` + zone_ifs="$zone_ifs $zif" + fi +done + +ORIG=$BASEDIR/etc/dladm/datalink.conf +for ifname in $host_ifs $zone_ifs +do + grep $ifname $ORIG >/dev/null + if [ $? != 0 ]; then + phys=`echo $ifname | sed "s/[0-9]*$//"` + devnum=`echo $ifname | sed "s/$phys//g"` + if [ "$phys$devnum" != $ifname -o \ + -n "`echo $devnum | tr -d '[0-9]'`" ]; then + echo "skipping invalid interface $ifname" + continue + fi + + vid=`expr $devnum / 1000` + inst=`expr $devnum % 1000` + + if [ "$vid" != "0" ]; then + echo dladm create-vlan -l $phys$inst -v $vid \ + $ifname >> ${PKG_INSTALL_ROOT}/$UPGRADE_SCRIPT + fi + fi +done + # # Change permissions of public IKE certificates and CRLs # that may have been incorrectly created as private diff --git a/usr/src/pkgdefs/SUNWcnetr/prototype_com b/usr/src/pkgdefs/SUNWcnetr/prototype_com index 307a2a7303..7091ec4bc5 100644 --- a/usr/src/pkgdefs/SUNWcnetr/prototype_com +++ b/usr/src/pkgdefs/SUNWcnetr/prototype_com @@ -22,7 +22,6 @@ # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" # # This required package information file contains a list of package contents. # The 'pkgmk' command uses this file to identify the contents of a package @@ -53,6 +52,8 @@ d none etc 755 root sys d none etc/dladm 755 dladm sys e preserve etc/dladm/secobj.conf 600 dladm sys e preserve etc/dladm/datalink.conf 644 dladm sys +e preserve etc/dladm/flowadm.conf 644 dladm sys +e preserve etc/dladm/flowprop.conf 644 dladm sys d none etc/default 755 root sys e dhcpagent etc/default/dhcpagent 644 root sys e preserve etc/default/inetinit 644 root sys @@ -74,3 +75,4 @@ e sock2path etc/inet/sock2path 444 root sys s none etc/sock2path=./inet/sock2path d none sbin 755 root sys f none sbin/dladm 555 root bin +f none sbin/flowadm 555 root bin diff --git a/usr/src/pkgdefs/SUNWcsu/prototype_com b/usr/src/pkgdefs/SUNWcsu/prototype_com index c3505988cb..b1021d4267 100644 --- a/usr/src/pkgdefs/SUNWcsu/prototype_com +++ b/usr/src/pkgdefs/SUNWcsu/prototype_com @@ -482,6 +482,7 @@ f none usr/lib/help/auths/locale/C/SmfCronStates.html 444 root bin f none usr/lib/help/auths/locale/C/SmfExAcctFlowStates.html 444 root bin f none usr/lib/help/auths/locale/C/SmfExAcctProcessStates.html 444 root bin f none usr/lib/help/auths/locale/C/SmfExAcctTaskStates.html 444 root bin +f none usr/lib/help/auths/locale/C/SmfExAcctNetStates.html 444 root bin f none usr/lib/help/auths/locale/C/SmfHeader.html 444 root bin f none usr/lib/help/auths/locale/C/SmfManageHeader.html 444 root bin f none usr/lib/help/auths/locale/C/SmfMDNSStates.html 444 root bin @@ -506,6 +507,7 @@ f none usr/lib/help/auths/locale/C/SmfValueCoreadm.html 444 root bin f none usr/lib/help/auths/locale/C/SmfValueExAcctFlow.html 444 root bin f none usr/lib/help/auths/locale/C/SmfValueExAcctProcess.html 444 root bin f none usr/lib/help/auths/locale/C/SmfValueExAcctTask.html 444 root bin +f none usr/lib/help/auths/locale/C/SmfValueExAcctNet.html 444 root bin f none usr/lib/help/auths/locale/C/SmfVtStates.html 444 root bin f none usr/lib/help/auths/locale/C/SmfValueHeader.html 444 root bin f none usr/lib/help/auths/locale/C/SmfValueInetd.html 444 root bin @@ -564,6 +566,7 @@ f none usr/lib/help/profiles/locale/C/RtCryptoMngmnt.html 444 root bin f none usr/lib/help/profiles/locale/C/RtExAcctFlow.html 444 root bin f none usr/lib/help/profiles/locale/C/RtExAcctProcess.html 444 root bin f none usr/lib/help/profiles/locale/C/RtExAcctTask.html 444 root bin +f none usr/lib/help/profiles/locale/C/RtExAcctNet.html 444 root bin f none usr/lib/help/profiles/locale/C/RtDHCPMngmnt.html 444 root bin f none usr/lib/help/profiles/locale/C/RtDatAdmin.html 444 root bin f none usr/lib/help/profiles/locale/C/RtDefault.html 444 root bin @@ -683,6 +686,7 @@ f none usr/lib/rcm/modules/SUNW_ip_rcm.so 555 root bin f none usr/lib/rcm/modules/SUNW_mpxio_rcm.so 555 root bin f none usr/lib/rcm/modules/SUNW_network_rcm.so 555 root bin f none usr/lib/rcm/modules/SUNW_vlan_rcm.so 555 root bin +f none usr/lib/rcm/modules/SUNW_vnic_rcm.so 555 root bin f none usr/lib/rcm/modules/SUNW_aggr_rcm.so 555 root bin f none usr/lib/rcm/modules/SUNW_swap_rcm.so 555 root bin f none usr/lib/rcm/rcm_daemon 555 root bin @@ -828,6 +832,7 @@ s none usr/sbin/edquota=../lib/fs/ufs/edquota f none usr/sbin/eeprom 2555 root sys s none usr/sbin/fdisk=../../sbin/fdisk f none usr/sbin/ff 555 root bin +s none usr/sbin/flowadm=../../sbin/flowadm s none usr/sbin/fiocompress=../../sbin/fiocompress f none usr/sbin/fmthard 555 root sys f none usr/sbin/format 555 root bin diff --git a/usr/src/pkgdefs/SUNWmdb/prototype_i386 b/usr/src/pkgdefs/SUNWmdb/prototype_i386 index f7620e480d..05c255e659 100644 --- a/usr/src/pkgdefs/SUNWmdb/prototype_i386 +++ b/usr/src/pkgdefs/SUNWmdb/prototype_i386 @@ -71,6 +71,7 @@ f none usr/lib/mdb/kvm/amd64/ipp.so 555 root sys f none usr/lib/mdb/kvm/amd64/krtld.so 555 root sys f none usr/lib/mdb/kvm/amd64/lofs.so 555 root sys f none usr/lib/mdb/kvm/amd64/logindmux.so 555 root sys +f none usr/lib/mdb/kvm/amd64/mac.so 555 root sys f none usr/lib/mdb/kvm/amd64/md.so 555 root sys f none usr/lib/mdb/kvm/amd64/mdb_kb.so 555 root sys f none usr/lib/mdb/kvm/amd64/mdb_ks.so 555 root sys @@ -103,6 +104,7 @@ f none usr/lib/mdb/kvm/ipp.so 555 root sys f none usr/lib/mdb/kvm/krtld.so 555 root sys f none usr/lib/mdb/kvm/lofs.so 555 root sys f none usr/lib/mdb/kvm/logindmux.so 555 root sys +f none usr/lib/mdb/kvm/mac.so 555 root sys f none usr/lib/mdb/kvm/md.so 555 root sys f none usr/lib/mdb/kvm/mdb_kb.so 555 root sys f none usr/lib/mdb/kvm/mdb_ks.so 555 root sys diff --git a/usr/src/pkgdefs/SUNWmdb/prototype_sparc b/usr/src/pkgdefs/SUNWmdb/prototype_sparc index 7e6878d47e..51f5c49182 100644 --- a/usr/src/pkgdefs/SUNWmdb/prototype_sparc +++ b/usr/src/pkgdefs/SUNWmdb/prototype_sparc @@ -19,10 +19,9 @@ # CDDL HEADER END # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" !include prototype_com @@ -53,6 +52,7 @@ f none usr/lib/mdb/kvm/sparcv9/isp.so 555 root sys f none usr/lib/mdb/kvm/sparcv9/krtld.so 555 root sys f none usr/lib/mdb/kvm/sparcv9/lofs.so 555 root sys f none usr/lib/mdb/kvm/sparcv9/logindmux.so 555 root sys +f none usr/lib/mdb/kvm/sparcv9/mac.so 555 root sys f none usr/lib/mdb/kvm/sparcv9/md.so 555 root sys f none usr/lib/mdb/kvm/sparcv9/mdb_ks.so 555 root sys f none usr/lib/mdb/kvm/sparcv9/mpt.so 555 root sys diff --git a/usr/src/pkgdefs/SUNWmdbr/prototype_i386 b/usr/src/pkgdefs/SUNWmdbr/prototype_i386 index 24755c9731..237c1da83b 100644 --- a/usr/src/pkgdefs/SUNWmdbr/prototype_i386 +++ b/usr/src/pkgdefs/SUNWmdbr/prototype_i386 @@ -41,6 +41,7 @@ f none kernel/kmdb/amd64/ipp 555 root sys f none kernel/kmdb/amd64/krtld 555 root sys f none kernel/kmdb/amd64/lofs 555 root sys f none kernel/kmdb/amd64/logindmux 555 root sys +f none kernel/kmdb/amd64/mac 555 root sys f none kernel/kmdb/amd64/md 555 root sys f none kernel/kmdb/amd64/mdb_ds 555 root sys f none kernel/kmdb/amd64/mpt 555 root sys @@ -72,6 +73,7 @@ f none kernel/kmdb/ipp 555 root sys f none kernel/kmdb/krtld 555 root sys f none kernel/kmdb/lofs 555 root sys f none kernel/kmdb/logindmux 555 root sys +f none kernel/kmdb/mac 555 root sys f none kernel/kmdb/md 555 root sys f none kernel/kmdb/mdb_ds 555 root sys f none kernel/kmdb/mpt 555 root sys diff --git a/usr/src/pkgdefs/SUNWmdbr/prototype_sparc b/usr/src/pkgdefs/SUNWmdbr/prototype_sparc index 99bb424c63..b4057c2328 100644 --- a/usr/src/pkgdefs/SUNWmdbr/prototype_sparc +++ b/usr/src/pkgdefs/SUNWmdbr/prototype_sparc @@ -19,10 +19,9 @@ # CDDL HEADER END # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" # !include prototype_com @@ -43,6 +42,7 @@ f none kernel/kmdb/sparcv9/isp 555 root sys f none kernel/kmdb/sparcv9/krtld 555 root sys f none kernel/kmdb/sparcv9/lofs 555 root sys f none kernel/kmdb/sparcv9/logindmux 555 root sys +f none kernel/kmdb/sparcv9/mac 555 root sys f none kernel/kmdb/sparcv9/md 555 root sys f none kernel/kmdb/sparcv9/mdb_ds 555 root sys f none kernel/kmdb/sparcv9/mpt 555 root sys diff --git a/usr/src/pkgdefs/etc/exception_list_i386 b/usr/src/pkgdefs/etc/exception_list_i386 index ee2ddf8352..e7a2d79ed1 100644 --- a/usr/src/pkgdefs/etc/exception_list_i386 +++ b/usr/src/pkgdefs/etc/exception_list_i386 @@ -89,12 +89,21 @@ usr/include/sys/dld.h i386 usr/include/sys/dld_impl.h i386 usr/include/sys/dld_ioc.h i386 usr/include/sys/dls.h i386 +usr/include/sys/dls_mgmt.h i386 usr/include/sys/dls_impl.h i386 usr/include/sys/mac.h i386 +usr/include/sys/mac_client.h i386 +usr/include/sys/mac_client_impl.h i386 +usr/include/sys/mac_flow.h i386 +usr/include/sys/mac_flow_impl.h i386 usr/include/sys/mac_impl.h i386 +usr/include/sys/mac_provider.h i386 +usr/include/sys/mac_soft_ring.h i386 # # Private GLDv3 userland libraries and headers # +usr/include/sys/vnic.h i386 +usr/include/sys/vnic_impl.h i386 usr/include/libdladm.h i386 usr/include/libdladm_impl.h i386 usr/include/libdllink.h i386 @@ -102,8 +111,11 @@ usr/include/libdlaggr.h i386 usr/include/libdlwlan.h i386 usr/include/libdlwlan_impl.h i386 usr/include/libdlvnic.h i386 +usr/include/libdlflow.h i386 +usr/include/libdlflow_impl.h i386 usr/include/libdlvlan.h i386 usr/include/libdlmgmt.h i386 +usr/include/libdlstat.h i386 lib/libdladm.so i386 lib/llib-ldladm.ln i386 lib/amd64/libdladm.so i386 @@ -528,6 +540,7 @@ lib/llib-lmeta.ln i386 # non-public pci header # usr/include/sys/pci_impl.h i386 +usr/include/sys/pci_tools.h i386 # # Exception list for RCM project, included by librcm and rcm_daemon # diff --git a/usr/src/pkgdefs/etc/exception_list_sparc b/usr/src/pkgdefs/etc/exception_list_sparc index ece69f8eef..005ace8c07 100644 --- a/usr/src/pkgdefs/etc/exception_list_sparc +++ b/usr/src/pkgdefs/etc/exception_list_sparc @@ -78,21 +78,33 @@ usr/include/sys/dld.h sparc usr/include/sys/dld_impl.h sparc usr/include/sys/dld_ioc.h sparc usr/include/sys/dls.h sparc +usr/include/sys/dls_mgmt.h sparc usr/include/sys/dls_impl.h sparc usr/include/sys/mac.h sparc +usr/include/sys/mac_client.h sparc +usr/include/sys/mac_client_impl.h sparc +usr/include/sys/mac_flow.h sparc +usr/include/sys/mac_flow_impl.h sparc usr/include/sys/mac_impl.h sparc +usr/include/sys/mac_provider.h sparc +usr/include/sys/mac_soft_ring.h sparc # # Private GLDv3 userland libraries and headers # +usr/include/sys/vnic.h sparc +usr/include/sys/vnic_impl.h sparc usr/include/libdladm.h sparc usr/include/libdladm_impl.h sparc usr/include/libdllink.h sparc usr/include/libdlaggr.h sparc +usr/include/libdlflow.h sparc +usr/include/libdlflow_impl.h sparc usr/include/libdlwlan.h sparc usr/include/libdlwlan_impl.h sparc usr/include/libdlvnic.h sparc usr/include/libdlvlan.h sparc usr/include/libdlmgmt.h sparc +usr/include/libdlstat.h sparc lib/libdladm.so sparc lib/llib-ldladm.ln sparc lib/sparcv9/libdladm.so sparc @@ -531,6 +543,7 @@ lib/llib-lmeta.ln sparc # non-public pci header # usr/include/sys/pci_impl.h sparc +usr/include/sys/pci_tools.h sparc # # Exception list for RCM project, included by librcm and rcm_daemon # diff --git a/usr/src/tools/scripts/bfu.sh b/usr/src/tools/scripts/bfu.sh index 88404359fd..100d0e594d 100644 --- a/usr/src/tools/scripts/bfu.sh +++ b/usr/src/tools/scripts/bfu.sh @@ -666,7 +666,7 @@ inetd_conf_svm_hack() { } upgrade_aggr_and_linkprop () { - # Since aggregation.conf and linkprop.conf are upgraded by + # Since aggregation.conf and linkprop.conf are upgraded by # SUNWcnetr's postinstall script, put the relevant portions of the # postinstall script here, modified to rename the old files instead # of removing them. @@ -756,6 +756,30 @@ upgrade_aggr_and_linkprop () { fi } +upgrade_vlan () { + # Convert hostname.*** and zonecfg vlan configurations + UPGRADE_SCRIPT=/var/svc/profile/upgrade_datalink + + for ifname in $host_ifs $zone_ifs + do + phys=`echo $ifname | sed "s/[0-9]*$//"` + devnum=`echo $ifname | sed "s/$phys//g"` + if [ "$phys$devnum" != $ifname -o \ + -n "`echo $devnum | tr -d '[0-9]'`" ]; then + echo "skipping invalid interface $ifname" + continue + fi + + vid=`expr $devnum / 1000` + inst=`expr $devnum % 1000` + + if [ "$vid" != "0" ]; then + echo dladm create-vlan -l $phys$inst -v $vid $ifname \ + >> $rootprefix$UPGRADE_SCRIPT + fi + done +} + # Update aac.conf for set legacy-name-enable properly update_aac_conf() { @@ -1174,6 +1198,24 @@ migrate_acctadm_conf() svcadm enable $fmri fi + fmri="svc:/system/extended-accounting:net" + svccfg -s $fmri setprop config/file = \ + ${ACCTADM_NET_FILE:="none"} + svccfg -s $fmri setprop config/tracked = \ + ${ACCTADM_NET_TRACKED:="none"} + svccfg -s $fmri setprop config/untracked = \ + ${ACCTADM_NET_UNTRACKED:="extended"} + if [ ${ACCTADM_NET_ENABLE:="no"} = "yes" ]; then + svccfg -s $fmri setprop config/enabled = "true" + else + svccfg -s $fmri setprop config/enabled = "false" + fi + if [ $ACCTADM_NET_ENABLE = "yes" -o \ + $ACCTADM_NET_FILE != "none" -o \ + $ACCTADM_NET_TRACKED != "none" ]; then + svcadm enable $fmri + fi + rm /etc/acctadm.conf fi _EOF @@ -4762,6 +4804,28 @@ then fi # + # save vlans associated with zones to be upgraded + # to the new dladm based format + # + flowadm_status="old" + if [[ ! -f $root/sbin/flowadm ]] && \ + archive_file_exists generic.sbin "sbin/flowadm"; then + flowadm_status="new" + host_ifs=`ls -1 $rootprefix/etc | egrep -e \ + '^hostname.|^hostname6.|^dhcp.'| cut -d . -f2 | sort -u` + zones=`zoneadm list -c | grep -v global` + for zone in $zones + do + zonecfg -z $zone info ip-type | grep exclusive \ + >/dev/null + if [ $? -eq 0 ]; then + zif=`zonecfg -z $zone info net | \ + grep physical | nawk '{print $2}'` + zone_ifs="$zone_ifs $zif" + fi + done + fi + # # Stop sendmail so that mail doesn't bounce during the interval # where /etc/mail/aliases is (effectively) empty. # @@ -7593,6 +7657,7 @@ mondo_loop() { # rm -f $root/usr/lib/rcm/modules/SUNW_vlan_rcm.so rm -f $root/usr/lib/rcm/modules/SUNW_aggr_rcm.so + rm -f $root/usr/lib/rcm/modules/SUNW_vnic_rcm.so rm -f $root/kernel/drv/softmac rm -f $root/kernel/drv/sparcv9/softmac rm -f $root/kernel/drv/amd64/softmac @@ -8077,6 +8142,11 @@ mondo_loop() { fi fi + # upgrade hostname and zones based vlans to dladm + if [[ $flowadm_status == "new" ]]; then + upgrade_vlan + fi + # The global zone needs to have its /dev/dld symlink created # during install so that processes can access it early in boot # before devfsadm is run. diff --git a/usr/src/uts/common/Makefile b/usr/src/uts/common/Makefile index 5b8f6bbc6b..7cf2f14f64 100644 --- a/usr/src/uts/common/Makefile +++ b/usr/src/uts/common/Makefile @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -20,10 +19,9 @@ # CDDL HEADER END # # -# Copyright 2002-2003 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" # # uts/common/Makefile # diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 2a54074941..564b2cf72e 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -174,6 +174,7 @@ GENUNIX_OBJS += \ inet_ntop.o \ instance.o \ ioctl.o \ + ip_cksum.o \ issetugid.o \ ippconf.o \ kcpc.o \ @@ -265,6 +266,7 @@ GENUNIX_OBJS += \ sidsys.o \ sched.o \ schedctl.o \ + sctp_crc32.o \ seg_dev.o \ seg_kp.o \ seg_kpm.o \ @@ -474,7 +476,7 @@ IP_ICMP_OBJS = icmp.o icmp_opt_data.o IP_RTS_OBJS = rts.o rts_opt_data.o IP_TCP_OBJS = tcp.o tcp_fusion.o tcp_kssl.o tcp_opt_data.o tcp_sack.o IP_UDP_OBJS = udp.o udp_opt_data.o -IP_SCTP_OBJS = sctp_crc32.o sctp.o sctp_opt_data.o sctp_output.o \ +IP_SCTP_OBJS = sctp.o sctp_opt_data.o sctp_output.o \ sctp_init.o sctp_input.o sctp_cookie.o \ sctp_conn.o sctp_error.o sctp_snmp.o \ sctp_param.o sctp_shutdown.o sctp_common.o \ @@ -483,7 +485,7 @@ IP_SCTP_OBJS = sctp_crc32.o sctp.o sctp_opt_data.o sctp_output.o \ sctp_addr.o tn_ipopt.o tnet.o ip_netinfo.o IP_OBJS += igmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \ - ip_cksum.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \ + ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \ ip_multi.o ip_ndp.o ip_opt_data.o ip_rts.o ip_srcid.o \ ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \ spd.o ipclassifier.o inet_common.o ip_squeue.o squeue.o \ @@ -560,14 +562,15 @@ CLONE_OBJS += clone.o CN_OBJS += cons.o -DLD_OBJS += dld_drv.o dld_proto.o dld_str.o +DLD_OBJS += dld_drv.o dld_proto.o dld_str.o dld_flow.o -DLS_OBJS += dls.o dls_link.o dls_mod.o dls_stat.o dls_vlan.o \ - dls_soft_ring.o dls_mgmt.o +DLS_OBJS += dls.o dls_link.o dls_mod.o dls_stat.o dls_mgmt.o GLD_OBJS += gld.o gldutil.o -MAC_OBJS += mac.o mac_mod.o mac_stat.o mac_ndd.o +MAC_OBJS += mac.o mac_bcast.o mac_client.o mac_datapath_setup.o mac_flow.o \ + mac_hio.o mac_mod.o mac_ndd.o mac_provider.o mac_sched.o \ + mac_soft_ring.o mac_stat.o mac_util.o MAC_ETHER_OBJS += mac_ether.o @@ -578,8 +581,6 @@ MAC_IB_OBJS += mac_ib.o AGGR_OBJS += aggr_dev.o aggr_ctl.o aggr_grp.o aggr_port.o \ aggr_send.o aggr_recv.o aggr_lacp.o -VNIC_OBJS += vnic_ctl.o vnic_dev.o vnic_bcast.o vnic_cl.o - SOFTMAC_OBJS += softmac_main.o softmac_ctl.o softmac_capab.o \ softmac_dev.o softmac_stat.o softmac_pkt.o @@ -588,6 +589,8 @@ NET80211_OBJS += net80211.o net80211_proto.o net80211_input.o \ net80211_crypto_none.o net80211_crypto_wep.o net80211_ioctl.o \ net80211_crypto_tkip.o net80211_crypto_ccmp.o +VNIC_OBJS += vnic_ctl.o vnic_dev.o + IB_OBJS += ibnex.o ibnex_ioctl.o IBCM_OBJS += ibcm_impl.o ibcm_sm.o ibcm_ti.o ibcm_utils.o ibcm_path.o \ @@ -1724,18 +1727,17 @@ IXGBE_OBJS = ixgbe_82598.o ixgbe_api.o ixgbe_common.o \ # # NIU 10G/1G driver module # -NXGE_OBJS = nxge_mac.o nxge_ipp.o nxge_rxdma.o \ - nxge_txdma.o nxge_txc.o nxge_main.o \ +NXGE_OBJS = nxge_mac.o nxge_ipp.o nxge_rxdma.o \ + nxge_txdma.o nxge_txc.o nxge_main.o \ nxge_hw.o nxge_fzc.o nxge_virtual.o \ nxge_send.o nxge_classify.o nxge_fflp.o \ nxge_fflp_hash.o nxge_ndd.o nxge_kstats.o \ - nxge_zcp.o nxge_fm.o nxge_espc.o \ - nxge_serialize.o nxge_hv.o \ + nxge_zcp.o nxge_fm.o nxge_espc.o nxge_hv.o \ nxge_hio.o nxge_hio_guest.o nxge_intr.o NXGE_NPI_OBJS = \ - npi.o npi_mac.o npi_ipp.o \ - npi_txdma.o npi_rxdma.o npi_txc.o \ + npi.o npi_mac.o npi_ipp.o \ + npi_txdma.o npi_rxdma.o npi_txc.o \ npi_zcp.o npi_espc.o npi_fflp.o \ npi_vir.o diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h index 09a34afa80..c7ccff8a14 100644 --- a/usr/src/uts/common/inet/ip.h +++ b/usr/src/uts/common/inet/ip.h @@ -50,6 +50,7 @@ extern "C" { #ifdef _KERNEL #include <netinet/ip6.h> #include <sys/avl.h> +#include <sys/list.h> #include <sys/vmem.h> #include <sys/squeue.h> #include <net/route.h> @@ -380,6 +381,13 @@ typedef struct ipf_s { uint32_t ipf_checksum; /* Partial checksum of fragment data */ } ipf_t; +/* + * IPv4 Fragments + */ +#define IS_V4_FRAGMENT(ipha_fragment_offset_and_flags) \ + (((ntohs(ipha_fragment_offset_and_flags) & IPH_OFFSET) != 0) || \ + ((ntohs(ipha_fragment_offset_and_flags) & IPH_MF) != 0)) + #define ipf_src V4_PART_OF_V6(ipf_v6src) #define ipf_dst V4_PART_OF_V6(ipf_v6dst) @@ -1718,9 +1726,10 @@ typedef union ill_g_head_u { #define ILL_CAPAB_MDT 0x04 /* Multidata Transmit */ #define ILL_CAPAB_HCKSUM 0x08 /* Hardware checksumming */ #define ILL_CAPAB_ZEROCOPY 0x10 /* Zero-copy */ -#define ILL_CAPAB_POLL 0x20 /* Polling Toggle */ -#define ILL_CAPAB_SOFT_RING 0x40 /* Soft_Ring capability */ -#define ILL_CAPAB_LSO 0x80 /* Large Segment Offload */ +#define ILL_CAPAB_DLD 0x20 /* DLD capabilities */ +#define ILL_CAPAB_DLD_POLL 0x40 /* Polling */ +#define ILL_CAPAB_DLD_DIRECT 0x80 /* Direct function call */ +#define ILL_CAPAB_DLD_LSO 0x100 /* Large Segment Offload */ /* * Per-ill Multidata Transmit capabilities. @@ -1743,9 +1752,9 @@ typedef struct ill_hcksum_capab_s ill_hcksum_capab_t; typedef struct ill_zerocopy_capab_s ill_zerocopy_capab_t; /* - * Per-ill Polling/soft ring capbilities. + * DLD capbilities. */ -typedef struct ill_dls_capab_s ill_dls_capab_t; +typedef struct ill_dld_capab_s ill_dld_capab_t; /* * Per-ill polling resource map. @@ -1762,7 +1771,6 @@ typedef struct ill_lso_capab_s ill_lso_capab_t; #define ILL_CONDEMNED 0x02 /* No more new ref's to the ILL */ #define ILL_CHANGING 0x04 /* ILL not globally visible */ #define ILL_DL_UNBIND_IN_PROGRESS 0x08 /* UNBIND_REQ is sent */ -#define ILL_SOFT_RING_ASSIGN 0x10 /* Making soft ring assignment */ /* Is this an ILL whose source address is used by other ILL's ? */ #define IS_USESRC_ILL(ill) \ @@ -1870,8 +1878,10 @@ typedef struct ill_s { ill_note_link : 1, /* supports link-up notification */ ill_capab_reneg : 1, /* capability renegotiation to be done */ + ill_dld_capab_inprog : 1, /* direct dld capab call in prog */ ill_need_recover_multicast : 1, - ill_pad_to_bit_31 : 17; + + ill_pad_to_bit_31 : 16; /* Following bit fields protected by ill_lock */ uint_t @@ -1883,6 +1893,7 @@ typedef struct ill_s { ill_arp_bringup_pending : 1, ill_mtu_userspecified : 1, /* SIOCSLIFLNKINFO has set the mtu */ ill_arp_extend : 1, /* ARP has DAD extensions */ + ill_pad_bit_31 : 25; /* @@ -1903,15 +1914,17 @@ typedef struct ill_s { /* * Capabilities related fields. */ - uint_t ill_dlpi_capab_state; /* State of capability query, IDS_* */ + uint_t ill_dlpi_capab_state; /* State of capability query, IDCS_* */ + uint_t ill_capab_pending_cnt; uint64_t ill_capabilities; /* Enabled capabilities, ILL_CAPAB_* */ ill_mdt_capab_t *ill_mdt_capab; /* Multidata Transmit capabilities */ ill_ipsec_capab_t *ill_ipsec_capab_ah; /* IPsec AH capabilities */ ill_ipsec_capab_t *ill_ipsec_capab_esp; /* IPsec ESP capabilities */ ill_hcksum_capab_t *ill_hcksum_capab; /* H/W cksumming capabilities */ ill_zerocopy_capab_t *ill_zerocopy_capab; /* Zero-copy capabilities */ - ill_dls_capab_t *ill_dls_capab; /* Polling, soft ring capabilities */ - ill_lso_capab_t *ill_lso_capab; /* Large Segment Offload capabilities */ + ill_dld_capab_t *ill_dld_capab; /* DLD capabilities */ + ill_lso_capab_t *ill_lso_capab; /* Large Segment Offload capabilities */ + mblk_t *ill_capab_reset_mp; /* Preallocated mblk for capab reset */ /* * New fields for IPv6 @@ -1989,6 +2002,7 @@ typedef struct ill_s { zoneid_t ill_zoneid; ip_stack_t *ill_ipst; /* Corresponds to a netstack_hold */ uint32_t ill_dhcpinit; /* IP_DHCPINIT_IFs for ill */ + void *ill_flownotify_mh; /* Tx flow ctl, mac cb handle */ uint_t ill_ilm_cnt; /* ilms referencing this ill */ uint_t ill_ipallmulti_cnt; /* ip_join_allmulti() calls */ } ill_t; @@ -2069,6 +2083,7 @@ typedef struct ill_s { * ill_type ipsq + down ill only when ill is up * ill_dlpi_multicast_state ill_lock ill_lock * ill_dlpi_fastpath_state ill_lock ill_lock + * ill_dlpi_capab_state ipsq ipsq * ill_max_hops ipsq Not atomic * * ill_max_mtu @@ -2110,6 +2125,8 @@ typedef struct ill_s { * ill_trace ill_lock ill_lock * ill_usesrc_grp_next ill_g_usesrc_lock ill_g_usesrc_lock * ill_dhcpinit atomics atomics + * ill_flownotify_mh write once write once + * ill_capab_pending_cnt ipsq ipsq */ /* @@ -2182,13 +2199,22 @@ typedef struct ipmx_s { * State for detecting if a driver supports certain features. * Support for DL_ENABMULTI_REQ uses ill_dlpi_multicast_state. * Support for DLPI M_DATA fastpath uses ill_dlpi_fastpath_state. - * Support for DL_CAPABILITY_REQ uses ill_dlpi_capab_state. */ #define IDS_UNKNOWN 0 /* No DLPI request sent */ #define IDS_INPROGRESS 1 /* DLPI request sent */ #define IDS_OK 2 /* DLPI request completed successfully */ #define IDS_FAILED 3 /* DLPI request failed */ +/* Support for DL_CAPABILITY_REQ uses ill_dlpi_capab_state. */ +enum { + IDCS_UNKNOWN, + IDCS_PROBE_SENT, + IDCS_OK, + IDCS_RESET_SENT, + IDCS_RENEG, + IDCS_FAILED +}; + /* Named Dispatch Parameter Management Structure */ typedef struct ipparam_s { uint_t ip_param_min; @@ -3165,6 +3191,8 @@ extern int ip_opt_set_ill(conn_t *, int, boolean_t, boolean_t, extern void ip_rput(queue_t *, mblk_t *); extern void ip_input(ill_t *, ill_rx_ring_t *, mblk_t *, struct mac_header_info_s *); +extern mblk_t *ip_accept_tcp(ill_t *, ill_rx_ring_t *, squeue_t *, + mblk_t *, mblk_t **, uint_t *cnt); extern void ip_rput_dlpi(queue_t *, mblk_t *); extern void ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *); extern void ip_rput_forward_multicast(ipaddr_t, mblk_t *, ipif_t *); @@ -3201,13 +3229,13 @@ extern ipaddr_t ip_net_mask(ipaddr_t); extern void ip_newroute(queue_t *, mblk_t *, ipaddr_t, conn_t *, zoneid_t, ip_stack_t *); extern ipxmit_state_t ip_xmit_v4(mblk_t *, ire_t *, struct ipsec_out_s *, - boolean_t); + boolean_t, conn_t *); extern int ip_hdr_complete(ipha_t *, zoneid_t, ip_stack_t *); extern struct qinit iprinitv6; extern struct qinit ipwinitv6; -extern void conn_drain_insert(conn_t *connp); +extern void conn_drain_insert(conn_t *connp); extern int conn_ipsec_length(conn_t *connp); extern void ip_wput_ipsec_out(queue_t *, mblk_t *, ipha_t *, ill_t *, ire_t *); @@ -3437,17 +3465,22 @@ struct ill_zerocopy_capab_s { }; struct ill_lso_capab_s { - uint_t ill_lso_version; /* interface version */ uint_t ill_lso_on; /* on/off switch for LSO on this ILL */ uint_t ill_lso_flags; /* capabilities */ uint_t ill_lso_max; /* maximum size of payload */ }; -/* Possible ill_states */ -#define ILL_RING_INPROC 3 /* Being assigned to squeue */ -#define ILL_RING_INUSE 2 /* Already Assigned to Rx Ring */ -#define ILL_RING_BEING_FREED 1 /* Being Unassigned */ -#define ILL_RING_FREE 0 /* Available to be assigned to Ring */ +/* + * rr_ring_state cycles in the order shown below from RR_FREE through + * RR_FREE_IN_PROG and back to RR_FREE. + */ +typedef enum { + RR_FREE, /* Free slot */ + RR_SQUEUE_UNBOUND, /* Ring's squeue is unbound */ + RR_SQUEUE_BIND_INPROG, /* Ring's squeue bind in progress */ + RR_SQUEUE_BOUND, /* Ring's squeue bound to cpu */ + RR_FREE_INPROG /* Ring is being freed */ +} ip_ring_state_t; #define ILL_MAX_RINGS 256 /* Max num of rx rings we can manage */ #define ILL_POLLING 0x01 /* Polling in use */ @@ -3457,73 +3490,92 @@ struct ill_lso_capab_s { * we need to duplicate the definitions here because we cannot * include mac/dls header files here. */ -typedef void (*ip_mac_blank_t)(void *, time_t, uint_t); -typedef void (*ip_dld_tx_t)(void *, mblk_t *); +typedef void *ip_mac_tx_cookie_t; +typedef void (*ip_mac_intr_disable_t)(void *); +typedef void (*ip_mac_intr_enable_t)(void *); +typedef void *(*ip_dld_tx_t)(void *, mblk_t *, uint64_t, uint16_t); +typedef void (*ip_flow_enable_t)(void *, ip_mac_tx_cookie_t); +typedef void *(*ip_dld_callb_t)(void *, ip_flow_enable_t, void *); +typedef int (*ip_capab_func_t)(void *, uint_t, void *, uint_t); -typedef void (*ip_dls_chg_soft_ring_t)(void *, int); -typedef void (*ip_dls_bind_t)(void *, processorid_t); -typedef void (*ip_dls_unbind_t)(void *); +/* + * POLLING README + * sq_get_pkts() is called to pick packets from softring in poll mode. It + * calls rr_rx to get the chain and process it with rr_ip_accept. + * rr_rx = mac_soft_ring_poll() to pick packets + * rr_ip_accept = ip_accept_tcp() to process packets + */ +/* + * XXX: With protocol, service specific squeues, they will have + * specific acceptor functions. + */ +typedef mblk_t *(*ip_mac_rx_t)(void *, size_t); +typedef mblk_t *(*ip_accept_t)(ill_t *, ill_rx_ring_t *, + squeue_t *, mblk_t *, mblk_t **, uint_t *); + +/* + * rr_intr_enable, rr_intr_disable, rr_rx_handle, rr_rx: + * May be accessed while in the squeue AND after checking that SQS_POLL_CAPAB + * is set. + * + * rr_ring_state: Protected by ill_lock. + */ struct ill_rx_ring { - ip_mac_blank_t rr_blank; /* Driver interrupt blanking func */ - void *rr_handle; /* Handle for Rx ring */ + ip_mac_intr_disable_t rr_intr_disable; /* Interrupt disabling func */ + ip_mac_intr_enable_t rr_intr_enable; /* Interrupt enabling func */ + void *rr_intr_handle; /* Handle interrupt funcs */ + ip_mac_rx_t rr_rx; /* Driver receive function */ + ip_accept_t rr_ip_accept; /* IP accept function */ + void *rr_rx_handle; /* Handle for Rx ring */ squeue_t *rr_sqp; /* Squeue the ring is bound to */ - ill_t *rr_ill; /* back pointer to ill */ - clock_t rr_poll_time; /* Last lbolt polling was used */ - uint32_t rr_poll_state; /* polling state flags */ - uint32_t rr_max_blank_time; /* Max interrupt blank */ - uint32_t rr_min_blank_time; /* Min interrupt blank */ - uint32_t rr_max_pkt_cnt; /* Max pkts before interrupt */ - uint32_t rr_min_pkt_cnt; /* Mix pkts before interrupt */ - uint32_t rr_normal_blank_time; /* Normal intr freq */ - uint32_t rr_normal_pkt_cnt; /* Normal intr pkt cnt */ - uint32_t rr_ring_state; /* State of this ring */ + ill_t *rr_ill; /* back pointer to ill */ + ip_ring_state_t rr_ring_state; /* State of this ring */ }; -struct ill_dls_capab_s { - ip_dld_tx_t ill_tx; /* Driver Tx routine */ - void *ill_tx_handle; /* Driver Tx handle */ - ip_dls_chg_soft_ring_t ill_dls_change_status; - /* change soft ring fanout */ - ip_dls_bind_t ill_dls_bind; /* to add CPU affinity */ - ip_dls_unbind_t ill_dls_unbind; /* remove CPU affinity */ - ill_rx_ring_t *ill_ring_tbl; /* Ring to Sqp mapping table */ - uint_t ill_dls_soft_ring_cnt; /* Number of soft ring */ - conn_t *ill_unbind_conn; /* Conn used during unplumb */ +/* + * IP - DLD direct function call capability + * Suffixes, df - dld function, dh - dld handle, + * cf - client (IP) function, ch - client handle + */ +typedef struct ill_dld_direct_s { /* DLD provided driver Tx */ + ip_dld_tx_t idd_tx_df; /* str_mdata_fastpath_put */ + void *idd_tx_dh; /* dld_str_t *dsp */ + ip_dld_callb_t idd_tx_cb_df; /* mac_tx_srs_notify */ + void *idd_tx_cb_dh; /* mac_client_handle_t *mch */ +} ill_dld_direct_t; + +/* IP - DLD polling capability */ +typedef struct ill_dld_poll_s { + ill_rx_ring_t idp_ring_tbl[ILL_MAX_RINGS]; +} ill_dld_poll_t; + +/* Describes ill->ill_dld_capab */ +struct ill_dld_capab_s { + ip_capab_func_t idc_capab_df; /* dld_capab_func */ + void *idc_capab_dh; /* dld_str_t *dsp */ + ill_dld_direct_t idc_direct; + ill_dld_poll_t idc_poll; }; /* * IP squeues exports */ -extern int ip_squeue_profile; -extern int ip_squeue_bind; extern boolean_t ip_squeue_fanout; -extern boolean_t ip_squeue_soft_ring; -extern uint_t ip_threads_per_cpu; -extern uint_t ip_squeues_per_cpu; -extern uint_t ip_soft_rings_cnt; - -typedef struct squeue_set_s { - kmutex_t sqs_lock; - struct squeue_s **sqs_list; - int sqs_size; - int sqs_max_size; - processorid_t sqs_bind; -} squeue_set_t; - -#define IP_SQUEUE_GET(hint) \ - ((!ip_squeue_fanout) ? (CPU->cpu_squeue_set->sqs_list[0]) : \ - ip_squeue_random(hint)) -typedef void (*squeue_func_t)(squeue_t *, mblk_t *, sqproc_t, void *, uint8_t); +#define IP_SQUEUE_GET(hint) ip_squeue_random(hint) extern void ip_squeue_init(void (*)(squeue_t *)); extern squeue_t *ip_squeue_random(uint_t); extern squeue_t *ip_squeue_get(ill_rx_ring_t *); -extern int ip_squeue_bind_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); +extern squeue_t *ip_squeue_getfree(pri_t); +extern int ip_squeue_cpu_move(squeue_t *, processorid_t); +extern void *ip_squeue_add_ring(ill_t *, void *); +extern void ip_squeue_bind_ring(ill_t *, ill_rx_ring_t *, processorid_t); +extern void ip_squeue_clean_ring(ill_t *, ill_rx_ring_t *); +extern void ip_squeue_quiesce_ring(ill_t *, ill_rx_ring_t *); +extern void ip_squeue_restart_ring(ill_t *, ill_rx_ring_t *); extern void ip_squeue_clean_all(ill_t *); -extern void ip_soft_ring_assignment(ill_t *, ill_rx_ring_t *, - mblk_t *, struct mac_header_info_s *); extern void ip_resume_tcp_bind(void *, mblk_t *, void *); extern void tcp_wput(queue_t *, mblk_t *); @@ -3580,6 +3632,9 @@ typedef void (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *); #define SQTAG_TCP_KSSL_INPUT 36 #define SQTAG_TCP_DROP_Q0 37 #define SQTAG_TCP_CONN_REQ_2 38 +#define SQTAG_IP_INPUT_RX_RING 39 +#define SQTAG_SQUEUE_CHANGE 40 +#define SQTAG_CONNECT_FINISH 41 #define NOT_OVER_IP(ip_wq) \ (ip_wq->q_next != NULL || \ diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c index 553a975c54..90cc6a51d5 100644 --- a/usr/src/uts/common/inet/ip/icmp.c +++ b/usr/src/uts/common/inet/ip/icmp.c @@ -24,9 +24,6 @@ */ /* Copyright (c) 1990 Mentat Inc. */ - -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/stream.h> #include <sys/stropts.h> @@ -4331,8 +4328,7 @@ icmp_wput_hdrincl(queue_t *q, mblk_t *mp, icmp_t *icmp, ip4_pkt_t *pktinfop) } mblk_setcred(mp, connp->conn_cred); - ip_output_options(connp, mp, q, IP_WPUT, - &optinfo); + ip_output_options(connp, mp, q, IP_WPUT, &optinfo); } static boolean_t diff --git a/usr/src/uts/common/inet/ip/igmp.c b/usr/src/uts/common/inet/ip/igmp.c index ecfafc5e51..091509c71e 100644 --- a/usr/src/uts/common/inet/ip/igmp.c +++ b/usr/src/uts/common/inet/ip/igmp.c @@ -24,8 +24,6 @@ */ /* Copyright (c) 1990 Mentat Inc. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Internet Group Management Protocol (IGMP) routines. * Multicast Listener Discovery Protocol (MLD) routines. @@ -1439,7 +1437,7 @@ igmp_timeout_handler(void *arg) if (!ill_waiter_inc(ill)) continue; rw_exit(&ipst->ips_ill_g_lock); - success = ipsq_enter(ill, B_TRUE); + success = ipsq_enter(ill, B_TRUE, NEW_OP); if (success) { next = igmp_timeout_handler_per_ill(ill); if (next < global_next) @@ -1682,7 +1680,7 @@ mld_timeout_handler(void *arg) if (!ill_waiter_inc(ill)) continue; rw_exit(&ipst->ips_ill_g_lock); - success = ipsq_enter(ill, B_TRUE); + success = ipsq_enter(ill, B_TRUE, NEW_OP); if (success) { next = mld_timeout_handler_per_ill(ill); if (next < global_next) diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c index 5eb9a7e1d2..b0eaa51983 100644 --- a/usr/src/uts/common/inet/ip/ip.c +++ b/usr/src/uts/common/inet/ip/ip.c @@ -46,6 +46,7 @@ #include <sys/atomic.h> #include <sys/policy.h> #include <sys/priv.h> +#include <sys/taskq.h> #include <sys/systm.h> #include <sys/param.h> @@ -125,16 +126,17 @@ #include <sys/tsol/tnet.h> #include <rpc/pmap_prot.h> +#include <sys/squeue_impl.h> /* * Values for squeue switch: - * IP_SQUEUE_ENTER_NODRAIN: squeue_enter_nodrain - * IP_SQUEUE_ENTER: squeue_enter - * IP_SQUEUE_FILL: squeue_fill + * IP_SQUEUE_ENTER_NODRAIN: SQ_NODRAIN + * IP_SQUEUE_ENTER: SQ_PROCESS + * IP_SQUEUE_FILL: SQ_FILL */ int ip_squeue_enter = 2; /* Setable in /etc/system */ -squeue_func_t ip_input_proc; +int ip_squeue_flag; #define SET_BPREV_FLAG(x) ((mblk_t *)(uintptr_t)(x)) /* @@ -391,6 +393,11 @@ void (*cl_inet_idlesa)(uint8_t, uint32_t, sa_family_t, in6_addr_t, * gcgrp_rwlock -> ire_lock * gcgrp_rwlock -> gcdb_lock * + * squeue(sq_lock), flow related (ft_lock, fe_lock) locking + * + * cpu_lock --> ill_lock --> sqset_lock --> sq_lock + * sq_lock -> conn_lock -> QLOCK(q) + * ill_lock -> ft_lock -> fe_lock * * Routing/forwarding table locking notes: * @@ -730,7 +737,7 @@ static boolean_t ip_source_route_included(ipha_t *); static void ip_trash_ire_reclaim_stack(ip_stack_t *); static void ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t, - zoneid_t, ip_stack_t *); + zoneid_t, ip_stack_t *, conn_t *); static mblk_t *ip_wput_frag_copyhdr(uchar_t *, int, int, ip_stack_t *); static void ip_wput_local_options(ipha_t *, ip_stack_t *); static int ip_wput_options(queue_t *, mblk_t *, ipha_t *, boolean_t, @@ -763,17 +770,13 @@ static void ip_multirt_bad_mtu(ire_t *, uint32_t); static int ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *); static int ip_cgtp_filter_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); -extern int ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, - caddr_t cp, cred_t *cr); -extern int ip_squeue_profile_set(queue_t *, mblk_t *, char *, caddr_t, - cred_t *); static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); static int ipmp_hook_emulation_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); -static squeue_func_t ip_squeue_switch(int); +static int ip_squeue_switch(int); static void *ip_kstat_init(netstackid_t, ip_stack_t *); static void ip_kstat_fini(netstackid_t, kstat_t *); @@ -790,7 +793,7 @@ static mblk_t *ip_tcp_input(mblk_t *, ipha_t *, ill_t *, boolean_t, ire_t *, mblk_t *, uint_t, queue_t *, ill_rx_ring_t *); static void ip_rput_process_forward(queue_t *, mblk_t *, ire_t *, - ipha_t *, ill_t *, boolean_t); + ipha_t *, ill_t *, boolean_t, boolean_t); static void ipobs_init(ip_stack_t *); static void ipobs_fini(ip_stack_t *); @@ -934,20 +937,14 @@ static ipndp_t lcl_ndp_arr[] = { "ip_rput_pullups" }, { ip_srcid_report, NULL, NULL, "ip_srcid_status" }, - { ip_param_generic_get, ip_squeue_profile_set, - (caddr_t)&ip_squeue_profile, "ip_squeue_profile" }, - { ip_param_generic_get, ip_squeue_bind_set, - (caddr_t)&ip_squeue_bind, "ip_squeue_bind" }, { ip_param_generic_get, ip_input_proc_set, (caddr_t)&ip_squeue_enter, "ip_squeue_enter" }, { ip_param_generic_get, ip_int_set, (caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" }, -#define IPNDP_CGTP_FILTER_OFFSET 11 +#define IPNDP_CGTP_FILTER_OFFSET 9 { ip_cgtp_filter_get, ip_cgtp_filter_set, NULL, "ip_cgtp_filter" }, - { ip_param_generic_get, ip_int_set, - (caddr_t)&ip_soft_rings_cnt, "ip_soft_rings_cnt" }, -#define IPNDP_IPMP_HOOK_OFFSET 13 +#define IPNDP_IPMP_HOOK_OFFSET 10 { ip_param_generic_get, ipmp_hook_emulation_set, NULL, "ipmp_hook_emulation" }, { ip_param_generic_get, ip_int_set, (caddr_t)&ip_debug, @@ -2564,8 +2561,8 @@ icmp_inbound_error_fanout(queue_t *q, ill_t *ill, mblk_t *mp, /* Have to change db_type after any pullupmsg */ DB_TYPE(mp) = M_CTL; - squeue_fill(connp->conn_sqp, first_mp, tcp_input, - connp, SQTAG_TCP_INPUT_ICMP_ERR); + SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, tcp_input, connp, + SQ_FILL, SQTAG_TCP_INPUT_ICMP_ERR); return; case IPPROTO_SCTP: @@ -5367,34 +5364,13 @@ ip_modclose(ill_t *ill) ipif_t *ipif; queue_t *q = ill->ill_rq; ip_stack_t *ipst = ill->ill_ipst; - clock_t timeout; - - /* - * Wait for the ACKs of all deferred control messages to be processed. - * In particular, we wait for a potential capability reset initiated - * in ip_sioctl_plink() to complete before proceeding. - * - * Note: we wait for at most ip_modclose_ackwait_ms (by default 3000 ms) - * in case the driver never replies. - */ - timeout = lbolt + MSEC_TO_TICK(ip_modclose_ackwait_ms); - mutex_enter(&ill->ill_lock); - while (ill->ill_dlpi_pending != DL_PRIM_INVAL) { - if (cv_timedwait(&ill->ill_cv, &ill->ill_lock, timeout) < 0) { - /* Timeout */ - break; - } - } - mutex_exit(&ill->ill_lock); /* - * Forcibly enter the ipsq after some delay. This is to take - * care of the case when some ioctl does not complete because - * we sent a control message to the driver and it did not - * send us a reply. We want to be able to at least unplumb - * and replumb rather than force the user to reboot the system. + * The punlink prior to this may have initiated a capability + * negotiation. But ipsq_enter will block until that finishes or + * times out. */ - success = ipsq_enter(ill, B_FALSE); + success = ipsq_enter(ill, B_FALSE, NEW_OP); /* * Open/close/push/pop is guaranteed to be single threaded @@ -5661,33 +5637,6 @@ ip_conn_input(void *arg1, mblk_t *mp, void *arg2) putnext(connp->conn_rq, mp); } -/* Return the IP checksum for the IP header at "iph". */ -uint16_t -ip_csum_hdr(ipha_t *ipha) -{ - uint16_t *uph; - uint32_t sum; - int opt_len; - - opt_len = (ipha->ipha_version_and_hdr_length & 0xF) - - IP_SIMPLE_HDR_LENGTH_IN_WORDS; - uph = (uint16_t *)ipha; - sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + - uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; - if (opt_len > 0) { - do { - sum += uph[10]; - sum += uph[11]; - uph += 2; - } while (--opt_len); - } - sum = (sum & 0xFFFF) + (sum >> 16); - sum = ~(sum + (sum >> 16)) & 0xFFFF; - if (sum == 0xffff) - sum = 0; - return ((uint16_t)sum); -} - /* * Called when the module is about to be unloaded */ @@ -5741,6 +5690,11 @@ ip_stack_shutdown(netstackid_t stackid, void *arg) */ ipv4_hook_shutdown(ipst); ipv6_hook_shutdown(ipst); + + mutex_enter(&ipst->ips_capab_taskq_lock); + ipst->ips_capab_taskq_quit = B_TRUE; + cv_signal(&ipst->ips_capab_taskq_cv); + mutex_exit(&ipst->ips_capab_taskq_lock); } /* @@ -5761,6 +5715,10 @@ ip_stack_fini(netstackid_t stackid, void *arg) ipv6_hook_destroy(ipst); ip_net_destroy(ipst); + mutex_destroy(&ipst->ips_capab_taskq_lock); + cv_destroy(&ipst->ips_capab_taskq_cv); + list_destroy(&ipst->ips_capab_taskq_list); + #ifdef NS_DEBUG printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid); #endif @@ -5882,7 +5840,7 @@ ip_thread_exit(void *phash) void ip_ddi_init(void) { - ip_input_proc = ip_squeue_switch(ip_squeue_enter); + ip_squeue_flag = ip_squeue_switch(ip_squeue_enter); /* * For IP and TCP the minor numbers should start from 2 since we have 4 @@ -6043,6 +6001,16 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns) ipv4_hook_init(ipst); ipv6_hook_init(ipst); + /* + * Create the taskq dispatcher thread and initialize related stuff. + */ + ipst->ips_capab_taskq_thread = thread_create(NULL, 0, + ill_taskq_dispatch, ipst, 0, &p0, TS_RUN, minclsyspri); + mutex_init(&ipst->ips_capab_taskq_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&ipst->ips_capab_taskq_cv, NULL, CV_DEFAULT, NULL); + list_create(&ipst->ips_capab_taskq_list, sizeof (mblk_t), + offsetof(mblk_t, b_next)); + return (ipst); } @@ -6839,8 +6807,8 @@ ip_fanout_tcp(queue_t *q, mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); if (IPCL_IS_TCP(connp)) { /* do not drain, certain use cases can blow the stack */ - squeue_enter_nodrain(connp->conn_sqp, first_mp, - connp->conn_recv, connp, SQTAG_IP_FANOUT_TCP); + SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, connp->conn_recv, + connp, ip_squeue_flag, SQTAG_IP_FANOUT_TCP); } else { /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ (connp->conn_recv)(connp, first_mp, NULL); @@ -7016,9 +6984,10 @@ ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp, if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure) { first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha, NULL, mctl_present); + /* Freed by ipsec_check_inbound_policy(). */ if (first_mp == NULL) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - return; /* Freed by ipsec_check_inbound_policy(). */ + return; } } if (mctl_present) @@ -9832,6 +9801,9 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, netstack_rele(ipst->ips_netstack); connp->conn_zoneid = zoneid; + connp->conn_sqp = NULL; + connp->conn_initial_sqp = NULL; + connp->conn_final_sqp = NULL; connp->conn_upq = q; q->q_ptr = WR(q)->q_ptr = connp; @@ -12977,6 +12949,7 @@ ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, mblk_t *mp1; boolean_t syn_present = B_FALSE; tcph_t *tcph; + uint_t tcph_flags; uint_t ip_hdr_len; ill_t *ill = (ill_t *)q->q_ptr; zoneid_t zoneid = ire->ire_zoneid; @@ -13121,6 +13094,9 @@ try_again: goto no_conn; } + tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; + tcph_flags = tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG); + /* * TCP FAST PATH for AF_INET socket. * @@ -13138,12 +13114,17 @@ try_again: !IPP_ENABLED(IPP_LOCAL_IN, ipst)) { ASSERT(first_mp == mp); BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); - SET_SQUEUE(mp, tcp_rput_data, connp); + if (tcph_flags != (TH_SYN | TH_ACK)) { + SET_SQUEUE(mp, tcp_rput_data, connp); + return (mp); + } + mp->b_datap->db_struioflag |= STRUIO_CONNECT; + DB_CKSUMSTART(mp) = (intptr_t)ip_squeue_get(ill_ring); + SET_SQUEUE(mp, tcp_input, connp); return (mp); } - tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; - if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { + if (tcph_flags == TH_SYN) { if (IPCL_IS_TCP(connp)) { mp->b_datap->db_struioflag |= STRUIO_EAGER; DB_CKSUMSTART(mp) = @@ -13165,7 +13146,6 @@ try_again: } syn_present = B_TRUE; } - } if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) { @@ -13903,6 +13883,12 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill) return (NULL); } +/* + * + * This is the fast forward path. If we are here, we dont need to + * worry about RSVP, CGTP, or TSol. Furthermore the ftable lookup + * needed to find the nexthop in this case is much simpler + */ ire_t * ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp) { @@ -13928,6 +13914,12 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp) */ ire_refrele(ire); ire = ire_cache_lookup(dst, GLOBAL_ZONEID, NULL, ipst); + /* + * ire_cache_lookup() can return ire of IRE_LOCAL in + * transient cases. In such case, just drop the packet + */ + if (ire->ire_type != IRE_CACHE) + goto drop; } /* @@ -13952,8 +13944,8 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp) /* No ire cache of nexthop. So first create one */ if (ire == NULL) { - ire = ire_forward(dst, &ret_action, NULL, NULL, - NULL, ipst); + ire = ire_forward_simple(dst, &ret_action, ipst); + /* * We only come to ip_fast_forward if ip_cgtp_filter * is not set. So ire_forward() should not return with @@ -14001,7 +13993,6 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp) pkt_len = ntohs(ipha->ipha_length); stq_ill = (ill_t *)ire->ire_stq->q_ptr; if (!(stq_ill->ill_flags & ILLF_ROUTER) || - !(ill->ill_flags & ILLF_ROUTER) || (ill == stq_ill) || (ill->ill_group != NULL && ill->ill_group == stq_ill->ill_group) || (ire->ire_nce == NULL) || @@ -14010,7 +14001,7 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp) ((hlen = MBLKL(fpmp)) > MBLKHEAD(mp)) || ipha->ipha_ttl <= 1) { ip_rput_process_forward(ill->ill_rq, mp, ire, - ipha, ill, B_FALSE); + ipha, ill, B_FALSE, B_TRUE); return (ire); } BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); @@ -14048,34 +14039,33 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp) BUMP_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutTransmits); UPDATE_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutOctets, pkt_len); - dev_q = ire->ire_stq->q_next; - if ((dev_q->q_next != NULL || dev_q->q_first != NULL) && - !canputnext(ire->ire_stq)) { - goto indiscard; + if (!ILL_DIRECT_CAPABLE(stq_ill) || DB_TYPE(mp) != M_DATA) { + dev_q = ire->ire_stq->q_next; + if (DEV_Q_FLOW_BLOCKED(dev_q)) + goto indiscard; } - if (ILL_DLS_CAPABLE(stq_ill)) { - /* - * Send the packet directly to DLD, where it - * may be queued depending on the availability - * of transmit resources at the media layer. - */ - IP_DLS_ILL_TX(stq_ill, ipha, mp, ipst, hlen); - } else { - DTRACE_PROBE4(ip4__physical__out__start, - ill_t *, NULL, ill_t *, stq_ill, - ipha_t *, ipha, mblk_t *, mp); - FW_HOOKS(ipst->ips_ip4_physical_out_event, - ipst->ips_ipv4firewall_physical_out, - NULL, stq_ill, ipha, mp, mp, 0, ipst); - DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); - if (mp == NULL) - goto drop; - DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, - ipha, __dtrace_ipsr_ill_t *, stq_ill, ipha_t *, ipha, - ip6_t *, NULL, int, 0); + DTRACE_PROBE4(ip4__physical__out__start, + ill_t *, NULL, ill_t *, stq_ill, ipha_t *, ipha, mblk_t *, mp); + FW_HOOKS(ipst->ips_ip4_physical_out_event, + ipst->ips_ipv4firewall_physical_out, + NULL, stq_ill, ipha, mp, mp, 0, ipst); + DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); + DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, + ipha, __dtrace_ipsr_ill_t *, stq_ill, ipha_t *, ipha, + ip6_t *, NULL, int, 0); - putnext(ire->ire_stq, mp); + if (mp != NULL) { + if (ipst->ips_ipobs_enabled) { + zoneid_t szone; + + szone = ip_get_zoneid_v4(ipha->ipha_src, mp, + ipst, ALL_ZONES); + ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, + ALL_ZONES, ill, IPV4_VERSION, hlen, ipst); + } + + ILL_SEND_TX(stq_ill, ire, dst, mp, IP_DROP_ON_NO_DESC); } return (ire); @@ -14096,7 +14086,7 @@ drop: static void ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, - ill_t *ill, boolean_t ll_multicast) + ill_t *ill, boolean_t ll_multicast, boolean_t from_ip_fast_forward) { ill_group_t *ill_group; ill_group_t *ire_group; @@ -14109,6 +14099,16 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, mp->b_prev = NULL; /* ip_rput_noire sets incoming interface here */ mp->b_next = NULL; /* ip_rput_noire sets dst here */ + /* + * If the caller of this function is ip_fast_forward() skip the + * next three checks as it does not apply. + */ + if (from_ip_fast_forward) { + ill_group = ill->ill_group; + ire_group = ((ill_t *)(ire->ire_rfq)->q_ptr)->ill_group; + goto skip; + } + if (ll_multicast != 0) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); goto drop_pkt; @@ -14147,6 +14147,7 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, * side-effect of that would be requiring an ire flush * whenever the ILLF_ROUTER flag changes. */ +skip: if (((ill->ill_flags & ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_ROUTER) == 0) && @@ -14253,7 +14254,7 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, } sendit: dev_q = ire->ire_stq->q_next; - if ((dev_q->q_next || dev_q->q_first) && !canput(dev_q)) { + if (DEV_Q_FLOW_BLOCKED(dev_q)) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); freemsg(mp); return; @@ -14447,7 +14448,7 @@ ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t *ire, ipha_t *ipha, ipha->ipha_hdr_checksum = 0; ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); ip_rput_process_forward(q, mp, ire, ipha, - ill, ll_multicast); + ill, ll_multicast, B_FALSE); ire_refrele(ire); return (NULL); } @@ -14904,6 +14905,15 @@ ip_fix_dbref(ill_t *ill, mblk_t *mp) return (mp1); } +#define ADD_TO_CHAIN(head, tail, cnt, mp) { \ + if (tail != NULL) \ + tail->b_next = mp; \ + else \ + head = mp; \ + tail = mp; \ + cnt++; \ +} + /* * Direct read side procedure capable of dealing with chains. GLDv3 based * drivers call this function directly with mblk chains while STREAMS @@ -14942,20 +14952,23 @@ ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, mblk_t *head = NULL; mblk_t *tail = NULL; mblk_t *first_mp; - mblk_t *mp; - mblk_t *dmp; int cnt = 0; ip_stack_t *ipst = ill->ill_ipst; + mblk_t *mp; + mblk_t *dmp; + uint8_t tag; ASSERT(mp_chain != NULL); ASSERT(ill != NULL); TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_input_start: q %p", q); + tag = (ip_ring != NULL) ? SQTAG_IP_INPUT_RX_RING : SQTAG_IP_INPUT; + #define rptr ((uchar_t *)ipha) while (mp_chain != NULL) { - first_mp = mp = mp_chain; + mp = mp_chain; mp_chain = mp_chain->b_next; mp->b_next = NULL; ll_multicast = 0; @@ -14987,6 +15000,15 @@ ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, * Given the above assumption, there is no need to walk * down the entire mblk chain (which could have a * potential performance problem) + * + * The "(DB_REF(mp) > 1)" check was moved from ip_rput() + * to here because of exclusive ip stacks and vnics. + * Packets transmitted from exclusive stack over vnic + * can have db_ref > 1 and when it gets looped back to + * another vnic in a different zone, you have ip_input() + * getting dblks with db_ref > 1. So if someone + * complains of TCP performance under this scenario, + * take a serious look here on the impact of copymsg(). */ if (DB_REF(mp) > 1) { @@ -15056,7 +15078,7 @@ ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, } } - /* Make sure its an M_DATA and that its aligned */ + /* Only M_DATA can come here and it is always aligned */ ASSERT(DB_TYPE(mp) == M_DATA); ASSERT(DB_REF(mp) == 1 && OK_32PTR(mp->b_rptr)); @@ -15140,7 +15162,6 @@ ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, continue; } dst = ipha->ipha_dst; - /* * Attach any necessary label information to * this packet @@ -15194,16 +15215,18 @@ ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, opt_len == 0 && ipha->ipha_protocol != IPPROTO_RSVP && !ll_multicast && !CLASSD(dst) && ill->ill_dhcpinit == 0) { if (ire == NULL) - ire = ire_cache_lookup(dst, ALL_ZONES, NULL, - ipst); - - /* incoming packet is for forwarding */ - if (ire == NULL || (ire->ire_type & IRE_CACHE)) { + ire = ire_cache_lookup_simple(dst, ipst); + /* + * Unless forwarding is enabled, dont call + * ip_fast_forward(). Incoming packet is for forwarding + */ + if ((ill->ill_flags & ILLF_ROUTER) && + (ire == NULL || (ire->ire_type & IRE_CACHE))) { ire = ip_fast_forward(ire, dst, ill, mp); continue; } /* incoming packet is for local consumption */ - if (ire->ire_type & IRE_LOCAL) + if ((ire != NULL) && (ire->ire_type & IRE_LOCAL)) goto local; } @@ -15363,7 +15386,7 @@ ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, } else if (ire->ire_stq != NULL) { /* fowarding? */ ip_rput_process_forward(q, mp, ire, ipha, ill, - ll_multicast); + ll_multicast, B_FALSE); /* ip_rput_process_forward consumed the packet */ continue; } @@ -15414,8 +15437,8 @@ local: * changes. */ IP_STAT(ipst, ip_input_multi_squeue); - squeue_enter_chain(curr_sqp, head, - tail, cnt, SQTAG_IP_INPUT); + SQUEUE_ENTER(curr_sqp, head, + tail, cnt, SQ_PROCESS, tag); curr_sqp = GET_SQUEUE(mp); head = mp; tail = mp; @@ -15444,33 +15467,231 @@ local: ire_refrele(ire); if (head != NULL) - squeue_enter_chain(curr_sqp, head, tail, cnt, SQTAG_IP_INPUT); + SQUEUE_ENTER(curr_sqp, head, tail, cnt, SQ_PROCESS, tag); - /* - * This code is there just to make netperf/ttcp look good. - * - * Its possible that after being in polling mode (and having cleared - * the backlog), squeues have turned the interrupt frequency higher - * to improve latency at the expense of more CPU utilization (less - * packets per interrupts or more number of interrupts). Workloads - * like ttcp/netperf do manage to tickle polling once in a while - * but for the remaining time, stay in higher interrupt mode since - * their packet arrival rate is pretty uniform and this shows up - * as higher CPU utilization. Since people care about CPU utilization - * while running netperf/ttcp, turn the interrupt frequency back to - * normal/default if polling has not been used in ip_poll_normal_ticks. - */ - if (ip_ring != NULL && (ip_ring->rr_poll_state & ILL_POLLING)) { - if (lbolt >= (ip_ring->rr_poll_time + ip_poll_normal_ticks)) { - ip_ring->rr_poll_state &= ~ILL_POLLING; - ip_ring->rr_blank(ip_ring->rr_handle, - ip_ring->rr_normal_blank_time, - ip_ring->rr_normal_pkt_cnt); + TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, + "ip_input_end: q %p (%S)", q, "end"); +#undef rptr +} + +/* + * ip_accept_tcp() - This function is called by the squeue when it retrieves + * a chain of packets in the poll mode. The packets have gone through the + * data link processing but not IP processing. For performance and latency + * reasons, the squeue wants to process the chain in line instead of feeding + * it back via ip_input path. + * + * So this is a light weight function which checks to see if the packets + * retrived are indeed TCP packets (TCP squeue always polls TCP soft ring + * but we still do the paranoid check) meant for local machine and we don't + * have labels etc enabled. Packets that meet the criterion are returned to + * the squeue and processed inline while the rest go via ip_input path. + */ +/*ARGSUSED*/ +mblk_t * +ip_accept_tcp(ill_t *ill, ill_rx_ring_t *ip_ring, squeue_t *target_sqp, + mblk_t *mp_chain, mblk_t **last, uint_t *cnt) +{ + mblk_t *mp; + ipaddr_t dst = NULL; + ipaddr_t prev_dst; + ire_t *ire = NULL; + ipha_t *ipha; + uint_t pkt_len; + ssize_t len; + uint_t opt_len; + queue_t *q = ill->ill_rq; + squeue_t *curr_sqp; + mblk_t *ahead = NULL; /* Accepted head */ + mblk_t *atail = NULL; /* Accepted tail */ + uint_t acnt = 0; /* Accepted count */ + mblk_t *utail = NULL; /* Unaccepted head */ + mblk_t *uhead = NULL; /* Unaccepted tail */ + uint_t ucnt = 0; /* Unaccepted cnt */ + ip_stack_t *ipst = ill->ill_ipst; + + *cnt = 0; + + ASSERT(ill != NULL); + ASSERT(ip_ring != NULL); + + TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_accept_tcp: q %p", q); + +#define rptr ((uchar_t *)ipha) + + while (mp_chain != NULL) { + mp = mp_chain; + mp_chain = mp_chain->b_next; + mp->b_next = NULL; + + /* + * We do ire caching from one iteration to + * another. In the event the packet chain contains + * all packets from the same dst, this caching saves + * an ire_cache_lookup for each of the succeeding + * packets in a packet chain. + */ + prev_dst = dst; + + ipha = (ipha_t *)mp->b_rptr; + len = mp->b_wptr - rptr; + + ASSERT(!MBLK_RX_FANOUT_SLOWPATH(mp, ipha)); + + /* + * If it is a non TCP packet, or doesn't have H/W cksum, + * or doesn't have min len, reject. + */ + if ((ipha->ipha_protocol != IPPROTO_TCP) || (len < + (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH))) { + ADD_TO_CHAIN(uhead, utail, ucnt, mp); + continue; } + + pkt_len = ntohs(ipha->ipha_length); + if (len != pkt_len) { + if (len > pkt_len) { + mp->b_wptr = rptr + pkt_len; + } else { + ADD_TO_CHAIN(uhead, utail, ucnt, mp); + continue; + } } - TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, - "ip_input_end: q %p (%S)", q, "end"); + opt_len = ipha->ipha_version_and_hdr_length - + IP_SIMPLE_HDR_VERSION; + dst = ipha->ipha_dst; + + /* IP version bad or there are IP options */ + if (opt_len && (!ip_rput_multimblk_ipoptions(q, ill, + mp, &ipha, &dst, ipst))) + continue; + + if (is_system_labeled() || (ill->ill_dhcpinit != 0) || + (ipst->ips_ip_cgtp_filter && + ipst->ips_ip_cgtp_filter_ops != NULL)) { + ADD_TO_CHAIN(uhead, utail, ucnt, mp); + continue; + } + + /* + * Reuse the cached ire only if the ipha_dst of the previous + * packet is the same as the current packet AND it is not + * INADDR_ANY. + */ + if (!(dst == prev_dst && dst != INADDR_ANY) && + (ire != NULL)) { + ire_refrele(ire); + ire = NULL; + } + + if (ire == NULL) + ire = ire_cache_lookup_simple(dst, ipst); + + /* + * Unless forwarding is enabled, dont call + * ip_fast_forward(). Incoming packet is for forwarding + */ + if ((ill->ill_flags & ILLF_ROUTER) && + (ire == NULL || (ire->ire_type & IRE_CACHE))) { + + DTRACE_PROBE4(ip4__physical__in__start, + ill_t *, ill, ill_t *, NULL, + ipha_t *, ipha, mblk_t *, mp); + + FW_HOOKS(ipst->ips_ip4_physical_in_event, + ipst->ips_ipv4firewall_physical_in, + ill, NULL, ipha, mp, mp, 0, ipst); + + DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp); + + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); + UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, + pkt_len); + + ire = ip_fast_forward(ire, dst, ill, mp); + continue; + } + + /* incoming packet is for local consumption */ + if ((ire != NULL) && (ire->ire_type & IRE_LOCAL)) + goto local_accept; + + /* + * Disable ire caching for anything more complex + * than the simple fast path case we checked for above. + */ + if (ire != NULL) { + ire_refrele(ire); + ire = NULL; + } + + ire = ire_cache_lookup(dst, ALL_ZONES, MBLK_GETLABEL(mp), + ipst); + if (ire == NULL || ire->ire_type == IRE_BROADCAST || + ire->ire_stq != NULL) { + ADD_TO_CHAIN(uhead, utail, ucnt, mp); + if (ire != NULL) { + ire_refrele(ire); + ire = NULL; + } + continue; + } + +local_accept: + + if (ire->ire_rfq != q) { + ADD_TO_CHAIN(uhead, utail, ucnt, mp); + if (ire != NULL) { + ire_refrele(ire); + ire = NULL; + } + continue; + } + + /* + * The event for packets being received from a 'physical' + * interface is placed after validation of the source and/or + * destination address as being local so that packets can be + * redirected to loopback addresses using ipnat. + */ + DTRACE_PROBE4(ip4__physical__in__start, + ill_t *, ill, ill_t *, NULL, + ipha_t *, ipha, mblk_t *, mp); + + FW_HOOKS(ipst->ips_ip4_physical_in_event, + ipst->ips_ipv4firewall_physical_in, + ill, NULL, ipha, mp, mp, 0, ipst); + + DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp); + + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); + UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pkt_len); + + if ((mp = ip_tcp_input(mp, ipha, ill, B_FALSE, ire, mp, + 0, q, ip_ring)) != NULL) { + if ((curr_sqp = GET_SQUEUE(mp)) == target_sqp) { + ADD_TO_CHAIN(ahead, atail, acnt, mp); + } else { + SQUEUE_ENTER(curr_sqp, mp, mp, 1, + SQ_FILL, SQTAG_IP_INPUT); + } + } + } + + if (ire != NULL) + ire_refrele(ire); + + if (uhead != NULL) + ip_input(ill, ip_ring, uhead, NULL); + + if (ahead != NULL) { + *last = atail; + *cnt = acnt; + return (ahead); + } + + return (NULL); #undef rptr } @@ -15770,11 +15991,18 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) } freemsg(mp); /* Don't want to pass this up */ return; - - case DL_CAPABILITY_REQ: case DL_CONTROL_REQ: + ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for " + "DL_CONTROL_REQ\n")); ill_dlpi_done(ill, dlea->dl_error_primitive); - ill->ill_dlpi_capab_state = IDS_FAILED; + freemsg(mp); + return; + case DL_CAPABILITY_REQ: + ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for " + "DL_CAPABILITY REQ\n")); + if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT) + ill->ill_dlpi_capab_state = IDCS_FAILED; + ill_capability_done(ill); freemsg(mp); return; } @@ -15814,19 +16042,14 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) dlea->dl_errno, dlea->dl_unix_errno); break; case DL_CAPABILITY_ACK: - /* Call a routine to handle this one. */ - ill_dlpi_done(ill, DL_CAPABILITY_REQ); ill_capability_ack(ill, mp); - /* - * If the ack is due to renegotiation, we will need to send - * a new CAPABILITY_REQ to start the renegotiation. + * The message has been handed off to ill_capability_ack + * and must not be freed below */ - if (ill->ill_capab_reneg) { - ill->ill_capab_reneg = B_FALSE; - ill_capability_probe(ill); - } + mp = NULL; break; + case DL_CONTROL_ACK: /* We treat all of these as "fire and forget" */ ill_dlpi_done(ill, DL_CONTROL_REQ); @@ -16117,10 +16340,9 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) * and the renegotiation has not been started yet; * nothing needs to be done in this case. */ - if (ill->ill_dlpi_capab_state != IDS_UNKNOWN) { - ill_capability_reset(ill); - ill->ill_capab_reneg = B_TRUE; - } + ipsq_current_start(ipsq, ill->ill_ipif, 0); + ill_capability_reset(ill, B_TRUE); + ipsq_current_finish(ipsq); break; default: ip0dbg(("ip_rput_dlpi_writer: unknown notification " @@ -16661,7 +16883,8 @@ ip_rput_forward(ire_t *ire, ipha_t *ipha, mblk_t *mp, ill_t *in_ill) max_frag -= secopt_size; } - ip_wput_frag(ire, mp, IB_PKT, max_frag, 0, GLOBAL_ZONEID, ipst); + ip_wput_frag(ire, mp, IB_PKT, max_frag, 0, + GLOBAL_ZONEID, ipst, NULL); ip2dbg(("ip_rput_forward:sent to ip_wput_frag\n")); return; } @@ -16677,7 +16900,7 @@ ip_rput_forward(ire_t *ire, ipha_t *ipha, mblk_t *mp, ill_t *in_ill) mp->b_prev = (mblk_t *)IPP_FWD_OUT; ip1dbg(("ip_rput_forward: Calling ip_xmit_v4\n")); - (void) ip_xmit_v4(mp, ire, NULL, B_FALSE); + (void) ip_xmit_v4(mp, ire, NULL, B_FALSE, NULL); /* ip_xmit_v4 always consumes the packet */ return; @@ -17049,9 +17272,12 @@ ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire) mp = ip_tcp_input(mp, ipha, ill, B_TRUE, ire, ipsec_mp, 0, ill->ill_rq, NULL); IRE_REFRELE(ire); - if (mp != NULL) - squeue_enter_chain(GET_SQUEUE(mp), mp, - mp, 1, SQTAG_IP_PROTO_AGAIN); + if (mp != NULL) { + + SQUEUE_ENTER(GET_SQUEUE(mp), mp, + mp, 1, SQ_PROCESS, + SQTAG_IP_PROTO_AGAIN); + } break; case IPPROTO_SCTP: if (!ire_need_rele) @@ -21721,7 +21947,7 @@ conn_set_held_ipif(conn_t *connp, ipif_t **ipifp, ipif_t *ipif) */ static void ip_wput_ire_fragmentit(mblk_t *ipsec_mp, ire_t *ire, zoneid_t zoneid, - ip_stack_t *ipst) + ip_stack_t *ipst, conn_t *connp) { ipha_t *ipha; mblk_t *mp; @@ -21779,7 +22005,7 @@ ip_wput_ire_fragmentit(mblk_t *ipsec_mp, ire_t *ire, zoneid_t zoneid, ip_source_route_included(ipha)) || CLASSD(ipha->ipha_dst)); ip_wput_frag(ire, ipsec_mp, OB_PKT, max_frag, - (dont_use ? 0 : frag_flag), zoneid, ipst); + (dont_use ? 0 : frag_flag), zoneid, ipst, connp); } /* @@ -22502,9 +22728,9 @@ another:; queue_t *dev_q = stq->q_next; /* flow controlled */ - if ((dev_q->q_next || dev_q->q_first) && - !canput(dev_q)) + if (DEV_Q_FLOW_BLOCKED(dev_q)) goto blocked; + if ((PROTO == IPPROTO_UDP) && (ip_hdr_included != IP_HDR_INCLUDED)) { hlen = (V_HLEN & 0xF) << 2; @@ -22685,6 +22911,7 @@ another:; ipst->ips_ipv4firewall_physical_out, NULL, ire->ire_ipif->ipif_ill, ipha, mp, mp, 0, ipst); DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); + if (mp == NULL) goto release_ire_and_ill; @@ -22703,7 +22930,9 @@ another:; } mp->b_prev = SET_BPREV_FLAG(IPP_LOCAL_OUT); DTRACE_PROBE2(ip__xmit__1, mblk_t *, mp, ire_t *, ire); - pktxmit_state = ip_xmit_v4(mp, ire, NULL, B_TRUE); + + pktxmit_state = ip_xmit_v4(mp, ire, NULL, B_TRUE, connp); + if ((pktxmit_state == SEND_FAILED) || (pktxmit_state == LLHDR_RESLV_FAILED)) { ip2dbg(("ip_wput_ire: ip_xmit_v4 failed" @@ -22976,10 +23205,9 @@ broadcast: #endif sctph->sh_chksum = sctp_cksum(mp, hlen); } else { - queue_t *dev_q = stq->q_next; + queue_t *dev_q = stq->q_next; - if ((dev_q->q_next || dev_q->q_first) && - !canput(dev_q)) { + if (DEV_Q_FLOW_BLOCKED(dev_q)) { blocked: ipha->ipha_ident = ip_hdr_included; /* @@ -23314,7 +23542,7 @@ checksumoptions: DTRACE_PROBE2(ip__xmit__2, mblk_t *, mp, ire_t *, ire); pktxmit_state = ip_xmit_v4(mp, ire, - NULL, B_TRUE); + NULL, B_TRUE, connp); if ((pktxmit_state == SEND_FAILED) || (pktxmit_state == LLHDR_RESLV_FAILED)) { release_ire_and_ill_2: @@ -23471,13 +23699,14 @@ fragmentit: "ip_wput_ire_end: q %p (%S)", q, "last fragmentation"); ip_wput_ire_fragmentit(mp, ire, - zoneid, ipst); + zoneid, ipst, connp); ire_refrele(ire); if (conn_outgoing_ill != NULL) ill_refrele(conn_outgoing_ill); return; } - ip_wput_ire_fragmentit(mp, ire, zoneid, ipst); + ip_wput_ire_fragmentit(mp, ire, + zoneid, ipst, connp); } } } else { @@ -24195,7 +24424,7 @@ pbuf_panic: */ static void ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, - uint32_t frag_flag, zoneid_t zoneid, ip_stack_t *ipst) + uint32_t frag_flag, zoneid_t zoneid, ip_stack_t *ipst, conn_t *connp) { int i1; mblk_t *ll_hdr_mp; @@ -24253,7 +24482,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, */ if (ire->ire_nce && ire->ire_nce->nce_state != ND_REACHABLE) { /* If nce_state is ND_INITIAL, trigger ARP query */ - (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE); + (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL); ip1dbg(("ip_wput_frag: mac address for ire is unresolved" " - dropping packet\n")); BUMP_MIB(mibptr, ipIfStatsOutFragFails); @@ -24622,7 +24851,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, ipha_t *, ipha, ip6_t *, NULL, int, 0); - putnext(q, xmit_mp); + ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0); BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutTransmits); UPDATE_MIB(out_ill->ill_ip_mib, @@ -24932,7 +25161,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, __dtrace_ipsr_ill_t *, out_ill, ipha_t *, ipha, ip6_t *, NULL, int, 0); - putnext(q, xmit_mp); + ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0); BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutTransmits); @@ -26286,7 +26515,8 @@ send: "fragmented accelerated packet!\n")); freemsg(ipsec_mp); } else { - ip_wput_ire_fragmentit(ipsec_mp, ire, zoneid, ipst); + ip_wput_ire_fragmentit(ipsec_mp, ire, + zoneid, ipst, NULL); } if (ire_need_rele) ire_refrele(ire); @@ -26461,7 +26691,7 @@ send: * Call ip_xmit_v4() to trigger ARP query * in case the nce_state is ND_INITIAL */ - (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE); + (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL); goto drop_pkt; } @@ -26477,7 +26707,7 @@ send: ip1dbg(("ip_wput_ipsec_out: calling ip_xmit_v4\n")); pktxmit_state = ip_xmit_v4(mp, ire, - (io->ipsec_out_accelerated ? io : NULL), B_FALSE); + (io->ipsec_out_accelerated ? io : NULL), B_FALSE, NULL); if ((pktxmit_state == SEND_FAILED) || (pktxmit_state == LLHDR_RESLV_FAILED)) { @@ -27588,9 +27818,9 @@ nak: */ ASSERT(ipsq != NULL); CONN_INC_REF(connp); - squeue_fill(connp->conn_sqp, mp, + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, ip_resume_tcp_bind, connp, - SQTAG_BIND_RETRY); + SQ_FILL, SQTAG_BIND_RETRY); } else if (IPCL_IS_UDP(connp)) { /* * In the case of UDP endpoint we @@ -28053,7 +28283,7 @@ nak: /* * send out queued packets. */ - (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE); + (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL); IRE_REFRELE(ire); return; @@ -28558,6 +28788,25 @@ ip_wsrv(queue_t *q) } /* + * Callback to disable flow control in IP. + * + * This is a mac client callback added when the DLD_CAPAB_DIRECT capability + * is enabled. + * + * When MAC_TX() is not able to send any more packets, dld sets its queue + * to QFULL and enable the STREAMS flow control. Later, when the underlying + * driver is able to continue to send packets, it calls mac_tx_(ring_)update() + * function and wakes up corresponding mac worker threads, which in turn + * calls this callback function, and disables flow control. + */ +/* ARGSUSED */ +void +ill_flow_enable(void *ill, ip_mac_tx_cookie_t cookie) +{ + qenable(((ill_t *)ill)->ill_wq); +} + +/* * Walk the list of all conn's calling the function provided with the * specified argument for each. Note that this only walks conn's that * have been bound. @@ -29280,17 +29529,17 @@ ip_cgtp_filter_is_registered(netstackid_t stackid) return (ret); } -static squeue_func_t +static int ip_squeue_switch(int val) { - squeue_func_t rval = squeue_fill; + int rval = SQ_FILL; switch (val) { case IP_SQUEUE_ENTER_NODRAIN: - rval = squeue_enter_nodrain; + rval = SQ_NODRAIN; break; case IP_SQUEUE_ENTER: - rval = squeue_enter; + rval = SQ_PROCESS; break; default: break; @@ -29312,7 +29561,7 @@ ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, if (ddi_strtol(value, NULL, 10, &new_value) != 0) return (EINVAL); - ip_input_proc = ip_squeue_switch(new_value); + ip_squeue_flag = ip_squeue_switch(new_value); *v = new_value; return (0); } @@ -29983,7 +30232,8 @@ ip_fanout_sctp_raw(mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, boolean_t isv4, * ip_wput_frag can call this function. */ ipxmit_state_t -ip_xmit_v4(mblk_t *mp, ire_t *ire, ipsec_out_t *io, boolean_t flow_ctl_enabled) +ip_xmit_v4(mblk_t *mp, ire_t *ire, ipsec_out_t *io, + boolean_t flow_ctl_enabled, conn_t *connp) { nce_t *arpce; ipha_t *ipha; @@ -30069,7 +30319,8 @@ ip_xmit_v4(mblk_t *mp, ire_t *ire, ipsec_out_t *io, boolean_t flow_ctl_enabled) ipha_t *, ipha, ip6_t *, NULL, int, 0); - putnext(q, first_mp); + ILL_SEND_TX(out_ill, + ire, connp, first_mp, 0); } else { BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsOutDiscards); diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c index 810cec9e8a..a1d97627b2 100644 --- a/usr/src/uts/common/inet/ip/ip6.c +++ b/usr/src/uts/common/inet/ip/ip6.c @@ -98,6 +98,7 @@ #include <inet/udp_impl.h> #include <inet/rawip_impl.h> #include <inet/rts_impl.h> +#include <sys/squeue_impl.h> #include <sys/squeue.h> #include <sys/tsol/label.h> @@ -108,7 +109,7 @@ /* Temporary; for CR 6451644 work-around */ #include <sys/ethernet.h> -extern squeue_func_t ip_input_proc; +extern int ip_squeue_flag; /* * Naming conventions: @@ -887,8 +888,8 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, goto drop_pkt; } - squeue_fill(connp->conn_sqp, first_mp, tcp_input, - connp, SQTAG_TCP6_INPUT_ICMP_ERR); + SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, tcp_input, connp, + SQ_FILL, SQTAG_TCP6_INPUT_ICMP_ERR); return; } @@ -2538,8 +2539,9 @@ ip_bind_connected_resume_v6(ipsq_t *ipsq, queue_t *q, mblk_t *mp, if (mp != NULL) { if (IPCL_IS_TCP(connp)) { CONN_INC_REF(connp); - squeue_fill(connp->conn_sqp, mp, ip_resume_tcp_bind, - connp, SQTAG_TCP_RPUTOTHER); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, + ip_resume_tcp_bind, connp, SQ_FILL, + SQTAG_TCP_RPUTOTHER); } else if (IPCL_IS_UDP(connp)) { udp_resume_bind(connp, mp); } else { @@ -3637,8 +3639,8 @@ ip_fanout_tcp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill, ill_t *inill, BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); if (IPCL_IS_TCP(connp)) { - (*ip_input_proc)(connp->conn_sqp, first_mp, - connp->conn_recv, connp, SQTAG_IP6_TCP_INPUT); + SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, connp->conn_recv, + connp, ip_squeue_flag, SQTAG_IP6_TCP_INPUT); } else { /* SOCK_RAW, IPPROTO_TCP case */ (connp->conn_recv)(connp, first_mp, NULL); @@ -11072,7 +11074,7 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, /* Driver is flow-controlling? */ if (!IP_FLOW_CONTROLLED_ULP(nexthdr) && - ((dev_q->q_next || dev_q->q_first) && !canput(dev_q))) { + DEV_Q_FLOW_BLOCKED(dev_q)) { /* * Queue packet if we have an conn to give back * pressure. We can't queue packets intended for @@ -12140,8 +12142,9 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp, "connp %p (ENOMEM)\n", (void *)connp)); } else { CONN_INC_REF(connp); - squeue_fill(connp->conn_sqp, mdimp, tcp_input, - connp, SQTAG_TCP_INPUT_MCTL); + SQUEUE_ENTER_ONE(connp->conn_sqp, mdimp, + tcp_input, connp, SQ_FILL, + SQTAG_TCP_INPUT_MCTL); } } @@ -12576,34 +12579,8 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp, } } else { /* - * Queue packet if we have an conn to give back pressure. - * We can't queue packets intended for hardware acceleration - * since we've tossed that state already. If the packet is - * being fed back from ire_send_v6, we don't know the - * position in the queue to enqueue the packet and we discard - * the packet. - */ - if (ipst->ips_ip_output_queue && (connp != NULL) && - (io == NULL) && (caller != IRE_SEND)) { - if (caller == IP_WSRV) { - connp->conn_did_putbq = 1; - (void) putbq(connp->conn_wq, mp); - conn_drain_insert(connp); - /* - * caller == IP_WSRV implies we are - * the service thread, and the - * queue is already noenabled. - * The check for canput and - * the putbq is not atomic. - * So we need to check again. - */ - if (canput(stq->q_next)) - connp->conn_did_putbq = 0; - } else { - (void) putq(connp->conn_wq, mp); - } - return; - } + * Can't apply backpressure, just discard the packet. + */ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); freemsg(mp); return; diff --git a/usr/src/uts/common/inet/ip/ip_ftable.c b/usr/src/uts/common/inet/ip/ip_ftable.c index c87267cb29..4fa3c7a74d 100644 --- a/usr/src/uts/common/inet/ip/ip_ftable.c +++ b/usr/src/uts/common/inet/ip/ip_ftable.c @@ -101,6 +101,8 @@ static ire_t *ire_round_robin(irb_t *, zoneid_t, ire_ftable_args_t *, static void ire_del_host_redir(ire_t *, char *); static boolean_t ire_find_best_route(struct radix_node *, void *); static int ip_send_align_hcksum_flags(mblk_t *, ill_t *); +static ire_t *ire_ftable_lookup_simple(ipaddr_t, + ire_t **, zoneid_t, int, ip_stack_t *); /* * Lookup a route in forwarding table. A specific lookup is indicated by @@ -406,6 +408,157 @@ found_ire_held: return (ire); } +/* + * This function is called by + * ip_fast_forward->ire_forward_simple + * The optimizations of this function over ire_ftable_lookup are: + * o removing unnecessary flag matching + * o doing longest prefix match instead of overloading it further + * with the unnecessary "best_prefix_match" + * o Does not do round robin of default route for every packet + * o inlines code of ire_ctable_lookup to look for nexthop cache + * entry before calling ire_route_lookup + */ +static ire_t * +ire_ftable_lookup_simple(ipaddr_t addr, + ire_t **pire, zoneid_t zoneid, int flags, + ip_stack_t *ipst) +{ + ire_t *ire = NULL; + ire_t *tmp_ire = NULL; + struct rt_sockaddr rdst; + struct rt_entry *rt; + irb_t *irb_ptr; + ire_t *save_ire; + int match_flags; + + rdst.rt_sin_len = sizeof (rdst); + rdst.rt_sin_family = AF_INET; + rdst.rt_sin_addr.s_addr = addr; + + /* + * This is basically inlining a simpler version of ire_match_args + */ + RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); + + rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, + ipst->ips_ip_ftable, NULL, NULL); + + if (rt == NULL) { + RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); + return (NULL); + } + irb_ptr = &rt->rt_irb; + if (irb_ptr == NULL || irb_ptr->irb_ire_cnt == 0) { + RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); + return (NULL); + } + + rw_enter(&irb_ptr->irb_lock, RW_READER); + for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { + if (ire->ire_zoneid == zoneid) + break; + } + + if (ire == NULL || (ire->ire_marks & IRE_MARK_CONDEMNED)) { + rw_exit(&irb_ptr->irb_lock); + RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); + return (NULL); + } + /* we have a ire that matches */ + if (ire != NULL) + IRE_REFHOLD(ire); + rw_exit(&irb_ptr->irb_lock); + RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); + + if ((flags & MATCH_IRE_RJ_BHOLE) && + (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) { + return (ire); + } + /* + * At this point, IRE that was found must be an IRE_FORWARDTABLE + * type. If this is a recursive lookup and an IRE_INTERFACE type was + * found, return that. If it was some other IRE_FORWARDTABLE type of + * IRE (one of the prefix types), then it is necessary to fill in the + * parent IRE pointed to by pire, and then lookup the gateway address of + * the parent. For backwards compatiblity, if this lookup returns an + * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level + * of lookup is done. + */ + match_flags = MATCH_IRE_DSTONLY; + + if (ire->ire_type & IRE_INTERFACE) + return (ire); + *pire = ire; + /* + * If we can't find an IRE_INTERFACE or the caller has not + * asked for pire, we need to REFRELE the save_ire. + */ + save_ire = ire; + + /* + * Currently MATCH_IRE_ILL is never used with + * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while + * sending out packets as MATCH_IRE_ILL is used only + * for communicating with on-link hosts. We can't assert + * that here as RTM_GET calls this function with + * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE. + * We have already used the MATCH_IRE_ILL in determining + * the right prefix route at this point. To match the + * behavior of how we locate routes while sending out + * packets, we don't want to use MATCH_IRE_ILL below + * while locating the interface route. + * + * ire_ftable_lookup may end up with an incomplete IRE_CACHE + * entry for the gateway (i.e., one for which the + * ire_nce->nce_state is not yet ND_REACHABLE). If the caller + * has specified MATCH_IRE_COMPLETE, such entries will not + * be returned; instead, we return the IF_RESOLVER ire. + */ + + if (ire->ire_ipif == NULL) { + tmp_ire = ire; + /* + * Look to see if the nexthop entry is in the + * cachetable (I am inlining a simpler ire_cache_lookup + * here). + */ + ire = ire_cache_lookup_simple(ire->ire_gateway_addr, ipst); + if (ire == NULL) { + /* Try ire_route_lookup */ + ire = tmp_ire; + } else { + goto solved; + } + } + if (ire->ire_ipif != NULL) + match_flags |= MATCH_IRE_ILL_GROUP; + + ire = ire_route_lookup(ire->ire_gateway_addr, 0, + 0, 0, ire->ire_ipif, NULL, zoneid, NULL, match_flags, ipst); +solved: + DTRACE_PROBE2(ftable__route__lookup1, (ire_t *), ire, + (ire_t *), save_ire); + if (ire == NULL) { + /* + * Do not release the parent ire if MATCH_IRE_PARENT + * is set. Also return it via ire. + */ + ire_refrele(save_ire); + *pire = NULL; + return (ire); + } + if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) { + /* + * If the caller did not ask for pire, release + * it now. + */ + if (pire == NULL) { + ire_refrele(save_ire); + } + } + return (ire); +} /* * Find an IRE_OFFSUBNET IRE entry for the multicast address 'group' @@ -1085,6 +1238,246 @@ icmp_err_ret: ire_refrele(ire); } return (NULL); +} + +/* + * Since caller is ip_fast_forward, there is no CGTP or Tsol test + * Also we dont call ftable lookup with MATCH_IRE_PARENT + */ + +ire_t * +ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action, + ip_stack_t *ipst) +{ + ipaddr_t gw = 0; + ire_t *ire = NULL; + ire_t *sire = NULL, *save_ire; + ill_t *dst_ill = NULL; + int error; + zoneid_t zoneid; + ipif_t *src_ipif = NULL; + mblk_t *res_mp; + ushort_t ire_marks = 0; + + zoneid = GLOBAL_ZONEID; + + + ire = ire_ftable_lookup_simple(dst, &sire, zoneid, + MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | + MATCH_IRE_RJ_BHOLE, ipst); + + if (ire == NULL) { + ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, RTA_DST, ipst); + goto icmp_err_ret; + } + + /* + * Verify that the returned IRE does not have either + * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is + * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER. + */ + if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))) { + ASSERT(ire->ire_type & (IRE_CACHE | IRE_INTERFACE)); + ip3dbg(("ire 0x%p is not cache/resolver/noresolver\n", + (void *)ire)); + goto icmp_err_ret; + } + + /* + * If we already have a fully resolved IRE CACHE of the + * nexthop router, just hand over the cache entry + * and we are done. + */ + + if (ire->ire_type & IRE_CACHE) { + + /* + * If we are using this ire cache entry as a + * gateway to forward packets, chances are we + * will be using it again. So turn off + * the temporary flag, thus reducing its + * chances of getting deleted frequently. + */ + if (ire->ire_marks & IRE_MARK_TEMPORARY) { + irb_t *irb = ire->ire_bucket; + rw_enter(&irb->irb_lock, RW_WRITER); + ire->ire_marks &= ~IRE_MARK_TEMPORARY; + irb->irb_tmp_ire_cnt--; + rw_exit(&irb->irb_lock); + } + + if (sire != NULL) { + UPDATE_OB_PKT_COUNT(sire); + ire_refrele(sire); + } + *ret_action = Forward_ok; + return (ire); + } + /* + * Increment the ire_ob_pkt_count field for ire if it is an + * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and + * increment the same for the parent IRE, sire, if it is some + * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST). + */ + if ((ire->ire_type & IRE_INTERFACE) != 0) { + UPDATE_OB_PKT_COUNT(ire); + ire->ire_last_used_time = lbolt; + } + + /* + * sire must be either IRE_CACHETABLE OR IRE_INTERFACE type + */ + if (sire != NULL) { + gw = sire->ire_gateway_addr; + ASSERT((sire->ire_type & + (IRE_CACHETABLE | IRE_INTERFACE)) == 0); + UPDATE_OB_PKT_COUNT(sire); + } + + /* Obtain dst_ill */ + dst_ill = ip_newroute_get_dst_ill(ire->ire_ipif->ipif_ill); + if (dst_ill == NULL) { + ip2dbg(("ire_forward no dst ill; ire 0x%p\n", + (void *)ire)); + goto icmp_err_ret; + } + + ASSERT(src_ipif == NULL); + /* Now obtain the src_ipif */ + src_ipif = ire_forward_src_ipif(dst, sire, ire, dst_ill, + zoneid, &ire_marks); + if (src_ipif == NULL) + goto icmp_err_ret; + + switch (ire->ire_type) { + case IRE_IF_NORESOLVER: + /* create ire_cache for ire_addr endpoint */ + case IRE_IF_RESOLVER: + /* + * We have the IRE_IF_RESOLVER of the nexthop gateway + * and now need to build a IRE_CACHE for it. + * In this case, we have the following : + * + * 1) src_ipif - used for getting a source address. + * + * 2) dst_ill - from which we derive ire_stq/ire_rfq. This + * means packets using the IRE_CACHE that we will build + * here will go out on dst_ill. + * + * 3) sire may or may not be NULL. But, the IRE_CACHE that is + * to be created will only be tied to the IRE_INTERFACE + * that was derived from the ire_ihandle field. + * + * If sire is non-NULL, it means the destination is + * off-link and we will first create the IRE_CACHE for the + * gateway. + */ + res_mp = dst_ill->ill_resolver_mp; + if (ire->ire_type == IRE_IF_RESOLVER && + (!OK_RESOLVER_MP(res_mp))) { + ire_refrele(ire); + ire = NULL; + goto out; + } + /* + * To be at this point in the code with a non-zero gw + * means that dst is reachable through a gateway that + * we have never resolved. By changing dst to the gw + * addr we resolve the gateway first. + */ + if (gw != INADDR_ANY) { + /* + * The source ipif that was determined above was + * relative to the destination address, not the + * gateway's. If src_ipif was not taken out of + * the IRE_IF_RESOLVER entry, we'll need to call + * ipif_select_source() again. + */ + if (src_ipif != ire->ire_ipif) { + ipif_refrele(src_ipif); + src_ipif = ipif_select_source(dst_ill, + gw, zoneid); + if (src_ipif == NULL) + goto icmp_err_ret; + } + dst = gw; + gw = INADDR_ANY; + } + + if (ire->ire_type == IRE_IF_NORESOLVER) + dst = ire->ire_addr; /* ire_cache for tunnel endpoint */ + + save_ire = ire; + /* + * create an incomplete IRE_CACHE. + * An areq_mp will be generated in ire_arpresolve() for + * RESOLVER interfaces. + */ + ire = ire_create( + (uchar_t *)&dst, /* dest address */ + (uchar_t *)&ip_g_all_ones, /* mask */ + (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ + (uchar_t *)&gw, /* gateway address */ + (save_ire->ire_type == IRE_IF_RESOLVER ? NULL: + &save_ire->ire_max_frag), + NULL, + dst_ill->ill_rq, /* recv-from queue */ + dst_ill->ill_wq, /* send-to queue */ + IRE_CACHE, /* IRE type */ + src_ipif, + ire->ire_mask, /* Parent mask */ + 0, + ire->ire_ihandle, /* Interface handle */ + 0, + &(ire->ire_uinfo), + NULL, + NULL, + ipst); + ip1dbg(("incomplete ire_cache 0x%p\n", (void *)ire)); + if (ire != NULL) { + ire->ire_marks |= ire_marks; + /* add the incomplete ire: */ + error = ire_add(&ire, NULL, NULL, NULL, B_TRUE); + if (error == 0 && ire != NULL) { + ire->ire_max_frag = save_ire->ire_max_frag; + ip1dbg(("setting max_frag to %d in ire 0x%p\n", + ire->ire_max_frag, (void *)ire)); + } else { + ire_refrele(save_ire); + goto icmp_err_ret; + } + } + + ire_refrele(save_ire); + break; + default: + break; + } + +out: + *ret_action = Forward_ok; + if (sire != NULL) + ire_refrele(sire); + if (dst_ill != NULL) + ill_refrele(dst_ill); + if (src_ipif != NULL) + ipif_refrele(src_ipif); + return (ire); +icmp_err_ret: + *ret_action = Forward_ret_icmp_err; + if (src_ipif != NULL) + ipif_refrele(src_ipif); + if (dst_ill != NULL) + ill_refrele(dst_ill); + if (sire != NULL) + ire_refrele(sire); + if (ire != NULL) { + if (ire->ire_flags & RTF_BLACKHOLE) + *ret_action = Forward_blackhole; + ire_refrele(ire); + } + /* caller needs to send icmp error message */ + return (NULL); } @@ -1439,7 +1832,7 @@ ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex, * if necessary and send it once ready. */ - value = ip_xmit_v4(mp, ire_cache, NULL, B_FALSE); + value = ip_xmit_v4(mp, ire_cache, NULL, B_FALSE, NULL); cleanup: ire_refrele(ire_cache); /* diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c index 3b8ff6b5d9..d767b25a76 100644 --- a/usr/src/uts/common/inet/ip/ip_if.c +++ b/usr/src/uts/common/inet/ip/ip_if.c @@ -44,6 +44,8 @@ #include <sys/sunldi.h> #include <sys/file.h> #include <sys/bitmap.h> +#include <sys/cpuvar.h> +#include <sys/time.h> #include <sys/kmem.h> #include <sys/systm.h> #include <sys/param.h> @@ -62,6 +64,7 @@ #include <sys/strsun.h> #include <sys/policy.h> #include <sys/ethernet.h> +#include <sys/callb.h> #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ #include <inet/mi.h> @@ -94,7 +97,8 @@ #include <netinet/igmp.h> #include <inet/ip_listutils.h> #include <inet/ipclassifier.h> -#include <sys/mac.h> +#include <sys/mac_client.h> +#include <sys/dld.h> #include <sys/systeminfo.h> #include <sys/bootconf.h> @@ -224,25 +228,27 @@ static void ill_ipsec_capab_free(ill_ipsec_capab_t *); static void ill_ipsec_capab_add(ill_t *, uint_t, boolean_t); static void ill_ipsec_capab_delete(ill_t *, uint_t); static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int); -static void ill_capability_proto(ill_t *, int, mblk_t *); static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *, boolean_t); static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *); -static void ill_capability_mdt_reset(ill_t *, mblk_t **); +static void ill_capability_mdt_reset_fill(ill_t *, mblk_t *); static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *); -static void ill_capability_ipsec_reset(ill_t *, mblk_t **); +static void ill_capability_ipsec_reset_fill(ill_t *, mblk_t *); static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); -static void ill_capability_hcksum_reset(ill_t *, mblk_t **); +static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *); static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, dl_capability_sub_t *); -static void ill_capability_zerocopy_reset(ill_t *, mblk_t **); -static void ill_capability_lso_ack(ill_t *, mblk_t *, dl_capability_sub_t *); -static void ill_capability_lso_reset(ill_t *, mblk_t **); -static void ill_capability_dls_ack(ill_t *, mblk_t *, dl_capability_sub_t *); -static mac_resource_handle_t ill_ring_add(void *, mac_resource_t *); -static void ill_capability_dls_reset(ill_t *, mblk_t **); -static void ill_capability_dls_disable(ill_t *); +static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *); +static int ill_capability_ipsec_reset_size(ill_t *, int *, int *, int *, + int *); +static void ill_capability_dld_reset_fill(ill_t *, mblk_t *); +static void ill_capability_dld_ack(ill_t *, mblk_t *, + dl_capability_sub_t *); +static void ill_capability_dld_enable(ill_t *); +static void ill_capability_ack_thr(void *); +static void ill_capability_lso_enable(ill_t *); +static void ill_capability_send(ill_t *, mblk_t *); static void illgrp_cache_delete(ire_t *, char *); static void illgrp_delete(ill_t *ill); @@ -523,16 +529,6 @@ static ipif_t ipif_zero; */ uint_t ill_no_arena = 12; /* Setable in /etc/system */ -/* - * Enable soft rings if ip_squeue_soft_ring or ip_squeue_fanout - * is set and ip_soft_rings_cnt > 0. ip_squeue_soft_ring is - * set through platform specific code (Niagara/Ontario). - */ -#define SOFT_RINGS_ENABLED() (ip_soft_rings_cnt ? \ - (ip_squeue_soft_ring || ip_squeue_fanout) : B_FALSE) - -#define ILL_CAPAB_DLS (ILL_CAPAB_SOFT_RING | ILL_CAPAB_POLL) - static uint_t ipif_rand(ip_stack_t *ipst) { @@ -824,12 +820,8 @@ ill_delete_tail(ill_t *ill) while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) cv_wait(&ill->ill_cv, &ill->ill_lock); mutex_exit(&ill->ill_lock); - - /* - * Clean up polling and soft ring capabilities - */ - if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) - ill_capability_dls_disable(ill); + ASSERT(!(ill->ill_capabilities & + (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT))); if (ill->ill_net_type != IRE_LOOPBACK) qprocsoff(ill->ill_rq); @@ -879,17 +871,11 @@ ill_delete_tail(ill_t *ill) ill->ill_lso_capab = NULL; } - if (ill->ill_dls_capab != NULL) { - CONN_DEC_REF(ill->ill_dls_capab->ill_unbind_conn); - ill->ill_dls_capab->ill_unbind_conn = NULL; - kmem_free(ill->ill_dls_capab, - sizeof (ill_dls_capab_t) + - (sizeof (ill_rx_ring_t) * ILL_MAX_RINGS)); - ill->ill_dls_capab = NULL; + if (ill->ill_dld_capab != NULL) { + kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t)); + ill->ill_dld_capab = NULL; } - ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL)); - while (ill->ill_ipif != NULL) ipif_free_tail(ill->ill_ipif); @@ -1478,7 +1464,7 @@ conn_ioctl_cleanup(conn_t *connp) refheld = ill_waiter_inc(ill); mutex_exit(&connp->conn_lock); if (refheld) { - if (ipsq_enter(ill, B_TRUE)) { + if (ipsq_enter(ill, B_TRUE, NEW_OP)) { ill_waiter_dcr(ill); /* * Check whether this ioctl has started and is @@ -1742,104 +1728,114 @@ ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) void ill_capability_probe(ill_t *ill) { + mblk_t *mp; + + ASSERT(IAM_WRITER_ILL(ill)); + + if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN && + ill->ill_dlpi_capab_state != IDCS_FAILED) + return; + /* - * Do so only if capabilities are still unknown. + * We are starting a new cycle of capability negotiation. + * Free up the capab reset messages of any previous incarnation. + * We will do a fresh allocation when we get the response to our probe */ - if (ill->ill_dlpi_capab_state != IDS_UNKNOWN) - return; + if (ill->ill_capab_reset_mp != NULL) { + freemsg(ill->ill_capab_reset_mp); + ill->ill_capab_reset_mp = NULL; + } - ill->ill_dlpi_capab_state = IDS_INPROGRESS; ip1dbg(("ill_capability_probe: starting capability negotiation\n")); - ill_capability_proto(ill, DL_CAPABILITY_REQ, NULL); + + mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ); + if (mp == NULL) + return; + + ill_capability_send(ill, mp); + ill->ill_dlpi_capab_state = IDCS_PROBE_SENT; } void -ill_capability_reset(ill_t *ill) -{ - mblk_t *sc_mp = NULL; - mblk_t *tmp; - - /* - * Note here that we reset the state to UNKNOWN, and later send - * down the DL_CAPABILITY_REQ without first setting the state to - * INPROGRESS. We do this in order to distinguish the - * DL_CAPABILITY_ACK response which may come back in response to - * a "reset" apart from the "probe" DL_CAPABILITY_REQ. This would - * also handle the case where the driver doesn't send us back - * a DL_CAPABILITY_ACK in response, since the "probe" routine - * requires the state to be in UNKNOWN anyway. In any case, all - * features are turned off until the state reaches IDS_OK. - */ - ill->ill_dlpi_capab_state = IDS_UNKNOWN; - ill->ill_capab_reneg = B_FALSE; - - /* - * Disable sub-capabilities and request a list of sub-capability - * messages which will be sent down to the driver. Each handler - * allocates the corresponding dl_capability_sub_t inside an - * mblk, and links it to the existing sc_mp mblk, or return it - * as sc_mp if it's the first sub-capability (the passed in - * sc_mp is NULL). Upon returning from all capability handlers, - * sc_mp will be pulled-up, before passing it downstream. - */ - ill_capability_mdt_reset(ill, &sc_mp); - ill_capability_hcksum_reset(ill, &sc_mp); - ill_capability_zerocopy_reset(ill, &sc_mp); - ill_capability_ipsec_reset(ill, &sc_mp); - ill_capability_dls_reset(ill, &sc_mp); - ill_capability_lso_reset(ill, &sc_mp); - - /* Nothing to send down in order to disable the capabilities? */ - if (sc_mp == NULL) - return; +ill_capability_reset(ill_t *ill, boolean_t reneg) +{ + ASSERT(IAM_WRITER_ILL(ill)); - tmp = msgpullup(sc_mp, -1); - freemsg(sc_mp); - if ((sc_mp = tmp) == NULL) { - cmn_err(CE_WARN, "ill_capability_reset: unable to send down " - "DL_CAPABILITY_REQ (ENOMEM)\n"); + if (ill->ill_dlpi_capab_state != IDCS_OK) return; - } - ip1dbg(("ill_capability_reset: resetting negotiated capabilities\n")); - ill_capability_proto(ill, DL_CAPABILITY_REQ, sc_mp); + ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT; + + ill_capability_send(ill, ill->ill_capab_reset_mp); + ill->ill_capab_reset_mp = NULL; + /* + * We turn off all capabilities except those pertaining to + * direct function call capabilities viz. ILL_CAPAB_DLD* + * which will be turned off by the corresponding reset functions. + */ + ill->ill_capabilities &= ~(ILL_CAPAB_MDT | ILL_CAPAB_HCKSUM | + ILL_CAPAB_ZEROCOPY | ILL_CAPAB_AH | ILL_CAPAB_ESP); } -/* - * Request or set new-style hardware capabilities supported by DLS provider. - */ static void -ill_capability_proto(ill_t *ill, int type, mblk_t *reqp) +ill_capability_reset_alloc(ill_t *ill) { mblk_t *mp; - dl_capability_req_t *capb; - size_t size = 0; - uint8_t *ptr; + size_t size = 0; + int err; + dl_capability_req_t *capb; - if (reqp != NULL) - size = MBLKL(reqp); + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(ill->ill_capab_reset_mp == NULL); - mp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + size, type); - if (mp == NULL) { - freemsg(reqp); - return; + if (ILL_MDT_CAPABLE(ill)) + size += sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); + + if (ILL_HCKSUM_CAPABLE(ill)) { + size += sizeof (dl_capability_sub_t) + + sizeof (dl_capab_hcksum_t); } - ptr = mp->b_rptr; - capb = (dl_capability_req_t *)ptr; - ptr += sizeof (dl_capability_req_t); + if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) { + size += sizeof (dl_capability_sub_t) + + sizeof (dl_capab_zerocopy_t); + } - if (reqp != NULL) { - capb->dl_sub_offset = sizeof (dl_capability_req_t); - capb->dl_sub_length = size; - bcopy(reqp->b_rptr, ptr, size); - ptr += size; - mp->b_cont = reqp->b_cont; - freeb(reqp); + if (ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) { + size += sizeof (dl_capability_sub_t); + size += ill_capability_ipsec_reset_size(ill, NULL, NULL, + NULL, NULL); } - ASSERT(ptr == mp->b_wptr); - ill_dlpi_send(ill, mp); + if (ill->ill_capabilities & ILL_CAPAB_DLD) { + size += sizeof (dl_capability_sub_t) + + sizeof (dl_capab_dld_t); + } + + mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED, + STR_NOSIG, &err); + + mp->b_datap->db_type = M_PROTO; + bzero(mp->b_rptr, size + sizeof (dl_capability_req_t)); + + capb = (dl_capability_req_t *)mp->b_rptr; + capb->dl_primitive = DL_CAPABILITY_REQ; + capb->dl_sub_offset = sizeof (dl_capability_req_t); + capb->dl_sub_length = size; + + mp->b_wptr += sizeof (dl_capability_req_t); + + /* + * Each handler fills in the corresponding dl_capability_sub_t + * inside the mblk, + */ + ill_capability_mdt_reset_fill(ill, mp); + ill_capability_hcksum_reset_fill(ill, mp); + ill_capability_zerocopy_reset_fill(ill, mp); + ill_capability_ipsec_reset_fill(ill, mp); + ill_capability_dld_reset_fill(ill, mp); + + ill->ill_capab_reset_mp = mp; } static void @@ -1944,7 +1940,6 @@ ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) if (*ill_mdt_capab == NULL) { *ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t), KM_NOSLEEP); - if (*ill_mdt_capab == NULL) { cmn_err(CE_WARN, "ill_capability_mdt_ack: " "could not enable MDT version %d " @@ -2017,42 +2012,22 @@ ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE; /* nmp points to a DL_CAPABILITY_REQ message to enable MDT */ - ill_dlpi_send(ill, nmp); + ill_capability_send(ill, nmp); } } static void -ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp) +ill_capability_mdt_reset_fill(ill_t *ill, mblk_t *mp) { - mblk_t *mp; dl_capab_mdt_t *mdt_subcap; dl_capability_sub_t *dl_subcap; - int size; if (!ILL_MDT_CAPABLE(ill)) return; ASSERT(ill->ill_mdt_capab != NULL); - /* - * Clear the capability flag for MDT but retain the ill_mdt_capab - * structure since it's possible that another thread is still - * referring to it. The structure only gets deallocated when - * we destroy the ill. - */ - ill->ill_capabilities &= ~ILL_CAPAB_MDT; - - size = sizeof (*dl_subcap) + sizeof (*mdt_subcap); - - mp = allocb(size, BPRI_HI); - if (mp == NULL) { - ip1dbg(("ill_capability_mdt_reset: unable to allocate " - "request to disable MDT\n")); - return; - } - mp->b_wptr = mp->b_rptr + size; - - dl_subcap = (dl_capability_sub_t *)mp->b_rptr; + dl_subcap = (dl_capability_sub_t *)mp->b_wptr; dl_subcap->dl_cap = DL_CAPAB_MDT; dl_subcap->dl_length = sizeof (*mdt_subcap); @@ -2062,10 +2037,26 @@ ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp) mdt_subcap->mdt_hdr_head = 0; mdt_subcap->mdt_hdr_tail = 0; - if (*sc_mp != NULL) - linkb(*sc_mp, mp); - else - *sc_mp = mp; + mp->b_wptr += sizeof (*dl_subcap) + sizeof (*mdt_subcap); +} + +static void +ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp) +{ + dl_capability_sub_t *dl_subcap; + + if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) + return; + + /* + * The dl_capab_dld_t that follows the dl_capability_sub_t is not + * initialized below since it is not used by DLD. + */ + dl_subcap = (dl_capability_sub_t *)mp->b_wptr; + dl_subcap->dl_cap = DL_CAPAB_DLD; + dl_subcap->dl_length = sizeof (dl_capab_dld_t); + + mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); } /* @@ -2371,7 +2362,7 @@ ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) * nmp points to a DL_CAPABILITY_REQ message to enable * IPsec hardware acceleration. */ - ill_dlpi_send(ill, nmp); + ill_capability_send(ill, nmp); if (need_sadb_dump) /* @@ -2457,10 +2448,10 @@ ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen, } /* ARGSUSED */ -static void -ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp) +static int +ill_capability_ipsec_reset_size(ill_t *ill, int *ah_cntp, int *ah_lenp, + int *esp_cntp, int *esp_lenp) { - mblk_t *mp; ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; uint64_t ill_capabilities = ill->ill_capabilities; @@ -2469,7 +2460,7 @@ ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp) int i, size = 0; if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP))) - return; + return (0); ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH)); ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP)); @@ -2504,18 +2495,32 @@ ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp) } } - if (size == 0) { - ip1dbg(("ill_capability_ipsec_reset: capabilities exist but " - "there's nothing to reset\n")); - return; - } + if (ah_cntp != NULL) + *ah_cntp = ah_cnt; + if (ah_lenp != NULL) + *ah_lenp = ah_len; + if (esp_cntp != NULL) + *esp_cntp = esp_cnt; + if (esp_lenp != NULL) + *esp_lenp = esp_len; - mp = allocb(size, BPRI_HI); - if (mp == NULL) { - ip1dbg(("ill_capability_ipsec_reset: unable to allocate " - "request to disable IPSEC Hardware Acceleration\n")); + return (size); +} + +/* ARGSUSED */ +static void +ill_capability_ipsec_reset_fill(ill_t *ill, mblk_t *mp) +{ + ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; + ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; + int ah_cnt = 0, esp_cnt = 0; + int ah_len = 0, esp_len = 0; + int size; + + size = ill_capability_ipsec_reset_size(ill, &ah_cnt, &ah_len, + &esp_cnt, &esp_len); + if (size == 0) return; - } /* * Clear the capability flags for IPsec HA but retain the ill @@ -2527,20 +2532,17 @@ ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp) * hardware acceleration, and by clearing them we ensure that new * outbound IPsec packets are sent down encrypted. */ - ill->ill_capabilities &= ~(ILL_CAPAB_AH | ILL_CAPAB_ESP); /* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */ if (ah_cnt > 0) { ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len, cap_ah, mp); - ASSERT(mp->b_rptr + size >= mp->b_wptr); } /* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */ if (esp_cnt > 0) { ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len, cap_esp, mp); - ASSERT(mp->b_rptr + size >= mp->b_wptr); } /* @@ -2550,11 +2552,6 @@ ill_capability_ipsec_reset(ill_t *ill, mblk_t **sc_mp) * must stop inbound decryption (by destroying all inbound SAs) * and let the corresponding packets come in encrypted. */ - - if (*sc_mp != NULL) - linkb(*sc_mp, mp); - else - *sc_mp = mp; } static void @@ -2564,15 +2561,6 @@ ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp, boolean_t legacy = B_FALSE; /* - * If this DL_CAPABILITY_ACK came in as a response to our "reset" - * DL_CAPABILITY_REQ, ignore it during this cycle. We've just - * instructed the driver to disable its advertised capabilities, - * so there's no point in accepting any response at this moment. - */ - if (ill->ill_dlpi_capab_state == IDS_UNKNOWN) - return; - - /* * Note that only the following two sub-capabilities may be * considered as "legacy", since their original definitions * do not incorporate the dl_mid_t module ID token, and hence @@ -2611,16 +2599,8 @@ ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp, case DL_CAPAB_ZEROCOPY: ill_capability_zerocopy_ack(ill, mp, subp); break; - case DL_CAPAB_POLL: - if (!SOFT_RINGS_ENABLED()) - ill_capability_dls_ack(ill, mp, subp); - break; - case DL_CAPAB_SOFT_RING: - if (SOFT_RINGS_ENABLED()) - ill_capability_dls_ack(ill, mp, subp); - break; - case DL_CAPAB_LSO: - ill_capability_lso_ack(ill, mp, subp); + case DL_CAPAB_DLD: + ill_capability_dld_ack(ill, mp, subp); break; default: ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", @@ -2629,407 +2609,6 @@ ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp, } /* - * As part of negotiating polling capability, the driver tells us - * the default (or normal) blanking interval and packet threshold - * (the receive timer fires if blanking interval is reached or - * the packet threshold is reached). - * - * As part of manipulating the polling interval, we always use our - * estimated interval (avg service time * number of packets queued - * on the squeue) but we try to blank for a minimum of - * rr_normal_blank_time * rr_max_blank_ratio. We disable the - * packet threshold during this time. When we are not in polling mode - * we set the blank interval typically lower, rr_normal_pkt_cnt * - * rr_min_blank_ratio but up the packet cnt by a ratio of - * rr_min_pkt_cnt_ratio so that we are still getting chains if - * possible although for a shorter interval. - */ -#define RR_MAX_BLANK_RATIO 20 -#define RR_MIN_BLANK_RATIO 10 -#define RR_MAX_PKT_CNT_RATIO 3 -#define RR_MIN_PKT_CNT_RATIO 3 - -/* - * These can be tuned via /etc/system. - */ -int rr_max_blank_ratio = RR_MAX_BLANK_RATIO; -int rr_min_blank_ratio = RR_MIN_BLANK_RATIO; -int rr_max_pkt_cnt_ratio = RR_MAX_PKT_CNT_RATIO; -int rr_min_pkt_cnt_ratio = RR_MIN_PKT_CNT_RATIO; - -static mac_resource_handle_t -ill_ring_add(void *arg, mac_resource_t *mrp) -{ - ill_t *ill = (ill_t *)arg; - mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; - ill_rx_ring_t *rx_ring; - int ip_rx_index; - - ASSERT(mrp != NULL); - if (mrp->mr_type != MAC_RX_FIFO) { - return (NULL); - } - ASSERT(ill != NULL); - ASSERT(ill->ill_dls_capab != NULL); - - mutex_enter(&ill->ill_lock); - for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) { - rx_ring = &ill->ill_dls_capab->ill_ring_tbl[ip_rx_index]; - ASSERT(rx_ring != NULL); - - if (rx_ring->rr_ring_state == ILL_RING_FREE) { - time_t normal_blank_time = - mrfp->mrf_normal_blank_time; - uint_t normal_pkt_cnt = - mrfp->mrf_normal_pkt_count; - - bzero(rx_ring, sizeof (ill_rx_ring_t)); - - rx_ring->rr_blank = mrfp->mrf_blank; - rx_ring->rr_handle = mrfp->mrf_arg; - rx_ring->rr_ill = ill; - rx_ring->rr_normal_blank_time = normal_blank_time; - rx_ring->rr_normal_pkt_cnt = normal_pkt_cnt; - - rx_ring->rr_max_blank_time = - normal_blank_time * rr_max_blank_ratio; - rx_ring->rr_min_blank_time = - normal_blank_time * rr_min_blank_ratio; - rx_ring->rr_max_pkt_cnt = - normal_pkt_cnt * rr_max_pkt_cnt_ratio; - rx_ring->rr_min_pkt_cnt = - normal_pkt_cnt * rr_min_pkt_cnt_ratio; - - rx_ring->rr_ring_state = ILL_RING_INUSE; - mutex_exit(&ill->ill_lock); - - DTRACE_PROBE2(ill__ring__add, (void *), ill, - (int), ip_rx_index); - return ((mac_resource_handle_t)rx_ring); - } - } - - /* - * We ran out of ILL_MAX_RINGS worth rx_ring structures. If - * we have devices which can overwhelm this limit, ILL_MAX_RING - * should be made configurable. Meanwhile it cause no panic because - * driver will pass ip_input a NULL handle which will make - * IP allocate the default squeue and Polling mode will not - * be used for this ring. - */ - cmn_err(CE_NOTE, "Reached maximum number of receiving rings (%d) " - "for %s\n", ILL_MAX_RINGS, ill->ill_name); - - mutex_exit(&ill->ill_lock); - return (NULL); -} - -static boolean_t -ill_capability_dls_init(ill_t *ill) -{ - ill_dls_capab_t *ill_dls = ill->ill_dls_capab; - conn_t *connp; - size_t sz; - ip_stack_t *ipst = ill->ill_ipst; - - if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { - if (ill_dls == NULL) { - cmn_err(CE_PANIC, "ill_capability_dls_init: " - "soft_ring enabled for ill=%s (%p) but data " - "structs uninitialized\n", ill->ill_name, - (void *)ill); - } - return (B_TRUE); - } else if (ill->ill_capabilities & ILL_CAPAB_POLL) { - if (ill_dls == NULL) { - cmn_err(CE_PANIC, "ill_capability_dls_init: " - "polling enabled for ill=%s (%p) but data " - "structs uninitialized\n", ill->ill_name, - (void *)ill); - } - return (B_TRUE); - } - - if (ill_dls != NULL) { - ill_rx_ring_t *rx_ring = ill_dls->ill_ring_tbl; - /* Soft_Ring or polling is being re-enabled */ - - connp = ill_dls->ill_unbind_conn; - ASSERT(rx_ring != NULL); - bzero((void *)ill_dls, sizeof (ill_dls_capab_t)); - bzero((void *)rx_ring, - sizeof (ill_rx_ring_t) * ILL_MAX_RINGS); - ill_dls->ill_ring_tbl = rx_ring; - ill_dls->ill_unbind_conn = connp; - return (B_TRUE); - } - - if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP, - ipst->ips_netstack)) == NULL) - return (B_FALSE); - - sz = sizeof (ill_dls_capab_t); - sz += sizeof (ill_rx_ring_t) * ILL_MAX_RINGS; - - ill_dls = kmem_zalloc(sz, KM_NOSLEEP); - if (ill_dls == NULL) { - cmn_err(CE_WARN, "ill_capability_dls_init: could not " - "allocate dls_capab for %s (%p)\n", ill->ill_name, - (void *)ill); - CONN_DEC_REF(connp); - return (B_FALSE); - } - - /* Allocate space to hold ring table */ - ill_dls->ill_ring_tbl = (ill_rx_ring_t *)&ill_dls[1]; - ill->ill_dls_capab = ill_dls; - ill_dls->ill_unbind_conn = connp; - return (B_TRUE); -} - -/* - * ill_capability_dls_disable: disable soft_ring and/or polling - * capability. Since any of the rings might already be in use, need - * to call ip_squeue_clean_all() which gets behind the squeue to disable - * direct calls if necessary. - */ -static void -ill_capability_dls_disable(ill_t *ill) -{ - ill_dls_capab_t *ill_dls = ill->ill_dls_capab; - - if (ill->ill_capabilities & ILL_CAPAB_DLS) { - ip_squeue_clean_all(ill); - ill_dls->ill_tx = NULL; - ill_dls->ill_tx_handle = NULL; - ill_dls->ill_dls_change_status = NULL; - ill_dls->ill_dls_bind = NULL; - ill_dls->ill_dls_unbind = NULL; - } - - ASSERT(!(ill->ill_capabilities & ILL_CAPAB_DLS)); -} - -static void -ill_capability_dls_capable(ill_t *ill, dl_capab_dls_t *idls, - dl_capability_sub_t *isub) -{ - uint_t size; - uchar_t *rptr; - dl_capab_dls_t dls, *odls; - ill_dls_capab_t *ill_dls; - mblk_t *nmp = NULL; - dl_capability_req_t *ocap; - uint_t sub_dl_cap = isub->dl_cap; - - if (!ill_capability_dls_init(ill)) - return; - ill_dls = ill->ill_dls_capab; - - /* Copy locally to get the members aligned */ - bcopy((void *)idls, (void *)&dls, - sizeof (dl_capab_dls_t)); - - /* Get the tx function and handle from dld */ - ill_dls->ill_tx = (ip_dld_tx_t)dls.dls_tx; - ill_dls->ill_tx_handle = (void *)dls.dls_tx_handle; - - if (sub_dl_cap == DL_CAPAB_SOFT_RING) { - ill_dls->ill_dls_change_status = - (ip_dls_chg_soft_ring_t)dls.dls_ring_change_status; - ill_dls->ill_dls_bind = (ip_dls_bind_t)dls.dls_ring_bind; - ill_dls->ill_dls_unbind = - (ip_dls_unbind_t)dls.dls_ring_unbind; - ill_dls->ill_dls_soft_ring_cnt = ip_soft_rings_cnt; - } - - size = sizeof (dl_capability_req_t) + sizeof (dl_capability_sub_t) + - isub->dl_length; - - if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { - cmn_err(CE_WARN, "ill_capability_dls_capable: could " - "not allocate memory for CAPAB_REQ for %s (%p)\n", - ill->ill_name, (void *)ill); - return; - } - - /* initialize dl_capability_req_t */ - rptr = nmp->b_rptr; - ocap = (dl_capability_req_t *)rptr; - ocap->dl_sub_offset = sizeof (dl_capability_req_t); - ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; - rptr += sizeof (dl_capability_req_t); - - /* initialize dl_capability_sub_t */ - bcopy(isub, rptr, sizeof (*isub)); - rptr += sizeof (*isub); - - odls = (dl_capab_dls_t *)rptr; - rptr += sizeof (dl_capab_dls_t); - - /* initialize dl_capab_dls_t to be sent down */ - dls.dls_rx_handle = (uintptr_t)ill; - dls.dls_rx = (uintptr_t)ip_input; - dls.dls_ring_add = (uintptr_t)ill_ring_add; - - if (sub_dl_cap == DL_CAPAB_SOFT_RING) { - dls.dls_ring_cnt = ip_soft_rings_cnt; - dls.dls_ring_assign = (uintptr_t)ip_soft_ring_assignment; - dls.dls_flags = SOFT_RING_ENABLE; - } else { - dls.dls_flags = POLL_ENABLE; - ip1dbg(("ill_capability_dls_capable: asking interface %s " - "to enable polling\n", ill->ill_name)); - } - bcopy((void *)&dls, (void *)odls, - sizeof (dl_capab_dls_t)); - ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); - /* - * nmp points to a DL_CAPABILITY_REQ message to - * enable either soft_ring or polling - */ - ill_dlpi_send(ill, nmp); -} - -static void -ill_capability_dls_reset(ill_t *ill, mblk_t **sc_mp) -{ - mblk_t *mp; - dl_capab_dls_t *idls; - dl_capability_sub_t *dl_subcap; - int size; - - if (!(ill->ill_capabilities & ILL_CAPAB_DLS)) - return; - - ASSERT(ill->ill_dls_capab != NULL); - - size = sizeof (*dl_subcap) + sizeof (*idls); - - mp = allocb(size, BPRI_HI); - if (mp == NULL) { - ip1dbg(("ill_capability_dls_reset: unable to allocate " - "request to disable soft_ring\n")); - return; - } - - mp->b_wptr = mp->b_rptr + size; - - dl_subcap = (dl_capability_sub_t *)mp->b_rptr; - dl_subcap->dl_length = sizeof (*idls); - if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) - dl_subcap->dl_cap = DL_CAPAB_SOFT_RING; - else - dl_subcap->dl_cap = DL_CAPAB_POLL; - - idls = (dl_capab_dls_t *)(dl_subcap + 1); - if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) - idls->dls_flags = SOFT_RING_DISABLE; - else - idls->dls_flags = POLL_DISABLE; - - if (*sc_mp != NULL) - linkb(*sc_mp, mp); - else - *sc_mp = mp; -} - -/* - * Process a soft_ring/poll capability negotiation ack received - * from a DLS Provider.isub must point to the sub-capability - * (DL_CAPAB_SOFT_RING/DL_CAPAB_POLL) of a DL_CAPABILITY_ACK message. - */ -static void -ill_capability_dls_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) -{ - dl_capab_dls_t *idls; - uint_t sub_dl_cap = isub->dl_cap; - uint8_t *capend; - - ASSERT(sub_dl_cap == DL_CAPAB_SOFT_RING || - sub_dl_cap == DL_CAPAB_POLL); - - if (ill->ill_isv6) - return; - - /* - * Note: range checks here are not absolutely sufficient to - * make us robust against malformed messages sent by drivers; - * this is in keeping with the rest of IP's dlpi handling. - * (Remember, it's coming from something else in the kernel - * address space) - */ - capend = (uint8_t *)(isub + 1) + isub->dl_length; - if (capend > mp->b_wptr) { - cmn_err(CE_WARN, "ill_capability_dls_ack: " - "malformed sub-capability too long for mblk"); - return; - } - - /* - * There are two types of acks we process here: - * 1. acks in reply to a (first form) generic capability req - * (dls_flag will be set to SOFT_RING_CAPABLE or POLL_CAPABLE) - * 2. acks in reply to a SOFT_RING_ENABLE or POLL_ENABLE - * capability req. - */ - idls = (dl_capab_dls_t *)(isub + 1); - - if (!dlcapabcheckqid(&idls->dls_mid, ill->ill_lmod_rq)) { - ip1dbg(("ill_capability_dls_ack: mid token for dls " - "capability isn't as expected; pass-thru " - "module(s) detected, discarding capability\n")); - if (ill->ill_capabilities & ILL_CAPAB_DLS) { - /* - * This is a capability renegotitation case. - * The interface better be unusable at this - * point other wise bad things will happen - * if we disable direct calls on a running - * and up interface. - */ - ill_capability_dls_disable(ill); - } - return; - } - - switch (idls->dls_flags) { - default: - /* Disable if unknown flag */ - case SOFT_RING_DISABLE: - case POLL_DISABLE: - ill_capability_dls_disable(ill); - break; - case SOFT_RING_CAPABLE: - case POLL_CAPABLE: - /* - * If the capability was already enabled, its safe - * to disable it first to get rid of stale information - * and then start enabling it again. - */ - ill_capability_dls_disable(ill); - ill_capability_dls_capable(ill, idls, isub); - break; - case SOFT_RING_ENABLE: - case POLL_ENABLE: - mutex_enter(&ill->ill_lock); - if (sub_dl_cap == DL_CAPAB_SOFT_RING && - !(ill->ill_capabilities & ILL_CAPAB_SOFT_RING)) { - ASSERT(ill->ill_dls_capab != NULL); - ill->ill_capabilities |= ILL_CAPAB_SOFT_RING; - } - if (sub_dl_cap == DL_CAPAB_POLL && - !(ill->ill_capabilities & ILL_CAPAB_POLL)) { - ASSERT(ill->ill_dls_capab != NULL); - ill->ill_capabilities |= ILL_CAPAB_POLL; - ip1dbg(("ill_capability_dls_ack: interface %s " - "has enabled polling\n", ill->ill_name)); - } - mutex_exit(&ill->ill_lock); - break; - } -} - -/* * Process a hardware checksum offload capability negotiation ack received * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) * of a DL_CAPABILITY_ACK message. @@ -3164,7 +2743,7 @@ ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) * nmp points to a DL_CAPABILITY_REQ message to enable * hardware checksum acceleration. */ - ill_dlpi_send(ill, nmp); + ill_capability_send(ill, nmp); } else { ip1dbg(("ill_capability_hcksum_ack: interface %s has " "advertised %x hardware checksum capability flags\n", @@ -3173,37 +2752,17 @@ ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) } static void -ill_capability_hcksum_reset(ill_t *ill, mblk_t **sc_mp) +ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp) { - mblk_t *mp; dl_capab_hcksum_t *hck_subcap; dl_capability_sub_t *dl_subcap; - int size; if (!ILL_HCKSUM_CAPABLE(ill)) return; ASSERT(ill->ill_hcksum_capab != NULL); - /* - * Clear the capability flag for hardware checksum offload but - * retain the ill_hcksum_capab structure since it's possible that - * another thread is still referring to it. The structure only - * gets deallocated when we destroy the ill. - */ - ill->ill_capabilities &= ~ILL_CAPAB_HCKSUM; - size = sizeof (*dl_subcap) + sizeof (*hck_subcap); - - mp = allocb(size, BPRI_HI); - if (mp == NULL) { - ip1dbg(("ill_capability_hcksum_reset: unable to allocate " - "request to disable hardware checksum offload\n")); - return; - } - - mp->b_wptr = mp->b_rptr + size; - - dl_subcap = (dl_capability_sub_t *)mp->b_rptr; + dl_subcap = (dl_capability_sub_t *)mp->b_wptr; dl_subcap->dl_cap = DL_CAPAB_HCKSUM; dl_subcap->dl_length = sizeof (*hck_subcap); @@ -3211,10 +2770,7 @@ ill_capability_hcksum_reset(ill_t *ill, mblk_t **sc_mp) hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; hck_subcap->hcksum_txflags = 0; - if (*sc_mp != NULL) - linkb(*sc_mp, mp); - else - *sc_mp = mp; + mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap); } static void @@ -3325,42 +2881,22 @@ ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ - ill_dlpi_send(ill, nmp); + ill_capability_send(ill, nmp); } } static void -ill_capability_zerocopy_reset(ill_t *ill, mblk_t **sc_mp) +ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp) { - mblk_t *mp; dl_capab_zerocopy_t *zerocopy_subcap; dl_capability_sub_t *dl_subcap; - int size; if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) return; ASSERT(ill->ill_zerocopy_capab != NULL); - /* - * Clear the capability flag for Zero-copy but retain the - * ill_zerocopy_capab structure since it's possible that another - * thread is still referring to it. The structure only gets - * deallocated when we destroy the ill. - */ - ill->ill_capabilities &= ~ILL_CAPAB_ZEROCOPY; - - size = sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); - - mp = allocb(size, BPRI_HI); - if (mp == NULL) { - ip1dbg(("ill_capability_zerocopy_reset: unable to allocate " - "request to disable Zero-copy\n")); - return; - } - mp->b_wptr = mp->b_rptr + size; - - dl_subcap = (dl_capability_sub_t *)mp->b_rptr; + dl_subcap = (dl_capability_sub_t *)mp->b_wptr; dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; dl_subcap->dl_length = sizeof (*zerocopy_subcap); @@ -3369,30 +2905,24 @@ ill_capability_zerocopy_reset(ill_t *ill, mblk_t **sc_mp) ill->ill_zerocopy_capab->ill_zerocopy_version; zerocopy_subcap->zerocopy_flags = 0; - if (*sc_mp != NULL) - linkb(*sc_mp, mp); - else - *sc_mp = mp; + mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); } /* - * Process Large Segment Offload capability negotiation ack received from a - * DLS Provider. isub must point to the sub-capability (DL_CAPAB_LSO) of a - * DL_CAPABILITY_ACK message. + * DLD capability + * Refer to dld.h for more information regarding the purpose and usage + * of this capability. */ static void -ill_capability_lso_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) +ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) { - mblk_t *nmp = NULL; - dl_capability_req_t *oc; - dl_capab_lso_t *lso_ic, *lso_oc; - ill_lso_capab_t **ill_lso_capab; - uint_t sub_dl_cap = isub->dl_cap; - uint8_t *capend; - - ASSERT(sub_dl_cap == DL_CAPAB_LSO); + dl_capab_dld_t *dld_ic, dld; + uint_t sub_dl_cap = isub->dl_cap; + uint8_t *capend; + ill_dld_capab_t *idc; - ill_lso_capab = (ill_lso_capab_t **)&ill->ill_lso_capab; + ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(sub_dl_cap == DL_CAPAB_DLD); /* * Note: range checks here are not absolutely sufficient to @@ -3403,165 +2933,395 @@ ill_capability_lso_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) */ capend = (uint8_t *)(isub + 1) + isub->dl_length; if (capend > mp->b_wptr) { - cmn_err(CE_WARN, "ill_capability_lso_ack: " + cmn_err(CE_WARN, "ill_capability_dld_ack: " "malformed sub-capability too long for mblk"); return; } - - lso_ic = (dl_capab_lso_t *)(isub + 1); - - if (lso_ic->lso_version != LSO_VERSION_1) { - cmn_err(CE_CONT, "ill_capability_lso_ack: " - "unsupported LSO sub-capability (version %d, expected %d)", - lso_ic->lso_version, LSO_VERSION_1); + dld_ic = (dl_capab_dld_t *)(isub + 1); + if (dld_ic->dld_version != DLD_CURRENT_VERSION) { + cmn_err(CE_CONT, "ill_capability_dld_ack: " + "unsupported DLD sub-capability (version %d, " + "expected %d)", dld_ic->dld_version, + DLD_CURRENT_VERSION); return; } - - if (!dlcapabcheckqid(&lso_ic->lso_mid, ill->ill_lmod_rq)) { - ip1dbg(("ill_capability_lso_ack: mid token for LSO " + if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) { + ip1dbg(("ill_capability_dld_ack: mid token for dld " "capability isn't as expected; pass-thru module(s) " "detected, discarding capability\n")); return; } - if ((lso_ic->lso_flags & LSO_TX_ENABLE) && - (lso_ic->lso_flags & LSO_TX_BASIC_TCP_IPV4)) { - if (*ill_lso_capab == NULL) { - *ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t), - KM_NOSLEEP); + /* + * Copy locally to ensure alignment. + */ + bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t)); - if (*ill_lso_capab == NULL) { - cmn_err(CE_WARN, "ill_capability_lso_ack: " - "could not enable LSO version %d " - "for %s (ENOMEM)\n", LSO_VERSION_1, - ill->ill_name); - return; - } + if ((idc = ill->ill_dld_capab) == NULL) { + idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP); + if (idc == NULL) { + cmn_err(CE_WARN, "ill_capability_dld_ack: " + "could not enable DLD version %d " + "for %s (ENOMEM)\n", DLD_CURRENT_VERSION, + ill->ill_name); + return; } + idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab; + idc->idc_capab_dh = (void *)dld.dld_capab_handle; + ill->ill_dld_capab = idc; + } + ip1dbg(("ill_capability_dld_ack: interface %s " + "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION)); - (*ill_lso_capab)->ill_lso_version = lso_ic->lso_version; - (*ill_lso_capab)->ill_lso_flags = lso_ic->lso_flags; - (*ill_lso_capab)->ill_lso_max = lso_ic->lso_max; - ill->ill_capabilities |= ILL_CAPAB_LSO; + ill_capability_dld_enable(ill); +} - ip1dbg(("ill_capability_lso_ack: interface %s " - "has enabled LSO\n ", ill->ill_name)); - } else if (lso_ic->lso_flags & LSO_TX_BASIC_TCP_IPV4) { - uint_t size; - uchar_t *rptr; +/* + * Typically capability negotiation between IP and the driver happens via + * DLPI message exchange. However GLD also offers a direct function call + * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities, + * But arbitrary function calls into IP or GLD are not permitted, since both + * of them are protected by their own perimeter mechanism. The perimeter can + * be viewed as a coarse lock or serialization mechanism. The hierarchy of + * these perimeters is IP -> MAC. Thus for example to enable the squeue + * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter + * to enter the mac perimeter and then do the direct function calls into + * GLD to enable squeue polling. The ring related callbacks from the mac into + * the stack to add, bind, quiesce, restart or cleanup a ring are all + * protected by the mac perimeter. + */ +static void +ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp) +{ + ill_dld_capab_t *idc = ill->ill_dld_capab; + int err; - size = sizeof (dl_capability_req_t) + - sizeof (dl_capability_sub_t) + sizeof (dl_capab_lso_t); + err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp, + DLD_ENABLE); + ASSERT(err == 0); +} - if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { - cmn_err(CE_WARN, "ill_capability_lso_ack: " +static void +ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph) +{ + ill_dld_capab_t *idc = ill->ill_dld_capab; + int err; + + err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph, + DLD_DISABLE); + ASSERT(err == 0); +} + +boolean_t +ill_mac_perim_held(ill_t *ill) +{ + ill_dld_capab_t *idc = ill->ill_dld_capab; + + return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL, + DLD_QUERY)); +} + +static void +ill_capability_direct_enable(ill_t *ill) +{ + ill_dld_capab_t *idc = ill->ill_dld_capab; + ill_dld_direct_t *idd = &idc->idc_direct; + dld_capab_direct_t direct; + int rc; + + ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); + + bzero(&direct, sizeof (direct)); + direct.di_rx_cf = (uintptr_t)ip_input; + direct.di_rx_ch = ill; + + rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct, + DLD_ENABLE); + if (rc == 0) { + idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df; + idd->idd_tx_dh = direct.di_tx_dh; + idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df; + idd->idd_tx_cb_dh = direct.di_tx_cb_dh; + /* + * One time registration of flow enable callback function + */ + ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh, + ill_flow_enable, ill); + ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT; + DTRACE_PROBE1(direct_on, (ill_t *), ill); + } else { + cmn_err(CE_WARN, "warning: could not enable DIRECT " + "capability, rc = %d\n", rc); + DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc); + } +} + +static void +ill_capability_poll_enable(ill_t *ill) +{ + ill_dld_capab_t *idc = ill->ill_dld_capab; + dld_capab_poll_t poll; + int rc; + + ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); + + bzero(&poll, sizeof (poll)); + poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring; + poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring; + poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring; + poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring; + poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring; + poll.poll_ring_ch = ill; + rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll, + DLD_ENABLE); + if (rc == 0) { + ill->ill_capabilities |= ILL_CAPAB_DLD_POLL; + DTRACE_PROBE1(poll_on, (ill_t *), ill); + } else { + ip1dbg(("warning: could not enable POLL " + "capability, rc = %d\n", rc)); + DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc); + } +} + +/* + * Enable the LSO capability. + */ +static void +ill_capability_lso_enable(ill_t *ill) +{ + ill_dld_capab_t *idc = ill->ill_dld_capab; + dld_capab_lso_t lso; + int rc; + + ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); + + if (ill->ill_lso_capab == NULL) { + ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t), + KM_NOSLEEP); + if (ill->ill_lso_capab == NULL) { + cmn_err(CE_WARN, "ill_capability_lso_enable: " "could not enable LSO for %s (ENOMEM)\n", ill->ill_name); return; } + } - rptr = nmp->b_rptr; - /* initialize dl_capability_req_t */ - oc = (dl_capability_req_t *)nmp->b_rptr; - oc->dl_sub_offset = sizeof (dl_capability_req_t); - oc->dl_sub_length = sizeof (dl_capability_sub_t) + - sizeof (dl_capab_lso_t); - nmp->b_rptr += sizeof (dl_capability_req_t); - - /* initialize dl_capability_sub_t */ - bcopy(isub, nmp->b_rptr, sizeof (*isub)); - nmp->b_rptr += sizeof (*isub); + bzero(&lso, sizeof (lso)); + if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso, + DLD_ENABLE)) == 0) { + ill->ill_lso_capab->ill_lso_flags = lso.lso_flags; + ill->ill_lso_capab->ill_lso_max = lso.lso_max; + ill->ill_capabilities |= ILL_CAPAB_DLD_LSO; + ip1dbg(("ill_capability_lso_enable: interface %s " + "has enabled LSO\n ", ill->ill_name)); + } else { + kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); + ill->ill_lso_capab = NULL; + DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc); + } +} - /* initialize dl_capab_lso_t */ - lso_oc = (dl_capab_lso_t *)nmp->b_rptr; - bcopy(lso_ic, lso_oc, sizeof (*lso_ic)); +static void +ill_capability_dld_enable(ill_t *ill) +{ + mac_perim_handle_t mph; - nmp->b_rptr = rptr; - ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); + ASSERT(IAM_WRITER_ILL(ill)); - /* set ENABLE flag */ - lso_oc->lso_flags |= LSO_TX_ENABLE; + if (ill->ill_isv6) + return; - /* nmp points to a DL_CAPABILITY_REQ message to enable LSO */ - ill_dlpi_send(ill, nmp); - } else { - ip1dbg(("ill_capability_lso_ack: interface %s has " - "advertised %x LSO capability flags\n", - ill->ill_name, lso_ic->lso_flags)); + ill_mac_perim_enter(ill, &mph); + if (!ill->ill_isv6) { + ill_capability_direct_enable(ill); + ill_capability_poll_enable(ill); + ill_capability_lso_enable(ill); } + ill->ill_capabilities |= ILL_CAPAB_DLD; + ill_mac_perim_exit(ill, mph); } static void -ill_capability_lso_reset(ill_t *ill, mblk_t **sc_mp) +ill_capability_dld_disable(ill_t *ill) { - mblk_t *mp; - dl_capab_lso_t *lso_subcap; - dl_capability_sub_t *dl_subcap; - int size; + ill_dld_capab_t *idc; + ill_dld_direct_t *idd; + mac_perim_handle_t mph; - if (!(ill->ill_capabilities & ILL_CAPAB_LSO)) + ASSERT(IAM_WRITER_ILL(ill)); + + if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) return; - ASSERT(ill->ill_lso_capab != NULL); - /* - * Clear the capability flag for LSO but retain the - * ill_lso_capab structure since it's possible that another - * thread is still referring to it. The structure only gets - * deallocated when we destroy the ill. - */ - ill->ill_capabilities &= ~ILL_CAPAB_LSO; + ill_mac_perim_enter(ill, &mph); + + idc = ill->ill_dld_capab; + if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) { + /* + * For performance we avoid locks in the transmit data path + * and don't maintain a count of the number of threads using + * direct calls. Thus some threads could be using direct + * transmit calls to GLD, even after the capability mechanism + * turns it off. This is still safe since the handles used in + * the direct calls continue to be valid until the unplumb is + * completed. Remove the callback that was added (1-time) at + * capab enable time. + */ + mutex_enter(&ill->ill_lock); + ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT; + mutex_exit(&ill->ill_lock); + if (ill->ill_flownotify_mh != NULL) { + idd = &idc->idc_direct; + idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL, + ill->ill_flownotify_mh); + ill->ill_flownotify_mh = NULL; + } + (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, + NULL, DLD_DISABLE); + } - size = sizeof (*dl_subcap) + sizeof (*lso_subcap); + if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) { + ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL; + ip_squeue_clean_all(ill); + (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, + NULL, DLD_DISABLE); + } - mp = allocb(size, BPRI_HI); - if (mp == NULL) { - ip1dbg(("ill_capability_lso_reset: unable to allocate " - "request to disable LSO\n")); - return; + if ((ill->ill_capabilities & ILL_CAPAB_DLD_LSO) != 0) { + ASSERT(ill->ill_lso_capab != NULL); + /* + * Clear the capability flag for LSO but retain the + * ill_lso_capab structure since it's possible that another + * thread is still referring to it. The structure only gets + * deallocated when we destroy the ill. + */ + + ill->ill_capabilities &= ~ILL_CAPAB_DLD_LSO; + (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, + NULL, DLD_DISABLE); } - mp->b_wptr = mp->b_rptr + size; + ill->ill_capabilities &= ~ILL_CAPAB_DLD; + ill_mac_perim_exit(ill, mph); +} - dl_subcap = (dl_capability_sub_t *)mp->b_rptr; - dl_subcap->dl_cap = DL_CAPAB_LSO; - dl_subcap->dl_length = sizeof (*lso_subcap); +/* + * Capability Negotiation protocol + * + * We don't wait for DLPI capability operations to finish during interface + * bringup or teardown. Doing so would introduce more asynchrony and the + * interface up/down operations will need multiple return and restarts. + * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as + * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next + * exclusive operation won't start until the DLPI operations of the previous + * exclusive operation complete. + * + * The capability state machine is shown below. + * + * state next state event, action + * + * IDCS_UNKNOWN IDCS_PROBE_SENT ill_capability_probe + * IDCS_PROBE_SENT IDCS_OK ill_capability_ack + * IDCS_PROBE_SENT IDCS_FAILED ip_rput_dlpi_writer (nack) + * IDCS_OK IDCS_RENEG Receipt of DL_NOTE_CAPAB_RENEG + * IDCS_OK IDCS_RESET_SENT ill_capability_reset + * IDCS_RESET_SENT IDCS_UNKNOWN ill_capability_ack_thr + * IDCS_RENEG IDCS_PROBE_SENT ill_capability_ack_thr -> + * ill_capability_probe. + */ + +/* + * Dedicated thread started from ip_stack_init that handles capability + * disable. This thread ensures the taskq dispatch does not fail by waiting + * for resources using TQ_SLEEP. The taskq mechanism is used to ensure + * that direct calls to DLD are done in a cv_waitable context. + */ +void +ill_taskq_dispatch(ip_stack_t *ipst) +{ + callb_cpr_t cprinfo; + char name[64]; + mblk_t *mp; - lso_subcap = (dl_capab_lso_t *)(dl_subcap + 1); - lso_subcap->lso_version = ill->ill_lso_capab->ill_lso_version; - lso_subcap->lso_flags = 0; + (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d", + ipst->ips_netstack->netstack_stackid); + CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr, + name); + mutex_enter(&ipst->ips_capab_taskq_lock); - if (*sc_mp != NULL) - linkb(*sc_mp, mp); - else - *sc_mp = mp; + for (;;) { + mp = list_head(&ipst->ips_capab_taskq_list); + while (mp != NULL) { + list_remove(&ipst->ips_capab_taskq_list, mp); + mutex_exit(&ipst->ips_capab_taskq_lock); + VERIFY(taskq_dispatch(system_taskq, + ill_capability_ack_thr, mp, TQ_SLEEP) != 0); + mutex_enter(&ipst->ips_capab_taskq_lock); + mp = list_head(&ipst->ips_capab_taskq_list); + } + + if (ipst->ips_capab_taskq_quit) + break; + CALLB_CPR_SAFE_BEGIN(&cprinfo); + cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock); + CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock); + } + VERIFY(list_head(&ipst->ips_capab_taskq_list) == NULL); + CALLB_CPR_EXIT(&cprinfo); + thread_exit(); } /* * Consume a new-style hardware capabilities negotiation ack. - * Called from ip_rput_dlpi_writer(). + * Called via taskq on receipt of DL_CAPABBILITY_ACK. */ -void -ill_capability_ack(ill_t *ill, mblk_t *mp) +static void +ill_capability_ack_thr(void *arg) { + mblk_t *mp = arg; dl_capability_ack_t *capp; dl_capability_sub_t *subp, *endp; + ill_t *ill; + boolean_t reneg; - if (ill->ill_dlpi_capab_state == IDS_INPROGRESS) - ill->ill_dlpi_capab_state = IDS_OK; + ill = (ill_t *)mp->b_prev; + VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE); + + if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT || + ill->ill_dlpi_capab_state == IDCS_RENEG) { + /* + * We have received the ack for our DL_CAPAB reset request. + * There isnt' anything in the message that needs processing. + * All message based capabilities have been disabled, now + * do the function call based capability disable. + */ + reneg = ill->ill_dlpi_capab_state == IDCS_RENEG; + ill_capability_dld_disable(ill); + ill->ill_dlpi_capab_state = IDCS_UNKNOWN; + if (reneg) + ill_capability_probe(ill); + goto done; + } + + if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT) + ill->ill_dlpi_capab_state = IDCS_OK; capp = (dl_capability_ack_t *)mp->b_rptr; - if (capp->dl_sub_length == 0) + if (capp->dl_sub_length == 0) { /* no new-style capabilities */ - return; + goto done; + } /* make sure the driver supplied correct dl_sub_length */ if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); - return; + goto done; } + #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) /* * There are sub-capabilities. Process the ones we know about. @@ -3582,6 +3342,34 @@ ill_capability_ack(ill_t *ill, mblk_t *mp) } } #undef SC +done: + inet_freemsg(mp); + ill_capability_done(ill); + ipsq_exit(ill->ill_phyint->phyint_ipsq); +} + +/* + * This needs to be started in a taskq thread to provide a cv_waitable + * context. + */ +void +ill_capability_ack(ill_t *ill, mblk_t *mp) +{ + ip_stack_t *ipst = ill->ill_ipst; + + mp->b_prev = (mblk_t *)ill; + if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp, + TQ_NOSLEEP) != 0) + return; + + /* + * The taskq dispatch failed. Signal the ill_taskq_dispatch thread + * which will do the dispatch using TQ_SLEEP to guarantee success. + */ + mutex_enter(&ipst->ips_capab_taskq_lock); + list_insert_tail(&ipst->ips_capab_taskq_list, mp); + cv_signal(&ipst->ips_capab_taskq_cv); + mutex_exit(&ipst->ips_capab_taskq_lock); } /* @@ -7609,7 +7397,7 @@ ipsq_dq(ipsq_t *ipsq) */ #define ENTER_SQ_WAIT_TICKS 100 boolean_t -ipsq_enter(ill_t *ill, boolean_t force) +ipsq_enter(ill_t *ill, boolean_t force, int type) { ipsq_t *ipsq; boolean_t waited_enough = B_FALSE; @@ -7630,7 +7418,8 @@ ipsq_enter(ill_t *ill, boolean_t force) ipsq = ill->ill_phyint->phyint_ipsq; mutex_enter(&ipsq->ipsq_lock); if (ipsq->ipsq_writer == NULL && - (ipsq->ipsq_current_ipif == NULL || waited_enough)) { + (type == CUR_OP || ipsq->ipsq_current_ipif == NULL || + waited_enough)) { break; } else if (ipsq->ipsq_writer != NULL) { mutex_exit(&ipsq->ipsq_lock); @@ -7661,6 +7450,18 @@ ipsq_enter(ill_t *ill, boolean_t force) return (B_TRUE); } +boolean_t +ill_perim_enter(ill_t *ill) +{ + return (ipsq_enter(ill, B_FALSE, CUR_OP)); +} + +void +ill_perim_exit(ill_t *ill) +{ + ipsq_exit(ill->ill_phyint->phyint_ipsq); +} + /* * The ipsq_t (ipsq) is the synchronization data structure used to serialize * certain critical operations like plumbing (i.e. most set ioctls), @@ -9984,6 +9785,13 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, ill->ill_ip_muxid = islink ? li->l_index : 0; /* + * Mark the ipsq busy until the capability operations initiated below + * complete. The PLINK/UNLINK ioctl itself completes when our caller + * returns, but the capability operation may complete asynchronously + * much later. + */ + ipsq_current_start(ipsq, ill->ill_ipif, ioccmd); + /* * If there's at least one up ipif on this ill, then we're bound to * the underlying driver via DLPI. In that case, renegotiate * capabilities to account for any possible change in modules @@ -9993,8 +9801,9 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, if (islink) ill_capability_probe(ill); else - ill_capability_reset(ill); + ill_capability_reset(ill, B_FALSE); } + ipsq_current_finish(ipsq); if (entered_ipsq) ipsq_exit(ipsq); @@ -18244,19 +18053,19 @@ ill_dl_down(ill_t *ill) ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS; mutex_exit(&ill->ill_lock); /* - * Reset the capabilities if the negotiation is done or is - * still in progress. Note that ill_capability_reset() will - * set ill_dlpi_capab_state to IDS_UNKNOWN, so the subsequent - * DL_CAPABILITY_ACK and DL_NOTE_CAPAB_RENEG will be ignored. - * - * Further, reset ill_capab_reneg to be B_FALSE so that the - * subsequent DL_CAPABILITY_ACK can be ignored, to prevent - * the capabilities renegotiation from happening. + * ip_rput does not pass up normal (M_PROTO) DLPI messages + * after ILL_CONDEMNED is set. So in the unplumb case, we call + * ill_capability_dld_disable disable rightaway. If this is not + * an unplumb operation then the disable happens on receipt of + * the capab ack via ip_rput_dlpi_writer -> + * ill_capability_ack_thr. In both cases the order of + * the operations seen by DLD is capability disable followed + * by DL_UNBIND. Also the DLD capability disable needs a + * cv_wait'able context. */ - if (ill->ill_dlpi_capab_state != IDS_UNKNOWN) - ill_capability_reset(ill); - ill->ill_capab_reneg = B_FALSE; - + if (ill->ill_state_flags & ILL_CONDEMNED) + ill_capability_dld_disable(ill); + ill_capability_reset(ill, B_FALSE); ill_dlpi_send(ill, mp); } @@ -18314,7 +18123,6 @@ ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) ill->ill_dlpi_pending = prim; } mutex_exit(&ill->ill_lock); - putnext(ill->ill_wq, mp); } @@ -18372,6 +18180,26 @@ ill_dlpi_send(ill_t *ill, mblk_t *mp) ill_dlpi_dispatch(ill, mp); } +static void +ill_capability_send(ill_t *ill, mblk_t *mp) +{ + ill->ill_capab_pending_cnt++; + ill_dlpi_send(ill, mp); +} + +void +ill_capability_done(ill_t *ill) +{ + ASSERT(ill->ill_capab_pending_cnt != 0); + + ill_dlpi_done(ill, DL_CAPABILITY_REQ); + + ill->ill_capab_pending_cnt--; + if (ill->ill_capab_pending_cnt == 0 && + ill->ill_dlpi_capab_state == IDCS_OK) + ill_capability_reset_alloc(ill); +} + /* * Send all deferred DLPI messages without waiting for their ACKs. */ diff --git a/usr/src/uts/common/inet/ip/ip_ire.c b/usr/src/uts/common/inet/ip/ip_ire.c index 2e940057f0..405cb653d5 100644 --- a/usr/src/uts/common/inet/ip/ip_ire.c +++ b/usr/src/uts/common/inet/ip/ip_ire.c @@ -4277,6 +4277,37 @@ ire_cache_lookup(ipaddr_t addr, zoneid_t zoneid, const ts_label_t *tsl, return (NULL); } +ire_t * +ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst) +{ + irb_t *irb_ptr; + ire_t *ire; + + /* + * Lets look for an ire in the cachetable whose + * ire_addr matches the destination. + * Since we are being called by forwarding fastpath + * no need to check for Trusted Solaris label. + */ + irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH( + dst, ipst->ips_ip_cache_table_size)]; + rw_enter(&irb_ptr->irb_lock, RW_READER); + for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { + if (ire->ire_marks & (IRE_MARK_CONDEMNED | + IRE_MARK_HIDDEN | IRE_MARK_PRIVATE_ADDR)) { + continue; + } + if (ire->ire_addr == dst) { + IRE_REFHOLD(ire); + rw_exit(&irb_ptr->irb_lock); + return (ire); + } + } + rw_exit(&irb_ptr->irb_lock); + return (NULL); +} + + /* * Locate the interface ire that is tied to the cache ire 'cire' via * cire->ire_ihandle. diff --git a/usr/src/uts/common/inet/ip/ip_mroute.c b/usr/src/uts/common/inet/ip/ip_mroute.c index 34fd3cd765..ac14adf00d 100644 --- a/usr/src/uts/common/inet/ip/ip_mroute.c +++ b/usr/src/uts/common/inet/ip/ip_mroute.c @@ -28,8 +28,6 @@ */ /* Copyright (c) 1990 Mentat Inc. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Procedures for the kernel part of DVMRP, * a Distance-Vector Multicast Routing Protocol. @@ -683,7 +681,7 @@ ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst) vifp->v_marks &= ~VIF_MARK_GOOD; vifp->v_marks |= VIF_MARK_CONDEMNED; mutex_exit(&(vifp)->v_lock); - suc = ipsq_enter(ill, B_FALSE); + suc = ipsq_enter(ill, B_FALSE, NEW_OP); ipsq = ill->ill_phyint->phyint_ipsq; } else { ipsq = ipsq_try_enter(ipif, NULL, diff --git a/usr/src/uts/common/inet/ip/ip_multi.c b/usr/src/uts/common/inet/ip/ip_multi.c index 7a036a34d9..f3c95ae362 100644 --- a/usr/src/uts/common/inet/ip/ip_multi.c +++ b/usr/src/uts/common/inet/ip/ip_multi.c @@ -1201,7 +1201,7 @@ ipsq_enter_byifindex(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst) return (NULL); } ill_refrele(ill); - in_ipsq = ipsq_enter(ill, B_FALSE); + in_ipsq = ipsq_enter(ill, B_FALSE, NEW_OP); ill_waiter_dcr(ill); if (!in_ipsq) ill = NULL; @@ -3912,7 +3912,7 @@ retry: * be refheld for cleanup by those routines and it would be * a mutual deadlock. */ - success = ipsq_enter(ill, B_FALSE); + success = ipsq_enter(ill, B_FALSE, NEW_OP); ipsq = ill->ill_phyint->phyint_ipsq; ill_waiter_dcr(ill); mutex_enter(&connp->conn_lock); diff --git a/usr/src/uts/common/inet/ip/ip_netinfo.c b/usr/src/uts/common/inet/ip/ip_netinfo.c index a34b55693e..53665593be 100644 --- a/usr/src/uts/common/inet/ip/ip_netinfo.c +++ b/usr/src/uts/common/inet/ip/ip_netinfo.c @@ -1546,7 +1546,7 @@ ip_ni_queue_func_impl(injection_t *inject, boolean_t out) if (inject->inj_isv6) { ip_rput_v6(ill->ill_rq, packet->ni_packet); } else { - ip_input(ill, NULL, packet->ni_packet, 0); + ip_input(ill, NULL, packet->ni_packet, NULL); } kmem_free(inject, sizeof (*inject)); ill_refrele(ill); diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c index c2b22ab956..9d677c3157 100644 --- a/usr/src/uts/common/inet/ip/ip_squeue.c +++ b/usr/src/uts/common/inet/ip/ip_squeue.c @@ -26,16 +26,36 @@ /* * IP interface to squeues. * - * IP creates an squeue instance for each CPU. The squeue pointer is saved in - * cpu_squeue field of the cpu structure. Each squeue is associated with a - * connection instance (conn_t). + * IP uses squeues to force serialization of packets, both incoming and + * outgoing. Each squeue is associated with a connection instance (conn_t) + * above, and a soft ring (if enabled) below. Each CPU will have a default + * squeue for outbound connections, and each soft ring of an interface will + * have an squeue to which it sends incoming packets. squeues are never + * destroyed, and if they become unused they are kept around against future + * needs. * - * For CPUs available at system startup time the squeue creation and association - * with CPU happens at MP initialization time. For CPUs added during dynamic - * reconfiguration, the initialization happens when the new CPU is configured in - * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either - * return per-CPU squeue or random squeue based on the ip_squeue_fanout - * variable. + * IP organizes its squeues using squeue sets (squeue_set_t). For each CPU + * in the system there will be one squeue set, all of whose squeues will be + * bound to that CPU, plus one additional set known as the unbound set. Sets + * associated with CPUs will have one default squeue, for outbound + * connections, and a linked list of squeues used by various NICs for inbound + * packets. The unbound set also has a linked list of squeues, but no default + * squeue. + * + * When a CPU goes offline its squeue set is destroyed, and all its squeues + * are moved to the unbound set. When a CPU comes online, a new squeue set is + * created and the default set is searched for a default squeue formerly bound + * to this CPU. If no default squeue is found, a new one is created. + * + * Two fields of the squeue_t, namely sq_next and sq_set, are owned by IP + * and not the squeue code. squeue.c will not touch them, and we can modify + * them without holding the squeue lock because of the guarantee that squeues + * are never destroyed. ip_squeue locks must be held, however. + * + * All the squeue sets are protected by a single lock, the sqset_lock. This + * is also used to protect the sq_next and sq_set fields of an squeue_t. + * + * The lock order is: cpu_lock --> ill_lock --> sqset_lock --> sq_lock * * There are two modes of associating connection with squeues. The first mode * associates each connection with the CPU that creates the connection (either @@ -50,18 +70,13 @@ * may process the connection on whatever CPU it is scheduled. The squeue to CPU * binding is only relevant for the worker thread. * - * The list of all created squeues is kept in squeue_set structure. This list is - * used when ip_squeue_fanout is set and the load is distributed across all - * squeues. - * * INTERFACE: * - * squeue_t *ip_squeue_get(hint) + * squeue_t *ip_squeue_get(ill_rx_ring_t) * - * Find an squeue based on the 'hint' value. The hint is used as an index - * in the array of IP squeues available. The way hint is computed may - * affect the effectiveness of the squeue distribution. Currently squeues - * are assigned in round-robin fashion using lbolt as a hint. + * Returns the squeue associated with an ill receive ring. If the ring is + * not bound to a CPU, and we're currently servicing the interrupt which + * generated the packet, then bind the squeue to CPU. * * * DR Notes @@ -78,36 +93,31 @@ * o When the CPU is going online, it creates a new squeue for this CPU if * necessary and binds the squeue worker thread to this CPU. * - * TUNEBALES: - * - * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU - * associated with an squeue instance. + * TUNABLES: * - * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c - * should be compiled with SQUEUE_PROFILE enabled for this variable to have - * an impact. + * ip_squeue_fanout: used when TCP calls IP_SQUEUE_GET(). If 1, then + * pick the default squeue from a random CPU, otherwise use our CPU's default + * squeue. * - * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue, - * otherwise get it from CPU->cpu_squeue. + * ip_squeue_fanout can be accessed and changed using ndd on /dev/tcp or + * /dev/ip. * - * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and - * changed using ndd on /dev/tcp or /dev/ip. - * - * ip_squeue_worker_wait: global value for the sq_wait field for all squeues - * created. This is the time squeue code waits before waking up the worker - * thread after queuing a request. + * ip_squeue_worker_wait: global value for the sq_wait field for all squeues * + * created. This is the time squeue code waits before waking up the worker + * thread after queuing a request. */ #include <sys/types.h> #include <sys/debug.h> #include <sys/kmem.h> #include <sys/cpuvar.h> - #include <sys/cmn_err.h> #include <inet/common.h> #include <inet/ip.h> +#include <netinet/ip6.h> #include <inet/ip_if.h> +#include <inet/ip_ire.h> #include <inet/nd.h> #include <inet/ipclassifier.h> #include <sys/types.h> @@ -115,31 +125,21 @@ #include <sys/sunddi.h> #include <sys/dlpi.h> #include <sys/squeue_impl.h> +#include <sys/tihdr.h> +#include <inet/udp_impl.h> +#include <sys/strsubr.h> +#include <sys/zone.h> +#include <sys/dld.h> #include <sys/atomic.h> /* - * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1 - * mapping between squeue and NIC (or Rx ring) for performance reasons so - * each squeue can uniquely own a NIC or a Rx ring and do polling - * (PSARC 2004/630). So we allow up to MAX_SQUEUES_PER_CPU squeues per CPU. - * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues - * can be created dynamically as needed. + * List of all created squeue sets. The list and its size are protected by + * sqset_lock. */ -#define MAX_SQUEUES_PER_CPU 32 -#define MIN_SQUEUES_PER_CPU 1 -uint_t ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; +static squeue_set_t **sqset_global_list; /* list 0 is the unbound list */ +static uint_t sqset_global_size; +kmutex_t sqset_lock; -#define IP_NUM_SOFT_RINGS 2 -uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS; - -/* - * List of all created squeue sets. The size is protected by cpu_lock - */ -squeue_set_t **sqset_global_list; -uint_t sqset_global_size; - -int ip_squeue_bind = B_TRUE; -int ip_squeue_profile = B_TRUE; static void (*ip_squeue_create_callback)(squeue_t *) = NULL; /* @@ -149,82 +149,153 @@ static void (*ip_squeue_create_callback)(squeue_t *) = NULL; */ uint_t ip_squeue_worker_wait = 10; -static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t); +static squeue_t *ip_squeue_create(pri_t); +static squeue_set_t *ip_squeue_set_create(processorid_t); static int ip_squeue_cpu_setup(cpu_setup_t, int, void *); - -static void ip_squeue_set_bind(squeue_set_t *); -static void ip_squeue_set_unbind(squeue_set_t *); -static squeue_t *ip_find_unused_squeue(squeue_set_t *, boolean_t); +static void ip_squeue_set_move(squeue_t *, squeue_set_t *); +static void ip_squeue_set_destroy(cpu_t *); static void ip_squeue_clean(void *, mblk_t *, void *); -static void ip_squeue_clean_ring(ill_t *, ill_rx_ring_t *); #define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS)) +static squeue_t * +ip_squeue_create(pri_t pri) +{ + squeue_t *sqp; + + sqp = squeue_create(ip_squeue_worker_wait, pri); + ASSERT(sqp != NULL); + if (ip_squeue_create_callback != NULL) + ip_squeue_create_callback(sqp); + return (sqp); +} + /* - * Create squeue set containing ip_squeues_per_cpu number of squeues - * for this CPU and bind them all to the CPU. + * Create a new squeue_set. If id == -1, then we're creating the unbound set, + * which should only happen once when we are first initialized. Otherwise id + * is the id of the CPU that needs a set, either because we are initializing + * or because the CPU has come online. + * + * If id != -1, then we need at a minimum to provide a default squeue for the + * new set. We search the unbound set for candidates, and if none are found we + * create a new one. */ static squeue_set_t * -ip_squeue_set_create(cpu_t *cp, boolean_t reuse) +ip_squeue_set_create(processorid_t id) { - int i; squeue_set_t *sqs; - squeue_t *sqp; - char sqname[64]; - processorid_t id = cp->cpu_id; + squeue_set_t *src = sqset_global_list[0]; + squeue_t **lastsqp, *sq; + squeue_t **defaultq_lastp = NULL; + + sqs = kmem_zalloc(sizeof (squeue_set_t), KM_SLEEP); + sqs->sqs_cpuid = id; + + if (id == -1) { + ASSERT(sqset_global_size == 0); + sqset_global_list[0] = sqs; + sqset_global_size = 1; + return (sqs); + } - if (reuse) { - int i; + /* + * When we create an squeue set id != -1, we need to give it a + * default squeue, in order to support fanout of conns across + * CPUs. Try to find a former default squeue that matches this + * cpu id on the unbound squeue set. If no such squeue is found, + * find some non-default TCP squeue and steal it. If still no such + * candidate is found, create a new squeue. + */ - /* - * We may already have an squeue created for this CPU. Try to - * find one and reuse it if possible. - */ - for (i = 0; i < sqset_global_size; i++) { - sqs = sqset_global_list[i]; - if (id == sqs->sqs_bind) - return (sqs); + ASSERT(MUTEX_HELD(&cpu_lock)); + mutex_enter(&sqset_lock); + lastsqp = &src->sqs_head; + + while (*lastsqp) { + if ((*lastsqp)->sq_bind == id && + (*lastsqp)->sq_state & SQS_DEFAULT) { + defaultq_lastp = lastsqp; + break; + } + if (defaultq_lastp == NULL && + !((*lastsqp)->sq_state & SQS_DEFAULT)) { + defaultq_lastp = lastsqp; } + lastsqp = &(*lastsqp)->sq_next; + + } + if (defaultq_lastp) { + /* Remove from src set and set SQS_DEFAULT */ + sq = *defaultq_lastp; + *defaultq_lastp = sq->sq_next; + sq->sq_next = NULL; + if (!(sq->sq_state & SQS_DEFAULT)) { + mutex_enter(&sq->sq_lock); + sq->sq_state |= SQS_DEFAULT; + mutex_exit(&sq->sq_lock); + } + } else { + sq = ip_squeue_create(SQUEUE_DEFAULT_PRIORITY); + sq->sq_state |= SQS_DEFAULT; } - sqs = kmem_zalloc(sizeof (squeue_set_t) + - (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP); - mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL); - sqs->sqs_list = (squeue_t **)&sqs[1]; - sqs->sqs_max_size = MAX_SQUEUES_PER_CPU; - sqs->sqs_bind = id; + sq->sq_set = sqs; + sqs->sqs_default = sq; + squeue_bind(sq, id); /* this locks squeue mutex */ - for (i = 0; i < ip_squeues_per_cpu; i++) { - bzero(sqname, sizeof (sqname)); + ASSERT(sqset_global_size <= NCPU); + sqset_global_list[sqset_global_size++] = sqs; + mutex_exit(&sqset_lock); + return (sqs); +} - (void) snprintf(sqname, sizeof (sqname), - "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid, - cp->cpu_id, i); +/* + * Called by ill_ring_add() to find an squeue to associate with a new ring. + */ - sqp = squeue_create(sqname, id, ip_squeue_worker_wait, - minclsyspri); +squeue_t * +ip_squeue_getfree(pri_t pri) +{ + squeue_set_t *sqs = sqset_global_list[0]; + squeue_t *sq; + mutex_enter(&sqset_lock); + for (sq = sqs->sqs_head; sq != NULL; sq = sq->sq_next) { /* - * The first squeue in each squeue_set is the DEFAULT - * squeue. + * Select a non-default squeue */ - sqp->sq_state |= SQS_DEFAULT; + if (!(sq->sq_state & (SQS_DEFAULT | SQS_ILL_BOUND))) + break; + } - ASSERT(sqp != NULL); + if (sq == NULL) { + sq = ip_squeue_create(pri); + sq->sq_set = sqs; + sq->sq_next = sqs->sqs_head; + sqs->sqs_head = sq; + } - squeue_profile_enable(sqp); - sqs->sqs_list[sqs->sqs_size++] = sqp; + ASSERT(!(sq->sq_state & (SQS_POLL_THR_CONTROL | SQS_WORKER_THR_CONTROL | + SQS_POLL_CLEANUP_DONE | SQS_POLL_QUIESCE_DONE | + SQS_POLL_THR_QUIESCED))); - if (ip_squeue_create_callback != NULL) - ip_squeue_create_callback(sqp); - } + mutex_enter(&sq->sq_lock); + sq->sq_state |= SQS_ILL_BOUND; + mutex_exit(&sq->sq_lock); + mutex_exit(&sqset_lock); - if (ip_squeue_bind && cpu_is_online(cp)) - ip_squeue_set_bind(sqs); + if (sq->sq_priority != pri) { + thread_lock(sq->sq_worker); + (void) thread_change_pri(sq->sq_worker, pri, 0); + thread_unlock(sq->sq_worker); - sqset_global_list[sqset_global_size++] = sqs; - ASSERT(sqset_global_size <= NCPU); - return (sqs); + thread_lock(sq->sq_poll_thr); + (void) thread_change_pri(sq->sq_poll_thr, pri, 0); + thread_unlock(sq->sq_poll_thr); + + sq->sq_priority = pri; + } + return (sq); } /* @@ -234,876 +305,450 @@ void ip_squeue_init(void (*callback)(squeue_t *)) { int i; + squeue_set_t *sqs; ASSERT(sqset_global_list == NULL); - if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU) - ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; - else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU) - ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU; - ip_squeue_create_callback = callback; squeue_init(); + mutex_init(&sqset_lock, NULL, MUTEX_DEFAULT, NULL); sqset_global_list = - kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP); + kmem_zalloc(sizeof (squeue_set_t *) * (NCPU+1), KM_SLEEP); sqset_global_size = 0; - mutex_enter(&cpu_lock); + /* + * We are called at system boot time and we don't + * expect memory allocation failure. + */ + sqs = ip_squeue_set_create(-1); + ASSERT(sqs != NULL); + mutex_enter(&cpu_lock); /* Create squeue for each active CPU available */ for (i = 0; i < NCPU; i++) { - cpu_t *cp = cpu[i]; + cpu_t *cp = cpu_get(i); if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) { - cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE); + /* + * We are called at system boot time and we don't + * expect memory allocation failure then + */ + cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id); + ASSERT(cp->cpu_squeue_set != NULL); } } register_cpu_setup_func(ip_squeue_cpu_setup, NULL); - mutex_exit(&cpu_lock); - - if (ip_squeue_profile) - squeue_profile_start(); } /* - * Get squeue_t structure based on index. - * Since the squeue list can only grow, no need to grab any lock. + * Get a default squeue, either from the current CPU or a CPU derived by hash + * from the index argument, depending upon the setting of ip_squeue_fanout. */ squeue_t * ip_squeue_random(uint_t index) { - squeue_set_t *sqs; - - sqs = sqset_global_list[index % sqset_global_size]; - return (sqs->sqs_list[index % sqs->sqs_size]); -} - -/* ARGSUSED */ -static void -ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2) -{ - squeue_t *sqp = arg2; - ill_rx_ring_t *ring = (ill_rx_ring_t *)mp->b_wptr; - ill_t *ill; - - ASSERT(sqp != NULL); - mp->b_wptr = NULL; - - if (ring == NULL) { - return; - } + squeue_set_t *sqs = NULL; + squeue_t *sq; /* - * Clean up squeue + * The minimum value of sqset_global_size is 2, one for the unbound + * squeue set and another for the squeue set of the zeroth CPU. + * Even though the value could be changing, it can never go below 2, + * so the assert does not need the lock protection. */ - mutex_enter(&sqp->sq_lock); - sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB); - sqp->sq_rx_ring = NULL; - mutex_exit(&sqp->sq_lock); + ASSERT(sqset_global_size > 1); - ill = ring->rr_ill; - if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { - ASSERT(ring->rr_handle != NULL); - ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle); - } + /* Protect against changes to sqset_global_list */ + mutex_enter(&sqset_lock); - /* - * Cleanup the ring - */ - - ring->rr_blank = NULL; - ring->rr_handle = NULL; - ring->rr_sqp = NULL; + if (!ip_squeue_fanout) + sqs = CPU->cpu_squeue_set; /* - * Signal ill that cleanup is done + * sqset_global_list[0] corresponds to the unbound squeue set. + * The computation below picks a set other than the unbound set. */ - mutex_enter(&ill->ill_lock); - ring->rr_ring_state = ILL_RING_FREE; - cv_signal(&ill->ill_cv); - mutex_exit(&ill->ill_lock); + if (sqs == NULL) + sqs = sqset_global_list[(index % (sqset_global_size - 1)) + 1]; + sq = sqs->sqs_default; + + mutex_exit(&sqset_lock); + ASSERT(sq); + return (sq); } /* - * Clean up one squeue element. ill_inuse_ref is protected by ill_lock. - * The real cleanup happens behind the squeue via ip_squeue_clean function but - * we need to protect ourselves from 2 threads trying to cleanup at the same - * time (possible with one port going down for aggr and someone tearing down the - * entire aggr simultaneously). So we use ill_inuse_ref protected by ill_lock - * to indicate when the cleanup has started (1 ref) and when the cleanup - * is done (0 ref). When a new ring gets assigned to squeue, we start by - * putting 2 ref on ill_inuse_ref. + * Move squeue from its current set to newset. Not used for default squeues. + * Bind or unbind the worker thread as appropriate. */ + static void -ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring) +ip_squeue_set_move(squeue_t *sq, squeue_set_t *newset) { - conn_t *connp; - squeue_t *sqp; - mblk_t *mp; - - ASSERT(rx_ring != NULL); + squeue_set_t *set; + squeue_t **lastsqp; + processorid_t cpuid = newset->sqs_cpuid; - /* Just clean one squeue */ - mutex_enter(&ill->ill_lock); - /* - * Reset the ILL_SOFT_RING_ASSIGN bit so that - * ip_squeue_soft_ring_affinty() will not go - * ahead with assigning rings. - */ - ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; - while (rx_ring->rr_ring_state == ILL_RING_INPROC) - /* Some operations pending on the ring. Wait */ - cv_wait(&ill->ill_cv, &ill->ill_lock); - - if (rx_ring->rr_ring_state != ILL_RING_INUSE) { - /* - * Someone already trying to clean - * this squeue or it's already been cleaned. - */ - mutex_exit(&ill->ill_lock); - return; - } - sqp = rx_ring->rr_sqp; + ASSERT(!(sq->sq_state & SQS_DEFAULT)); + ASSERT(!MUTEX_HELD(&sq->sq_lock)); + ASSERT(MUTEX_HELD(&sqset_lock)); - if (sqp == NULL) { - /* - * The rx_ring never had a squeue assigned to it. - * We are under ill_lock so we can clean it up - * here itself since no one can get to it. - */ - rx_ring->rr_blank = NULL; - rx_ring->rr_handle = NULL; - rx_ring->rr_sqp = NULL; - rx_ring->rr_ring_state = ILL_RING_FREE; - mutex_exit(&ill->ill_lock); + set = sq->sq_set; + if (set == newset) return; - } - - /* Indicate that it's being cleaned */ - rx_ring->rr_ring_state = ILL_RING_BEING_FREED; - ASSERT(sqp != NULL); - mutex_exit(&ill->ill_lock); - /* - * Use the preallocated ill_unbind_conn for this purpose - */ - connp = ill->ill_dls_capab->ill_unbind_conn; - - if (connp->conn_tcp->tcp_closemp.b_prev == NULL) { - connp->conn_tcp->tcp_closemp_used = B_TRUE; - } else { - cmn_err(CE_PANIC, "ip_squeue_clean_ring: " - "concurrent use of tcp_closemp_used: connp %p tcp %p\n", - (void *)connp, (void *)connp->conn_tcp); - } - - TCP_DEBUG_GETPCSTACK(connp->conn_tcp->tcmp_stk, 15); - mp = &connp->conn_tcp->tcp_closemp; - CONN_INC_REF(connp); - - /* - * Since the field sq_rx_ring for default squeue is NULL, - * ip_squeue_clean() will have no way to get the ring if we - * don't pass the pointer to it. We use b_wptr to do so - * as use of b_wptr for any other purpose is not expected. - */ - - ASSERT(mp->b_wptr == NULL); - mp->b_wptr = (unsigned char *)rx_ring; - squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL); - - mutex_enter(&ill->ill_lock); - while (rx_ring->rr_ring_state != ILL_RING_FREE) - cv_wait(&ill->ill_cv, &ill->ill_lock); - mutex_exit(&ill->ill_lock); + lastsqp = &set->sqs_head; + while (*lastsqp != sq) + lastsqp = &(*lastsqp)->sq_next; + + *lastsqp = sq->sq_next; + sq->sq_next = newset->sqs_head; + newset->sqs_head = sq; + sq->sq_set = newset; + if (cpuid == -1) + squeue_unbind(sq); + else + squeue_bind(sq, cpuid); } -void -ip_squeue_clean_all(ill_t *ill) +/* + * Move squeue from its current set to cpuid's set and bind to cpuid. + */ + +int +ip_squeue_cpu_move(squeue_t *sq, processorid_t cpuid) { - int idx; + cpu_t *cpu; + squeue_set_t *set; - /* - * No need to clean if poll_capab isn't set for this ill - */ - if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING))) - return; + if (sq->sq_state & SQS_DEFAULT) + return (-1); - for (idx = 0; idx < ILL_MAX_RINGS; idx++) { - ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx]; + ASSERT(MUTEX_HELD(&cpu_lock)); - ip_squeue_clean_ring(ill, ipr); - } + cpu = cpu_get(cpuid); + if (!CPU_ISON(cpu)) + return (-1); - ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING); + mutex_enter(&sqset_lock); + set = cpu->cpu_squeue_set; + if (set != NULL) + ip_squeue_set_move(sq, set); + mutex_exit(&sqset_lock); + return ((set == NULL) ? -1 : 0); } -typedef struct ip_taskq_arg { - ill_t *ip_taskq_ill; - ill_rx_ring_t *ip_taskq_ill_rx_ring; - cpu_t *ip_taskq_cpu; -} ip_taskq_arg_t; - /* - * Do a Rx ring to squeue binding. Find a unique squeue that is not - * managing a receive ring. If no such squeue exists, dynamically - * create a new one in the squeue set. - * - * The function runs via the system taskq. The ill passed as an - * argument can't go away since we hold a ref. The lock order is - * ill_lock -> sqs_lock -> sq_lock. - * - * If we are binding a Rx ring to a squeue attached to the offline CPU, - * no need to check that because squeues are never destroyed once - * created. + * The mac layer is calling, asking us to move an squeue to a + * new CPU. This routine is called with cpu_lock held. */ -/* ARGSUSED */ -static void -ip_squeue_extend(void *arg) +void +ip_squeue_bind_ring(ill_t *ill, ill_rx_ring_t *rx_ring, processorid_t cpuid) { - ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; - ill_t *ill = sq_arg->ip_taskq_ill; - ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; - cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; - squeue_set_t *sqs; - squeue_t *sqp = NULL; - - ASSERT(ill != NULL); - ASSERT(ill_rx_ring != NULL); - kmem_free(arg, sizeof (ip_taskq_arg_t)); + ASSERT(ILL_MAC_PERIM_HELD(ill)); + ASSERT(rx_ring->rr_ill == ill); - /* - * Make sure the CPU that originally took the interrupt still - * exists. - */ - if (!CPU_ISON(intr_cpu)) - intr_cpu = CPU; - - sqs = intr_cpu->cpu_squeue_set; - - /* - * If this ill represents link aggregation, then there might be - * multiple NICs trying to register them selves at the same time - * and in order to ensure that test and assignment of free rings - * is sequential, we need to hold the ill_lock. - */ mutex_enter(&ill->ill_lock); - sqp = ip_find_unused_squeue(sqs, B_FALSE); - if (sqp == NULL) { - /* - * We hit the max limit of squeues allowed per CPU. - * Assign this rx_ring to DEFAULT squeue of the - * interrupted CPU but the squeue will not manage - * the ring. Also print a warning. - */ - cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already " - "has max number of squeues. System performance might " - "become suboptimal\n", sqs->sqs_bind, (void *)sqs); - - /* the first squeue in the list is the default squeue */ - sqp = sqs->sqs_list[0]; - ASSERT(sqp != NULL); - ill_rx_ring->rr_sqp = sqp; - ill_rx_ring->rr_ring_state = ILL_RING_INUSE; - + if (rx_ring->rr_ring_state == RR_FREE || + rx_ring->rr_ring_state == RR_FREE_INPROG) { mutex_exit(&ill->ill_lock); - ill_waiter_dcr(ill); return; } - ASSERT(MUTEX_HELD(&sqp->sq_lock)); - sqp->sq_rx_ring = ill_rx_ring; - ill_rx_ring->rr_sqp = sqp; - ill_rx_ring->rr_ring_state = ILL_RING_INUSE; - - sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB); - mutex_exit(&sqp->sq_lock); + if (ip_squeue_cpu_move(rx_ring->rr_sqp, cpuid) != -1) + rx_ring->rr_ring_state = RR_SQUEUE_BOUND; mutex_exit(&ill->ill_lock); - - /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ - ill_waiter_dcr(ill); } -/* - * Do a Rx ring to squeue binding. Find a unique squeue that is not - * managing a receive ring. If no such squeue exists, dynamically - * create a new one in the squeue set. - * - * The function runs via the system taskq. The ill passed as an - * argument can't go away since we hold a ref. The lock order is - * ill_lock -> sqs_lock -> sq_lock. - * - * If we are binding a Rx ring to a squeue attached to the offline CPU, - * no need to check that because squeues are never destroyed once - * created. - */ -/* ARGSUSED */ -static void -ip_squeue_soft_ring_affinity(void *arg) +void * +ip_squeue_add_ring(ill_t *ill, void *mrp) { - ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; - ill_t *ill = sq_arg->ip_taskq_ill; - ill_dls_capab_t *ill_soft_ring = ill->ill_dls_capab; - ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; - cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; - cpu_t *bind_cpu; - int cpu_id = intr_cpu->cpu_id; - int min_cpu_id, max_cpu_id; - boolean_t enough_uniq_cpus = B_FALSE; - boolean_t enough_cpus = B_FALSE; - squeue_set_t *sqs, *last_sqs; - squeue_t *sqp = NULL; - int i, j; - - ASSERT(ill != NULL); - kmem_free(arg, sizeof (ip_taskq_arg_t)); + mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; + ill_rx_ring_t *rx_ring, *ring_tbl; + int ip_rx_index; + squeue_t *sq = NULL; + pri_t pri; - /* - * Make sure the CPU that originally took the interrupt still - * exists. - */ - if (!CPU_ISON(intr_cpu)) { - intr_cpu = CPU; - cpu_id = intr_cpu->cpu_id; - } + ASSERT(ILL_MAC_PERIM_HELD(ill)); + ASSERT(mrfp->mrf_type == MAC_RX_FIFO); + ASSERT(ill->ill_dld_capab != NULL); - /* - * If this ill represents link aggregation, then there might be - * multiple NICs trying to register them selves at the same time - * and in order to ensure that test and assignment of free rings - * is sequential, we need to hold the ill_lock. - */ - mutex_enter(&ill->ill_lock); + ring_tbl = ill->ill_dld_capab->idc_poll.idp_ring_tbl; - if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { - mutex_exit(&ill->ill_lock); - return; + mutex_enter(&ill->ill_lock); + for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) { + rx_ring = &ring_tbl[ip_rx_index]; + if (rx_ring->rr_ring_state == RR_FREE) + break; } - /* - * We need to fanout the interrupts from the NIC. We do that by - * telling the driver underneath to create soft rings and use - * worker threads (if the driver advertized SOFT_RING capability) - * Its still a big performance win to if we can fanout to the - * threads on the same core that is taking interrupts. - * - * Since we don't know the interrupt to CPU binding, we don't - * assign any squeues or affinity to worker threads in the NIC. - * At the time of the first interrupt, we know which CPU is - * taking interrupts and try to find other threads on the same - * core. Assuming, ip_threads_per_cpu is correct and cpus are - * numbered sequentially for each core (XXX need something better - * than this in future), find the lowest number and highest - * number thread for that core. - * - * If we have one more thread per core than number of soft rings, - * then don't assign any worker threads to the H/W thread (cpu) - * taking interrupts (capability negotiation tries to ensure this) - * - * If the number of threads per core are same as the number of - * soft rings, then assign the worker affinity and squeue to - * the same cpu. - * - * Otherwise, just fanout to higher number CPUs starting from - * the interrupted CPU. - */ - min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu; - max_cpu_id = min_cpu_id + ip_threads_per_cpu; - - /* - * Quickly check if there are enough CPUs present for fanout - * and also max_cpu_id is less than the id of the active CPU. - * We use the cpu_id stored in the last squeue_set to get - * an idea. The scheme is by no means perfect since it doesn't - * take into account CPU DR operations and the fact that - * interrupts themselves might change. An ideal scenario - * would be to ensure that interrupts run cpus by themselves - * and worker threads never have affinity to those CPUs. If - * the interrupts move to CPU which had a worker thread, it - * should be changed. Probably callbacks similar to CPU offline - * are needed to make it work perfectly. - */ - last_sqs = sqset_global_list[sqset_global_size - 1]; - if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) { - if ((max_cpu_id - min_cpu_id) > - ill_soft_ring->ill_dls_soft_ring_cnt) - enough_uniq_cpus = B_TRUE; - else if ((max_cpu_id - min_cpu_id) >= - ill_soft_ring->ill_dls_soft_ring_cnt) - enough_cpus = B_TRUE; + if (ip_rx_index == ILL_MAX_RINGS) { + /* + * We ran out of ILL_MAX_RINGS worth rx_ring structures. If + * we have devices which can overwhelm this limit, + * ILL_MAX_RING should be made configurable. Meanwhile it + * cause no panic because driver will pass ip_input a NULL + * handle which will make IP allocate the default squeue and + * Polling mode will not be used for this ring. + */ + cmn_err(CE_NOTE, + "Reached maximum number of receiving rings (%d) for %s\n", + ILL_MAX_RINGS, ill->ill_name); + mutex_exit(&ill->ill_lock); + return (NULL); } - j = 0; - for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) { - if (enough_uniq_cpus) { - if ((min_cpu_id + i) == cpu_id) { - j++; - continue; - } - bind_cpu = cpu[min_cpu_id + i]; - } else if (enough_cpus) { - bind_cpu = cpu[min_cpu_id + i]; - } else { - /* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */ - bind_cpu = cpu[(cpu_id + i) % ncpus]; - } + bzero(rx_ring, sizeof (ill_rx_ring_t)); + rx_ring->rr_rx = (ip_mac_rx_t)mrfp->mrf_receive; + /* XXX: Hard code it to tcp accept for now */ + rx_ring->rr_ip_accept = (ip_accept_t)ip_accept_tcp; - /* - * Check if the CPU actually exist and active. If not, - * use the interrupted CPU. ip_find_unused_squeue() will - * find the right CPU to fanout anyway. - */ - if (!CPU_ISON(bind_cpu)) - bind_cpu = intr_cpu; + rx_ring->rr_intr_handle = mrfp->mrf_intr_handle; + rx_ring->rr_intr_enable = (ip_mac_intr_enable_t)mrfp->mrf_intr_enable; + rx_ring->rr_intr_disable = + (ip_mac_intr_disable_t)mrfp->mrf_intr_disable; + rx_ring->rr_rx_handle = mrfp->mrf_rx_arg; + rx_ring->rr_ill = ill; - sqs = bind_cpu->cpu_squeue_set; - ASSERT(sqs != NULL); - ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j]; + pri = mrfp->mrf_flow_priority; - sqp = ip_find_unused_squeue(sqs, B_TRUE); - if (sqp == NULL) { - /* - * We hit the max limit of squeues allowed per CPU. - * Assign this rx_ring to DEFAULT squeue of the - * interrupted CPU but thesqueue will not manage - * the ring. Also print a warning. - */ - cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = " - "%d/%p already has max number of squeues. System " - "performance might become suboptimal\n", - sqs->sqs_bind, (void *)sqs); + sq = ip_squeue_getfree(pri); - /* the first squeue in the list is the default squeue */ - sqp = intr_cpu->cpu_squeue_set->sqs_list[0]; - ASSERT(sqp != NULL); + mutex_enter(&sq->sq_lock); + sq->sq_rx_ring = rx_ring; + rx_ring->rr_sqp = sq; - ill_rx_ring->rr_sqp = sqp; - ill_rx_ring->rr_ring_state = ILL_RING_INUSE; - continue; + sq->sq_state |= SQS_POLL_CAPAB; - } - ASSERT(MUTEX_HELD(&sqp->sq_lock)); - ill_rx_ring->rr_sqp = sqp; - sqp->sq_rx_ring = ill_rx_ring; - ill_rx_ring->rr_ring_state = ILL_RING_INUSE; - sqp->sq_state |= SQS_ILL_BOUND; - - /* assign affinity to soft ring */ - if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) { - ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle, - sqp->sq_bind); - } - mutex_exit(&sqp->sq_lock); - } + rx_ring->rr_ring_state = RR_SQUEUE_UNBOUND; + sq->sq_ill = ill; + mutex_exit(&sq->sq_lock); mutex_exit(&ill->ill_lock); - ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle, - SOFT_RING_FANOUT); + DTRACE_PROBE4(ill__ring__add, char *, ill->ill_name, ill_t *, ill, int, + ip_rx_index, void *, mrfp->mrf_rx_arg); - mutex_enter(&ill->ill_lock); - ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; - mutex_exit(&ill->ill_lock); + /* Assign the squeue to the specified CPU as well */ + mutex_enter(&cpu_lock); + (void) ip_squeue_bind_ring(ill, rx_ring, mrfp->mrf_cpu_id); + mutex_exit(&cpu_lock); - /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ - ill_waiter_dcr(ill); + return (rx_ring); } -/* ARGSUSED */ +/* + * sanitize the squeue etc. Some of the processing + * needs to be done from inside the perimeter. + */ void -ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring, - mblk_t *mp_chain, struct mac_header_info_s *mhip) +ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring) { - ip_taskq_arg_t *taskq_arg; - boolean_t refheld; - - mutex_enter(&ill->ill_lock); - if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { - taskq_arg = (ip_taskq_arg_t *) - kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP); - - if (taskq_arg == NULL) - goto out; + squeue_t *sqp; - taskq_arg->ip_taskq_ill = ill; - taskq_arg->ip_taskq_ill_rx_ring = NULL; - taskq_arg->ip_taskq_cpu = CPU; + ASSERT(ILL_MAC_PERIM_HELD(ill)); + ASSERT(rx_ring != NULL); - /* - * Set ILL_SOFT_RING_ASSIGN flag. We don't want - * the next interrupt to schedule a task for calling - * ip_squeue_soft_ring_affinity(); - */ - ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN; - } else { + /* Just clean one squeue */ + mutex_enter(&ill->ill_lock); + if (rx_ring->rr_ring_state == RR_FREE) { mutex_exit(&ill->ill_lock); - goto out; + return; } + rx_ring->rr_ring_state = RR_FREE_INPROG; + sqp = rx_ring->rr_sqp; + + mutex_enter(&sqp->sq_lock); + sqp->sq_state |= SQS_POLL_CLEANUP; + cv_signal(&sqp->sq_worker_cv); mutex_exit(&ill->ill_lock); - refheld = ill_waiter_inc(ill); - if (refheld) { - if (taskq_dispatch(system_taskq, - ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP)) - goto out; - - /* release ref on ill if taskq dispatch fails */ - ill_waiter_dcr(ill); - } + while (!(sqp->sq_state & SQS_POLL_CLEANUP_DONE)) + cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock); + sqp->sq_state &= ~(SQS_POLL_CLEANUP_DONE | SQS_ILL_BOUND); + + ASSERT(!(sqp->sq_state & (SQS_POLL_THR_CONTROL | + SQS_WORKER_THR_CONTROL | SQS_POLL_QUIESCE_DONE | + SQS_POLL_THR_QUIESCED))); + + cv_signal(&sqp->sq_worker_cv); + mutex_exit(&sqp->sq_lock); + /* - * Turn on CAPAB_SOFT_RING so that affinity assignment - * can be tried again later. + * Logically free the squeue. It goes back to the set of unused + * squeues */ + mutex_enter(&sqset_lock); + ip_squeue_set_move(sqp, sqset_global_list[0]); + mutex_exit(&sqset_lock); + mutex_enter(&ill->ill_lock); - ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; + rx_ring->rr_ring_state = RR_FREE; mutex_exit(&ill->ill_lock); - kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); - -out: - ip_input(ill, NULL, mp_chain, mhip); } -static squeue_t * -ip_find_unused_squeue(squeue_set_t *sqs, boolean_t fanout) +/* + * Stop the squeue from polling. This needs to be done + * from inside the perimeter. + */ +void +ip_squeue_quiesce_ring(ill_t *ill, ill_rx_ring_t *rx_ring) { - int i; - squeue_set_t *best_sqs = NULL; - squeue_set_t *curr_sqs = NULL; - int min_sq = 0; - squeue_t *sqp = NULL; - char sqname[64]; - cpu_t *bind_cpu; - - /* - * If fanout is set and the passed squeue_set already has some - * squeues which are managing the NICs, try to find squeues on - * unused CPU. - */ - if (sqs->sqs_size > 1 && fanout) { - /* - * First check to see if any squeue on the CPU passed - * is managing a NIC. - */ - mutex_enter(&sqs->sqs_lock); - for (i = 0; i < sqs->sqs_size; i++) { - mutex_enter(&sqs->sqs_list[i]->sq_lock); - if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) && - !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) { - mutex_exit(&sqs->sqs_list[i]->sq_lock); - break; - } - mutex_exit(&sqs->sqs_list[i]->sq_lock); - } - mutex_exit(&sqs->sqs_lock); - if (i != sqs->sqs_size) { - best_sqs = NULL; - - for (i = sqset_global_size - 1; i >= 0; i--) { - curr_sqs = sqset_global_list[i]; - /* - * Check and make sure the CPU that sqs - * is bound to is valid. There could be - * sqs's around whose CPUs could have - * been DR'd out. - */ - mutex_enter(&cpu_lock); - if (cpu_get(curr_sqs->sqs_bind) != NULL) { - if (best_sqs == NULL) { - best_sqs = curr_sqs; - min_sq = curr_sqs->sqs_size; - } else if (curr_sqs->sqs_size < - min_sq) { - best_sqs = curr_sqs; - min_sq = curr_sqs->sqs_size; - } - } - mutex_exit(&cpu_lock); - } - - ASSERT(best_sqs != NULL); - sqs = best_sqs; - } - } + squeue_t *sqp; - mutex_enter(&sqs->sqs_lock); + ASSERT(ILL_MAC_PERIM_HELD(ill)); + ASSERT(rx_ring != NULL); - for (i = 0; i < sqs->sqs_size; i++) { - mutex_enter(&sqs->sqs_list[i]->sq_lock); - if ((sqs->sqs_list[i]->sq_state & - (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) { - sqp = sqs->sqs_list[i]; - break; - } - mutex_exit(&sqs->sqs_list[i]->sq_lock); - } + sqp = rx_ring->rr_sqp; + mutex_enter(&sqp->sq_lock); + sqp->sq_state |= SQS_POLL_QUIESCE; + cv_signal(&sqp->sq_worker_cv); + while (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE)) + cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock); - if (sqp == NULL) { - /* Need to create a new squeue */ - if (sqs->sqs_size == sqs->sqs_max_size) { - /* - * Reached the max limit for squeue - * we can allocate on this CPU. - */ - mutex_exit(&sqs->sqs_lock); - return (NULL); - } + mutex_exit(&sqp->sq_lock); +} - mutex_enter(&cpu_lock); - if ((bind_cpu = cpu_get(sqs->sqs_bind)) == NULL) { - /* Too bad, CPU got DR'd out, return NULL */ - mutex_exit(&cpu_lock); - mutex_exit(&sqs->sqs_lock); - return (NULL); - } +/* + * Restart polling etc. Needs to be inside the perimeter to + * prevent races. + */ +void +ip_squeue_restart_ring(ill_t *ill, ill_rx_ring_t *rx_ring) +{ + squeue_t *sqp; - bzero(sqname, sizeof (sqname)); - (void) snprintf(sqname, sizeof (sqname), - "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid, - bind_cpu->cpu_id, sqs->sqs_size); - mutex_exit(&cpu_lock); + ASSERT(ILL_MAC_PERIM_HELD(ill)); + ASSERT(rx_ring != NULL); - sqp = squeue_create(sqname, sqs->sqs_bind, - ip_squeue_worker_wait, minclsyspri); + sqp = rx_ring->rr_sqp; + mutex_enter(&sqp->sq_lock); + /* + * Handle change in number of rings between the quiesce and + * restart operations by checking for a previous quiesce before + * attempting a restart. + */ + if (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE)) { + mutex_exit(&sqp->sq_lock); + return; + } + sqp->sq_state |= SQS_POLL_RESTART; + cv_signal(&sqp->sq_worker_cv); + while (!(sqp->sq_state & SQS_POLL_RESTART_DONE)) + cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock); + sqp->sq_state &= ~SQS_POLL_RESTART_DONE; + mutex_exit(&sqp->sq_lock); +} - ASSERT(sqp != NULL); +/* + * sanitize all squeues associated with the ill. + */ +void +ip_squeue_clean_all(ill_t *ill) +{ + int idx; + ill_rx_ring_t *rx_ring; - squeue_profile_enable(sqp); - /* - * Other functions scanning sqs_list don't take sqs_lock. - * Once sqp is stored in sqs_list[] global visibility is - * ensured before incrementing the sqs_size counter. - */ - sqs->sqs_list[sqs->sqs_size] = sqp; - membar_producer(); - sqs->sqs_size++; - - if (ip_squeue_create_callback != NULL) - ip_squeue_create_callback(sqp); - - if (ip_squeue_bind) { - mutex_enter(&cpu_lock); - bind_cpu = cpu_get(sqs->sqs_bind); - if (bind_cpu != NULL && cpu_is_online(bind_cpu)) { - squeue_bind(sqp, -1); - } - mutex_exit(&cpu_lock); - } - mutex_enter(&sqp->sq_lock); + for (idx = 0; idx < ILL_MAX_RINGS; idx++) { + rx_ring = &ill->ill_dld_capab->idc_poll.idp_ring_tbl[idx]; + ip_squeue_clean_ring(ill, rx_ring); } - - mutex_exit(&sqs->sqs_lock); - ASSERT(sqp != NULL); - return (sqp); } /* - * Find the squeue assigned to manage this Rx ring. If the Rx ring is not - * owned by a squeue yet, do the assignment. When the NIC registers it - * Rx rings with IP, we don't know where the interrupts will land and - * hence we need to wait till this point to do the assignment. + * Used by IP to get the squeue associated with a ring. If the squeue isn't + * yet bound to a CPU, and we're being called directly from the NIC's + * interrupt, then we know what CPU we want to assign the squeue to, so + * dispatch that task to a taskq. */ squeue_t * ip_squeue_get(ill_rx_ring_t *ill_rx_ring) { squeue_t *sqp; - ill_t *ill; - int interrupt; - ip_taskq_arg_t *taskq_arg; - boolean_t refheld; - - if (ill_rx_ring == NULL) - return (IP_SQUEUE_GET(lbolt)); - - sqp = ill_rx_ring->rr_sqp; - /* - * Do a quick check. If it's not NULL, we are done. - * Squeues are never destroyed so worse we will bind - * this connection to a suboptimal squeue. - * - * This is the fast path case. - */ - if (sqp != NULL) - return (sqp); - - ill = ill_rx_ring->rr_ill; - ASSERT(ill != NULL); - - interrupt = servicing_interrupt(); - taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t), - KM_NOSLEEP); - mutex_enter(&ill->ill_lock); - /* - * Check sqp under the lock again for atomicity. Possible race with - * a previously scheduled ip_squeue_get -> ip_squeue_extend. - * Do the ring to squeue binding only if we are in interrupt context - * AND the ring is not already bound AND there is no one else trying - * the bind already. - */ - sqp = ill_rx_ring->rr_sqp; - if (sqp != NULL || !interrupt || - ill_rx_ring->rr_ring_state != ILL_RING_INUSE || taskq_arg == NULL) { - /* - * Note that the ring might get bound once we drop the lock - * below, if a previous request is in progress i.e. if the ring - * state is ILL_RING_INPROC. The incoming connection on whose - * behalf we are currently here might get a suboptimal squeue - * via the call to IP_SQUEUE_GET below, but there is no - * correctness issue. - */ - mutex_exit(&ill->ill_lock); - if (taskq_arg != NULL) - kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); - if (sqp != NULL) - return (sqp); + if ((ill_rx_ring == NULL) || ((sqp = ill_rx_ring->rr_sqp) == NULL)) return (IP_SQUEUE_GET(lbolt)); - } - - /* - * No sqp assigned yet. Can't really do that in interrupt - * context. Assign the default sqp to this connection and - * trigger creation of new sqp and binding it to this ring - * via taskq. Need to make sure ill stays around. - */ - taskq_arg->ip_taskq_ill = ill; - taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring; - taskq_arg->ip_taskq_cpu = CPU; - ill_rx_ring->rr_ring_state = ILL_RING_INPROC; - mutex_exit(&ill->ill_lock); - refheld = ill_waiter_inc(ill); - if (refheld) { - if (taskq_dispatch(system_taskq, ip_squeue_extend, - taskq_arg, TQ_NOSLEEP) != NULL) { - return (IP_SQUEUE_GET(lbolt)); - } - } - /* - * The ill is closing and we could not get a reference on the ill OR - * taskq_dispatch failed probably due to memory allocation failure. - * We will try again next time. - */ - mutex_enter(&ill->ill_lock); - ill_rx_ring->rr_ring_state = ILL_RING_INUSE; - mutex_exit(&ill->ill_lock); - kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); - if (refheld) - ill_waiter_dcr(ill); - return (IP_SQUEUE_GET(lbolt)); + return (sqp); } /* - * NDD hooks for setting ip_squeue_xxx tuneables. + * Called when a CPU goes offline. It's squeue_set_t is destroyed, and all + * squeues are unboudn and moved to the unbound set. */ - -/* ARGSUSED */ -int -ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, - caddr_t addr, cred_t *cr) +static void +ip_squeue_set_destroy(cpu_t *cpu) { - int *bind_enabled = (int *)addr; - long new_value; int i; + squeue_t *sqp, *lastsqp = NULL; + squeue_set_t *sqs, *unbound = sqset_global_list[0]; - if (ddi_strtol(value, NULL, 10, &new_value) != 0) - return (EINVAL); + mutex_enter(&sqset_lock); + if ((sqs = cpu->cpu_squeue_set) == NULL) { + mutex_exit(&sqset_lock); + return; + } - if (ip_squeue_bind == new_value) - return (0); + /* Move all squeues to unbound set */ - *bind_enabled = new_value; - mutex_enter(&cpu_lock); - if (new_value == 0) { - for (i = 0; i < sqset_global_size; i++) - ip_squeue_set_unbind(sqset_global_list[i]); - } else { - for (i = 0; i < sqset_global_size; i++) - ip_squeue_set_bind(sqset_global_list[i]); + for (sqp = sqs->sqs_head; sqp; lastsqp = sqp, sqp = sqp->sq_next) { + squeue_unbind(sqp); + sqp->sq_set = unbound; + } + if (sqs->sqs_head) { + lastsqp->sq_next = unbound->sqs_head; + unbound->sqs_head = sqs->sqs_head; } - mutex_exit(&cpu_lock); - return (0); -} + /* Also move default squeue to unbound set */ -/* - * Set squeue profiling. - * 0 means "disable" - * 1 means "enable" - * 2 means "enable and reset" - */ -/* ARGSUSED */ -int -ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, - cred_t *cr) -{ - int *profile_enabled = (int *)cp; - long new_value; - squeue_set_t *sqs; - - if (ddi_strtol(value, NULL, 10, &new_value) != 0) - return (EINVAL); - - if (new_value == 0) - squeue_profile_stop(); - else if (new_value == 1) - squeue_profile_start(); - else if (new_value == 2) { - int i, j; - - squeue_profile_stop(); - mutex_enter(&cpu_lock); - for (i = 0; i < sqset_global_size; i++) { - sqs = sqset_global_list[i]; - for (j = 0; j < sqs->sqs_size; j++) { - squeue_profile_reset(sqs->sqs_list[j]); - } - } - mutex_exit(&cpu_lock); + sqp = sqs->sqs_default; + ASSERT(sqp); + ASSERT((sqp->sq_state & (SQS_DEFAULT|SQS_ILL_BOUND)) == SQS_DEFAULT); - new_value = 1; - squeue_profile_start(); - } - *profile_enabled = new_value; + sqp->sq_next = unbound->sqs_head; + unbound->sqs_head = sqp; + squeue_unbind(sqp); + sqp->sq_set = unbound; - return (0); + for (i = 1; i < sqset_global_size; i++) + if (sqset_global_list[i] == sqs) + break; + + ASSERT(i < sqset_global_size); + sqset_global_list[i] = sqset_global_list[sqset_global_size - 1]; + sqset_global_list[sqset_global_size - 1] = NULL; + sqset_global_size--; + + mutex_exit(&sqset_lock); + kmem_free(sqs, sizeof (*sqs)); } /* * Reconfiguration callback */ - /* ARGSUSED */ static int ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg) { - cpu_t *cp = cpu[id]; + cpu_t *cp = cpu_get(id); ASSERT(MUTEX_HELD(&cpu_lock)); switch (what) { case CPU_CONFIG: - /* - * A new CPU is added. Create an squeue for it but do not bind - * it yet. - */ - if (cp->cpu_squeue_set == NULL) - cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); - break; case CPU_ON: case CPU_INIT: case CPU_CPUPART_IN: - if (cp->cpu_squeue_set == NULL) { - cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); - } - if (ip_squeue_bind) - ip_squeue_set_bind(cp->cpu_squeue_set); + if (cp->cpu_squeue_set == NULL) + cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id); break; case CPU_UNCONFIG: case CPU_OFF: case CPU_CPUPART_OUT: ASSERT((cp->cpu_squeue_set != NULL) || (cp->cpu_flags & CPU_OFFLINE)); - if (cp->cpu_squeue_set != NULL) { - ip_squeue_set_unbind(cp->cpu_squeue_set); + ip_squeue_set_destroy(cp); + cp->cpu_squeue_set = NULL; } break; default: @@ -1111,54 +756,3 @@ ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg) } return (0); } - -/* ARGSUSED */ -static void -ip_squeue_set_bind(squeue_set_t *sqs) -{ - int i; - squeue_t *sqp; - - if (!ip_squeue_bind) - return; - - mutex_enter(&sqs->sqs_lock); - for (i = 0; i < sqs->sqs_size; i++) { - sqp = sqs->sqs_list[i]; - if (sqp->sq_state & SQS_BOUND) - continue; - squeue_bind(sqp, -1); - } - mutex_exit(&sqs->sqs_lock); -} - -static void -ip_squeue_set_unbind(squeue_set_t *sqs) -{ - int i; - squeue_t *sqp; - - mutex_enter(&sqs->sqs_lock); - for (i = 0; i < sqs->sqs_size; i++) { - sqp = sqs->sqs_list[i]; - - /* - * CPU is going offline. Remove the thread affinity - * for any soft ring threads the squeue is managing. - */ - if (sqp->sq_state & SQS_ILL_BOUND) { - ill_rx_ring_t *ring = sqp->sq_rx_ring; - ill_t *ill = ring->rr_ill; - - if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { - ASSERT(ring->rr_handle != NULL); - ill->ill_dls_capab->ill_dls_unbind( - ring->rr_handle); - } - } - if (!(sqp->sq_state & SQS_BOUND)) - continue; - squeue_unbind(sqp); - } - mutex_exit(&sqs->sqs_lock); -} diff --git a/usr/src/uts/common/inet/ip/spd.c b/usr/src/uts/common/inet/ip/spd.c index 7274576285..f785d8a3f6 100644 --- a/usr/src/uts/common/inet/ip/spd.c +++ b/usr/src/uts/common/inet/ip/spd.c @@ -176,13 +176,6 @@ int ipsec_weird_null_inbound_policy = 0; (((sa1)->ipsa_dst_cid == (sa2)->ipsa_dst_cid)))) /* - * IPv4 Fragments - */ -#define IS_V4_FRAGMENT(ipha_fragment_offset_and_flags) \ - (((ntohs(ipha_fragment_offset_and_flags) & IPH_OFFSET) != 0) || \ - ((ntohs(ipha_fragment_offset_and_flags) & IPH_MF) != 0)) - -/* * IPv6 Fragments */ #define IS_V6_FRAGMENT(ipp) (ipp.ipp_fields & IPPF_FRAGHDR) diff --git a/usr/src/uts/common/inet/ip/tun.c b/usr/src/uts/common/inet/ip/tun.c index 24af532b77..632601b5f1 100644 --- a/usr/src/uts/common/inet/ip/tun.c +++ b/usr/src/uts/common/inet/ip/tun.c @@ -3202,7 +3202,7 @@ tun_rdata_v4(queue_t *q, mblk_t *ipsec_mp, mblk_t *data_mp, tun_t *atp) */ pullup_len = hdrlen + (inner_v4 ? sizeof (ipha_t) : sizeof (ip6_t)) + 4; if ((data_mp->b_wptr - data_mp->b_rptr) < pullup_len) { - if (!pullupmsg(data_mp, hdrlen + pullup_len)) { + if (!pullupmsg(data_mp, pullup_len)) { atomic_add_32(&atp->tun_InErrors, 1); atomic_add_32(&atp->tun_InDiscard, 1); if (ipsec_mp != NULL) diff --git a/usr/src/uts/common/inet/ip_ftable.h b/usr/src/uts/common/inet/ip_ftable.h index e729761147..6a3a05183b 100644 --- a/usr/src/uts/common/inet/ip_ftable.h +++ b/usr/src/uts/common/inet/ip_ftable.h @@ -27,8 +27,6 @@ #ifndef _INET_IP_FTABLE_H #define _INET_IP_FTABLE_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -94,6 +92,8 @@ extern void ire_delete_host_redirects(ipaddr_t, ip_stack_t *); extern ire_t *ire_ihandle_lookup_onlink(ire_t *); extern ire_t *ire_forward(ipaddr_t, enum ire_forward_action *, ire_t *, ire_t *, const struct ts_label_s *, ip_stack_t *); +extern ire_t *ire_forward_simple(ipaddr_t, enum ire_forward_action *, + ip_stack_t *); extern irb_t *ire_get_bucket(ire_t *); extern uint_t ifindex_lookup(const struct sockaddr *, zoneid_t); extern int ipfil_sendpkt(const struct sockaddr *, mblk_t *, uint_t, zoneid_t); diff --git a/usr/src/uts/common/inet/ip_if.h b/usr/src/uts/common/inet/ip_if.h index 1bd5b47a9f..c0a6c51696 100644 --- a/usr/src/uts/common/inet/ip_if.h +++ b/usr/src/uts/common/inet/ip_if.h @@ -142,6 +142,12 @@ extern "C" { #define RESTRICT_TO_GROUP 0x1 /* Restrict to IPMP group */ #define RESTRICT_TO_ILL 0x2 /* Restrict to ILL */ +#ifdef DEBUG +#define ILL_MAC_PERIM_HELD(ill) ill_mac_perim_held(ill) +#else +#define ILL_MAC_PERIM_HELD(ill) +#endif + /* for ipif_resolver_up */ enum ip_resolver_action { Res_act_initial, /* initial address establishment */ @@ -158,6 +164,7 @@ extern void ill_dlpi_done(ill_t *, t_uscalar_t); extern boolean_t ill_dlpi_pending(ill_t *, t_uscalar_t); extern void ill_dlpi_send(ill_t *, mblk_t *); extern void ill_dlpi_send_deferred(ill_t *); +extern void ill_capability_done(ill_t *); extern mblk_t *ill_dlur_gen(uchar_t *, uint_t, t_uscalar_t, t_scalar_t); extern ill_t *ill_group_lookup_on_ifindex(uint_t, boolean_t, ip_stack_t *); @@ -208,9 +215,12 @@ extern void ill_untrace_ref(ill_t *); extern boolean_t ill_down_start(queue_t *, mblk_t *); extern ill_t *ill_lookup_group_v6(const in6_addr_t *, zoneid_t, ip_stack_t *); + extern void ill_capability_ack(ill_t *, mblk_t *); extern void ill_capability_probe(ill_t *); -extern void ill_capability_reset(ill_t *); +extern void ill_capability_reset(ill_t *, boolean_t); +extern void ill_taskq_dispatch(ip_stack_t *); + extern void ill_mtu_change(ire_t *, char *); extern void ill_group_cleanup(ill_t *); extern int ill_up_ipifs(ill_t *, queue_t *, mblk_t *); @@ -281,10 +291,11 @@ extern void ipsq_current_start(ipsq_t *, ipif_t *, int); extern void ipsq_current_finish(ipsq_t *); extern void ipsq_enq(ipsq_t *, queue_t *, mblk_t *, ipsq_func_t, int, ill_t *); -extern boolean_t ipsq_enter(ill_t *, boolean_t); +extern boolean_t ipsq_enter(ill_t *, boolean_t, int); extern ipsq_t *ipsq_try_enter(ipif_t *, ill_t *, queue_t *, mblk_t *, ipsq_func_t, int, boolean_t); extern void ipsq_exit(ipsq_t *); +extern boolean_t ill_mac_perim_held(ill_t *); extern mblk_t *ipsq_pending_mp_get(ipsq_t *, conn_t **); extern boolean_t ipsq_pending_mp_add(conn_t *, ipif_t *, queue_t *, mblk_t *, int); diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h index d993e5f6b4..f7a9b8ff58 100644 --- a/usr/src/uts/common/inet/ip_impl.h +++ b/usr/src/uts/common/inet/ip_impl.h @@ -40,6 +40,7 @@ extern "C" { #ifdef _KERNEL #include <sys/sdt.h> +#include <sys/dld.h> #define IP_MOD_ID 5701 @@ -359,7 +360,7 @@ typedef struct ip_mdt_info_s { ill->ill_mdt_capab->ill_mdt_on != 0) #define ILL_LSO_CAPABLE(ill) \ - (((ill)->ill_capabilities & ILL_CAPAB_LSO) != 0) + (((ill)->ill_capabilities & ILL_CAPAB_DLD_LSO) != 0) /* * ioctl identifier and structure for Large Segment Offload @@ -378,12 +379,11 @@ typedef struct ip_lso_info_s { #define ILL_LSO_USABLE(ill) \ (ILL_LSO_CAPABLE(ill) && \ ill->ill_lso_capab != NULL && \ - ill->ill_lso_capab->ill_lso_version == LSO_VERSION_1 && \ ill->ill_lso_capab->ill_lso_on != 0) #define ILL_LSO_TCP_USABLE(ill) \ (ILL_LSO_USABLE(ill) && \ - ill->ill_lso_capab->ill_lso_flags & LSO_TX_BASIC_TCP_IPV4) + ill->ill_lso_capab->ill_lso_flags & DLD_LSO_TX_BASIC_TCP_IPV4) /* * Macro that determines whether or not a given CONN may be considered @@ -497,43 +497,36 @@ typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t; (connp)->conn_udp->udp_drain_qfull : \ !canputnext((connp)->conn_rq)) -#define ILL_DLS_CAPABLE(ill) \ - (((ill)->ill_capabilities & \ - (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) != 0) - -/* - * Macro that hands off one or more messages directly to DLD - * when the interface is marked with ILL_CAPAB_POLL. - */ -#define IP_DLS_ILL_TX(ill, ipha, mp, ipst, hlen) { \ - ill_dls_capab_t *ill_dls = ill->ill_dls_capab; \ - ASSERT(ILL_DLS_CAPABLE(ill)); \ - ASSERT(ill_dls != NULL); \ - ASSERT(ill_dls->ill_tx != NULL); \ - ASSERT(ill_dls->ill_tx_handle != NULL); \ - DTRACE_PROBE4(ip4__physical__out__start, \ - ill_t *, NULL, ill_t *, ill, \ - ipha_t *, ipha, mblk_t *, mp); \ - FW_HOOKS(ipst->ips_ip4_physical_out_event, \ - ipst->ips_ipv4firewall_physical_out, \ - NULL, ill, ipha, mp, mp, 0, ipst); \ - DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); \ - if (mp != NULL) { \ - if (ipst->ips_ipobs_enabled) { \ - zoneid_t szone; \ - \ - szone = ip_get_zoneid_v4(ipha->ipha_src, mp, \ - ipst, ALL_ZONES); \ - ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, \ - ALL_ZONES, ill, IPV4_VERSION, hlen, ipst); \ - } \ - DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, \ - void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill, \ - ipha_t *, ipha, ip6_t *, NULL, int, 0); \ - ill_dls->ill_tx(ill_dls->ill_tx_handle, mp); \ - } \ +/* Macro that follows definitions of flags for mac_tx() (see mac_client.h) */ +#define IP_DROP_ON_NO_DESC 0x01 /* Equivalent to MAC_DROP_ON_NO_DESC */ + +#define ILL_DIRECT_CAPABLE(ill) \ + (((ill)->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) + +#define ILL_SEND_TX(ill, ire, hint, mp, flag) { \ + if (ILL_DIRECT_CAPABLE(ill) && DB_TYPE(mp) == M_DATA) { \ + ill_dld_direct_t *idd; \ + \ + idd = &(ill)->ill_dld_capab->idc_direct; \ + /* \ + * Send the packet directly to DLD, where it \ + * may be queued depending on the availability \ + * of transmit resources at the media layer. \ + * Ignore the returned value for the time being \ + * In future, we may want to take this into \ + * account and flow control the TCP. \ + */ \ + (void) idd->idd_tx_df(idd->idd_tx_dh, mp, \ + (uintptr_t)(hint), flag); \ + } else { \ + putnext((ire)->ire_stq, mp); \ + } \ } +#define MBLK_RX_FANOUT_SLOWPATH(mp, ipha) \ + (DB_TYPE(mp) != M_DATA || DB_REF(mp) != 1 || !OK_32PTR(ipha) || \ + (((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH) >= (mp)->b_wptr)) + /* * In non-global zone exclusive IP stacks, data structures such as IRE * entries pretend that they're in the global zone. The following @@ -548,6 +541,7 @@ typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t; extern int ip_wput_frag_mdt_min; extern boolean_t ip_can_frag_mdt(mblk_t *, ssize_t, ssize_t); extern mblk_t *ip_prepend_zoneid(mblk_t *, zoneid_t, ip_stack_t *); +extern void ill_flow_enable(void *, ip_mac_tx_cookie_t); extern zoneid_t ip_get_zoneid_v4(ipaddr_t, mblk_t *, ip_stack_t *, zoneid_t); extern zoneid_t ip_get_zoneid_v6(in6_addr_t *, mblk_t *, const ill_t *, ip_stack_t *, zoneid_t); diff --git a/usr/src/uts/common/inet/ip_ire.h b/usr/src/uts/common/inet/ip_ire.h index c9a0e12ea1..7accbbcfa3 100644 --- a/usr/src/uts/common/inet/ip_ire.h +++ b/usr/src/uts/common/inet/ip_ire.h @@ -235,6 +235,7 @@ extern void ire_atomic_end(irb_t *irb_ptr, ire_t *ire); extern void ire_cache_count(ire_t *, char *); extern ire_t *ire_cache_lookup(ipaddr_t, zoneid_t, const struct ts_label_s *, ip_stack_t *); +extern ire_t *ire_cache_lookup_simple(ipaddr_t, ip_stack_t *); extern ire_t *ire_cache_lookup_v6(const in6_addr_t *, zoneid_t, const struct ts_label_s *, ip_stack_t *); extern void ire_cache_reclaim(ire_t *, char *); diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h index b788b95fa0..d0c3953374 100644 --- a/usr/src/uts/common/inet/ip_stack.h +++ b/usr/src/uts/common/inet/ip_stack.h @@ -35,7 +35,7 @@ extern "C" { #include <netinet/igmp_var.h> #ifdef _KERNEL - +#include <sys/list.h> /* * IP statistics. @@ -175,6 +175,13 @@ struct ip_stack { struct ill_group *ips_illgrp_head_v4; /* Head of IPv4 ill groups */ struct ill_group *ips_illgrp_head_v6; /* Head of IPv6 ill groups */ + /* Taskq dispatcher for capability operations */ + kmutex_t ips_capab_taskq_lock; + kcondvar_t ips_capab_taskq_cv; + list_t ips_capab_taskq_list; + kthread_t *ips_capab_taskq_thread; + boolean_t ips_capab_taskq_quit; + /* ipclassifier.c - keep in ip_stack_t */ /* ipclassifier hash tables */ struct connf_s *ips_rts_clients; diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h index dac6d023f7..4665549c69 100644 --- a/usr/src/uts/common/inet/ipclassifier.h +++ b/usr/src/uts/common/inet/ipclassifier.h @@ -26,8 +26,6 @@ #ifndef _INET_IPCLASSIFIER_H #define _INET_IPCLASSIFIER_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -222,10 +220,13 @@ struct conn_s { conn_recvslla : 1, /* IP_RECVSLLA option */ conn_mdt_ok : 1, /* MDT is permitted */ conn_nexthop_set : 1, - conn_allzones : 1, /* SO_ALLZONES */ + conn_allzones : 1; /* SO_ALLZONES */ + unsigned int conn_lso_ok : 1; /* LSO is usable */ + squeue_t *conn_initial_sqp; /* Squeue at open time */ + squeue_t *conn_final_sqp; /* Squeue after connect */ ill_t *conn_nofailover_ill; /* Failover ill */ ill_t *conn_dhcpinit_ill; /* IP_DHCPINIT_IF */ ipsec_latch_t *conn_latch; /* latched state */ @@ -286,8 +287,8 @@ struct conn_s { int conn_orig_bound_ifindex; /* BOUND_IF before MOVE */ int conn_orig_multicast_ifindex; /* IPv6 MC IF before MOVE */ - struct conn_s *conn_drain_next; /* Next conn in drain list */ - struct conn_s *conn_drain_prev; /* Prev conn in drain list */ + struct conn_s *conn_drain_next; /* Next conn in drain list */ + struct conn_s *conn_drain_prev; /* Prev conn in drain list */ idl_t *conn_idl; /* Ptr to the drain list head */ mblk_t *conn_ipsec_opt_mp; /* ipsec option mblk */ uint32_t conn_src_preferences; /* prefs for src addr select */ @@ -499,6 +500,7 @@ struct connf_s { (connp)->conn_ports = ports; \ (connp)->conn_send = ip_output; \ (connp)->conn_sqp = IP_SQUEUE_GET(lbolt); \ + (connp)->conn_initial_sqp = (connp)->conn_sqp; \ } #define IPCL_TCP_EAGER_INIT_V6(connp, protocol, src, rem, ports) { \ @@ -508,6 +510,7 @@ struct connf_s { (connp)->conn_ports = ports; \ (connp)->conn_send = ip_output_v6; \ (connp)->conn_sqp = IP_SQUEUE_GET(lbolt); \ + (connp)->conn_initial_sqp = (connp)->conn_sqp; \ } #define IPCL_UDP_HASH(lport, ipst) \ diff --git a/usr/src/uts/common/inet/ipdrop.h b/usr/src/uts/common/inet/ipdrop.h index 88dcda264c..9fe672434e 100644 --- a/usr/src/uts/common/inet/ipdrop.h +++ b/usr/src/uts/common/inet/ipdrop.h @@ -124,7 +124,6 @@ struct ip_dropstats { }; #endif /* _KERNEL */ - #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c index 4895e2249e..559abd9178 100644 --- a/usr/src/uts/common/inet/squeue.c +++ b/usr/src/uts/common/inet/squeue.c @@ -19,144 +19,95 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* - * Squeues - TCP/IP serialization mechanism. - * - * This is a general purpose high-performance serialization mechanism. It is - * similar to a taskq with a single worker thread, the difference is that it - * does not imply a context switch - the thread placing a request may actually - * process it. It is also biased for processing requests in interrupt context. - * - * Each squeue has a worker thread which may optionally be bound to a CPU. - * - * Only one thread may process requests from a given squeue at any time. This is - * called "entering" squeue. - * - * Each dispatched request is processed either by - * - * a) Dispatching thread or - * b) Some other thread that is currently processing squeue at the time of - * request or - * c) worker thread. - * - * INTERFACES: - * - * squeue_t *squeue_create(name, bind, wait, pri) - * - * name: symbolic name for squeue. - * wait: time to wait before waiking the worker thread after queueing - * request. - * bind: preferred CPU binding for the worker thread. - * pri: thread priority for the worker thread. - * - * This function never fails and may sleep. It returns a transparent pointer - * to the squeue_t structure that is passed to all other squeue operations. - * - * void squeue_bind(sqp, bind) - * - * Bind squeue worker thread to a CPU specified by the 'bind' argument. The - * 'bind' value of -1 binds to the preferred thread specified for - * squeue_create. - * - * NOTE: Any value of 'bind' other then -1 is not supported currently, but the - * API is present - in the future it may be useful to specify different - * binding. - * - * void squeue_unbind(sqp) - * - * Unbind the worker thread from its preferred CPU. - * - * void squeue_enter(*sqp, *mp, proc, arg, tag) - * - * Post a single request for processing. Each request consists of mblock 'mp', - * function 'proc' to execute and an argument 'arg' to pass to this - * function. The function is called as (*proc)(arg, mp, sqp); The tag is an - * arbitrary number from 0 to 255 which will be stored in mp to track exact - * caller of squeue_enter. The combination of function name and the tag should - * provide enough information to identify the caller. - * - * If no one is processing the squeue, squeue_enter() will call the function - * immediately. Otherwise it will add the request to the queue for later - * processing. Once the function is executed, the thread may continue - * executing all other requests pending on the queue. + * Squeues: General purpose serialization mechanism + * ------------------------------------------------ * - * NOTE: The tagging information is only used when SQUEUE_DEBUG is set to 1. - * NOTE: The argument can be conn_t only. Ideally we'd like to have generic - * argument, but we want to drop connection reference count here - this - * improves tail-call optimizations. - * XXX: The arg should have type conn_t. + * Background: + * ----------- * - * void squeue_enter_nodrain(*sqp, *mp, proc, arg, tag) + * This is a general purpose high-performance serialization mechanism + * currently used by TCP/IP. It is implement by means of a per CPU queue, + * a worker thread and a polling thread with are bound to the CPU + * associated with the squeue. The squeue is strictly FIFO for both read + * and write side and only one thread can process it at any given time. + * The design goal of squeue was to offer a very high degree of + * parallelization (on a per H/W execution pipeline basis) with at + * most one queuing. * - * Same as squeue_enter(), but the entering thread will only try to execute a - * single request. It will not continue executing any pending requests. + * The modules needing protection typically calls squeue_enter() or + * squeue_enter_chain() routine as soon as a thread enter the module + * from either direction. For each packet, the processing function + * and argument is stored in the mblk itself. When the packet is ready + * to be processed, the squeue retrieves the stored function and calls + * it with the supplied argument and the pointer to the packet itself. + * The called function can assume that no other thread is processing + * the squeue when it is executing. * - * void squeue_fill(*sqp, *mp, proc, arg, tag) + * Squeue/connection binding: + * -------------------------- * - * Just place the request on the queue without trying to execute it. Arrange - * for the worker thread to process the request. + * TCP/IP uses an IP classifier in conjunction with squeue where specific + * connections are assigned to specific squeue (based on various policies), + * at the connection creation time. Once assigned, the connection to + * squeue mapping is never changed and all future packets for that + * connection are processed on that squeue. The connection ("conn") to + * squeue mapping is stored in "conn_t" member "conn_sqp". * - * void squeue_profile_enable(sqp) - * void squeue_profile_disable(sqp) + * Since the processing of the connection cuts across multiple layers + * but still allows packets for different connnection to be processed on + * other CPU/squeues, squeues are also termed as "Vertical Perimeter" or + * "Per Connection Vertical Perimeter". * - * Enable or disable profiling for specified 'sqp'. Profiling is only - * available when SQUEUE_PROFILE is set. + * Processing Model: + * ----------------- * - * void squeue_profile_reset(sqp) + * Squeue doesn't necessary processes packets with its own worker thread. + * The callers can pick if they just want to queue the packet, process + * their packet if nothing is queued or drain and process. The first two + * modes are typically employed when the packet was generated while + * already doing the processing behind the squeue and last mode (drain + * and process) is typically employed when the thread is entering squeue + * for the first time. The squeue still imposes a finite time limit + * for which a external thread can do processing after which it switches + * processing to its own worker thread. * - * Reset all profiling information to zero. Profiling is only - * available when SQUEUE_PROFILE is set. + * Once created, squeues are never deleted. Hence squeue pointers are + * always valid. This means that functions outside the squeue can still + * refer safely to conn_sqp and their is no need for ref counts. * - * void squeue_profile_start() - * void squeue_profile_stop() + * Only a thread executing in the squeue can change the squeue of the + * connection. It does so by calling a squeue framework function to do this. + * After changing the squeue, the thread must leave the squeue. It must not + * continue to execute any code that needs squeue protection. * - * Globally enable or disabled profiling for all squeues. + * The squeue framework, after entering the squeue, checks if the current + * squeue matches the conn_sqp. If the check fails, the packet is delivered + * to right squeue. * - * uintptr_t *squeue_getprivate(sqp, p) + * Polling Model: + * -------------- * - * Each squeue keeps small amount of private data space available for various - * consumers. Current consumers include TCP and NCA. Other consumers need to - * add their private tag to the sqprivate_t enum. The private information is - * limited to an uintptr_t value. The squeue has no knowledge of its content - * and does not manage it in any way. + * Squeues can control the rate of packet arrival into itself from the + * NIC or specific Rx ring within a NIC. As part of capability negotiation + * between IP and MAC layer, squeue are created for each TCP soft ring + * (or TCP Rx ring - to be implemented in future). As part of this + * negotiation, squeues get a cookie for underlying soft ring or Rx + * ring, a function to turn off incoming packets and a function to call + * to poll for packets. This helps schedule the receive side packet + * processing so that queue backlog doesn't build up and packet processing + * doesn't keep getting disturbed by high priority interrupts. As part + * of this mode, as soon as a backlog starts building, squeue turns off + * the interrupts and switches to poll mode. In poll mode, when poll + * thread goes down to retrieve packets, it retrieves them in the form of + * a chain which improves performance even more. As the squeue/softring + * system gets more packets, it gets more efficient by switching to + * polling more often and dealing with larger packet chains. * - * The typical use may be a breakdown of data structures per CPU (since - * squeues are usually per CPU). See NCA for examples of use. - * Currently 'p' may have one legal value SQPRIVATE_TCP. - * - * processorid_t squeue_binding(sqp) - * - * Returns the CPU binding for a given squeue. - * - * TUNABALES: - * - * squeue_intrdrain_ms: Maximum time in ms interrupts spend draining any - * squeue. Note that this is approximation - squeues have no control on the - * time it takes to process each request. This limit is only checked - * between processing individual messages. - * Default: 20 ms. - * - * squeue_writerdrain_ms: Maximum time in ms non-interrupts spend draining any - * squeue. Note that this is approximation - squeues have no control on the - * time it takes to process each request. This limit is only checked - * between processing individual messages. - * Default: 10 ms. - * - * squeue_workerdrain_ms: Maximum time in ms worker thread spends draining any - * squeue. Note that this is approximation - squeues have no control on the - * time it takes to process each request. This limit is only checked - * between processing individual messages. - * Default: 10 ms. - * - * squeue_workerwait_ms: When worker thread is interrupted because workerdrain - * expired, how much time to wait before waking worker thread again. - * Default: 10 ms. */ #include <sys/types.h> @@ -169,208 +120,30 @@ #include <sys/callb.h> #include <sys/sdt.h> #include <sys/ddi.h> +#include <sys/sunddi.h> #include <inet/ipclassifier.h> #include <inet/udp_impl.h> -/* - * State flags. - * Note: The MDB IP module depends on the values of these flags. - */ -#define SQS_PROC 0x0001 /* being processed */ -#define SQS_WORKER 0x0002 /* worker thread */ -#define SQS_ENTER 0x0004 /* enter thread */ -#define SQS_FAST 0x0008 /* enter-fast thread */ -#define SQS_USER 0x0010 /* A non interrupt user */ -#define SQS_BOUND 0x0020 /* Worker thread is bound */ -#define SQS_PROFILE 0x0040 /* Enable profiling */ -#define SQS_REENTER 0x0080 /* Re entered thread */ -#define SQS_TMO_PROG 0x0100 /* Timeout is being set */ - #include <sys/squeue_impl.h> static void squeue_fire(void *); static void squeue_drain(squeue_t *, uint_t, hrtime_t); static void squeue_worker(squeue_t *sqp); - -#if SQUEUE_PROFILE -static kmutex_t squeue_kstat_lock; -static int squeue_kstat_update(kstat_t *, int); -#endif +static void squeue_polling_thread(squeue_t *sqp); kmem_cache_t *squeue_cache; #define SQUEUE_MSEC_TO_NSEC 1000000 -int squeue_intrdrain_ms = 20; -int squeue_writerdrain_ms = 10; -int squeue_workerdrain_ms = 10; -int squeue_workerwait_ms = 10; +int squeue_drain_ms = 20; +int squeue_workerwait_ms = 0; /* The values above converted to ticks or nano seconds */ -static int squeue_intrdrain_ns = 0; -static int squeue_writerdrain_ns = 0; -static int squeue_workerdrain_ns = 0; +static int squeue_drain_ns = 0; static int squeue_workerwait_tick = 0; -/* - * The minimum packet queued when worker thread doing the drain triggers - * polling (if squeue allows it). The choice of 3 is arbitrary. You - * definitely don't want it to be 1 since that will trigger polling - * on very low loads as well (ssh seems to do be one such example - * where packet flow was very low yet somehow 1 packet ended up getting - * queued and worker thread fires every 10ms and blanking also gets - * triggered. - */ -int squeue_worker_poll_min = 3; - -#if SQUEUE_PROFILE -/* - * Set to B_TRUE to enable profiling. - */ -static int squeue_profile = B_FALSE; -#define SQ_PROFILING(sqp) (squeue_profile && ((sqp)->sq_state & SQS_PROFILE)) - -#define SQSTAT(sqp, x) ((sqp)->sq_stats.x++) -#define SQDELTA(sqp, x, d) ((sqp)->sq_stats.x += (d)) - -struct squeue_kstat { - kstat_named_t sq_count; - kstat_named_t sq_max_qlen; - kstat_named_t sq_npackets_worker; - kstat_named_t sq_npackets_intr; - kstat_named_t sq_npackets_other; - kstat_named_t sq_nqueued_intr; - kstat_named_t sq_nqueued_other; - kstat_named_t sq_ndrains_worker; - kstat_named_t sq_ndrains_intr; - kstat_named_t sq_ndrains_other; - kstat_named_t sq_time_worker; - kstat_named_t sq_time_intr; - kstat_named_t sq_time_other; -} squeue_kstat = { - { "count", KSTAT_DATA_UINT64 }, - { "max_qlen", KSTAT_DATA_UINT64 }, - { "packets_worker", KSTAT_DATA_UINT64 }, - { "packets_intr", KSTAT_DATA_UINT64 }, - { "packets_other", KSTAT_DATA_UINT64 }, - { "queued_intr", KSTAT_DATA_UINT64 }, - { "queued_other", KSTAT_DATA_UINT64 }, - { "ndrains_worker", KSTAT_DATA_UINT64 }, - { "ndrains_intr", KSTAT_DATA_UINT64 }, - { "ndrains_other", KSTAT_DATA_UINT64 }, - { "time_worker", KSTAT_DATA_UINT64 }, - { "time_intr", KSTAT_DATA_UINT64 }, - { "time_other", KSTAT_DATA_UINT64 }, -}; -#endif - -#define SQUEUE_WORKER_WAKEUP(sqp) { \ - timeout_id_t tid = (sqp)->sq_tid; \ - \ - ASSERT(MUTEX_HELD(&(sqp)->sq_lock)); \ - /* \ - * Queue isn't being processed, so take \ - * any post enqueue actions needed before leaving. \ - */ \ - if (tid != 0) { \ - /* \ - * Waiting for an enter() to process mblk(s). \ - */ \ - clock_t waited = lbolt - (sqp)->sq_awaken; \ - \ - if (TICK_TO_MSEC(waited) >= (sqp)->sq_wait) { \ - /* \ - * Times up and have a worker thread \ - * waiting for work, so schedule it. \ - */ \ - (sqp)->sq_tid = 0; \ - (sqp)->sq_awaken = lbolt; \ - cv_signal(&(sqp)->sq_async); \ - mutex_exit(&(sqp)->sq_lock); \ - (void) untimeout(tid); \ - return; \ - } \ - mutex_exit(&(sqp)->sq_lock); \ - return; \ - } else if ((sqp)->sq_state & SQS_TMO_PROG) { \ - mutex_exit(&(sqp)->sq_lock); \ - return; \ - } else if ((sqp)->sq_wait != 0) { \ - clock_t wait = (sqp)->sq_wait; \ - /* \ - * Wait up to sqp->sq_wait ms for an \ - * enter() to process this queue. We \ - * don't want to contend on timeout locks \ - * with sq_lock held for performance reasons, \ - * so drop the sq_lock before calling timeout \ - * but we need to check if timeout is required \ - * after re acquiring the sq_lock. Once \ - * the sq_lock is dropped, someone else could \ - * have processed the packet or the timeout could \ - * have already fired. \ - */ \ - (sqp)->sq_state |= SQS_TMO_PROG; \ - mutex_exit(&(sqp)->sq_lock); \ - tid = timeout(squeue_fire, (sqp), wait); \ - mutex_enter(&(sqp)->sq_lock); \ - /* Check again if we still need the timeout */ \ - if ((((sqp)->sq_state & (SQS_PROC|SQS_TMO_PROG)) == \ - SQS_TMO_PROG) && ((sqp)->sq_tid == 0) && \ - ((sqp)->sq_first != NULL)) { \ - (sqp)->sq_state &= ~SQS_TMO_PROG; \ - (sqp)->sq_awaken = lbolt; \ - (sqp)->sq_tid = tid; \ - mutex_exit(&(sqp)->sq_lock); \ - return; \ - } else { \ - if ((sqp)->sq_state & SQS_TMO_PROG) { \ - (sqp)->sq_state &= ~SQS_TMO_PROG; \ - mutex_exit(&(sqp)->sq_lock); \ - (void) untimeout(tid); \ - } else { \ - /* \ - * The timer fired before we could \ - * reacquire the sq_lock. squeue_fire \ - * removes the SQS_TMO_PROG flag \ - * and we don't need to do anything \ - * else. \ - */ \ - mutex_exit(&(sqp)->sq_lock); \ - } \ - } \ - } else { \ - /* \ - * Schedule the worker thread. \ - */ \ - (sqp)->sq_awaken = lbolt; \ - cv_signal(&(sqp)->sq_async); \ - mutex_exit(&(sqp)->sq_lock); \ - } \ - ASSERT(MUTEX_NOT_HELD(&(sqp)->sq_lock)); \ -} - -#define ENQUEUE_MP(sqp, mp, proc, arg) { \ - /* \ - * Enque our mblk. \ - */ \ - (mp)->b_queue = NULL; \ - ASSERT(MUTEX_HELD(&(sqp)->sq_lock)); \ - ASSERT((mp)->b_prev == NULL && (mp)->b_next == NULL); \ - (mp)->b_queue = (queue_t *)(proc); \ - (mp)->b_prev = (mblk_t *)(arg); \ - \ - if ((sqp)->sq_last != NULL) \ - (sqp)->sq_last->b_next = (mp); \ - else \ - (sqp)->sq_first = (mp); \ - (sqp)->sq_last = (mp); \ - (sqp)->sq_count++; \ - ASSERT((sqp)->sq_count > 0); \ - DTRACE_PROBE2(squeue__enqueue, squeue_t *, sqp, \ - mblk_t *, mp); \ -} - +#define MAX_BYTES_TO_PICKUP 150000 #define ENQUEUE_CHAIN(sqp, mp, tail, cnt) { \ /* \ @@ -390,89 +163,120 @@ struct squeue_kstat { \ } -#define SQS_POLLING_ON(sqp, rx_ring) { \ - ASSERT(rx_ring != NULL); \ +#define SQS_POLLING_ON(sqp, sq_poll_capable, rx_ring) { \ ASSERT(MUTEX_HELD(&(sqp)->sq_lock)); \ - rx_ring->rr_blank(rx_ring->rr_handle, \ - MIN((sqp->sq_avg_drain_time * sqp->sq_count), \ - rx_ring->rr_max_blank_time), \ - rx_ring->rr_max_pkt_cnt); \ - rx_ring->rr_poll_state |= ILL_POLLING; \ - rx_ring->rr_poll_time = lbolt; \ + if (sq_poll_capable) { \ + ASSERT(rx_ring != NULL); \ + ASSERT(sqp->sq_state & SQS_POLL_CAPAB); \ + if (!(sqp->sq_state & SQS_POLLING)) { \ + sqp->sq_state |= SQS_POLLING; \ + rx_ring->rr_intr_disable(rx_ring->rr_intr_handle); \ + } \ + } \ } +#define SQS_POLLING_OFF(sqp, sq_poll_capable, rx_ring) { \ + ASSERT(MUTEX_HELD(&(sqp)->sq_lock)); \ + if (sq_poll_capable) { \ + ASSERT(rx_ring != NULL); \ + ASSERT(sqp->sq_state & SQS_POLL_CAPAB); \ + if (sqp->sq_state & SQS_POLLING) { \ + sqp->sq_state &= ~SQS_POLLING; \ + rx_ring->rr_intr_enable(rx_ring->rr_intr_handle); \ + } \ + } \ +} -#define SQS_POLLING_OFF(sqp, rx_ring) { \ - ASSERT(rx_ring != NULL); \ +#define SQS_POLL_RING(sqp, sq_poll_capable) { \ ASSERT(MUTEX_HELD(&(sqp)->sq_lock)); \ - rx_ring->rr_blank(rx_ring->rr_handle, \ - rx_ring->rr_min_blank_time, \ - rx_ring->rr_min_pkt_cnt); \ + if (sq_poll_capable) { \ + ASSERT(sqp->sq_state & SQS_POLL_CAPAB); \ + if (!(sqp->sq_state & SQS_GET_PKTS)) { \ + sqp->sq_state |= SQS_GET_PKTS; \ + cv_signal(&sqp->sq_poll_cv); \ + } \ + } \ } +#ifdef DEBUG +#define SQUEUE_DBG_SET(sqp, mp, proc, connp, tag) { \ + (sqp)->sq_curmp = (mp); \ + (sqp)->sq_curproc = (proc); \ + (sqp)->sq_connp = (connp); \ + (mp)->b_tag = (sqp)->sq_tag = (tag); \ +} + +#define SQUEUE_DBG_CLEAR(sqp) { \ + (sqp)->sq_curmp = NULL; \ + (sqp)->sq_curproc = NULL; \ + (sqp)->sq_connp = NULL; \ +} +#else +#define SQUEUE_DBG_SET(sqp, mp, proc, connp, tag) +#define SQUEUE_DBG_CLEAR(sqp) +#endif + void squeue_init(void) { squeue_cache = kmem_cache_create("squeue_cache", sizeof (squeue_t), 64, NULL, NULL, NULL, NULL, NULL, 0); - squeue_intrdrain_ns = squeue_intrdrain_ms * SQUEUE_MSEC_TO_NSEC; - squeue_writerdrain_ns = squeue_writerdrain_ms * SQUEUE_MSEC_TO_NSEC; - squeue_workerdrain_ns = squeue_workerdrain_ms * SQUEUE_MSEC_TO_NSEC; + squeue_drain_ns = squeue_drain_ms * SQUEUE_MSEC_TO_NSEC; squeue_workerwait_tick = MSEC_TO_TICK_ROUNDUP(squeue_workerwait_ms); } /* ARGSUSED */ squeue_t * -squeue_create(char *name, processorid_t bind, clock_t wait, pri_t pri) +squeue_create(clock_t wait, pri_t pri) { squeue_t *sqp = kmem_cache_alloc(squeue_cache, KM_SLEEP); bzero(sqp, sizeof (squeue_t)); - (void) strncpy(sqp->sq_name, name, SQ_NAMELEN + 1); - sqp->sq_name[SQ_NAMELEN] = '\0'; - - sqp->sq_bind = bind; + sqp->sq_bind = PBIND_NONE; + sqp->sq_priority = pri; sqp->sq_wait = MSEC_TO_TICK(wait); - sqp->sq_avg_drain_time = - drv_hztousec(NSEC_TO_TICK_ROUNDUP(squeue_intrdrain_ns)) / - NSEC_TO_TICK_ROUNDUP(squeue_intrdrain_ns); - -#if SQUEUE_PROFILE - if ((sqp->sq_kstat = kstat_create("ip", bind, name, - "net", KSTAT_TYPE_NAMED, - sizeof (squeue_kstat) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL)) != NULL) { - sqp->sq_kstat->ks_lock = &squeue_kstat_lock; - sqp->sq_kstat->ks_data = &squeue_kstat; - sqp->sq_kstat->ks_update = squeue_kstat_update; - sqp->sq_kstat->ks_private = sqp; - kstat_install(sqp->sq_kstat); - } -#endif - sqp->sq_worker = thread_create(NULL, 0, squeue_worker, sqp, 0, &p0, TS_RUN, pri); + sqp->sq_poll_thr = thread_create(NULL, 0, squeue_polling_thread, + sqp, 0, &p0, TS_RUN, pri); + + sqp->sq_enter = squeue_enter; + sqp->sq_drain = squeue_drain; + return (sqp); } -/* ARGSUSED */ +/* + * Bind squeue worker thread to the specified CPU, given by CPU id. + * If the CPU id value is -1, bind the worker thread to the value + * specified in sq_bind field. If a thread is already bound to a + * different CPU, unbind it from the old CPU and bind to the new one. + */ + void squeue_bind(squeue_t *sqp, processorid_t bind) { - ASSERT(bind == -1); - mutex_enter(&sqp->sq_lock); + ASSERT(sqp->sq_bind != PBIND_NONE || bind != PBIND_NONE); + ASSERT(MUTEX_HELD(&cpu_lock)); + if (sqp->sq_state & SQS_BOUND) { - mutex_exit(&sqp->sq_lock); - return; + if (sqp->sq_bind == bind) { + mutex_exit(&sqp->sq_lock); + return; + } + thread_affinity_clear(sqp->sq_worker); + } else { + sqp->sq_state |= SQS_BOUND; } - sqp->sq_state |= SQS_BOUND; - mutex_exit(&sqp->sq_lock); + if (bind != PBIND_NONE) + sqp->sq_bind = bind; thread_affinity_set(sqp->sq_worker, sqp->sq_bind); + mutex_exit(&sqp->sq_lock); } void @@ -485,9 +289,98 @@ squeue_unbind(squeue_t *sqp) } sqp->sq_state &= ~SQS_BOUND; + thread_affinity_clear(sqp->sq_worker); mutex_exit(&sqp->sq_lock); +} - thread_affinity_clear(sqp->sq_worker); +void +squeue_worker_wakeup(squeue_t *sqp) +{ + timeout_id_t tid = (sqp)->sq_tid; + + ASSERT(MUTEX_HELD(&(sqp)->sq_lock)); + + if (sqp->sq_wait == 0) { + ASSERT(tid == 0); + ASSERT(!(sqp->sq_state & SQS_TMO_PROG)); + sqp->sq_awaken = lbolt; + cv_signal(&sqp->sq_worker_cv); + mutex_exit(&sqp->sq_lock); + return; + } + + /* + * Queue isn't being processed, so take + * any post enqueue actions needed before leaving. + */ + if (tid != 0) { + /* + * Waiting for an enter() to process mblk(s). + */ + clock_t waited = lbolt - sqp->sq_awaken; + + if (TICK_TO_MSEC(waited) >= sqp->sq_wait) { + /* + * Times up and have a worker thread + * waiting for work, so schedule it. + */ + sqp->sq_tid = 0; + sqp->sq_awaken = lbolt; + cv_signal(&sqp->sq_worker_cv); + mutex_exit(&sqp->sq_lock); + (void) untimeout(tid); + return; + } + mutex_exit(&sqp->sq_lock); + return; + } else if (sqp->sq_state & SQS_TMO_PROG) { + mutex_exit(&sqp->sq_lock); + return; + } else { + clock_t wait = sqp->sq_wait; + /* + * Wait up to sqp->sq_wait ms for an + * enter() to process this queue. We + * don't want to contend on timeout locks + * with sq_lock held for performance reasons, + * so drop the sq_lock before calling timeout + * but we need to check if timeout is required + * after re acquiring the sq_lock. Once + * the sq_lock is dropped, someone else could + * have processed the packet or the timeout could + * have already fired. + */ + sqp->sq_state |= SQS_TMO_PROG; + mutex_exit(&sqp->sq_lock); + tid = timeout(squeue_fire, sqp, wait); + mutex_enter(&sqp->sq_lock); + /* Check again if we still need the timeout */ + if (((sqp->sq_state & (SQS_PROC|SQS_TMO_PROG)) == + SQS_TMO_PROG) && (sqp->sq_tid == 0) && + (sqp->sq_first != NULL)) { + sqp->sq_state &= ~SQS_TMO_PROG; + sqp->sq_tid = tid; + mutex_exit(&sqp->sq_lock); + return; + } else { + if (sqp->sq_state & SQS_TMO_PROG) { + sqp->sq_state &= ~SQS_TMO_PROG; + mutex_exit(&sqp->sq_lock); + (void) untimeout(tid); + } else { + /* + * The timer fired before we could + * reacquire the sq_lock. squeue_fire + * removes the SQS_TMO_PROG flag + * and we don't need to do anything + * else. + */ + mutex_exit(&sqp->sq_lock); + } + } + } + + ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock)); } /* @@ -500,18 +393,20 @@ squeue_unbind(squeue_t *sqp) * * The proc and arg for each mblk is already stored in the mblk in * appropriate places. + * + * The process_flag specifies if we are allowed to process the mblk + * and drain in the entering thread context. If process_flag is + * SQ_FILL, then we just queue the mblk and return (after signaling + * the worker thread if no one else is processing the squeue). */ +/* ARGSUSED */ void -squeue_enter_chain(squeue_t *sqp, mblk_t *mp, mblk_t *tail, - uint32_t cnt, uint8_t tag) +squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, + int process_flag, uint8_t tag) { - int interrupt = servicing_interrupt(); - void *arg; + conn_t *connp; sqproc_t proc; hrtime_t now; -#if SQUEUE_PROFILE - hrtime_t start, delta; -#endif ASSERT(sqp != NULL); ASSERT(mp != NULL); @@ -520,355 +415,111 @@ squeue_enter_chain(squeue_t *sqp, mblk_t *mp, mblk_t *tail, ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock)); mutex_enter(&sqp->sq_lock); - if (!(sqp->sq_state & SQS_PROC)) { + + /* + * Try to process the packet if SQ_FILL flag is not set and + * we are allowed to process the squeue. The SQ_NODRAIN is + * ignored if the packet chain consists of more than 1 packet. + */ + if (!(sqp->sq_state & SQS_PROC) && ((process_flag == SQ_PROCESS) || + (process_flag == SQ_NODRAIN && sqp->sq_first == NULL))) { /* * See if anything is already queued. If we are the * first packet, do inline processing else queue the * packet and do the drain. */ - sqp->sq_run = curthread; if (sqp->sq_first == NULL && cnt == 1) { /* * Fast-path, ok to process and nothing queued. */ sqp->sq_state |= (SQS_PROC|SQS_FAST); + sqp->sq_run = curthread; mutex_exit(&sqp->sq_lock); /* * We are the chain of 1 packet so * go through this fast path. */ - arg = mp->b_prev; + ASSERT(mp->b_prev != NULL); + ASSERT(mp->b_queue != NULL); + connp = (conn_t *)mp->b_prev; mp->b_prev = NULL; proc = (sqproc_t)mp->b_queue; mp->b_queue = NULL; - - ASSERT(proc != NULL); - ASSERT(arg != NULL); + ASSERT(proc != NULL && connp != NULL); ASSERT(mp->b_next == NULL); -#if SQUEUE_DEBUG - sqp->sq_isintr = interrupt; - sqp->sq_curmp = mp; - sqp->sq_curproc = proc; - sqp->sq_connp = arg; - mp->b_tag = sqp->sq_tag = tag; -#endif -#if SQUEUE_PROFILE - if (SQ_PROFILING(sqp)) { - if (interrupt) - SQSTAT(sqp, sq_npackets_intr); - else - SQSTAT(sqp, sq_npackets_other); - start = gethrtime(); - } -#endif - ((conn_t *)arg)->conn_on_sqp = B_TRUE; - DTRACE_PROBE3(squeue__proc__start, squeue_t *, - sqp, mblk_t *, mp, conn_t *, arg); - (*proc)(arg, mp, sqp); - DTRACE_PROBE2(squeue__proc__end, squeue_t *, - sqp, conn_t *, arg); - ((conn_t *)arg)->conn_on_sqp = B_FALSE; - -#if SQUEUE_PROFILE - if (SQ_PROFILING(sqp)) { - delta = gethrtime() - start; - if (interrupt) - SQDELTA(sqp, sq_time_intr, delta); - else - SQDELTA(sqp, sq_time_other, delta); - } -#endif -#if SQUEUE_DEBUG - sqp->sq_curmp = NULL; - sqp->sq_curproc = NULL; - sqp->sq_connp = NULL; - sqp->sq_isintr = 0; -#endif - - CONN_DEC_REF((conn_t *)arg); - ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock)); - mutex_enter(&sqp->sq_lock); - sqp->sq_state &= ~(SQS_PROC|SQS_FAST); - if (sqp->sq_first == NULL) { - /* - * We processed inline our packet and - * nothing new has arrived. We are done. - */ - sqp->sq_run = NULL; - mutex_exit(&sqp->sq_lock); - return; - } else if (sqp->sq_bind != CPU->cpu_id) { - /* - * If the current thread is not running - * on the CPU to which this squeue is bound, - * then don't allow it to drain. - */ - sqp->sq_run = NULL; - SQUEUE_WORKER_WAKEUP(sqp); - return; - } - } else { - ENQUEUE_CHAIN(sqp, mp, tail, cnt); -#if SQUEUE_DEBUG - mp->b_tag = tag; -#endif -#if SQUEUE_PROFILE - if (SQ_PROFILING(sqp)) { - if (servicing_interrupt()) - SQSTAT(sqp, sq_nqueued_intr); - else - SQSTAT(sqp, sq_nqueued_other); - if (sqp->sq_stats.sq_max_qlen < sqp->sq_count) - sqp->sq_stats.sq_max_qlen = - sqp->sq_count; - } -#endif - } - - /* - * We are here because either we couldn't do inline - * processing (because something was already queued), - * or we had a chanin of more than one packet, - * or something else arrived after we were done with - * inline processing. - */ - ASSERT(MUTEX_HELD(&sqp->sq_lock)); - ASSERT(sqp->sq_first != NULL); - -#if SQUEUE_PROFILE - if (SQ_PROFILING(sqp)) { - start = gethrtime(); - } -#endif -#if SQUEUE_DEBUG - sqp->sq_isintr = interrupt; -#endif - - now = gethrtime(); - if (interrupt) { - squeue_drain(sqp, SQS_ENTER, now + - squeue_intrdrain_ns); - } else { - squeue_drain(sqp, SQS_USER, now + - squeue_writerdrain_ns); - } - -#if SQUEUE_PROFILE - if (SQ_PROFILING(sqp)) { - delta = gethrtime() - start; - if (interrupt) - SQDELTA(sqp, sq_time_intr, delta); - else - SQDELTA(sqp, sq_time_other, delta); - } -#endif -#if SQUEUE_DEBUG - sqp->sq_isintr = 0; -#endif - - /* - * If we didn't do a complete drain, the worker - * thread was already signalled by squeue_drain. - */ - sqp->sq_run = NULL; - mutex_exit(&sqp->sq_lock); - return; - } else { - ASSERT(sqp->sq_run != NULL); - /* - * Queue is already being processed. Just enqueue - * the packet and go away. - */ -#if SQUEUE_DEBUG - mp->b_tag = tag; -#endif -#if SQUEUE_PROFILE - if (SQ_PROFILING(sqp)) { - if (servicing_interrupt()) - SQSTAT(sqp, sq_nqueued_intr); - else - SQSTAT(sqp, sq_nqueued_other); - if (sqp->sq_stats.sq_max_qlen < sqp->sq_count) - sqp->sq_stats.sq_max_qlen = sqp->sq_count; - } -#endif - - ENQUEUE_CHAIN(sqp, mp, tail, cnt); - mutex_exit(&sqp->sq_lock); - return; - } -} - -/* - * squeue_enter() - enter squeue *sqp with mblk *mp with argument of *arg. - */ -void -squeue_enter(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void *arg, - uint8_t tag) -{ - int interrupt = servicing_interrupt(); - hrtime_t now; -#if SQUEUE_PROFILE - hrtime_t start, delta; -#endif -#if SQUEUE_DEBUG - conn_t *connp = (conn_t *)arg; - ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp); - ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp); -#endif - - ASSERT(proc != NULL); - ASSERT(sqp != NULL); - ASSERT(mp != NULL); - ASSERT(mp->b_next == NULL); - ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock)); - - mutex_enter(&sqp->sq_lock); - if (!(sqp->sq_state & SQS_PROC)) { - /* - * See if anything is already queued. If we are the - * first packet, do inline processing else queue the - * packet and do the drain. - */ - sqp->sq_run = curthread; - if (sqp->sq_first == NULL) { /* - * Fast-path, ok to process and nothing queued. + * Handle squeue switching. More details in the + * block comment at the top of the file */ - sqp->sq_state |= (SQS_PROC|SQS_FAST); - mutex_exit(&sqp->sq_lock); - -#if SQUEUE_DEBUG - sqp->sq_isintr = interrupt; - sqp->sq_curmp = mp; - sqp->sq_curproc = proc; - sqp->sq_connp = connp; - mp->b_tag = sqp->sq_tag = tag; -#endif -#if SQUEUE_PROFILE - if (SQ_PROFILING(sqp)) { - if (interrupt) - SQSTAT(sqp, sq_npackets_intr); - else - SQSTAT(sqp, sq_npackets_other); - start = gethrtime(); + if (connp->conn_sqp == sqp) { + SQUEUE_DBG_SET(sqp, mp, proc, connp, + tag); + connp->conn_on_sqp = B_TRUE; + DTRACE_PROBE3(squeue__proc__start, squeue_t *, + sqp, mblk_t *, mp, conn_t *, connp); + (*proc)(connp, mp, sqp); + DTRACE_PROBE2(squeue__proc__end, squeue_t *, + sqp, conn_t *, connp); + connp->conn_on_sqp = B_FALSE; + SQUEUE_DBG_CLEAR(sqp); + CONN_DEC_REF(connp); + } else { + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, + connp, SQ_FILL, SQTAG_SQUEUE_CHANGE); } -#endif - ((conn_t *)arg)->conn_on_sqp = B_TRUE; - DTRACE_PROBE3(squeue__proc__start, squeue_t *, - sqp, mblk_t *, mp, conn_t *, arg); - (*proc)(arg, mp, sqp); - DTRACE_PROBE2(squeue__proc__end, squeue_t *, - sqp, conn_t *, arg); - ((conn_t *)arg)->conn_on_sqp = B_FALSE; - -#if SQUEUE_PROFILE - if (SQ_PROFILING(sqp)) { - delta = gethrtime() - start; - if (interrupt) - SQDELTA(sqp, sq_time_intr, delta); - else - SQDELTA(sqp, sq_time_other, delta); - } -#endif -#if SQUEUE_DEBUG - sqp->sq_curmp = NULL; - sqp->sq_curproc = NULL; - sqp->sq_connp = NULL; - sqp->sq_isintr = 0; -#endif - - CONN_DEC_REF((conn_t *)arg); ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock)); mutex_enter(&sqp->sq_lock); sqp->sq_state &= ~(SQS_PROC|SQS_FAST); - if (sqp->sq_first == NULL) { + sqp->sq_run = NULL; + if (sqp->sq_first == NULL || + process_flag == SQ_NODRAIN) { + if (sqp->sq_first != NULL) { + squeue_worker_wakeup(sqp); + return; + } /* - * We processed inline our packet and - * nothing new has arrived. We are done. + * We processed inline our packet and nothing + * new has arrived. We are done. In case any + * control actions are pending, wake up the + * worker. */ - sqp->sq_run = NULL; + if (sqp->sq_state & SQS_WORKER_THR_CONTROL) + cv_signal(&sqp->sq_worker_cv); mutex_exit(&sqp->sq_lock); return; - } else if (sqp->sq_bind != CPU->cpu_id) { - /* - * If the current thread is not running - * on the CPU to which this squeue is bound, - * then don't allow it to drain. - */ - sqp->sq_run = NULL; - SQUEUE_WORKER_WAKEUP(sqp); - return; } } else { - ENQUEUE_MP(sqp, mp, proc, arg); -#if SQUEUE_DEBUG + ENQUEUE_CHAIN(sqp, mp, tail, cnt); +#ifdef DEBUG mp->b_tag = tag; #endif -#if SQUEUE_PROFILE - if (SQ_PROFILING(sqp)) { - if (servicing_interrupt()) - SQSTAT(sqp, sq_nqueued_intr); - else - SQSTAT(sqp, sq_nqueued_other); - if (sqp->sq_stats.sq_max_qlen < sqp->sq_count) - sqp->sq_stats.sq_max_qlen = - sqp->sq_count; - } -#endif } - /* * We are here because either we couldn't do inline - * processing (because something was already queued) + * processing (because something was already queued), + * or we had a chain of more than one packet, * or something else arrived after we were done with * inline processing. */ ASSERT(MUTEX_HELD(&sqp->sq_lock)); ASSERT(sqp->sq_first != NULL); - -#if SQUEUE_PROFILE - if (SQ_PROFILING(sqp)) { - start = gethrtime(); - } -#endif -#if SQUEUE_DEBUG - sqp->sq_isintr = interrupt; -#endif - now = gethrtime(); - if (interrupt) { - squeue_drain(sqp, SQS_ENTER, now + - squeue_intrdrain_ns); - } else { - squeue_drain(sqp, SQS_USER, now + - squeue_writerdrain_ns); - } - -#if SQUEUE_PROFILE - if (SQ_PROFILING(sqp)) { - delta = gethrtime() - start; - if (interrupt) - SQDELTA(sqp, sq_time_intr, delta); - else - SQDELTA(sqp, sq_time_other, delta); - } -#endif -#if SQUEUE_DEBUG - sqp->sq_isintr = 0; -#endif + sqp->sq_drain(sqp, SQS_ENTER, now + squeue_drain_ns); /* * If we didn't do a complete drain, the worker * thread was already signalled by squeue_drain. + * In case any control actions are pending, wake + * up the worker. */ sqp->sq_run = NULL; + if (sqp->sq_state & SQS_WORKER_THR_CONTROL) + cv_signal(&sqp->sq_worker_cv); mutex_exit(&sqp->sq_lock); return; } else { - ASSERT(sqp->sq_run != NULL); /* * We let a thread processing a squeue reenter only * once. This helps the case of incoming connection @@ -878,168 +529,42 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void *arg, * loopback connection where the two ends are bound * to the same squeue (which is typical on single * CPU machines). + * * We let the thread reenter only once for the fear * of stack getting blown with multiple traversal. */ + connp = (conn_t *)mp->b_prev; if (!(sqp->sq_state & SQS_REENTER) && - (sqp->sq_run == curthread) && sqp->sq_first == NULL && - (((conn_t *)arg)->conn_on_sqp == B_FALSE)) { + (process_flag != SQ_FILL) && (sqp->sq_first == NULL) && + (sqp->sq_run == curthread) && (cnt == 1) && + (connp->conn_on_sqp == B_FALSE)) { sqp->sq_state |= SQS_REENTER; mutex_exit(&sqp->sq_lock); - ((conn_t *)arg)->conn_on_sqp = B_TRUE; - DTRACE_PROBE3(squeue__proc__start, squeue_t *, - sqp, mblk_t *, mp, conn_t *, arg); - (*proc)(arg, mp, sqp); - DTRACE_PROBE2(squeue__proc__end, squeue_t *, - sqp, conn_t *, arg); - ((conn_t *)arg)->conn_on_sqp = B_FALSE; - CONN_DEC_REF((conn_t *)arg); - - mutex_enter(&sqp->sq_lock); - sqp->sq_state &= ~SQS_REENTER; - mutex_exit(&sqp->sq_lock); - return; - } - /* - * Queue is already being processed. Just enqueue - * the packet and go away. - */ -#if SQUEUE_DEBUG - mp->b_tag = tag; -#endif -#if SQUEUE_PROFILE - if (SQ_PROFILING(sqp)) { - if (servicing_interrupt()) - SQSTAT(sqp, sq_nqueued_intr); - else - SQSTAT(sqp, sq_nqueued_other); - if (sqp->sq_stats.sq_max_qlen < sqp->sq_count) - sqp->sq_stats.sq_max_qlen = sqp->sq_count; - } -#endif - - ENQUEUE_MP(sqp, mp, proc, arg); - mutex_exit(&sqp->sq_lock); - return; - } -} - -void -squeue_enter_nodrain(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void *arg, - uint8_t tag) -{ - int interrupt = servicing_interrupt(); - boolean_t being_processed; -#if SQUEUE_DEBUG - conn_t *connp = (conn_t *)arg; -#endif -#if SQUEUE_PROFILE - hrtime_t start, delta; -#endif + ASSERT(mp->b_prev != NULL); + ASSERT(mp->b_queue != NULL); - ASSERT(proc != NULL); - ASSERT(sqp != NULL); - ASSERT(mp != NULL); - ASSERT(mp->b_next == NULL); - ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp); - ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp); - ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock)); - - mutex_enter(&sqp->sq_lock); - - being_processed = (sqp->sq_state & SQS_PROC); - if (!being_processed && (sqp->sq_first == NULL)) { - /* - * Fast-path, ok to process and nothing queued. - */ - sqp->sq_state |= (SQS_PROC|SQS_FAST); - sqp->sq_run = curthread; - mutex_exit(&sqp->sq_lock); - -#if SQUEUE_DEBUG - sqp->sq_isintr = interrupt; - sqp->sq_curmp = mp; - sqp->sq_curproc = proc; - sqp->sq_connp = connp; - mp->b_tag = sqp->sq_tag = tag; -#endif - -#if SQUEUE_PROFILE - if (SQ_PROFILING(sqp)) { - if (interrupt) - SQSTAT(sqp, sq_npackets_intr); - else - SQSTAT(sqp, sq_npackets_other); - start = gethrtime(); - } -#endif - - ((conn_t *)arg)->conn_on_sqp = B_TRUE; - DTRACE_PROBE3(squeue__proc__start, squeue_t *, - sqp, mblk_t *, mp, conn_t *, arg); - (*proc)(arg, mp, sqp); - DTRACE_PROBE2(squeue__proc__end, squeue_t *, - sqp, conn_t *, arg); - ((conn_t *)arg)->conn_on_sqp = B_FALSE; - -#if SQUEUE_DEBUG - sqp->sq_curmp = NULL; - sqp->sq_curproc = NULL; - sqp->sq_connp = NULL; - sqp->sq_isintr = 0; -#endif -#if SQUEUE_PROFILE - if (SQ_PROFILING(sqp)) { - delta = gethrtime() - start; - if (interrupt) - SQDELTA(sqp, sq_time_intr, delta); - else - SQDELTA(sqp, sq_time_other, delta); - } -#endif + mp->b_prev = NULL; + proc = (sqproc_t)mp->b_queue; + mp->b_queue = NULL; - CONN_DEC_REF((conn_t *)arg); - mutex_enter(&sqp->sq_lock); - sqp->sq_state &= ~(SQS_PROC|SQS_FAST); - sqp->sq_run = NULL; - if (sqp->sq_first == NULL) { /* - * We processed inline our packet and - * nothing new has arrived. We are done. + * Handle squeue switching. More details in the + * block comment at the top of the file */ - mutex_exit(&sqp->sq_lock); - } else { - SQUEUE_WORKER_WAKEUP(sqp); - } - return; - } else { - /* - * We let a thread processing a squeue reenter only - * once. This helps the case of incoming connection - * where a SYN-ACK-ACK that triggers the conn_ind - * doesn't have to queue the packet if listener and - * eager are on the same squeue. Also helps the - * loopback connection where the two ends are bound - * to the same squeue (which is typical on single - * CPU machines). - * We let the thread reenter only once for the fear - * of stack getting blown with multiple traversal. - */ - if (being_processed && !(sqp->sq_state & SQS_REENTER) && - (sqp->sq_run == curthread) && sqp->sq_first == NULL && - (((conn_t *)arg)->conn_on_sqp == B_FALSE)) { - sqp->sq_state |= SQS_REENTER; - mutex_exit(&sqp->sq_lock); - - ((conn_t *)arg)->conn_on_sqp = B_TRUE; - DTRACE_PROBE3(squeue__proc__start, squeue_t *, - sqp, mblk_t *, mp, conn_t *, arg); - (*proc)(arg, mp, sqp); - DTRACE_PROBE2(squeue__proc__end, squeue_t *, - sqp, conn_t *, arg); - ((conn_t *)arg)->conn_on_sqp = B_FALSE; - CONN_DEC_REF((conn_t *)arg); + if (connp->conn_sqp == sqp) { + connp->conn_on_sqp = B_TRUE; + DTRACE_PROBE3(squeue__proc__start, squeue_t *, + sqp, mblk_t *, mp, conn_t *, connp); + (*proc)(connp, mp, sqp); + DTRACE_PROBE2(squeue__proc__end, squeue_t *, + sqp, conn_t *, connp); + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + } else { + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, + connp, SQ_FILL, SQTAG_SQUEUE_CHANGE); + } mutex_enter(&sqp->sq_lock); sqp->sq_state &= ~SQS_REENTER; @@ -1047,80 +572,32 @@ squeue_enter_nodrain(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void *arg, return; } -#if SQUEUE_DEBUG + /* + * Queue is already being processed or there is already + * one or more paquets on the queue. Enqueue the + * packet and wakeup the squeue worker thread if the + * squeue is not being processed. + */ +#ifdef DEBUG mp->b_tag = tag; #endif -#if SQUEUE_PROFILE - if (SQ_PROFILING(sqp)) { - if (servicing_interrupt()) - SQSTAT(sqp, sq_nqueued_intr); - else - SQSTAT(sqp, sq_nqueued_other); - if (sqp->sq_stats.sq_max_qlen < sqp->sq_count) - sqp->sq_stats.sq_max_qlen = sqp->sq_count; - } -#endif - ENQUEUE_MP(sqp, mp, proc, arg); - if (being_processed) { - /* - * Queue is already being processed. - * No need to do anything. - */ - mutex_exit(&sqp->sq_lock); + + ENQUEUE_CHAIN(sqp, mp, tail, cnt); + if (!(sqp->sq_state & SQS_PROC)) { + squeue_worker_wakeup(sqp); return; } - SQUEUE_WORKER_WAKEUP(sqp); - } -} - -/* - * squeue_fill() - fill squeue *sqp with mblk *mp with argument of *arg - * without processing the squeue. - */ -/* ARGSUSED */ -void -squeue_fill(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void * arg, - uint8_t tag) -{ -#if SQUEUE_DEBUG - conn_t *connp = (conn_t *)arg; -#endif - ASSERT(proc != NULL); - ASSERT(sqp != NULL); - ASSERT(mp != NULL); - ASSERT(mp->b_next == NULL); - ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp); - ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp); - - ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock)); - mutex_enter(&sqp->sq_lock); - ENQUEUE_MP(sqp, mp, proc, arg); -#if SQUEUE_DEBUG - mp->b_tag = tag; -#endif -#if SQUEUE_PROFILE - if (SQ_PROFILING(sqp)) { - if (servicing_interrupt()) - SQSTAT(sqp, sq_nqueued_intr); - else - SQSTAT(sqp, sq_nqueued_other); - if (sqp->sq_stats.sq_max_qlen < sqp->sq_count) - sqp->sq_stats.sq_max_qlen = sqp->sq_count; - } -#endif - - /* - * If queue is already being processed. No need to do anything. - */ - if (sqp->sq_state & SQS_PROC) { + /* + * In case any control actions are pending, wake + * up the worker. + */ + if (sqp->sq_state & SQS_WORKER_THR_CONTROL) + cv_signal(&sqp->sq_worker_cv); mutex_exit(&sqp->sq_lock); return; } - - SQUEUE_WORKER_WAKEUP(sqp); } - /* * PRIVATE FUNCTIONS */ @@ -1151,7 +628,7 @@ squeue_fire(void *arg) if (!(state & SQS_PROC)) { sqp->sq_awaken = lbolt; - cv_signal(&sqp->sq_async); + cv_signal(&sqp->sq_worker_cv); } mutex_exit(&sqp->sq_lock); } @@ -1159,64 +636,52 @@ squeue_fire(void *arg) static void squeue_drain(squeue_t *sqp, uint_t proc_type, hrtime_t expire) { - mblk_t *mp; - mblk_t *head; - sqproc_t proc; - conn_t *connp; - clock_t start = lbolt; - clock_t drain_time; - timeout_id_t tid; - uint_t cnt; - uint_t total_cnt = 0; + mblk_t *mp; + mblk_t *head; + sqproc_t proc; + conn_t *connp; + timeout_id_t tid; ill_rx_ring_t *sq_rx_ring = sqp->sq_rx_ring; - int interrupt = servicing_interrupt(); - boolean_t poll_on = B_FALSE; - hrtime_t now; + hrtime_t now; + boolean_t did_wakeup = B_FALSE; + boolean_t sq_poll_capable; + sq_poll_capable = (sqp->sq_state & SQS_POLL_CAPAB) != 0; +again: ASSERT(mutex_owned(&sqp->sq_lock)); - ASSERT(!(sqp->sq_state & SQS_PROC)); - -#if SQUEUE_PROFILE - if (SQ_PROFILING(sqp)) { - if (interrupt) - SQSTAT(sqp, sq_ndrains_intr); - else if (!(proc_type & SQS_WORKER)) - SQSTAT(sqp, sq_ndrains_other); - else - SQSTAT(sqp, sq_ndrains_worker); - } -#endif + ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED | + SQS_POLL_QUIESCE_DONE))); + + head = sqp->sq_first; + sqp->sq_first = NULL; + sqp->sq_last = NULL; + sqp->sq_count = 0; if ((tid = sqp->sq_tid) != 0) sqp->sq_tid = 0; sqp->sq_state |= SQS_PROC | proc_type; - head = sqp->sq_first; - sqp->sq_first = NULL; - sqp->sq_last = NULL; - cnt = sqp->sq_count; + /* * We have backlog built up. Switch to polling mode if the - * device underneath allows it. Need to do it only for - * drain by non-interrupt thread so interrupts don't - * come and disrupt us in between. If its a interrupt thread, - * no need because most devices will not issue another - * interrupt till this one returns. + * device underneath allows it. Need to do it so that + * more packets don't come in and disturb us (by contending + * for sq_lock or higher priority thread preempting us). + * + * The worker thread is allowed to do active polling while we + * just disable the interrupts for drain by non worker (kernel + * or userland) threads so they can peacefully process the + * packets during time allocated to them. */ - if ((sqp->sq_state & SQS_POLL_CAPAB) && !(proc_type & SQS_ENTER) && - (sqp->sq_count > squeue_worker_poll_min)) { - ASSERT(sq_rx_ring != NULL); - SQS_POLLING_ON(sqp, sq_rx_ring); - poll_on = B_TRUE; - } - + SQS_POLLING_ON(sqp, sq_poll_capable, sq_rx_ring); mutex_exit(&sqp->sq_lock); if (tid != 0) (void) untimeout(tid); -again: + while ((mp = head) != NULL) { + head = mp->b_next; mp->b_next = NULL; @@ -1224,255 +689,548 @@ again: mp->b_queue = NULL; connp = (conn_t *)mp->b_prev; mp->b_prev = NULL; -#if SQUEUE_DEBUG - sqp->sq_curmp = mp; - sqp->sq_curproc = proc; - sqp->sq_connp = connp; - sqp->sq_tag = mp->b_tag; -#endif -#if SQUEUE_PROFILE - if (SQ_PROFILING(sqp)) { - if (interrupt) - SQSTAT(sqp, sq_npackets_intr); - else if (!(proc_type & SQS_WORKER)) - SQSTAT(sqp, sq_npackets_other); - else - SQSTAT(sqp, sq_npackets_worker); + /* + * Handle squeue switching. More details in the + * block comment at the top of the file + */ + if (connp->conn_sqp == sqp) { + SQUEUE_DBG_SET(sqp, mp, proc, connp, + mp->b_tag); + connp->conn_on_sqp = B_TRUE; + DTRACE_PROBE3(squeue__proc__start, squeue_t *, + sqp, mblk_t *, mp, conn_t *, connp); + (*proc)(connp, mp, sqp); + DTRACE_PROBE2(squeue__proc__end, squeue_t *, + sqp, conn_t *, connp); + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + } else { + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, + SQ_FILL, SQTAG_SQUEUE_CHANGE); } -#endif - - connp->conn_on_sqp = B_TRUE; - DTRACE_PROBE3(squeue__proc__start, squeue_t *, - sqp, mblk_t *, mp, conn_t *, connp); - (*proc)(connp, mp, sqp); - DTRACE_PROBE2(squeue__proc__end, squeue_t *, - sqp, conn_t *, connp); - connp->conn_on_sqp = B_FALSE; - CONN_DEC_REF(connp); } - -#if SQUEUE_DEBUG - sqp->sq_curmp = NULL; - sqp->sq_curproc = NULL; - sqp->sq_connp = NULL; -#endif + SQUEUE_DBG_CLEAR(sqp); mutex_enter(&sqp->sq_lock); - sqp->sq_count -= cnt; - total_cnt += cnt; + /* + * Check if there is still work to do (either more arrived or timer + * expired). If we are the worker thread and we are polling capable, + * continue doing the work since no one else is around to do the + * work anyway (but signal the poll thread to retrieve some packets + * in the meanwhile). If we are not the worker thread, just + * signal the worker thread to take up the work if processing time + * has expired. + */ if (sqp->sq_first != NULL) { - - now = gethrtime(); - if (!expire || (now < expire)) { - /* More arrived and time not expired */ - head = sqp->sq_first; - sqp->sq_first = NULL; - sqp->sq_last = NULL; - cnt = sqp->sq_count; - mutex_exit(&sqp->sq_lock); - goto again; - } - /* - * If we are not worker thread and we - * reached our time limit to do drain, - * signal the worker thread to pick - * up the work. - * If we were the worker thread, then - * we take a break to allow an interrupt - * or writer to pick up the load. + * Still more to process. If time quanta not expired, we + * should let the drain go on. The worker thread is allowed + * to drain as long as there is anything left. */ - if (proc_type != SQS_WORKER) { + now = gethrtime(); + if ((now < expire) || (proc_type == SQS_WORKER)) { + /* + * If time not expired or we are worker thread and + * this squeue is polling capable, continue to do + * the drain. + * + * We turn off interrupts for all userland threads + * doing drain but we do active polling only for + * worker thread. + */ + if (proc_type == SQS_WORKER) + SQS_POLL_RING(sqp, sq_poll_capable); + goto again; + } else { + did_wakeup = B_TRUE; sqp->sq_awaken = lbolt; - cv_signal(&sqp->sq_async); + cv_signal(&sqp->sq_worker_cv); } } /* - * Try to see if we can get a time estimate to process a packet. - * Do it only in interrupt context since less chance of context - * switch or pinning etc. to get a better estimate. + * If the poll thread is already running, just return. The + * poll thread continues to hold the proc and will finish + * processing. */ - if (interrupt && ((drain_time = (lbolt - start)) > 0)) - sqp->sq_avg_drain_time = ((80 * sqp->sq_avg_drain_time) + - (20 * (drv_hztousec(drain_time)/total_cnt)))/100; - - sqp->sq_state &= ~(SQS_PROC | proc_type); + if (sqp->sq_state & SQS_GET_PKTS) { + ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED | + SQS_POLL_QUIESCE_DONE))); + sqp->sq_state &= ~proc_type; + return; + } /* - * If polling was turned on, turn it off and reduce the default - * interrupt blank interval as well to bring new packets in faster - * (reduces the latency when there is no backlog). + * + * If we are the worker thread and no work is left, send the poll + * thread down once more to see if something arrived. Otherwise, + * turn the interrupts back on and we are done. */ - if (poll_on && (sqp->sq_state & SQS_POLL_CAPAB)) { - ASSERT(sq_rx_ring != NULL); - SQS_POLLING_OFF(sqp, sq_rx_ring); + if ((proc_type == SQS_WORKER) && + (sqp->sq_state & SQS_POLL_CAPAB)) { + /* + * Do one last check to see if anything arrived + * in the NIC. We leave the SQS_PROC set to ensure + * that poll thread keeps the PROC and can decide + * if it needs to turn polling off or continue + * processing. + * + * If we drop the SQS_PROC here and poll thread comes + * up empty handed, it can not safely turn polling off + * since someone else could have acquired the PROC + * and started draining. The previously running poll + * thread and the current thread doing drain would end + * up in a race for turning polling on/off and more + * complex code would be required to deal with it. + * + * Its lot simpler for drain to hand the SQS_PROC to + * poll thread (if running) and let poll thread finish + * without worrying about racing with any other thread. + */ + ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED | + SQS_POLL_QUIESCE_DONE))); + SQS_POLL_RING(sqp, sq_poll_capable); + sqp->sq_state &= ~proc_type; + } else { + /* + * The squeue is either not capable of polling or + * poll thread already finished processing and didn't + * find anything. Since there is nothing queued and + * we already turn polling on (for all threads doing + * drain), we should turn polling off and relinquish + * the PROC. + */ + ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED | + SQS_POLL_QUIESCE_DONE))); + SQS_POLLING_OFF(sqp, sq_poll_capable, sq_rx_ring); + sqp->sq_state &= ~(SQS_PROC | proc_type); + if (!did_wakeup && sqp->sq_first != NULL) { + squeue_worker_wakeup(sqp); + mutex_enter(&sqp->sq_lock); + } + /* + * If we are not the worker and there is a pending quiesce + * event, wake up the worker + */ + if ((proc_type != SQS_WORKER) && + (sqp->sq_state & SQS_WORKER_THR_CONTROL)) + cv_signal(&sqp->sq_worker_cv); } } +/* + * Quiesce, Restart, or Cleanup of the squeue poll thread. + * + * Quiesce and Restart: After an squeue poll thread has been quiesced, it does + * not attempt to poll the underlying soft ring any more. The quiesce is + * triggered by the mac layer when it wants to quiesce a soft ring. Typically + * control operations such as changing the fanout of a NIC or VNIC (dladm + * setlinkprop) need to quiesce data flow before changing the wiring. + * The operation is done by the mac layer, but it calls back into IP to + * quiesce the soft ring. After completing the operation (say increase or + * decrease of the fanout) the mac layer then calls back into IP to restart + * the quiesced soft ring. + * + * Cleanup: This is triggered when the squeue binding to a soft ring is + * removed permanently. Typically interface plumb and unplumb would trigger + * this. It can also be triggered from the mac layer when a soft ring is + * being deleted say as the result of a fanout reduction. Since squeues are + * never deleted, the cleanup marks the squeue as fit for recycling and + * moves it to the zeroth squeue set. + */ static void -squeue_worker(squeue_t *sqp) +squeue_poll_thr_control(squeue_t *sqp) +{ + if (sqp->sq_state & SQS_POLL_THR_RESTART) { + /* Restart implies a previous quiesce */ + ASSERT(sqp->sq_state & SQS_POLL_THR_QUIESCED); + sqp->sq_state &= ~(SQS_POLL_THR_QUIESCED | + SQS_POLL_THR_RESTART); + sqp->sq_state |= SQS_POLL_CAPAB; + cv_signal(&sqp->sq_worker_cv); + return; + } + + if (sqp->sq_state & SQS_POLL_THR_QUIESCE) { + sqp->sq_state |= SQS_POLL_THR_QUIESCED; + sqp->sq_state &= ~SQS_POLL_THR_QUIESCE; + cv_signal(&sqp->sq_worker_cv); + return; + } +} + +/* + * POLLING Notes + * + * With polling mode, we want to do as much processing as we possibly can + * in worker thread context. The sweet spot is worker thread keeps doing + * work all the time in polling mode and writers etc. keep dumping packets + * to worker thread. Occassionally, we send the poll thread (running at + * lower priority to NIC to get the chain of packets to feed to worker). + * Sending the poll thread down to NIC is dependant on 3 criterions + * + * 1) Its always driven from squeue_drain and only if worker thread is + * doing the drain. + * 2) We clear the backlog once and more packets arrived in between. + * Before starting drain again, send the poll thread down if + * the drain is being done by worker thread. + * 3) Before exiting the squeue_drain, if the poll thread is not already + * working and we are the worker thread, try to poll one more time. + * + * For latency sake, we do allow any thread calling squeue_enter + * to process its packet provided: + * + * 1) Nothing is queued + * 2) If more packets arrived in between, the non worker thread are allowed + * to do the drain till their time quanta expired provided SQS_GET_PKTS + * wasn't set in between. + * + * Avoiding deadlocks with interrupts + * ================================== + * + * One of the big problem is that we can't send poll_thr down while holding + * the sq_lock since the thread can block. So we drop the sq_lock before + * calling sq_get_pkts(). We keep holding the SQS_PROC as long as the + * poll thread is running so that no other thread can acquire the + * perimeter in between. If the squeue_drain gets done (no more work + * left), it leaves the SQS_PROC set if poll thread is running. + */ + +/* + * This is the squeue poll thread. In poll mode, it polls the underlying + * TCP softring and feeds packets into the squeue. The worker thread then + * drains the squeue. The poll thread also responds to control signals for + * quiesceing, restarting, or cleanup of an squeue. These are driven by + * control operations like plumb/unplumb or as a result of dynamic Rx ring + * related operations that are driven from the mac layer. + */ +static void +squeue_polling_thread(squeue_t *sqp) { kmutex_t *lock = &sqp->sq_lock; - kcondvar_t *async = &sqp->sq_async; + kcondvar_t *async = &sqp->sq_poll_cv; + ip_mac_rx_t sq_get_pkts; + ip_accept_t ip_accept; + ill_rx_ring_t *sq_rx_ring; + ill_t *sq_ill; + mblk_t *head, *tail, *mp; + uint_t cnt; + void *sq_mac_handle; callb_cpr_t cprinfo; - hrtime_t now; -#if SQUEUE_PROFILE - hrtime_t start; -#endif + size_t bytes_to_pickup; + uint32_t ctl_state; - CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "nca"); + CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "sq_poll"); mutex_enter(lock); for (;;) { - while (sqp->sq_first == NULL || (sqp->sq_state & SQS_PROC)) { - CALLB_CPR_SAFE_BEGIN(&cprinfo); -still_wait: - cv_wait(async, lock); - if (sqp->sq_state & SQS_PROC) { - goto still_wait; - } - CALLB_CPR_SAFE_END(&cprinfo, lock); + CALLB_CPR_SAFE_BEGIN(&cprinfo); + cv_wait(async, lock); + CALLB_CPR_SAFE_END(&cprinfo, lock); + + ctl_state = sqp->sq_state & (SQS_POLL_THR_CONTROL | + SQS_POLL_THR_QUIESCED); + if (ctl_state != 0) { + /* + * If the squeue is quiesced, then wait for a control + * request. A quiesced squeue must not poll the + * underlying soft ring. + */ + if (ctl_state == SQS_POLL_THR_QUIESCED) + continue; + /* + * Act on control requests to quiesce, cleanup or + * restart an squeue + */ + squeue_poll_thr_control(sqp); + continue; } -#if SQUEUE_PROFILE - if (SQ_PROFILING(sqp)) { - start = gethrtime(); + if (!(sqp->sq_state & SQS_POLL_CAPAB)) + continue; + + ASSERT((sqp->sq_state & + (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) == + (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)); + +poll_again: + sq_rx_ring = sqp->sq_rx_ring; + sq_get_pkts = sq_rx_ring->rr_rx; + sq_mac_handle = sq_rx_ring->rr_rx_handle; + ip_accept = sq_rx_ring->rr_ip_accept; + sq_ill = sq_rx_ring->rr_ill; + bytes_to_pickup = MAX_BYTES_TO_PICKUP; + mutex_exit(lock); + head = sq_get_pkts(sq_mac_handle, bytes_to_pickup); + mp = NULL; + if (head != NULL) { + /* + * We got the packet chain from the mac layer. It + * would be nice to be able to process it inline + * for better performance but we need to give + * IP a chance to look at this chain to ensure + * that packets are really meant for this squeue + * and do the IP processing. + */ + mp = ip_accept(sq_ill, sq_rx_ring, sqp, head, + &tail, &cnt); } -#endif + mutex_enter(lock); + if (mp != NULL) + ENQUEUE_CHAIN(sqp, mp, tail, cnt); - ASSERT(squeue_workerdrain_ns != 0); - now = gethrtime(); - sqp->sq_run = curthread; - squeue_drain(sqp, SQS_WORKER, now + squeue_workerdrain_ns); - sqp->sq_run = NULL; + ASSERT((sqp->sq_state & + (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) == + (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)); - if (sqp->sq_first != NULL) { + if (sqp->sq_first != NULL && !(sqp->sq_state & SQS_WORKER)) { /* - * Doing too much processing by worker thread - * in presense of interrupts can be sub optimal. - * Instead, once a drain is done by worker thread - * for squeue_writerdrain_ns (the reason we are - * here), we force wait for squeue_workerwait_tick - * before doing more processing even if sq_wait is - * set to 0. - * - * This can be counterproductive for performance - * if worker thread is the only means to process - * the packets (interrupts or writers are not - * allowed inside the squeue). + * We have packets to process and worker thread + * is not running. Check to see if poll thread is + * allowed to process. Let it do processing only if it + * picked up some packets from the NIC otherwise + * wakeup the worker thread. */ - if (sqp->sq_tid == 0 && - !(sqp->sq_state & SQS_TMO_PROG)) { - timeout_id_t tid; + if (mp != NULL) { + hrtime_t now; + + now = gethrtime(); + sqp->sq_run = curthread; + sqp->sq_drain(sqp, SQS_POLL_PROC, now + + squeue_drain_ns); + sqp->sq_run = NULL; + + if (sqp->sq_first == NULL) + goto poll_again; - sqp->sq_state |= SQS_TMO_PROG; - mutex_exit(&sqp->sq_lock); - tid = timeout(squeue_fire, sqp, - squeue_workerwait_tick); - mutex_enter(&sqp->sq_lock); /* - * Check again if we still need - * the timeout + * Couldn't do the entire drain because the + * time limit expired, let the + * worker thread take over. */ - if (((sqp->sq_state & (SQS_TMO_PROG|SQS_PROC)) - == SQS_TMO_PROG) && (sqp->sq_tid == 0) && - (sqp->sq_first != NULL)) { - sqp->sq_state &= ~SQS_TMO_PROG; - sqp->sq_awaken = lbolt; - sqp->sq_tid = tid; - } else if (sqp->sq_state & SQS_TMO_PROG) { - /* timeout not needed */ - sqp->sq_state &= ~SQS_TMO_PROG; - mutex_exit(&(sqp)->sq_lock); - (void) untimeout(tid); - mutex_enter(&sqp->sq_lock); - } } - CALLB_CPR_SAFE_BEGIN(&cprinfo); - cv_wait(async, lock); - CALLB_CPR_SAFE_END(&cprinfo, lock); - } - -#if SQUEUE_PROFILE - if (SQ_PROFILING(sqp)) { - SQDELTA(sqp, sq_time_worker, gethrtime() - start); + sqp->sq_awaken = lbolt; + /* + * Put the SQS_PROC_HELD on so the worker + * thread can distinguish where its called from. We + * can remove the SQS_PROC flag here and turn off the + * polling so that it wouldn't matter who gets the + * processing but we get better performance this way + * and save the cost of turn polling off and possibly + * on again as soon as we start draining again. + * + * We can't remove the SQS_PROC flag without turning + * polling off until we can guarantee that control + * will return to squeue_drain immediately. + */ + sqp->sq_state |= SQS_PROC_HELD; + sqp->sq_state &= ~SQS_GET_PKTS; + cv_signal(&sqp->sq_worker_cv); + } else if (sqp->sq_first == NULL && + !(sqp->sq_state & SQS_WORKER)) { + /* + * Nothing queued and worker thread not running. + * Since we hold the proc, no other thread is + * processing the squeue. This means that there + * is no work to be done and nothing is queued + * in squeue or in NIC. Turn polling off and go + * back to interrupt mode. + */ + sqp->sq_state &= ~(SQS_PROC|SQS_GET_PKTS); + /* LINTED: constant in conditional context */ + SQS_POLLING_OFF(sqp, B_TRUE, sq_rx_ring); + } else { + /* + * Worker thread is already running. We don't need + * to do anything. Indicate that poll thread is done. + */ + sqp->sq_state &= ~SQS_GET_PKTS; + } + if (sqp->sq_state & SQS_POLL_THR_CONTROL) { + /* + * Act on control requests to quiesce, cleanup or + * restart an squeue + */ + squeue_poll_thr_control(sqp); } -#endif } } -#if SQUEUE_PROFILE -static int -squeue_kstat_update(kstat_t *ksp, int rw) +/* + * The squeue worker thread acts on any control requests to quiesce, cleanup + * or restart an ill_rx_ring_t by calling this function. The worker thread + * synchronizes with the squeue poll thread to complete the request and finally + * wakes up the requestor when the request is completed. + */ +static void +squeue_worker_thr_control(squeue_t *sqp) { - struct squeue_kstat *sqsp = &squeue_kstat; - squeue_t *sqp = ksp->ks_private; + ill_t *ill; + ill_rx_ring_t *rx_ring; - if (rw == KSTAT_WRITE) - return (EACCES); + ASSERT(MUTEX_HELD(&sqp->sq_lock)); -#if SQUEUE_DEBUG - sqsp->sq_count.value.ui64 = sqp->sq_count; - sqsp->sq_max_qlen.value.ui64 = sqp->sq_stats.sq_max_qlen; -#endif - sqsp->sq_npackets_worker.value.ui64 = sqp->sq_stats.sq_npackets_worker; - sqsp->sq_npackets_intr.value.ui64 = sqp->sq_stats.sq_npackets_intr; - sqsp->sq_npackets_other.value.ui64 = sqp->sq_stats.sq_npackets_other; - sqsp->sq_nqueued_intr.value.ui64 = sqp->sq_stats.sq_nqueued_intr; - sqsp->sq_nqueued_other.value.ui64 = sqp->sq_stats.sq_nqueued_other; - sqsp->sq_ndrains_worker.value.ui64 = sqp->sq_stats.sq_ndrains_worker; - sqsp->sq_ndrains_intr.value.ui64 = sqp->sq_stats.sq_ndrains_intr; - sqsp->sq_ndrains_other.value.ui64 = sqp->sq_stats.sq_ndrains_other; - sqsp->sq_time_worker.value.ui64 = sqp->sq_stats.sq_time_worker; - sqsp->sq_time_intr.value.ui64 = sqp->sq_stats.sq_time_intr; - sqsp->sq_time_other.value.ui64 = sqp->sq_stats.sq_time_other; - return (0); -} -#endif + if (sqp->sq_state & SQS_POLL_RESTART) { + /* Restart implies a previous quiesce. */ + ASSERT((sqp->sq_state & (SQS_PROC_HELD | + SQS_POLL_QUIESCE_DONE | SQS_PROC | SQS_WORKER)) == + (SQS_POLL_QUIESCE_DONE | SQS_PROC | SQS_WORKER)); + /* + * Request the squeue poll thread to restart and wait till + * it actually restarts. + */ + sqp->sq_state &= ~SQS_POLL_QUIESCE_DONE; + sqp->sq_state |= SQS_POLL_THR_RESTART; + cv_signal(&sqp->sq_poll_cv); + while (sqp->sq_state & SQS_POLL_THR_QUIESCED) + cv_wait(&sqp->sq_worker_cv, &sqp->sq_lock); + sqp->sq_state &= ~(SQS_POLL_RESTART | SQS_PROC | + SQS_WORKER); + /* + * Signal any waiter that is waiting for the restart + * to complete + */ + sqp->sq_state |= SQS_POLL_RESTART_DONE; + cv_signal(&sqp->sq_ctrlop_done_cv); + return; + } -void -squeue_profile_enable(squeue_t *sqp) -{ - mutex_enter(&sqp->sq_lock); - sqp->sq_state |= SQS_PROFILE; - mutex_exit(&sqp->sq_lock); -} + if (sqp->sq_state & SQS_PROC_HELD) { + /* The squeue poll thread handed control to us */ + ASSERT(sqp->sq_state & SQS_PROC); + } -void -squeue_profile_disable(squeue_t *sqp) -{ - mutex_enter(&sqp->sq_lock); - sqp->sq_state &= ~SQS_PROFILE; + /* + * Prevent any other thread from processing the squeue + * until we finish the control actions by setting SQS_PROC. + * But allow ourself to reenter by setting SQS_WORKER + */ + sqp->sq_state |= (SQS_PROC | SQS_WORKER); + + /* Signal the squeue poll thread and wait for it to quiesce itself */ + if (!(sqp->sq_state & SQS_POLL_THR_QUIESCED)) { + sqp->sq_state |= SQS_POLL_THR_QUIESCE; + cv_signal(&sqp->sq_poll_cv); + while (!(sqp->sq_state & SQS_POLL_THR_QUIESCED)) + cv_wait(&sqp->sq_worker_cv, &sqp->sq_lock); + } + + rx_ring = sqp->sq_rx_ring; + ill = rx_ring->rr_ill; + /* + * The lock hierarchy is as follows. + * cpu_lock -> ill_lock -> sqset_lock -> sq_lock + */ mutex_exit(&sqp->sq_lock); -} + mutex_enter(&ill->ill_lock); + mutex_enter(&sqp->sq_lock); -void -squeue_profile_reset(squeue_t *sqp) -{ -#if SQUEUE_PROFILE - bzero(&sqp->sq_stats, sizeof (sqstat_t)); -#endif -} + SQS_POLLING_OFF(sqp, (sqp->sq_state & SQS_POLL_CAPAB) != 0, + sqp->sq_rx_ring); + sqp->sq_state &= ~(SQS_POLL_CAPAB | SQS_GET_PKTS | SQS_PROC_HELD); + if (sqp->sq_state & SQS_POLL_CLEANUP) { + /* + * Disassociate this squeue from its ill_rx_ring_t. + * The rr_sqp, sq_rx_ring fields are protected by the + * corresponding squeue, ill_lock* and sq_lock. Holding any + * of them will ensure that the ring to squeue mapping does + * not change. + */ + ASSERT(!(sqp->sq_state & SQS_DEFAULT)); -void -squeue_profile_start(void) -{ -#if SQUEUE_PROFILE - squeue_profile = B_TRUE; -#endif + sqp->sq_rx_ring = NULL; + rx_ring->rr_sqp = NULL; + + sqp->sq_state &= ~(SQS_POLL_CLEANUP | SQS_POLL_THR_QUIESCED | + SQS_POLL_QUIESCE_DONE); + sqp->sq_ill = NULL; + + rx_ring->rr_rx_handle = NULL; + rx_ring->rr_intr_handle = NULL; + rx_ring->rr_intr_enable = NULL; + rx_ring->rr_intr_disable = NULL; + sqp->sq_state |= SQS_POLL_CLEANUP_DONE; + } else { + sqp->sq_state &= ~SQS_POLL_QUIESCE; + sqp->sq_state |= SQS_POLL_QUIESCE_DONE; + } + /* + * Signal any waiter that is waiting for the quiesce or cleanup + * to complete and also wait for it to actually see and reset the + * SQS_POLL_CLEANUP_DONE. + */ + cv_signal(&sqp->sq_ctrlop_done_cv); + mutex_exit(&ill->ill_lock); + if (sqp->sq_state & SQS_POLL_CLEANUP_DONE) { + cv_wait(&sqp->sq_worker_cv, &sqp->sq_lock); + sqp->sq_state &= ~(SQS_PROC | SQS_WORKER); + } } -void -squeue_profile_stop(void) +static void +squeue_worker(squeue_t *sqp) { -#if SQUEUE_PROFILE - squeue_profile = B_FALSE; -#endif + kmutex_t *lock = &sqp->sq_lock; + kcondvar_t *async = &sqp->sq_worker_cv; + callb_cpr_t cprinfo; + hrtime_t now; + + CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "sq_worker"); + mutex_enter(lock); + + for (;;) { + for (;;) { + /* + * If the poll thread has handed control to us + * we need to break out of the wait. + */ + if (sqp->sq_state & SQS_PROC_HELD) + break; + + /* + * If the squeue is not being processed and we either + * have messages to drain or some thread has signaled + * some control activity we need to break + */ + if (!(sqp->sq_state & SQS_PROC) && + ((sqp->sq_state & SQS_WORKER_THR_CONTROL) || + (sqp->sq_first != NULL))) + break; + + /* + * If we have started some control action, then check + * for the SQS_WORKER flag (since we don't + * release the squeue) to make sure we own the squeue + * and break out + */ + if ((sqp->sq_state & SQS_WORKER_THR_CONTROL) && + (sqp->sq_state & SQS_WORKER)) + break; + + CALLB_CPR_SAFE_BEGIN(&cprinfo); + cv_wait(async, lock); + CALLB_CPR_SAFE_END(&cprinfo, lock); + } + if (sqp->sq_state & SQS_WORKER_THR_CONTROL) { + squeue_worker_thr_control(sqp); + continue; + } + ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED | + SQS_POLL_CLEANUP_DONE | SQS_POLL_QUIESCE_DONE | + SQS_WORKER_THR_CONTROL | SQS_POLL_THR_CONTROL))); + + if (sqp->sq_state & SQS_PROC_HELD) + sqp->sq_state &= ~SQS_PROC_HELD; + + now = gethrtime(); + sqp->sq_run = curthread; + sqp->sq_drain(sqp, SQS_WORKER, now + squeue_drain_ns); + sqp->sq_run = NULL; + } } uintptr_t * @@ -1482,9 +1240,3 @@ squeue_getprivate(squeue_t *sqp, sqprivate_t p) return (&sqp->sq_private[p]); } - -processorid_t -squeue_binding(squeue_t *sqp) -{ - return (sqp->sq_bind); -} diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index 3b8440b230..4bb50d2344 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -96,6 +96,7 @@ #include <inet/ip_if.h> #include <inet/ipp_common.h> #include <inet/ip_netinfo.h> +#include <sys/squeue_impl.h> #include <sys/squeue.h> #include <inet/kssl/ksslapi.h> #include <sys/tsol/label.h> @@ -124,8 +125,8 @@ * The tcp data structure does not use any kind of lock for protecting * its state but instead uses 'squeues' for mutual exclusion from various * read and write side threads. To access a tcp member, the thread should - * always be behind squeue (via squeue_enter, squeue_enter_nodrain, or - * squeue_fill). Since the squeues allow a direct function call, caller + * always be behind squeue (via squeue_enter with flags as SQ_FILL, SQ_PROCESS, + * or SQ_NODRAIN). Since the squeues allow a direct function call, caller * can pass any tcp function having prototype of edesc_t as argument * (different from traditional STREAMs model where packets come in only * designated entry points). The list of functions that can be directly @@ -251,15 +252,12 @@ /* * Values for squeue switch: - * 1: squeue_enter_nodrain - * 2: squeue_enter - * 3: squeue_fill + * 1: SQ_NODRAIN + * 2: SQ_PROCESS + * 3: SQ_FILL */ -int tcp_squeue_close = 2; /* Setable in /etc/system */ -int tcp_squeue_wput = 2; - -squeue_func_t tcp_squeue_close_proc; -squeue_func_t tcp_squeue_wput_proc; +int tcp_squeue_wput = 2; /* /etc/systems */ +int tcp_squeue_flag; /* * Macros for sodirect: @@ -940,7 +938,7 @@ static int tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, tcph_t *tcph, uint_t ipvers, mblk_t *idmp); static int tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, tcph_t *tcph, mblk_t *idmp); -static squeue_func_t tcp_squeue_switch(int); +static int tcp_squeue_switch(int); static int tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t); static int tcp_openv4(queue_t *, dev_t *, int, int, cred_t *); @@ -1865,9 +1863,9 @@ tcp_time_wait_collector(void *arg) TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); mp = &tcp->tcp_closemp; - squeue_fill(connp->conn_sqp, mp, + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timewait_output, connp, - SQTAG_TCP_TIMEWAIT); + SQ_FILL, SQTAG_TCP_TIMEWAIT); } } else { mutex_enter(&connp->conn_lock); @@ -1893,8 +1891,9 @@ tcp_time_wait_collector(void *arg) TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); mp = &tcp->tcp_closemp; - squeue_fill(connp->conn_sqp, mp, - tcp_timewait_output, connp, 0); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, + tcp_timewait_output, connp, + SQ_FILL, SQTAG_TCP_TIMEWAIT); } mutex_enter(&tcp_time_wait->tcp_time_wait_lock); } @@ -2374,10 +2373,10 @@ finish: * queue. */ /* - * We already have a ref on tcp so no need to do one before squeue_fill + * We already have a ref on tcp so no need to do one before squeue_enter */ - squeue_fill(eager->tcp_connp->conn_sqp, opt_mp, - tcp_accept_finish, eager->tcp_connp, SQTAG_TCP_ACCEPT_FINISH); + SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, opt_mp, tcp_accept_finish, + eager->tcp_connp, SQ_FILL, SQTAG_TCP_ACCEPT_FINISH); } /* @@ -4048,8 +4047,8 @@ tcp_close(queue_t *q, int flags) TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); - (*tcp_squeue_close_proc)(connp->conn_sqp, mp, - tcp_close_output, connp, SQTAG_IP_TCP_CLOSE); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_close_output, connp, + tcp_squeue_flag, SQTAG_IP_TCP_CLOSE); mutex_enter(&tcp->tcp_closelock); while (!tcp->tcp_closed) { @@ -4074,9 +4073,9 @@ tcp_close(queue_t *q, int flags) /* Entering squeue, bump ref count. */ CONN_INC_REF(connp); bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); - squeue_enter(connp->conn_sqp, bp, + SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_linger_interrupted, connp, - SQTAG_IP_TCP_CLOSE); + tcp_squeue_flag, SQTAG_IP_TCP_CLOSE); mutex_enter(&tcp->tcp_closelock); } break; @@ -4625,6 +4624,11 @@ tcp_free(tcp_t *tcp) tcp->tcp_ordrel_mp = NULL; } + if (tcp->tcp_ordrel_mp != NULL) { + freeb(tcp->tcp_ordrel_mp); + tcp->tcp_ordrel_mp = NULL; + } + if (tcp->tcp_sack_info != NULL) { if (tcp->tcp_notsack_list != NULL) { TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); @@ -4825,8 +4829,9 @@ tcp_drop_q0(tcp_t *tcp) /* Mark the IRE created for this SYN request temporary */ tcp_ip_ire_mark_advice(eager); - squeue_fill(eager->tcp_connp->conn_sqp, mp, - tcp_clean_death_wrapper, eager->tcp_connp, SQTAG_TCP_DROP_Q0); + SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, + tcp_clean_death_wrapper, eager->tcp_connp, + SQ_FILL, SQTAG_TCP_DROP_Q0); return (B_TRUE); } @@ -5302,6 +5307,7 @@ tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp) * The caller already ensured that there is a sqp present. */ econnp->conn_sqp = new_sqp; + econnp->conn_initial_sqp = new_sqp; if (connp->conn_policy != NULL) { ipsec_in_t *ii; @@ -5681,6 +5687,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) goto error2; ASSERT(econnp->conn_netstack == connp->conn_netstack); econnp->conn_sqp = new_sqp; + econnp->conn_initial_sqp = new_sqp; } else if ((mp->b_datap->db_struioflag & STRUIO_POLICY) != 0) { /* * mp is updated in tcp_get_ipsec_conn(). @@ -6032,8 +6039,9 @@ error: freemsg(mp1); eager->tcp_closemp_used = B_TRUE; TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); - squeue_fill(econnp->conn_sqp, &eager->tcp_closemp, tcp_eager_kill, - econnp, SQTAG_TCP_CONN_REQ_2); + mp1 = &eager->tcp_closemp; + SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_eager_kill, + econnp, SQ_FILL, SQTAG_TCP_CONN_REQ_2); /* * If a connection already exists, send the mp to that connections so @@ -6056,8 +6064,8 @@ error: CONN_DEC_REF(econnp); freemsg(mp); } else { - squeue_fill(econnp->conn_sqp, mp, tcp_input, - econnp, SQTAG_TCP_CONN_REQ_1); + SQUEUE_ENTER_ONE(econnp->conn_sqp, mp, + tcp_input, econnp, SQ_FILL, SQTAG_TCP_CONN_REQ_1); } } else { /* Nobody wants this packet */ @@ -6149,8 +6157,8 @@ tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2) done: if (connp->conn_sqp != sqp) { CONN_INC_REF(connp); - squeue_fill(connp->conn_sqp, mp, - connp->conn_recv, connp, SQTAG_TCP_CONN_REQ_UNBOUND); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp, + SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND); } else { tcp_conn_request(connp, mp, sqp); } @@ -7217,8 +7225,8 @@ tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum) CONN_INC_REF(eager->tcp_connp); mutex_exit(&listener->tcp_eager_lock); mp = &eager->tcp_closemp; - squeue_fill(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill, - eager->tcp_connp, SQTAG_TCP_EAGER_BLOWOFF); + SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill, + eager->tcp_connp, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF); return (B_TRUE); } @@ -7245,9 +7253,9 @@ tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only) TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); CONN_INC_REF(eager->tcp_connp); mp = &eager->tcp_closemp; - squeue_fill(eager->tcp_connp->conn_sqp, mp, + SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill, eager->tcp_connp, - SQTAG_TCP_EAGER_CLEANUP); + SQ_FILL, SQTAG_TCP_EAGER_CLEANUP); } eager = eager->tcp_eager_next_q; } @@ -7261,8 +7269,8 @@ tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only) TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); CONN_INC_REF(eager->tcp_connp); mp = &eager->tcp_closemp; - squeue_fill(eager->tcp_connp->conn_sqp, mp, - tcp_eager_kill, eager->tcp_connp, + SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, + tcp_eager_kill, eager->tcp_connp, SQ_FILL, SQTAG_TCP_EAGER_CLEANUP_Q0); } eager = eager->tcp_eager_next_q0; @@ -9785,6 +9793,7 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, return (ENOSR); } connp->conn_sqp = IP_SQUEUE_GET(lbolt); + connp->conn_initial_sqp = connp->conn_sqp; tcp = connp->conn_tcp; q->q_ptr = WR(q)->q_ptr = connp; @@ -12059,13 +12068,13 @@ enq: * on the conn structure associated so the tcp is guaranteed to exist * when we come here. We still need to check the state because it might * as well has been closed. The squeue processing function i.e. squeue_enter, - * squeue_enter_nodrain, or squeue_drain is responsible for doing the - * CONN_DEC_REF. + * is responsible for doing the CONN_DEC_REF. * * Apart from the default entry point, IP also sends packets directly to * tcp_rput_data for AF_INET fast path and tcp_conn_request for incoming * connections. */ +boolean_t tcp_outbound_squeue_switch = B_FALSE; void tcp_input(void *arg, mblk_t *mp, void *arg2) { @@ -12102,10 +12111,33 @@ tcp_input(void *arg, mblk_t *mp, void *arg2) return; } - if (DB_TYPE(mp) == M_DATA) - tcp_rput_data(connp, mp, arg2); - else + if (DB_TYPE(mp) != M_DATA) { tcp_rput_common(tcp, mp); + return; + } + + if (mp->b_datap->db_struioflag & STRUIO_CONNECT) { + squeue_t *final_sqp; + + mp->b_datap->db_struioflag &= ~STRUIO_CONNECT; + final_sqp = (squeue_t *)DB_CKSUMSTART(mp); + DB_CKSUMSTART(mp) = 0; + if (tcp->tcp_state == TCPS_SYN_SENT && + connp->conn_final_sqp == NULL && + tcp_outbound_squeue_switch) { + ASSERT(connp->conn_initial_sqp == connp->conn_sqp); + connp->conn_final_sqp = final_sqp; + if (connp->conn_final_sqp != connp->conn_sqp) { + CONN_INC_REF(connp); + SQUEUE_SWITCH(connp, connp->conn_final_sqp); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, + tcp_rput_data, connp, ip_squeue_flag, + SQTAG_CONNECT_FINISH); + return; + } + } + } + tcp_rput_data(connp, mp, arg2); } /* @@ -14316,16 +14348,27 @@ process_ack: CONN_INC_REF(listener->tcp_connp); if (listener->tcp_connp->conn_sqp == connp->conn_sqp) { + /* + * We optimize by not calling an SQUEUE_ENTER + * on the listener since we know that the + * listener and eager squeues are the same. + * We are able to make this check safely only + * because neither the eager nor the listener + * can change its squeue. Only an active connect + * can change its squeue + */ tcp_send_conn_ind(listener->tcp_connp, mp, listener->tcp_connp->conn_sqp); CONN_DEC_REF(listener->tcp_connp); } else if (!tcp->tcp_loopback) { - squeue_fill(listener->tcp_connp->conn_sqp, mp, - tcp_send_conn_ind, - listener->tcp_connp, SQTAG_TCP_CONN_IND); + SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, + mp, tcp_send_conn_ind, + listener->tcp_connp, SQ_FILL, + SQTAG_TCP_CONN_IND); } else { - squeue_enter(listener->tcp_connp->conn_sqp, mp, - tcp_send_conn_ind, listener->tcp_connp, + SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, + mp, tcp_send_conn_ind, + listener->tcp_connp, SQ_PROCESS, SQTAG_TCP_CONN_IND); } } @@ -15884,7 +15927,6 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) return (mp); } - /* * Handle a *T_BIND_REQ that has failed either due to a T_ERROR_ACK * or a "bad" IRE detected by tcp_adapt_ire. @@ -16402,8 +16444,8 @@ tcp_rsrv(queue_t *q) mutex_exit(&tcp->tcp_rsrv_mp_lock); CONN_INC_REF(connp); - squeue_enter(connp->conn_sqp, mp, tcp_rsrv_input, connp, - SQTAG_TCP_RSRV); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_rsrv_input, connp, + SQ_PROCESS, SQTAG_TCP_RSRV); } /* @@ -18768,9 +18810,9 @@ tcp_wput_accept(queue_t *q, mblk_t *mp) /* Need to get inside the listener perimeter */ CONN_INC_REF(listener->tcp_connp); - squeue_fill(listener->tcp_connp->conn_sqp, mp1, + SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1, tcp_send_pending, listener->tcp_connp, - SQTAG_TCP_SEND_PENDING); + SQ_FILL, SQTAG_TCP_SEND_PENDING); } no_more_eagers: tcp_eager_unlink(eager); @@ -18781,10 +18823,13 @@ no_more_eagers: * but we still have an extra refs on eager (apart from the * usual tcp references). The ref was placed in tcp_rput_data * before sending the conn_ind in tcp_send_conn_ind. - * The ref will be dropped in tcp_accept_finish(). + * The ref will be dropped in tcp_accept_finish(). As sockfs + * has already established this tcp with it's own stream, + * it's OK to set tcp_detached to B_FALSE. */ - squeue_enter_nodrain(econnp->conn_sqp, opt_mp, - tcp_accept_finish, econnp, SQTAG_TCP_ACCEPT_FINISH_Q0); + econnp->conn_tcp->tcp_detached = B_FALSE; + SQUEUE_ENTER_ONE(econnp->conn_sqp, opt_mp, tcp_accept_finish, + econnp, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0); return; default: mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0); @@ -18916,7 +18961,6 @@ tcp_wput(queue_t *q, mblk_t *mp) t_scalar_t type; uchar_t *rptr; struct iocblk *iocp; - uint32_t msize; tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; ASSERT(connp->conn_ref >= 2); @@ -18926,18 +18970,16 @@ tcp_wput(queue_t *q, mblk_t *mp) tcp = connp->conn_tcp; ASSERT(tcp != NULL); - msize = msgdsize(mp); - mutex_enter(&tcp->tcp_non_sq_lock); - tcp->tcp_squeue_bytes += msize; + tcp->tcp_squeue_bytes += msgdsize(mp); if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) { tcp_setqfull(tcp); } mutex_exit(&tcp->tcp_non_sq_lock); CONN_INC_REF(connp); - (*tcp_squeue_wput_proc)(connp->conn_sqp, mp, - tcp_output, connp, SQTAG_TCP_OUTPUT); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp, + tcp_squeue_flag, SQTAG_TCP_OUTPUT); return; case M_CMD: @@ -19030,8 +19072,8 @@ tcp_wput(queue_t *q, mblk_t *mp) } CONN_INC_REF(connp); - (*tcp_squeue_wput_proc)(connp->conn_sqp, mp, - output_proc, connp, SQTAG_TCP_WPUT_OTHER); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, output_proc, connp, + tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER); } /* @@ -19503,34 +19545,27 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, ntohs(ipha->ipha_length)); - if (ILL_DLS_CAPABLE(ill)) { - /* - * Send the packet directly to DLD, where it may be queued - * depending on the availability of transmit resources at - * the media layer. - */ - IP_DLS_ILL_TX(ill, ipha, mp, ipst, ire_fp_mp_len); - } else { - ill_t *out_ill = (ill_t *)ire->ire_stq->q_ptr; - DTRACE_PROBE4(ip4__physical__out__start, - ill_t *, NULL, ill_t *, out_ill, - ipha_t *, ipha, mblk_t *, mp); - FW_HOOKS(ipst->ips_ip4_physical_out_event, - ipst->ips_ipv4firewall_physical_out, - NULL, out_ill, ipha, mp, mp, 0, ipst); - DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); + DTRACE_PROBE4(ip4__physical__out__start, + ill_t *, NULL, ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); + FW_HOOKS(ipst->ips_ip4_physical_out_event, + ipst->ips_ipv4firewall_physical_out, + NULL, ill, ipha, mp, mp, 0, ipst); + DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); + DTRACE_IP_FASTPATH(mp, ipha, ill, ipha, NULL); - if (mp != NULL) { - if (ipst->ips_ipobs_enabled) { - ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, - IP_REAL_ZONEID(connp->conn_zoneid, ipst), - ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, - ipst); - } - DTRACE_IP_FASTPATH(mp, ipha, out_ill, ipha, NULL); - putnext(ire->ire_stq, mp); + if (mp != NULL) { + if (ipst->ips_ipobs_enabled) { + zoneid_t szone; + + szone = ip_get_zoneid_v4(ipha->ipha_src, mp, + ipst, ALL_ZONES); + ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, + ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst); } + + ILL_SEND_TX(ill, ire, connp, mp, 0); } + IRE_REFRELE(ire); } @@ -21327,12 +21362,7 @@ tcp_multisend_data(tcp_t *tcp, ire_t *ire, const ill_t *ill, mblk_t *md_mp_head, } /* send it down */ - if (ILL_DLS_CAPABLE(ill)) { - ill_dls_capab_t *ill_dls = ill->ill_dls_capab; - ill_dls->ill_tx(ill_dls->ill_tx_handle, md_mp_head); - } else { - putnext(ire->ire_stq, md_mp_head); - } + putnext(ire->ire_stq, md_mp_head); /* we're done for TCP/IPv4 */ if (tcp->tcp_ipversion == IPV4_VERSION) @@ -21478,10 +21508,12 @@ tcp_lsosend_data(tcp_t *tcp, mblk_t *mp, ire_t *ire, ill_t *ill, const int mss, IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum); /* - * Append LSO flag to DB_LSOFLAGS(mp) and set the mss to DB_LSOMSS(mp). + * Append LSO flags and mss to the mp. */ - DB_LSOFLAGS(mp) |= HW_LSO; - DB_LSOMSS(mp) = mss; + lso_info_set(mp, mss, HW_LSO); + + ipha->ipha_fragment_offset_and_flags |= + (uint32_t)htons(ire->ire_frag_flag); ire_fp_mp = ire->ire_nce->nce_fp_mp; ire_fp_mp_len = MBLKL(ire_fp_mp); @@ -21496,34 +21528,25 @@ tcp_lsosend_data(tcp_t *tcp, mblk_t *mp, ire_t *ire, ill_t *ill, const int mss, UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, ntohs(ipha->ipha_length)); - if (ILL_DLS_CAPABLE(ill)) { - /* - * Send the packet directly to DLD, where it may be queued - * depending on the availability of transmit resources at - * the media layer. - */ - IP_DLS_ILL_TX(ill, ipha, mp, ipst, ire_fp_mp_len); - } else { - ill_t *out_ill = (ill_t *)ire->ire_stq->q_ptr; - DTRACE_PROBE4(ip4__physical__out__start, - ill_t *, NULL, ill_t *, out_ill, - ipha_t *, ipha, mblk_t *, mp); - FW_HOOKS(ipst->ips_ip4_physical_out_event, - ipst->ips_ipv4firewall_physical_out, - NULL, out_ill, ipha, mp, mp, 0, ipst); - DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); + DTRACE_PROBE4(ip4__physical__out__start, + ill_t *, NULL, ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); + FW_HOOKS(ipst->ips_ip4_physical_out_event, + ipst->ips_ipv4firewall_physical_out, NULL, + ill, ipha, mp, mp, 0, ipst); + DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); + DTRACE_IP_FASTPATH(mp, ipha, ill, ipha, NULL); - if (mp != NULL) { - if (ipst->ips_ipobs_enabled) { - zoneid_t szone = tcp->tcp_connp->conn_zoneid; + if (mp != NULL) { + if (ipst->ips_ipobs_enabled) { + zoneid_t szone; - ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, - ALL_ZONES, ill, tcp->tcp_ipversion, - ire_fp_mp_len, ipst); - } - DTRACE_IP_FASTPATH(mp, ipha, out_ill, ipha, NULL); - putnext(ire->ire_stq, mp); + szone = ip_get_zoneid_v4(ipha->ipha_src, mp, + ipst, ALL_ZONES); + ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, + ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst); } + + ILL_SEND_TX(ill, ire, tcp->tcp_connp, mp, 0); } } @@ -24921,9 +24944,6 @@ tcp_ddi_g_init(void) /* Initialize the random number generator */ tcp_random_init(); - tcp_squeue_wput_proc = tcp_squeue_switch(tcp_squeue_wput); - tcp_squeue_close_proc = tcp_squeue_switch(tcp_squeue_close); - /* A single callback independently of how many netstacks we have */ ip_squeue_init(tcp_squeue_add); @@ -24932,6 +24952,8 @@ tcp_ddi_g_init(void) tcp_taskq = taskq_create("tcp_taskq", 1, minclsyspri, 1, 1, TASKQ_PREPOPULATE); + tcp_squeue_flag = tcp_squeue_switch(tcp_squeue_wput); + /* * We want to be informed each time a stack is created or * destroyed in the kernel, so we can maintain the @@ -25420,7 +25442,7 @@ tcp_ioctl_abort_handler(tcp_t *tcp, mblk_t *mp) * If we get here, we are already on the correct * squeue. This ioctl follows the following path * tcp_wput -> tcp_wput_ioctl -> tcp_ioctl_abort_conn - * ->tcp_ioctl_abort->squeue_fill (if on a + * ->tcp_ioctl_abort->squeue_enter (if on a * different squeue) */ int errcode; @@ -25487,8 +25509,8 @@ startover: listhead = listhead->b_next; tcp = (tcp_t *)mp->b_prev; mp->b_next = mp->b_prev = NULL; - squeue_fill(tcp->tcp_connp->conn_sqp, mp, - tcp_input, tcp->tcp_connp, SQTAG_TCP_ABORT_BUCKET); + SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp, tcp_input, + tcp->tcp_connp, SQ_FILL, SQTAG_TCP_ABORT_BUCKET); } *count += nmatch; @@ -25989,8 +26011,8 @@ tcp_timer_callback(void *arg) tcpt = (tcp_timer_t *)mp->b_rptr; connp = tcpt->connp; - squeue_fill(connp->conn_sqp, mp, - tcp_timer_handler, connp, SQTAG_TCP_TIMER); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp, + SQ_FILL, SQTAG_TCP_TIMER); } static void @@ -26486,6 +26508,7 @@ tcp_kstat_update(kstat_t *kp, int rw) netstack_rele(ns); return (-1); } + tcpkp = (tcp_named_kstat_t *)kp->ks_data; tcpkp->currEstab.value.ui32 = 0; @@ -26583,8 +26606,8 @@ tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp) /* Already has an eager */ if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { TCP_STAT(tcps, tcp_reinput_syn); - squeue_enter(connp->conn_sqp, mp, connp->conn_recv, - connp, SQTAG_TCP_REINPUT_EAGER); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp, + SQ_PROCESS, SQTAG_TCP_REINPUT_EAGER); return; } @@ -26609,21 +26632,21 @@ tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp) DB_CKSUMSTART(mp) = (intptr_t)sqp; } - squeue_fill(connp->conn_sqp, mp, connp->conn_recv, connp, - SQTAG_TCP_REINPUT); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp, + SQ_FILL, SQTAG_TCP_REINPUT); } -static squeue_func_t +static int tcp_squeue_switch(int val) { - squeue_func_t rval = squeue_fill; + int rval = SQ_FILL; switch (val) { case 1: - rval = squeue_enter_nodrain; + rval = SQ_NODRAIN; break; case 2: - rval = squeue_enter; + rval = SQ_PROCESS; break; default: break; diff --git a/usr/src/uts/common/inet/tcp/tcp_kssl.c b/usr/src/uts/common/inet/tcp/tcp_kssl.c index 0913da33f8..8eb8cddff3 100644 --- a/usr/src/uts/common/inet/tcp/tcp_kssl.c +++ b/usr/src/uts/common/inet/tcp/tcp_kssl.c @@ -53,6 +53,7 @@ #include <inet/ipdrop.h> #include <inet/tcp_impl.h> +#include <sys/squeue_impl.h> #include <sys/squeue.h> #include <inet/kssl/ksslapi.h> @@ -70,7 +71,7 @@ static void tcp_kssl_input_asynch(void *, mblk_t *, void *); extern void tcp_output(void *, mblk_t *, void *); extern void tcp_send_conn_ind(void *, mblk_t *, void *); -extern squeue_func_t tcp_squeue_wput_proc; +extern int tcp_squeue_flag; /* * tcp_rput_data() calls this routine for all packet destined to a @@ -205,10 +206,10 @@ tcp_kssl_input(tcp_t *tcp, mblk_t *mp) listener->tcp_connp->conn_sqp); CONN_DEC_REF(listener->tcp_connp); } else { - squeue_fill( + SQUEUE_ENTER_ONE( listener->tcp_connp->conn_sqp, ind_mp, tcp_send_conn_ind, - listener->tcp_connp, + listener->tcp_connp, SQ_FILL, SQTAG_TCP_CONN_IND); } } @@ -294,11 +295,11 @@ no_can_do: listener->tcp_connp->conn_sqp); CONN_DEC_REF(listener->tcp_connp); } else { - squeue_fill( + SQUEUE_ENTER_ONE( listener->tcp_connp->conn_sqp, ind_mp, tcp_send_conn_ind, listener->tcp_connp, - SQTAG_TCP_CONN_IND); + SQ_FILL, SQTAG_TCP_CONN_IND); } } if (mp != NULL) @@ -343,8 +344,8 @@ tcp_kssl_input_callback(void *arg, mblk_t *mp, kssl_cmd_t kssl_cmd) mutex_exit(&tcp->tcp_non_sq_lock); } CONN_INC_REF(connp); - (*tcp_squeue_wput_proc)(connp->conn_sqp, mp, - tcp_output, connp, SQTAG_TCP_OUTPUT); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp, + tcp_squeue_flag, SQTAG_TCP_OUTPUT); /* FALLTHROUGH */ case KSSL_CMD_NONE: @@ -375,8 +376,8 @@ tcp_kssl_input_callback(void *arg, mblk_t *mp, kssl_cmd_t kssl_cmd) */ if ((sqmp = allocb(1, BPRI_MED)) != NULL) { CONN_INC_REF(connp); - squeue_fill(connp->conn_sqp, sqmp, tcp_kssl_input_asynch, - connp, SQTAG_TCP_KSSL_INPUT); + SQUEUE_ENTER_ONE(connp->conn_sqp, sqmp, tcp_kssl_input_asynch, + connp, SQ_FILL, SQTAG_TCP_KSSL_INPUT); } else { DTRACE_PROBE(kssl_err__allocb_failed); } diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index 3369ca915e..70677c86d8 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -78,6 +78,7 @@ #include <inet/ipclassifier.h> #include <inet/ipsec_impl.h> #include <inet/ipp_common.h> +#include <sys/squeue_impl.h> #include <inet/ipnet.h> /* @@ -196,14 +197,15 @@ static int udp_rinfop(queue_t *q, infod_t *dp); static int udp_rrw(queue_t *q, struiod_t *dp); static int udp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); -static void udp_send_data(udp_t *, queue_t *, mblk_t *, ipha_t *); +static void udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, + ipha_t *ipha); static void udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr, t_scalar_t destlen, t_scalar_t err); static void udp_unbind(queue_t *q, mblk_t *mp); static in_port_t udp_update_next_port(udp_t *udp, in_port_t port, boolean_t random); static mblk_t *udp_output_v4(conn_t *, mblk_t *, ipaddr_t, uint16_t, uint_t, - int *, boolean_t); + int *, boolean_t); static mblk_t *udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, int *error); static void udp_wput_other(queue_t *q, mblk_t *mp); @@ -4401,6 +4403,7 @@ udp_input(void *arg1, mblk_t *mp, void *arg2) UDP_STAT(us, udp_in_recvucred); } + /* XXX FIXME: apply to AF_INET6 as well */ /* * If SO_TIMESTAMP is set allocate the appropriate sized * buffer. Since gethrestime() expects a pointer aligned @@ -6237,8 +6240,12 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid) dev_q = ire->ire_stq->q_next; ASSERT(dev_q != NULL); + ill = ire_to_ill(ire); + ASSERT(ill != NULL); - if (DEV_Q_IS_FLOW_CTLED(dev_q)) { + /* is queue flow controlled? */ + if (q->q_first != NULL || connp->conn_draining || + DEV_Q_FLOW_BLOCKED(dev_q)) { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); if (ipst->ips_ip_output_queue) @@ -6256,8 +6263,6 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid) dst = ipha->ipha_dst; src = ipha->ipha_src; - ill = ire_to_ill(ire); - ASSERT(ill != NULL); BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); @@ -6334,31 +6339,32 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid) UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, ntohs(ipha->ipha_length)); - if (ILL_DLS_CAPABLE(ill)) { - /* - * Send the packet directly to DLD, where it may be queued - * depending on the availability of transmit resources at - * the media layer. - */ - IP_DLS_ILL_TX(ill, ipha, mp, ipst, ire_fp_mp_len); - } else { - DTRACE_PROBE4(ip4__physical__out__start, - ill_t *, NULL, ill_t *, ill, - ipha_t *, ipha, mblk_t *, mp); - FW_HOOKS(ipst->ips_ip4_physical_out_event, - ipst->ips_ipv4firewall_physical_out, - NULL, ill, ipha, mp, mp, ll_multicast, ipst); - DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); - if (mp != NULL) { - if (ipst->ips_ipobs_enabled) { - ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, - IP_REAL_ZONEID(connp->conn_zoneid, ipst), - ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, - ipst); - } - DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, - void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill, - ipha_t *, ipha, ip6_t *, NULL, int, 0); + DTRACE_PROBE4(ip4__physical__out__start, + ill_t *, NULL, ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); + FW_HOOKS(ipst->ips_ip4_physical_out_event, + ipst->ips_ipv4firewall_physical_out, NULL, ill, ipha, mp, mp, + ll_multicast, ipst); + DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); + if (ipst->ips_ipobs_enabled && mp != NULL) { + zoneid_t szone; + + szone = ip_get_zoneid_v4(ipha->ipha_src, mp, + ipst, ALL_ZONES); + ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, + ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst); + } + + if (mp != NULL) { + DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, + void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill, + ipha_t *, ipha, ip6_t *, NULL, int, 0); + + if (ILL_DIRECT_CAPABLE(ill)) { + ill_dld_direct_t *idd = &ill->ill_dld_capab->idc_direct; + + (void) idd->idd_tx_df(idd->idd_tx_dh, mp, + (uintptr_t)connp, 0); + } else { putnext(ire->ire_stq, mp); } } diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h index 04b8dbc22c..468fa553f4 100644 --- a/usr/src/uts/common/inet/udp_impl.h +++ b/usr/src/uts/common/inet/udp_impl.h @@ -26,8 +26,6 @@ #ifndef _UDP_IMPL_H #define _UDP_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * UDP implementation private declarations. These interfaces are * used to build the IP module and are not meant to be accessed @@ -159,7 +157,7 @@ typedef struct udp_fanout_s { * below IP and if the q_first is NULL, we optimize by not doing * the canput check */ -#define DEV_Q_IS_FLOW_CTLED(dev_q) \ +#define DEV_Q_FLOW_BLOCKED(dev_q) \ (((dev_q)->q_next != NULL || (dev_q)->q_first != NULL) && \ !canput(dev_q)) @@ -371,9 +369,7 @@ extern void udp_quiesce_conn(conn_t *); extern void udp_ddi_init(void); extern void udp_ddi_destroy(void); extern void udp_resume_bind(conn_t *, mblk_t *); -extern void udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr, - socklen_t addrlen); -extern void udp_wput(queue_t *, mblk_t *); +extern void udp_wput(queue_t *, mblk_t *); extern int udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr); diff --git a/usr/src/uts/common/io/afe/afe.c b/usr/src/uts/common/io/afe/afe.c index a89926f58f..9f32d0d3f8 100644 --- a/usr/src/uts/common/io/afe/afe.c +++ b/usr/src/uts/common/io/afe/afe.c @@ -184,7 +184,6 @@ static mac_callbacks_t afe_m_callbacks = { afe_m_multicst, afe_m_unicst, afe_m_tx, - NULL, /* mc_resources */ NULL, /* mc_ioctl */ NULL, /* mc_getcapab */ NULL, /* mc_open */ diff --git a/usr/src/uts/common/io/afe/afeimpl.h b/usr/src/uts/common/io/afe/afeimpl.h index 0dccbe1acd..2b2e0c237d 100644 --- a/usr/src/uts/common/io/afe/afeimpl.h +++ b/usr/src/uts/common/io/afe/afeimpl.h @@ -36,10 +36,10 @@ #ifndef _AFEIMPL_H #define _AFEIMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef _KERNEL +#include <sys/mac_provider.h> + /* * Compile time tunables. */ diff --git a/usr/src/uts/common/io/aggr/aggr_ctl.c b/usr/src/uts/common/io/aggr/aggr_ctl.c index 0cfb177ed6..ea167fda28 100644 --- a/usr/src/uts/common/io/aggr/aggr_ctl.c +++ b/usr/src/uts/common/io/aggr/aggr_ctl.c @@ -29,13 +29,14 @@ #include <sys/aggr.h> #include <sys/aggr_impl.h> +#include <sys/priv_names.h> /* * Process a LAIOC_MODIFY request. */ /* ARGSUSED */ static int -aggr_ioc_modify(void *karg, intptr_t arg, int mode, cred_t *cred) +aggr_ioc_modify(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { laioc_modify_t *modify_arg = karg; uint32_t policy; @@ -68,8 +69,8 @@ aggr_ioc_modify(void *karg, intptr_t arg, int mode, cred_t *cred) lacp_timer = modify_arg->lu_lacp_timer; } - return (aggr_grp_modify(modify_arg->lu_linkid, NULL, modify_mask, - policy, mac_fixed, mac_addr, lacp_mode, lacp_timer)); + return (aggr_grp_modify(modify_arg->lu_linkid, modify_mask, policy, + mac_fixed, mac_addr, lacp_mode, lacp_timer)); } /* @@ -77,7 +78,7 @@ aggr_ioc_modify(void *karg, intptr_t arg, int mode, cred_t *cred) */ /* ARGSUSED */ static int -aggr_ioc_create(void *karg, intptr_t arg, int mode, cred_t *cred) +aggr_ioc_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { laioc_create_t *create_arg = karg; uint16_t nports; @@ -122,7 +123,7 @@ done: /* ARGSUSED */ static int -aggr_ioc_delete(void *karg, intptr_t arg, int mode, cred_t *cred) +aggr_ioc_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { laioc_delete_t *delete_arg = karg; @@ -191,7 +192,7 @@ aggr_ioc_info_new_port(void *arg, datalink_id_t linkid, uchar_t *mac, /*ARGSUSED*/ static int -aggr_ioc_info(void *karg, intptr_t arg, int mode, cred_t *cred) +aggr_ioc_info(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { laioc_info_t *info_argp = karg; datalink_id_t linkid; @@ -249,30 +250,31 @@ done: /* ARGSUSED */ static int -aggr_ioc_add(void *karg, intptr_t arg, int mode, cred_t *cred) +aggr_ioc_add(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { return (aggr_ioc_add_remove(karg, arg, LAIOC_ADD, mode)); } /* ARGSUSED */ static int -aggr_ioc_remove(void *karg, intptr_t arg, int mode, cred_t *cred) +aggr_ioc_remove(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { return (aggr_ioc_add_remove(karg, arg, LAIOC_REMOVE, mode)); } static dld_ioc_info_t aggr_ioc_list[] = { - {LAIOC_CREATE, DLDCOPYIN | DLDDLCONFIG, sizeof (laioc_create_t), - aggr_ioc_create}, - {LAIOC_DELETE, DLDCOPYIN | DLDDLCONFIG, sizeof (laioc_delete_t), - aggr_ioc_delete}, - {LAIOC_INFO, DLDCOPYINOUT, sizeof (laioc_info_t), aggr_ioc_info}, - {LAIOC_ADD, DLDCOPYIN | DLDDLCONFIG, sizeof (laioc_add_rem_t), - aggr_ioc_add}, - {LAIOC_REMOVE, DLDCOPYIN | DLDDLCONFIG, sizeof (laioc_add_rem_t), - aggr_ioc_remove}, - {LAIOC_MODIFY, DLDCOPYIN | DLDDLCONFIG, sizeof (laioc_modify_t), - aggr_ioc_modify} + {LAIOC_CREATE, DLDCOPYIN, sizeof (laioc_create_t), aggr_ioc_create, + {PRIV_SYS_DL_CONFIG}}, + {LAIOC_DELETE, DLDCOPYIN, sizeof (laioc_delete_t), aggr_ioc_delete, + {PRIV_SYS_DL_CONFIG}}, + {LAIOC_INFO, DLDCOPYINOUT, sizeof (laioc_info_t), aggr_ioc_info, + {NULL}}, + {LAIOC_ADD, DLDCOPYIN, sizeof (laioc_add_rem_t), aggr_ioc_add, + {PRIV_SYS_DL_CONFIG}}, + {LAIOC_REMOVE, DLDCOPYIN, sizeof (laioc_add_rem_t), aggr_ioc_remove, + {PRIV_SYS_DL_CONFIG}}, + {LAIOC_MODIFY, DLDCOPYIN, sizeof (laioc_modify_t), aggr_ioc_modify, + {PRIV_SYS_DL_CONFIG}} }; int diff --git a/usr/src/uts/common/io/aggr/aggr_dev.c b/usr/src/uts/common/io/aggr/aggr_dev.c index fc2c396c2b..6640015af5 100644 --- a/usr/src/uts/common/io/aggr/aggr_dev.c +++ b/usr/src/uts/common/io/aggr/aggr_dev.c @@ -42,38 +42,8 @@ static int aggr_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); static int aggr_attach(dev_info_t *, ddi_attach_cmd_t); static int aggr_detach(dev_info_t *, ddi_detach_cmd_t); -static struct cb_ops aggr_cb_ops = { - nulldev, /* open */ - nulldev, /* close */ - nulldev, /* strategy */ - nulldev, /* print */ - nodev, /* dump */ - nodev, /* read */ - nodev, /* write */ - nodev, /* ioctl */ - nodev, /* devmap */ - nodev, /* mmap */ - nodev, /* segmap */ - nochpoll, /* poll */ - ddi_prop_op, /* cb_prop_op */ - 0, /* streamtab */ - D_MP /* Driver compatibility flag */ -}; - -static struct dev_ops aggr_dev_ops = { - DEVO_REV, /* devo_rev */ - 0, /* refcnt */ - aggr_getinfo, /* get_dev_info */ - nulldev, /* identify */ - nulldev, /* probe */ - aggr_attach, /* attach */ - aggr_detach, /* detach */ - nodev, /* reset */ - &aggr_cb_ops, /* driver operations */ - NULL, /* bus operations */ - nodev, /* dev power */ - ddi_quiesce_not_supported, /* dev quiesce */ -}; +DDI_DEFINE_STREAM_OPS(aggr_dev_ops, nulldev, nulldev, aggr_attach, aggr_detach, + nodev, aggr_getinfo, D_MP, NULL, ddi_quiesce_not_supported); static struct modldrv aggr_modldrv = { &mod_driverops, /* Type of module. This one is a driver */ @@ -82,9 +52,7 @@ static struct modldrv aggr_modldrv = { }; static struct modlinkage modlinkage = { - MODREV_1, - &aggr_modldrv, - NULL + MODREV_1, &aggr_modldrv, NULL }; int diff --git a/usr/src/uts/common/io/aggr/aggr_grp.c b/usr/src/uts/common/io/aggr/aggr_grp.c index cee6d5e45f..fa90087320 100644 --- a/usr/src/uts/common/io/aggr/aggr_grp.c +++ b/usr/src/uts/common/io/aggr/aggr_grp.c @@ -39,6 +39,7 @@ #include <sys/sysmacros.h> #include <sys/conf.h> #include <sys/cmn_err.h> +#include <sys/disp.h> #include <sys/list.h> #include <sys/ksynch.h> #include <sys/kmem.h> @@ -52,6 +53,7 @@ #include <sys/id_space.h> #include <sys/strsun.h> #include <sys/dlpi.h> +#include <sys/mac_provider.h> #include <sys/dls.h> #include <sys/vlan.h> #include <sys/aggr.h> @@ -63,7 +65,6 @@ static int aggr_m_promisc(void *, boolean_t); static int aggr_m_multicst(void *, boolean_t, const uint8_t *); static int aggr_m_unicst(void *, const uint8_t *); static int aggr_m_stat(void *, uint_t, uint64_t *); -static void aggr_m_resources(void *); static void aggr_m_ioctl(void *, queue_t *, mblk_t *); static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *); static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t); @@ -76,8 +77,20 @@ static uint_t aggr_grp_max_sdu(aggr_grp_t *); static uint32_t aggr_grp_max_margin(aggr_grp_t *); static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *); static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *); -static int aggr_grp_multicst(aggr_grp_t *grp, boolean_t add, - const uint8_t *addrp); + +static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); +static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); +static int aggr_pseudo_disable_intr(mac_intr_handle_t); +static int aggr_pseudo_enable_intr(mac_intr_handle_t); +static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t); +static void aggr_pseudo_stop_ring(mac_ring_driver_t); +static int aggr_addmac(void *, const uint8_t *); +static int aggr_remmac(void *, const uint8_t *); +static mblk_t *aggr_rx_poll(void *, int); +static void aggr_fill_ring(void *, mac_ring_type_t, const int, + const int, mac_ring_info_t *, mac_ring_handle_t); +static void aggr_fill_group(void *, mac_ring_type_t, const int, + mac_group_info_t *, mac_group_handle_t); static kmem_cache_t *aggr_grp_cache; static mod_hash_t *aggr_grp_hash; @@ -87,10 +100,11 @@ static id_space_t *key_ids; #define GRP_HASHSZ 64 #define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid) +#define AGGR_PORT_NAME_DELIMIT '-' static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0}; -#define AGGR_M_CALLBACK_FLAGS (MC_RESOURCES | MC_IOCTL | MC_GETCAPAB) +#define AGGR_M_CALLBACK_FLAGS (MC_IOCTL | MC_GETCAPAB) static mac_callbacks_t aggr_m_callbacks = { AGGR_M_CALLBACK_FLAGS, @@ -99,9 +113,8 @@ static mac_callbacks_t aggr_m_callbacks = { aggr_m_stop, aggr_m_promisc, aggr_m_multicst, - aggr_m_unicst, + NULL, aggr_m_tx, - aggr_m_resources, aggr_m_ioctl, aggr_m_capab_get }; @@ -113,11 +126,12 @@ aggr_grp_constructor(void *buf, void *arg, int kmflag) aggr_grp_t *grp = buf; bzero(grp, sizeof (*grp)); - rw_init(&grp->lg_lock, NULL, RW_DRIVER, NULL); - rw_init(&grp->aggr.gl_lock, NULL, RW_DRIVER, NULL); - + mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL); + rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL); + mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL); grp->lg_link_state = LINK_STATE_UNKNOWN; - return (0); } @@ -132,8 +146,11 @@ aggr_grp_destructor(void *buf, void *arg) grp->lg_tx_ports_size * sizeof (aggr_port_t *)); } - rw_destroy(&grp->aggr.gl_lock); - rw_destroy(&grp->lg_lock); + mutex_destroy(&grp->lg_lacp_lock); + cv_destroy(&grp->lg_lacp_cv); + mutex_destroy(&grp->lg_port_lock); + cv_destroy(&grp->lg_port_cv); + rw_destroy(&grp->lg_tx_lock); } void @@ -179,6 +196,51 @@ aggr_grp_count(void) } /* + * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions + * requires the mac perimeter, this function holds a reference of the aggr + * and aggr won't call mac_unregister() until this reference drops to 0. + */ +void +aggr_grp_port_hold(aggr_port_t *port) +{ + aggr_grp_t *grp = port->lp_grp; + + AGGR_PORT_REFHOLD(port); + mutex_enter(&grp->lg_port_lock); + grp->lg_port_ref++; + mutex_exit(&grp->lg_port_lock); +} + +/* + * Release the reference of the grp and inform aggr_grp_delete() calling + * mac_unregister() is now safe. + */ +void +aggr_grp_port_rele(aggr_port_t *port) +{ + aggr_grp_t *grp = port->lp_grp; + + mutex_enter(&grp->lg_port_lock); + if (--grp->lg_port_ref == 0) + cv_signal(&grp->lg_port_cv); + mutex_exit(&grp->lg_port_lock); + AGGR_PORT_REFRELE(port); +} + +/* + * Wait for the port's lacp timer thread and the port's notification callback + * to exit. + */ +void +aggr_grp_port_wait(aggr_grp_t *grp) +{ + mutex_enter(&grp->lg_port_lock); + if (grp->lg_port_ref != 0) + cv_wait(&grp->lg_port_cv, &grp->lg_port_lock); + mutex_exit(&grp->lg_port_lock); +} + +/* * Attach a port to a link aggregation group. * * A port is attached to a link aggregation group once its speed @@ -193,9 +255,8 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) { boolean_t link_state_changed = B_FALSE; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp)); - ASSERT(RW_WRITE_HELD(&grp->lg_lock)); - ASSERT(RW_WRITE_HELD(&port->lp_lock)); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + ASSERT(MAC_PERIM_HELD(port->lp_mh)); if (port->lp_state == AGGR_PORT_STATE_ATTACHED) return (B_FALSE); @@ -251,7 +312,7 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) /* * Set port's receive callback */ - port->lp_mrh = mac_rx_add(port->lp_mh, aggr_recv_cb, (void *)port); + mac_rx_set(port->lp_mch, aggr_recv_cb, port); /* * If LACP is OFF, the port can be used to send data as soon @@ -270,28 +331,28 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) } boolean_t -aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port, boolean_t port_detach) +aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port) { boolean_t link_state_changed = B_FALSE; - ASSERT(RW_WRITE_HELD(&grp->lg_lock)); - ASSERT(RW_WRITE_HELD(&port->lp_lock)); - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp)); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + ASSERT(MAC_PERIM_HELD(port->lp_mh)); + /* update state */ if (port->lp_state != AGGR_PORT_STATE_ATTACHED) return (B_FALSE); - mac_rx_remove(port->lp_mh, port->lp_mrh, B_FALSE); + mac_rx_clear(port->lp_mch); aggr_grp_multicst_port(port, B_FALSE); if (grp->lg_lacp_mode == AGGR_LACP_OFF) aggr_send_port_disable(port); - else if (port_detach) + else aggr_lacp_port_detached(port); - /* update state */ port->lp_state = AGGR_PORT_STATE_STANDBY; + grp->lg_nattached_ports--; if (grp->lg_nattached_ports == 0) { /* the last attached MAC port of the group is being detached */ @@ -323,17 +384,15 @@ aggr_grp_update_ports_mac(aggr_grp_t *grp) { aggr_port_t *cport; boolean_t link_state_changed = B_FALSE; + mac_perim_handle_t mph; - ASSERT(RW_WRITE_HELD(&grp->lg_lock)); - - if (grp->lg_closing) - return (link_state_changed); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); for (cport = grp->lg_ports; cport != NULL; cport = cport->lp_next) { - rw_enter(&cport->lp_lock, RW_WRITER); - if (aggr_port_unicst(cport, grp->lg_addr) != 0) { - if (aggr_grp_detach_port(grp, cport, B_TRUE)) + mac_perim_enter_by_mh(cport->lp_mh, &mph); + if (aggr_port_unicst(cport) != 0) { + if (aggr_grp_detach_port(grp, cport)) link_state_changed = B_TRUE; } else { /* @@ -346,7 +405,7 @@ aggr_grp_update_ports_mac(aggr_grp_t *grp) if (aggr_grp_attach_port(grp, cport)) link_state_changed = B_TRUE; } - rw_exit(&cport->lp_lock); + mac_perim_exit(mph); } return (link_state_changed); } @@ -365,9 +424,8 @@ void aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port, boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) { - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp)); - ASSERT(RW_WRITE_HELD(&grp->lg_lock)); - ASSERT(RW_WRITE_HELD(&port->lp_lock)); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + ASSERT(MAC_PERIM_HELD(port->lp_mh)); ASSERT(mac_addr_changedp != NULL); ASSERT(link_state_changedp != NULL); @@ -394,9 +452,8 @@ aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port, * Update the actual port MAC address to the MAC address * of the group. */ - if (aggr_port_unicst(port, grp->lg_addr) != 0) { - *link_state_changedp = aggr_grp_detach_port(grp, port, - B_TRUE); + if (aggr_port_unicst(port) != 0) { + *link_state_changedp = aggr_grp_detach_port(grp, port); } else { /* * If a port was detached because of a previous @@ -414,21 +471,25 @@ aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port, * Add a port to a link aggregation group. */ static int -aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t linkid, boolean_t force, +aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force, aggr_port_t **pp) { aggr_port_t *port, **cport; + mac_perim_handle_t mph; int err; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp)); - ASSERT(RW_WRITE_HELD(&grp->lg_lock)); + /* + * lg_mh could be NULL when the function is called during the creation + * of the aggregation. + */ + ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh)); /* create new port */ - err = aggr_port_create(linkid, force, &port); + err = aggr_port_create(grp, port_linkid, force, &port); if (err != 0) return (err); - rw_enter(&port->lp_lock, RW_WRITER); + mac_perim_enter_by_mh(port->lp_mh, &mph); /* add port to list of group constituent ports */ cport = &grp->lg_ports; @@ -446,19 +507,238 @@ aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t linkid, boolean_t force, grp->lg_nports++; aggr_lacp_init_port(port); + mac_perim_exit(mph); + + if (pp != NULL) + *pp = port; + + return (0); +} + +/* + * Add a pseudo Rx ring for the given HW ring handle. + */ +static int +aggr_add_pseudo_rx_ring(aggr_port_t *port, + aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) +{ + aggr_pseudo_rx_ring_t *ring; + int err; + int j; + + for (j = 0; j < MAX_RINGS_PER_GROUP; j++) { + ring = rx_grp->arg_rings + j; + if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE)) + break; + } /* - * Initialize the callback functions for this port. Note that this - * can only be done after the lp_grp field is set. + * No slot for this new Rx ring. */ - aggr_port_init_callbacks(port); + if (j == MAX_RINGS_PER_GROUP) + return (EIO); - rw_exit(&port->lp_lock); + ring->arr_flags |= MAC_PSEUDO_RING_INUSE; + ring->arr_hw_rh = hw_rh; + ring->arr_port = port; + rx_grp->arg_ring_cnt++; - if (pp != NULL) - *pp = port; + /* + * The group is already registered, dynamically add a new ring to the + * mac group. + */ + mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring); + if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) { + ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; + ring->arr_hw_rh = NULL; + ring->arr_port = NULL; + rx_grp->arg_ring_cnt--; + mac_hwring_teardown(hw_rh); + } + return (err); +} - return (0); +/* + * Remove the pseudo Rx ring of the given HW ring handle. + */ +static void +aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) +{ + aggr_pseudo_rx_ring_t *ring; + int j; + + for (j = 0; j < MAX_RINGS_PER_GROUP; j++) { + ring = rx_grp->arg_rings + j; + if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) || + ring->arr_hw_rh != hw_rh) { + continue; + } + + mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh); + + ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; + ring->arr_hw_rh = NULL; + ring->arr_port = NULL; + rx_grp->arg_ring_cnt--; + mac_hwring_teardown(hw_rh); + break; + } +} + +/* + * This function is called to create pseudo rings over the hardware rings of + * the underlying device. Note that there is a 1:1 mapping between the pseudo + * RX rings of the aggr and the hardware rings of the underlying port. + */ +static int +aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) +{ + aggr_grp_t *grp = port->lp_grp; + mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; + aggr_unicst_addr_t *addr, *a; + mac_perim_handle_t pmph; + int hw_rh_cnt, i = 0, j; + int err = 0; + + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + mac_perim_enter_by_mh(port->lp_mh, &pmph); + + /* + * This function must be called after the aggr registers its mac + * and its RX group has been initialized. + */ + ASSERT(rx_grp->arg_gh != NULL); + + /* + * Get the list the the underlying HW rings. + */ + hw_rh_cnt = mac_hwrings_get(port->lp_mch, &port->lp_hwgh, hw_rh); + + if (port->lp_hwgh != NULL) { + /* + * Quiesce the HW ring and the mac srs on the ring. Note + * that the HW ring will be restarted when the pseudo ring + * is started. At that time all the packets will be + * directly passed up to the pseudo RX ring and handled + * by mac srs created over the pseudo RX ring. + */ + mac_rx_client_quiesce(port->lp_mch); + mac_srs_perm_quiesce(port->lp_mch, B_TRUE); + } + + /* + * Add all the unicast addresses to the newly added port. + */ + for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) { + if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0) + break; + } + + for (i = 0; err == 0 && i < hw_rh_cnt; i++) + err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]); + + if (err != 0) { + for (j = 0; j < i; j++) + aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]); + + for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next) + aggr_port_remmac(port, a->aua_addr); + + if (port->lp_hwgh != NULL) { + mac_srs_perm_quiesce(port->lp_mch, B_FALSE); + mac_rx_client_restart(port->lp_mch); + port->lp_hwgh = NULL; + } + } else { + port->lp_grp_added = B_TRUE; + } +done: + mac_perim_exit(pmph); + return (err); +} + +/* + * This function is called by aggr to remove pseudo RX rings over the + * HW rings of the underlying port. + */ +static void +aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) +{ + aggr_grp_t *grp = port->lp_grp; + mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; + aggr_unicst_addr_t *addr; + mac_group_handle_t hwgh; + mac_perim_handle_t pmph; + int hw_rh_cnt, i; + + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + mac_perim_enter_by_mh(port->lp_mh, &pmph); + + if (!port->lp_grp_added) + goto done; + + ASSERT(rx_grp->arg_gh != NULL); + hw_rh_cnt = mac_hwrings_get(port->lp_mch, &hwgh, hw_rh); + + /* + * If hw_rh_cnt is 0, it means that the underlying port does not + * support RX rings. Directly return in this case. + */ + for (i = 0; i < hw_rh_cnt; i++) + aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]); + + for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) + aggr_port_remmac(port, addr->aua_addr); + + if (port->lp_hwgh != NULL) { + port->lp_hwgh = NULL; + + /* + * First clear the permanent-quiesced flag of the RX srs then + * restart the HW ring and the mac srs on the ring. Note that + * the HW ring and associated SRS will soon been removed when + * the port is removed from the aggr. + */ + mac_srs_perm_quiesce(port->lp_mch, B_FALSE); + mac_rx_client_restart(port->lp_mch); + } + + port->lp_grp_added = B_FALSE; +done: + mac_perim_exit(pmph); +} + +static int +aggr_pseudo_disable_intr(mac_intr_handle_t ih) +{ + aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; + return (mac_hwring_disable_intr(rr_ring->arr_hw_rh)); +} + +static int +aggr_pseudo_enable_intr(mac_intr_handle_t ih) +{ + aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; + return (mac_hwring_enable_intr(rr_ring->arr_hw_rh)); +} + +static int +aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen) +{ + aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; + int err; + + err = mac_hwring_start(rr_ring->arr_hw_rh); + if (err == 0) + rr_ring->arr_gen = mr_gen; + return (err); +} + +static void +aggr_pseudo_stop_ring(mac_ring_driver_t arg) +{ + aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; + mac_hwring_stop(rr_ring->arr_hw_rh); } /* @@ -472,6 +752,7 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, aggr_grp_t *grp = NULL; aggr_port_t *port; boolean_t link_state_changed = B_FALSE; + mac_perim_handle_t mph, pmph; /* get group corresponding to linkid */ rw_enter(&aggr_grp_lock, RW_READER); @@ -481,10 +762,12 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, return (ENOENT); } AGGR_GRP_REFHOLD(grp); - rw_exit(&aggr_grp_lock); - AGGR_LACP_LOCK_WRITER(grp); - rw_enter(&grp->lg_lock, RW_WRITER); + /* + * Hold the perimeter so that the aggregation won't be destroyed. + */ + mac_perim_enter_by_mh(grp->lg_mh, &mph); + rw_exit(&aggr_grp_lock); /* add the specified ports to group */ for (i = 0; i < nports; i++) { @@ -504,29 +787,53 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, goto bail; } + /* + * Create the pseudo ring for each HW ring of the underlying + * port. + */ + rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group); + if (rc != 0) + goto bail; + + mac_perim_enter_by_mh(port->lp_mh, &pmph); + + /* set LACP mode */ + aggr_port_lacp_set_mode(grp, port); + /* start port if group has already been started */ if (grp->lg_started) { - rw_enter(&port->lp_lock, RW_WRITER); rc = aggr_port_start(port); if (rc != 0) { - rw_exit(&port->lp_lock); + mac_perim_exit(pmph); goto bail; } - /* set port promiscuous mode */ - rc = aggr_port_promisc(port, grp->lg_promisc); - if (rc != 0) { - rw_exit(&port->lp_lock); - goto bail; + /* + * Turn on the promiscuous mode over the port when it + * is requested to be turned on to receive the + * non-primary address over a port, or the promiscous + * mode is enabled over the aggr. + */ + if (grp->lg_promisc || port->lp_prom_addr != NULL) { + rc = aggr_port_promisc(port, B_TRUE); + if (rc != 0) { + mac_perim_exit(pmph); + goto bail; + } } - rw_exit(&port->lp_lock); } + mac_perim_exit(pmph); /* * Attach each port if necessary. */ - if (aggr_port_notify_link(grp, port, B_FALSE)) + if (aggr_port_notify_link(grp, port)) link_state_changed = B_TRUE; + + /* + * Initialize the callback functions for this port. + */ + aggr_port_init_callbacks(port); } /* update the MAC address of the constituent ports */ @@ -539,64 +846,43 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, bail: if (rc != 0) { /* stop and remove ports that have been added */ - for (i = 0; i < nadded && !grp->lg_closing; i++) { + for (i = 0; i < nadded; i++) { port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); ASSERT(port != NULL); if (grp->lg_started) { - rw_enter(&port->lp_lock, RW_WRITER); + mac_perim_enter_by_mh(port->lp_mh, &pmph); + (void) aggr_port_promisc(port, B_FALSE); aggr_port_stop(port); - rw_exit(&port->lp_lock); + mac_perim_exit(pmph); } + aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); (void) aggr_grp_rem_port(grp, port, NULL, NULL); } } - rw_exit(&grp->lg_lock); - AGGR_LACP_UNLOCK(grp); - if (rc == 0 && !grp->lg_closing) + if (rc == 0) mac_resource_update(grp->lg_mh); + mac_perim_exit(mph); AGGR_GRP_REFRELE(grp); return (rc); } -/* - * Update properties of an existing link aggregation group. - */ -int -aggr_grp_modify(datalink_id_t linkid, aggr_grp_t *grp_arg, uint8_t update_mask, - uint32_t policy, boolean_t mac_fixed, const uchar_t *mac_addr, - aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer) +static int +aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy, + boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, + aggr_lacp_timer_t lacp_timer) { - int rc = 0; - aggr_grp_t *grp = NULL; boolean_t mac_addr_changed = B_FALSE; boolean_t link_state_changed = B_FALSE; + mac_perim_handle_t pmph; - if (grp_arg == NULL) { - /* get group corresponding to linkid */ - rw_enter(&aggr_grp_lock, RW_READER); - if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), - (mod_hash_val_t *)&grp) != 0) { - rc = ENOENT; - goto bail; - } - AGGR_LACP_LOCK_WRITER(grp); - rw_enter(&grp->lg_lock, RW_WRITER); - } else { - grp = grp_arg; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp)); - ASSERT(RW_WRITE_HELD(&grp->lg_lock)); - } - - ASSERT(RW_WRITE_HELD(&grp->lg_lock) || RW_READ_HELD(&grp->lg_lock)); - AGGR_GRP_REFHOLD(grp); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); /* validate fixed address if specified */ if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed && ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) || (mac_addr[0] & 0x01))) { - rc = EINVAL; - goto bail; + return (EINVAL); } /* update policy if requested */ @@ -616,11 +902,11 @@ aggr_grp_modify(datalink_id_t linkid, aggr_grp_t *grp_arg, uint8_t update_mask, /* switch from user-supplied to automatic */ aggr_port_t *port = grp->lg_ports; - rw_enter(&port->lp_lock, RW_WRITER); + mac_perim_enter_by_mh(port->lp_mh, &pmph); bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); grp->lg_mac_addr_port = port; mac_addr_changed = B_TRUE; - rw_exit(&port->lp_lock); + mac_perim_exit(pmph); } grp->lg_addr_fixed = mac_fixed; } @@ -631,36 +917,51 @@ aggr_grp_modify(datalink_id_t linkid, aggr_grp_t *grp_arg, uint8_t update_mask, if (update_mask & AGGR_MODIFY_LACP_MODE) aggr_lacp_update_mode(grp, lacp_mode); - if ((update_mask & AGGR_MODIFY_LACP_TIMER) && !grp->lg_closing) + if (update_mask & AGGR_MODIFY_LACP_TIMER) aggr_lacp_update_timer(grp, lacp_timer); -bail: - if (grp != NULL && !grp->lg_closing) { - /* - * If grp_arg is non-NULL, this function is called from - * mac_unicst_set(), and the MAC_NOTE_UNICST notification - * will be sent there. - */ - if ((grp_arg == NULL) && mac_addr_changed) - mac_unicst_update(grp->lg_mh, grp->lg_addr); + if (link_state_changed) + mac_link_update(grp->lg_mh, grp->lg_link_state); - if (link_state_changed) - mac_link_update(grp->lg_mh, grp->lg_link_state); + if (mac_addr_changed) + mac_unicst_update(grp->lg_mh, grp->lg_addr); - } + return (0); +} - if (grp_arg == NULL) { - if (grp != NULL) { - rw_exit(&grp->lg_lock); - AGGR_LACP_UNLOCK(grp); - } +/* + * Update properties of an existing link aggregation group. + */ +int +aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy, + boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, + aggr_lacp_timer_t lacp_timer) +{ + aggr_grp_t *grp = NULL; + mac_perim_handle_t mph; + int err; + + /* get group corresponding to linkid */ + rw_enter(&aggr_grp_lock, RW_READER); + if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), + (mod_hash_val_t *)&grp) != 0) { rw_exit(&aggr_grp_lock); + return (ENOENT); } + AGGR_GRP_REFHOLD(grp); - if (grp != NULL) - AGGR_GRP_REFRELE(grp); + /* + * Hold the perimeter so that the aggregation won't be destroyed. + */ + mac_perim_enter_by_mh(grp->lg_mh, &mph); + rw_exit(&aggr_grp_lock); - return (rc); + err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed, + mac_addr, lacp_mode, lacp_timer); + + mac_perim_exit(mph); + AGGR_GRP_REFRELE(grp); + return (err); } /* @@ -676,6 +977,7 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, aggr_port_t *port; mac_register_t *mac; boolean_t link_state_changed; + mac_perim_handle_t mph; int err; int i; @@ -695,9 +997,6 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP); - AGGR_LACP_LOCK_WRITER(grp); - rw_enter(&grp->lg_lock, RW_WRITER); - grp->lg_refs = 1; grp->lg_closing = B_FALSE; grp->lg_force = force; @@ -707,6 +1006,11 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; grp->lg_started = B_FALSE; grp->lg_promisc = B_FALSE; + grp->lg_lacp_done = B_FALSE; + grp->lg_lacp_head = grp->lg_lacp_tail = NULL; + grp->lg_lacp_rx_thread = thread_create(NULL, 0, + aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri); + bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t)); aggr_lacp_init_grp(grp); /* add MAC ports to group */ @@ -723,7 +1027,6 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, goto bail; } grp->lg_key = key; - grp->lg_mcst_list = NULL; for (i = 0; i < nports; i++) { err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, NULL); @@ -748,17 +1051,6 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, grp->lg_mac_addr_port = grp->lg_ports; } - /* - * Update the MAC address of the constituent ports. - * None of the port is attached at this time, the link state of the - * aggregation will not change. - */ - link_state_changed = aggr_grp_update_ports_mac(grp); - ASSERT(!link_state_changed); - - /* update outbound load balancing policy */ - aggr_send_update_policy(grp, policy); - /* set the initial group capabilities */ aggr_grp_capab_set(grp); @@ -775,6 +1067,7 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, mac->m_min_sdu = 0; mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp); mac->m_margin = aggr_grp_max_margin(grp); + mac->m_v12n = MAC_VIRT_LEVEL1; err = mac_register(mac, &grp->lg_mh); mac_free(mac); if (err != 0) @@ -782,9 +1075,23 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, if ((err = dls_devnet_create(grp->lg_mh, grp->lg_linkid)) != 0) { (void) mac_unregister(grp->lg_mh); + grp->lg_mh = NULL; goto bail; } + mac_perim_enter_by_mh(grp->lg_mh, &mph); + + /* + * Update the MAC address of the constituent ports. + * None of the port is attached at this time, the link state of the + * aggregation will not change. + */ + link_state_changed = aggr_grp_update_ports_mac(grp); + ASSERT(!link_state_changed); + + /* update outbound load balancing policy */ + aggr_send_update_policy(grp, policy); + /* set LACP mode */ aggr_lacp_set_mode(grp, lacp_mode, lacp_timer); @@ -792,8 +1099,19 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, * Attach each port if necessary. */ for (port = grp->lg_ports; port != NULL; port = port->lp_next) { - if (aggr_port_notify_link(grp, port, B_FALSE)) + /* + * Create the pseudo ring for each HW ring of the underlying + * port. Note that this is done after the aggr registers the + * mac. + */ + VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0); + if (aggr_port_notify_link(grp, port)) link_state_changed = B_TRUE; + + /* + * Initialize the callback functions for this port. + */ + aggr_port_init_callbacks(port); } if (link_state_changed) @@ -805,31 +1123,35 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, ASSERT(err == 0); aggr_grp_cnt++; - rw_exit(&grp->lg_lock); - AGGR_LACP_UNLOCK(grp); + mac_perim_exit(mph); rw_exit(&aggr_grp_lock); return (0); bail: - if (grp != NULL) { - aggr_port_t *cport; - grp->lg_closing = B_TRUE; - - port = grp->lg_ports; - while (port != NULL) { - cport = port->lp_next; - aggr_port_delete(port); - port = cport; - } + grp->lg_closing = B_TRUE; - rw_exit(&grp->lg_lock); - AGGR_LACP_UNLOCK(grp); + port = grp->lg_ports; + while (port != NULL) { + aggr_port_t *cport; - AGGR_GRP_REFRELE(grp); + cport = port->lp_next; + aggr_port_delete(port); + port = cport; } + /* + * Inform the lacp_rx thread to exit. + */ + mutex_enter(&grp->lg_lacp_lock); + grp->lg_lacp_done = B_TRUE; + cv_signal(&grp->lg_lacp_cv); + while (grp->lg_lacp_rx_thread != NULL) + cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); + mutex_exit(&grp->lg_lacp_lock); + rw_exit(&aggr_grp_lock); + AGGR_GRP_REFRELE(grp); return (err); } @@ -841,7 +1163,7 @@ aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid) { aggr_port_t *port; - ASSERT(RW_WRITE_HELD(&grp->lg_lock) || RW_READ_HELD(&grp->lg_lock)); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); for (port = grp->lg_ports; port != NULL; port = port->lp_next) { if (port->lp_linkid == linkid) @@ -862,12 +1184,12 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, aggr_port_t **pport; boolean_t mac_addr_changed = B_FALSE; boolean_t link_state_changed = B_FALSE; + mac_perim_handle_t mph; uint64_t val; uint_t i; uint_t stat; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp)); - ASSERT(RW_WRITE_HELD(&grp->lg_lock)); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); ASSERT(grp->lg_nports > 1); ASSERT(!grp->lg_closing); @@ -881,9 +1203,7 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, } *pport = port->lp_next; - atomic_add_32(&port->lp_closing, 1); - - rw_enter(&port->lp_lock, RW_WRITER); + mac_perim_enter_by_mh(port->lp_mh, &mph); /* * If the MAC address of the port being removed was assigned @@ -900,7 +1220,7 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, mac_addr_changed = B_TRUE; } - link_state_changed = aggr_grp_detach_port(grp, port, B_FALSE); + link_state_changed = aggr_grp_detach_port(grp, port); /* * Add the counter statistics of the ports while it was aggregated @@ -909,7 +1229,7 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, * value of the counter at the moment it was added to the * aggregation. */ - for (i = 0; i < MAC_NSTAT && !grp->lg_closing; i++) { + for (i = 0; i < MAC_NSTAT; i++) { stat = i + MAC_STAT_MIN; if (!MAC_STAT_ISACOUNTER(stat)) continue; @@ -917,7 +1237,7 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, val -= port->lp_stat[i]; grp->lg_stat[i] += val; } - for (i = 0; i < ETHER_NSTAT && !grp->lg_closing; i++) { + for (i = 0; i < ETHER_NSTAT; i++) { stat = i + MACTYPE_STAT_MIN; if (!ETHER_STAT_ISACOUNTER(stat)) continue; @@ -927,8 +1247,7 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, } grp->lg_nports--; - - rw_exit(&port->lp_lock); + mac_perim_exit(mph); aggr_port_delete(port); @@ -960,6 +1279,7 @@ aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports) aggr_port_t *port; boolean_t mac_addr_update = B_FALSE, mac_addr_changed; boolean_t link_state_update = B_FALSE, link_state_changed; + mac_perim_handle_t mph, pmph; /* get group corresponding to linkid */ rw_enter(&aggr_grp_lock, RW_READER); @@ -969,10 +1289,12 @@ aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports) return (ENOENT); } AGGR_GRP_REFHOLD(grp); - rw_exit(&aggr_grp_lock); - AGGR_LACP_LOCK_WRITER(grp); - rw_enter(&grp->lg_lock, RW_WRITER); + /* + * Hold the perimeter so that the aggregation won't be destroyed. + */ + mac_perim_enter_by_mh(grp->lg_mh, &mph); + rw_exit(&aggr_grp_lock); /* we need to keep at least one port per group */ if (nports >= grp->lg_nports) { @@ -989,20 +1311,51 @@ aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports) } } + /* clear the promiscous mode for the specified ports */ + for (i = 0; i < nports && rc == 0; i++) { + /* lookup port */ + port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); + ASSERT(port != NULL); + + mac_perim_enter_by_mh(port->lp_mh, &pmph); + rc = aggr_port_promisc(port, B_FALSE); + mac_perim_exit(pmph); + } + if (rc != 0) { + for (i = 0; i < nports; i++) { + port = aggr_grp_port_lookup(grp, + ports[i].lp_linkid); + ASSERT(port != NULL); + + /* + * Turn the promiscuous mode back on if it is required + * to receive the non-primary address over a port, or + * the promiscous mode is enabled over the aggr. + */ + mac_perim_enter_by_mh(port->lp_mh, &pmph); + if (port->lp_started && (grp->lg_promisc || + port->lp_prom_addr != NULL)) { + (void) aggr_port_promisc(port, B_TRUE); + } + mac_perim_exit(pmph); + } + goto bail; + } + /* remove the specified ports from group */ - for (i = 0; i < nports && !grp->lg_closing; i++) { + for (i = 0; i < nports; i++) { /* lookup port */ port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); ASSERT(port != NULL); /* stop port if group has already been started */ if (grp->lg_started) { - rw_enter(&port->lp_lock, RW_WRITER); - aggr_lacp_port_detached(port); + mac_perim_enter_by_mh(port->lp_mh, &pmph); aggr_port_stop(port); - rw_exit(&port->lp_lock); + mac_perim_exit(pmph); } + aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); /* remove port from group */ rc = aggr_grp_rem_port(grp, port, &mac_addr_changed, &link_state_changed); @@ -1012,16 +1365,14 @@ aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports) } bail: - rw_exit(&grp->lg_lock); - AGGR_LACP_UNLOCK(grp); - if (!grp->lg_closing) { - if (mac_addr_update) - mac_unicst_update(grp->lg_mh, grp->lg_addr); - if (link_state_update) - mac_link_update(grp->lg_mh, grp->lg_link_state); - if (rc == 0) - mac_resource_update(grp->lg_mh); - } + if (mac_addr_update) + mac_unicst_update(grp->lg_mh, grp->lg_addr); + if (link_state_update) + mac_link_update(grp->lg_mh, grp->lg_link_state); + if (rc == 0) + mac_resource_update(grp->lg_mh); + + mac_perim_exit(mph); AGGR_GRP_REFRELE(grp); return (rc); @@ -1032,9 +1383,9 @@ aggr_grp_delete(datalink_id_t linkid) { aggr_grp_t *grp = NULL; aggr_port_t *port, *cport; - lg_mcst_addr_t *mcst, *mcst_nextp; datalink_id_t tmpid; mod_hash_val_t val; + mac_perim_handle_t mph, pmph; int err; rw_enter(&aggr_grp_lock, RW_WRITER); @@ -1051,68 +1402,69 @@ aggr_grp_delete(datalink_id_t linkid) * aggr_m_stat() and thus has a kstat_hold() on the kstats that * dls_devnet_destroy() needs to delete. */ - if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid)) != 0) { + if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) { rw_exit(&aggr_grp_lock); return (err); } ASSERT(linkid == tmpid); - AGGR_LACP_LOCK_WRITER(grp); - rw_enter(&grp->lg_lock, RW_WRITER); - /* * Unregister from the MAC service module. Since this can * fail if a client hasn't closed the MAC port, we gracefully * fail the operation. */ - grp->lg_closing = B_TRUE; if ((err = mac_disable(grp->lg_mh)) != 0) { - grp->lg_closing = B_FALSE; - rw_exit(&grp->lg_lock); - AGGR_LACP_UNLOCK(grp); - (void) dls_devnet_create(grp->lg_mh, linkid); rw_exit(&aggr_grp_lock); return (err); } + (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val); + ASSERT(grp == (aggr_grp_t *)val); + + ASSERT(aggr_grp_cnt > 0); + aggr_grp_cnt--; + rw_exit(&aggr_grp_lock); /* - * Free the list of multicast addresses. + * Inform the lacp_rx thread to exit. */ - for (mcst = grp->lg_mcst_list; mcst != NULL; mcst = mcst_nextp) { - mcst_nextp = mcst->lg_mcst_nextp; - kmem_free(mcst, sizeof (lg_mcst_addr_t)); - } - grp->lg_mcst_list = NULL; + mutex_enter(&grp->lg_lacp_lock); + grp->lg_lacp_done = B_TRUE; + cv_signal(&grp->lg_lacp_cv); + while (grp->lg_lacp_rx_thread != NULL) + cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); + mutex_exit(&grp->lg_lacp_lock); + mac_perim_enter_by_mh(grp->lg_mh, &mph); + + grp->lg_closing = B_TRUE; /* detach and free MAC ports associated with group */ port = grp->lg_ports; while (port != NULL) { cport = port->lp_next; - rw_enter(&port->lp_lock, RW_WRITER); - aggr_lacp_port_detached(port); + mac_perim_enter_by_mh(port->lp_mh, &pmph); if (grp->lg_started) aggr_port_stop(port); - (void) aggr_grp_detach_port(grp, port, B_FALSE); - rw_exit(&port->lp_lock); + (void) aggr_grp_detach_port(grp, port); + mac_perim_exit(pmph); + aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); aggr_port_delete(port); port = cport; } - VERIFY(mac_unregister(grp->lg_mh) == 0); + mac_perim_exit(mph); - rw_exit(&grp->lg_lock); - AGGR_LACP_UNLOCK(grp); - - (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val); - ASSERT(grp == (aggr_grp_t *)val); + /* + * Wait for the port's lacp timer thread and its notification callback + * to exit before calling mac_unregister() since both needs to access + * the mac perimeter of the grp. + */ + aggr_grp_port_wait(grp); - ASSERT(aggr_grp_cnt > 0); - aggr_grp_cnt--; + VERIFY(mac_unregister(grp->lg_mh) == 0); + grp->lg_mh = NULL; - rw_exit(&aggr_grp_lock); AGGR_GRP_REFRELE(grp); - return (0); } @@ -1120,6 +1472,7 @@ void aggr_grp_free(aggr_grp_t *grp) { ASSERT(grp->lg_refs == 0); + ASSERT(grp->lg_port_ref == 0); if (grp->lg_key > AGGR_MAX_KEY) { id_free(key_ids, grp->lg_key); grp->lg_key = 0; @@ -1134,6 +1487,7 @@ aggr_grp_info(datalink_id_t linkid, void *fn_arg, { aggr_grp_t *grp; aggr_port_t *port; + mac_perim_handle_t mph, pmph; int rc = 0; rw_enter(&aggr_grp_lock, RW_READER); @@ -1143,8 +1497,10 @@ aggr_grp_info(datalink_id_t linkid, void *fn_arg, rw_exit(&aggr_grp_lock); return (ENOENT); } + AGGR_GRP_REFHOLD(grp); - rw_enter(&grp->lg_lock, RW_READER); + mac_perim_enter_by_mh(grp->lg_mh, &mph); + rw_exit(&aggr_grp_lock); rc = new_grp_fn(fn_arg, grp->lg_linkid, (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr, @@ -1155,32 +1511,21 @@ aggr_grp_info(datalink_id_t linkid, void *fn_arg, goto bail; for (port = grp->lg_ports; port != NULL; port = port->lp_next) { - rw_enter(&port->lp_lock, RW_READER); + mac_perim_enter_by_mh(port->lp_mh, &pmph); rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr, port->lp_state, &port->lp_lacp.ActorOperPortState); - rw_exit(&port->lp_lock); + mac_perim_exit(pmph); if (rc != 0) goto bail; } bail: - rw_exit(&grp->lg_lock); - rw_exit(&aggr_grp_lock); + mac_perim_exit(mph); + AGGR_GRP_REFRELE(grp); return (rc); } -static void -aggr_m_resources(void *arg) -{ - aggr_grp_t *grp = arg; - aggr_port_t *port; - - /* Call each port's m_resources function */ - for (port = grp->lg_ports; port != NULL; port = port->lp_next) - mac_resources(port->lp_mh); -} - /*ARGSUSED*/ static void aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp) @@ -1230,10 +1575,11 @@ aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val) static int aggr_m_stat(void *arg, uint_t stat, uint64_t *val) { - aggr_grp_t *grp = arg; - int rval = 0; + aggr_grp_t *grp = arg; + mac_perim_handle_t mph; + int rval = 0; - rw_enter(&grp->lg_lock, RW_READER); + mac_perim_enter_by_mh(grp->lg_mh, &mph); switch (stat) { case MAC_STAT_IFSPEED: @@ -1253,7 +1599,7 @@ aggr_m_stat(void *arg, uint_t stat, uint64_t *val) rval = aggr_grp_stat(grp, stat, val); } - rw_exit(&grp->lg_lock); + mac_perim_exit(mph); return (rval); } @@ -1262,9 +1608,9 @@ aggr_m_start(void *arg) { aggr_grp_t *grp = arg; aggr_port_t *port; + mac_perim_handle_t mph, pmph; - AGGR_LACP_LOCK_WRITER(grp); - rw_enter(&grp->lg_lock, RW_WRITER); + mac_perim_enter_by_mh(grp->lg_mh, &mph); /* * Attempts to start all configured members of the group. @@ -1272,23 +1618,27 @@ aggr_m_start(void *arg) * is received. */ for (port = grp->lg_ports; port != NULL; port = port->lp_next) { - rw_enter(&port->lp_lock, RW_WRITER); + mac_perim_enter_by_mh(port->lp_mh, &pmph); if (aggr_port_start(port) != 0) { - rw_exit(&port->lp_lock); + mac_perim_exit(pmph); continue; } - /* set port promiscuous mode */ - if (aggr_port_promisc(port, grp->lg_promisc) != 0) - aggr_port_stop(port); - rw_exit(&port->lp_lock); + /* + * Turn on the promiscuous mode if it is required to receive + * the non-primary address over a port, or the promiscous + * mode is enabled over the aggr. + */ + if (grp->lg_promisc || port->lp_prom_addr != NULL) { + if (aggr_port_promisc(port, B_TRUE) != 0) + aggr_port_stop(port); + } + mac_perim_exit(pmph); } grp->lg_started = B_TRUE; - rw_exit(&grp->lg_lock); - AGGR_LACP_UNLOCK(grp); - + mac_perim_exit(mph); return (0); } @@ -1297,21 +1647,22 @@ aggr_m_stop(void *arg) { aggr_grp_t *grp = arg; aggr_port_t *port; + mac_perim_handle_t mph, pmph; - AGGR_LACP_LOCK_WRITER(grp); - rw_enter(&grp->lg_lock, RW_WRITER); + mac_perim_enter_by_mh(grp->lg_mh, &mph); for (port = grp->lg_ports; port != NULL; port = port->lp_next) { - rw_enter(&port->lp_lock, RW_WRITER); - aggr_lacp_port_detached(port); + mac_perim_enter_by_mh(port->lp_mh, &pmph); + + /* reset port promiscuous mode */ + (void) aggr_port_promisc(port, B_FALSE); + aggr_port_stop(port); - rw_exit(&port->lp_lock); + mac_perim_exit(pmph); } grp->lg_started = B_FALSE; - - rw_exit(&grp->lg_lock); - AGGR_LACP_UNLOCK(grp); + mac_perim_exit(mph); } static int @@ -1320,10 +1671,10 @@ aggr_m_promisc(void *arg, boolean_t on) aggr_grp_t *grp = arg; aggr_port_t *port; boolean_t link_state_changed = B_FALSE; + mac_perim_handle_t mph, pmph; - AGGR_LACP_LOCK_WRITER(grp); - rw_enter(&grp->lg_lock, RW_WRITER); AGGR_GRP_REFHOLD(grp); + mac_perim_enter_by_mh(grp->lg_mh, &mph); ASSERT(!grp->lg_closing); @@ -1331,25 +1682,30 @@ aggr_m_promisc(void *arg, boolean_t on) goto bail; for (port = grp->lg_ports; port != NULL; port = port->lp_next) { - rw_enter(&port->lp_lock, RW_WRITER); + int err = 0; + + mac_perim_enter_by_mh(port->lp_mh, &pmph); AGGR_PORT_REFHOLD(port); - if (port->lp_started) { - if (aggr_port_promisc(port, on) != 0) { - if (aggr_grp_detach_port(grp, port, B_TRUE)) - link_state_changed = B_TRUE; - } else { - /* - * If a port was detached because of a previous - * failure changing the promiscuity, the port - * is reattached when it successfully changes - * the promiscuity now, and this might cause - * the link state of the aggregation to change. - */ - if (aggr_grp_attach_port(grp, port)) - link_state_changed = B_TRUE; - } + if (!on && (port->lp_prom_addr == NULL)) + err = aggr_port_promisc(port, B_FALSE); + else if (on && port->lp_started) + err = aggr_port_promisc(port, B_TRUE); + + if (err != 0) { + if (aggr_grp_detach_port(grp, port)) + link_state_changed = B_TRUE; + } else { + /* + * If a port was detached because of a previous + * failure changing the promiscuity, the port + * is reattached when it successfully changes + * the promiscuity now, and this might cause + * the link state of the aggregation to change. + */ + if (aggr_grp_attach_port(grp, port)) + link_state_changed = B_TRUE; } - rw_exit(&port->lp_lock); + mac_perim_exit(pmph); AGGR_PORT_REFRELE(port); } @@ -1359,13 +1715,49 @@ aggr_m_promisc(void *arg, boolean_t on) mac_link_update(grp->lg_mh, grp->lg_link_state); bail: - rw_exit(&grp->lg_lock); - AGGR_LACP_UNLOCK(grp); + mac_perim_exit(mph); AGGR_GRP_REFRELE(grp); return (0); } +static void +aggr_grp_port_rename(const char *new_name, void *arg) +{ + /* + * aggr port's mac client name is the format of "aggr link name" plus + * AGGR_PORT_NAME_DELIMIT plus "underneath link name". + */ + int aggr_len, link_len, clnt_name_len, i; + char *str_end, *str_st, *str_del; + char aggr_name[MAXNAMELEN]; + char link_name[MAXNAMELEN]; + char *clnt_name; + aggr_grp_t *aggr_grp = arg; + aggr_port_t *aggr_port = aggr_grp->lg_ports; + + for (i = 0; i < aggr_grp->lg_nports; i++) { + clnt_name = mac_client_name(aggr_port->lp_mch); + clnt_name_len = strlen(clnt_name); + str_st = clnt_name; + str_end = &(clnt_name[clnt_name_len]); + str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT); + ASSERT(str_del != NULL); + aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st); + link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del); + bzero(aggr_name, MAXNAMELEN); + bzero(link_name, MAXNAMELEN); + bcopy(clnt_name, aggr_name, aggr_len); + bcopy(str_del, link_name, link_len + 1); + bzero(clnt_name, MAXNAMELEN); + (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name, + link_name); + + (void) mac_rename_primary(aggr_port->lp_mh, NULL); + aggr_port = aggr_port->lp_next; + } +} + /* * Initialize the capabilities that are advertised for the group * according to the capabilities of the constituent ports. @@ -1381,51 +1773,245 @@ aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) *hcksum_txflags = grp->lg_hcksum_txflags; break; } - case MAC_CAPAB_POLL: - /* - * There's nothing for us to fill in, we simply return - * B_TRUE or B_FALSE to represent the group's support - * status for this capability. - */ - return (grp->lg_gldv3_polling); case MAC_CAPAB_NO_NATIVEVLAN: return (!grp->lg_vlan); case MAC_CAPAB_NO_ZCOPY: return (!grp->lg_zcopy); + case MAC_CAPAB_RINGS: { + mac_capab_rings_t *cap_rings = cap_data; + + if (cap_rings->mr_type == MAC_RING_TYPE_RX) { + cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; + cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt; + cap_rings->mr_rget = aggr_fill_ring; + + /* + * An aggregation advertises only one (pseudo) RX + * group, which virtualizes the main/primary group of + * the underlying devices. + */ + cap_rings->mr_gnum = 1; + cap_rings->mr_gget = aggr_fill_group; + cap_rings->mr_gaddring = NULL; + cap_rings->mr_gremring = NULL; + } else { + return (B_FALSE); + } + break; + } + case MAC_CAPAB_AGGR: + { + mac_capab_aggr_t *aggr_cap; + + if (cap_data != NULL) { + aggr_cap = cap_data; + aggr_cap->mca_rename_fn = aggr_grp_port_rename; + aggr_cap->mca_unicst = aggr_m_unicst; + } + return (B_TRUE); + } default: return (B_FALSE); } return (B_TRUE); } +/* + * Callback funtion for MAC layer to register groups. + */ +static void +aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index, + mac_group_info_t *infop, mac_group_handle_t gh) +{ + aggr_grp_t *grp = arg; + aggr_pseudo_rx_group_t *rx_group; + + ASSERT(rtype == MAC_RING_TYPE_RX && index == 0); + rx_group = &grp->lg_rx_group; + rx_group->arg_gh = gh; + rx_group->arg_grp = grp; + + infop->mgi_driver = (mac_group_driver_t)rx_group; + infop->mgi_start = NULL; + infop->mgi_stop = NULL; + infop->mgi_addmac = aggr_addmac; + infop->mgi_remmac = aggr_remmac; + infop->mgi_count = rx_group->arg_ring_cnt; +} + +/* + * Callback funtion for MAC layer to register all rings. + */ +static void +aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, + const int index, mac_ring_info_t *infop, mac_ring_handle_t rh) +{ + aggr_grp_t *grp = arg; + + switch (rtype) { + case MAC_RING_TYPE_RX: { + aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_group; + aggr_pseudo_rx_ring_t *rx_ring; + mac_intr_t aggr_mac_intr; + + ASSERT(rg_index == 0); + + ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt)); + rx_ring = rx_group->arg_rings + index; + rx_ring->arr_rh = rh; + + /* + * Entrypoint to enable interrupt (disable poll) and + * disable interrupt (enable poll). + */ + aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring; + aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr; + aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr; + + infop->mri_driver = (mac_ring_driver_t)rx_ring; + infop->mri_start = aggr_pseudo_start_ring; + infop->mri_stop = aggr_pseudo_stop_ring; + + infop->mri_intr = aggr_mac_intr; + infop->mri_poll = aggr_rx_poll; + break; + } + default: + break; + } +} + +static mblk_t * +aggr_rx_poll(void *arg, int bytes_to_pickup) +{ + aggr_pseudo_rx_ring_t *rr_ring = arg; + aggr_port_t *port = rr_ring->arr_port; + aggr_grp_t *grp = port->lp_grp; + mblk_t *mp_chain, *mp, **mpp; + + mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup); + + if (grp->lg_lacp_mode == AGGR_LACP_OFF) + return (mp_chain); + + mpp = &mp_chain; + while ((mp = *mpp) != NULL) { + if (MBLKL(mp) >= sizeof (struct ether_header)) { + struct ether_header *ehp; + + ehp = (struct ether_header *)mp->b_rptr; + if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) { + *mpp = mp->b_next; + mp->b_next = NULL; + aggr_recv_lacp(port, + (mac_resource_handle_t)rr_ring, mp); + continue; + } + } + + if (!port->lp_collector_enabled) { + *mpp = mp->b_next; + mp->b_next = NULL; + freemsg(mp); + continue; + } + mpp = &mp->b_next; + } + return (mp_chain); +} + static int -aggr_grp_multicst(aggr_grp_t *grp, boolean_t add, const uint8_t *addrp) +aggr_addmac(void *arg, const uint8_t *mac_addr) { - lg_mcst_addr_t *mcst, **ppmcst; + aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; + aggr_unicst_addr_t *addr, **pprev; + aggr_grp_t *grp = rx_group->arg_grp; + aggr_port_t *port, *p; + mac_perim_handle_t mph; + int err = 0; + + mac_perim_enter_by_mh(grp->lg_mh, &mph); + + if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { + mac_perim_exit(mph); + return (0); + } - ASSERT(RW_WRITE_HELD(&grp->lg_lock)); + /* + * Insert this mac address into the list of mac addresses owned by + * the aggregation pseudo group. + */ + pprev = &rx_group->arg_macaddr; + while ((addr = *pprev) != NULL) { + if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) { + mac_perim_exit(mph); + return (EEXIST); + } + pprev = &addr->aua_next; + } + addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP); + bcopy(mac_addr, addr->aua_addr, ETHERADDRL); + addr->aua_next = NULL; + *pprev = addr; - for (ppmcst = &(grp->lg_mcst_list); (mcst = *ppmcst) != NULL; - ppmcst = &(mcst->lg_mcst_nextp)) { - if (bcmp(mcst->lg_mcst_addr, addrp, MAXMACADDRLEN) == 0) + for (port = grp->lg_ports; port != NULL; port = port->lp_next) + if ((err = aggr_port_addmac(port, mac_addr)) != 0) break; + + if (err != 0) { + for (p = grp->lg_ports; p != port; p = p->lp_next) + aggr_port_remmac(p, mac_addr); + + *pprev = NULL; + kmem_free(addr, sizeof (aggr_unicst_addr_t)); } - if (add) { - if (mcst != NULL) - return (0); - mcst = kmem_zalloc(sizeof (lg_mcst_addr_t), KM_NOSLEEP); - if (mcst == NULL) - return (ENOMEM); - bcopy(addrp, mcst->lg_mcst_addr, MAXMACADDRLEN); - *ppmcst = mcst; - } else { - if (mcst == NULL) - return (ENOENT); - *ppmcst = mcst->lg_mcst_nextp; - kmem_free(mcst, sizeof (lg_mcst_addr_t)); + mac_perim_exit(mph); + return (err); +} + +static int +aggr_remmac(void *arg, const uint8_t *mac_addr) +{ + aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; + aggr_unicst_addr_t *addr, **pprev; + aggr_grp_t *grp = rx_group->arg_grp; + aggr_port_t *port; + mac_perim_handle_t mph; + int err = 0; + + mac_perim_enter_by_mh(grp->lg_mh, &mph); + + if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { + mac_perim_exit(mph); + return (0); } - return (0); + + /* + * Insert this mac address into the list of mac addresses owned by + * the aggregation pseudo group. + */ + pprev = &rx_group->arg_macaddr; + while ((addr = *pprev) != NULL) { + if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) { + pprev = &addr->aua_next; + continue; + } + break; + } + if (addr == NULL) { + mac_perim_exit(mph); + return (EINVAL); + } + + for (port = grp->lg_ports; port != NULL; port = port->lp_next) + aggr_port_remmac(port, mac_addr); + + *pprev = addr->aua_next; + kmem_free(addr, sizeof (aggr_unicst_addr_t)); + + mac_perim_exit(mph); + return (err); } /* @@ -1438,17 +2024,14 @@ void aggr_grp_multicst_port(aggr_port_t *port, boolean_t add) { aggr_grp_t *grp = port->lp_grp; - lg_mcst_addr_t *mcst; - ASSERT(RW_WRITE_HELD(&port->lp_lock)); - ASSERT(RW_WRITE_HELD(&grp->lg_lock) || RW_READ_HELD(&grp->lg_lock)); + ASSERT(MAC_PERIM_HELD(port->lp_mh)); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); if (!port->lp_started) return; - for (mcst = grp->lg_mcst_list; mcst != NULL; - mcst = mcst->lg_mcst_nextp) - (void) aggr_port_multicst(port, add, mcst->lg_mcst_addr); + mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add); } static int @@ -1456,19 +2039,18 @@ aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp) { aggr_grp_t *grp = arg; aggr_port_t *port = NULL; + mac_perim_handle_t mph; int err = 0, cerr; - rw_enter(&grp->lg_lock, RW_WRITER); + mac_perim_enter_by_mh(grp->lg_mh, &mph); for (port = grp->lg_ports; port != NULL; port = port->lp_next) { if (port->lp_state != AGGR_PORT_STATE_ATTACHED) continue; cerr = aggr_port_multicst(port, add, addrp); - if (cerr == 0) - (void) aggr_grp_multicst(grp, add, addrp); if (cerr != 0 && err == 0) err = cerr; } - rw_exit(&grp->lg_lock); + mac_perim_exit(mph); return (err); } @@ -1476,16 +2058,14 @@ static int aggr_m_unicst(void *arg, const uint8_t *macaddr) { aggr_grp_t *grp = arg; - int rc; + mac_perim_handle_t mph; + int err; - AGGR_LACP_LOCK_WRITER(grp); - rw_enter(&grp->lg_lock, RW_WRITER); - rc = aggr_grp_modify(0, grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr, + mac_perim_enter_by_mh(grp->lg_mh, &mph); + err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr, 0, 0); - rw_exit(&grp->lg_lock); - AGGR_LACP_UNLOCK(grp); - - return (rc); + mac_perim_exit(mph); + return (err); } /* @@ -1498,11 +2078,10 @@ aggr_grp_capab_set(aggr_grp_t *grp) uint32_t cksum; aggr_port_t *port; - ASSERT(RW_WRITE_HELD(&grp->lg_lock)); + ASSERT(grp->lg_mh == NULL); ASSERT(grp->lg_ports != NULL); grp->lg_hcksum_txflags = (uint32_t)-1; - grp->lg_gldv3_polling = B_TRUE; grp->lg_zcopy = B_TRUE; grp->lg_vlan = B_TRUE; @@ -1516,9 +2095,6 @@ aggr_grp_capab_set(aggr_grp_t *grp) grp->lg_zcopy &= !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL); - - grp->lg_gldv3_polling &= - mac_capab_get(port->lp_mh, MAC_CAPAB_POLL, NULL); } } @@ -1551,11 +2127,6 @@ aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port) return (B_FALSE); } - if (mac_capab_get(port->lp_mh, MAC_CAPAB_POLL, NULL) != - grp->lg_gldv3_polling) { - return (B_FALSE); - } - return (B_TRUE); } @@ -1568,7 +2139,7 @@ aggr_grp_max_sdu(aggr_grp_t *grp) uint_t max_sdu = (uint_t)-1; aggr_port_t *port; - ASSERT(RW_WRITE_HELD(&grp->lg_lock)); + ASSERT(grp->lg_mh == NULL); ASSERT(grp->lg_ports != NULL); for (port = grp->lg_ports; port != NULL; port = port->lp_next) { @@ -1605,7 +2176,7 @@ aggr_grp_max_margin(aggr_grp_t *grp) uint32_t margin = UINT32_MAX; aggr_port_t *port; - ASSERT(RW_WRITE_HELD(&grp->lg_lock)); + ASSERT(grp->lg_mh == NULL); ASSERT(grp->lg_ports != NULL); for (port = grp->lg_ports; port != NULL; port = port->lp_next) { diff --git a/usr/src/uts/common/io/aggr/aggr_lacp.c b/usr/src/uts/common/io/aggr/aggr_lacp.c index 09330f8df1..0916533c48 100644 --- a/usr/src/uts/common/io/aggr/aggr_lacp.c +++ b/usr/src/uts/common/io/aggr/aggr_lacp.c @@ -29,8 +29,10 @@ #include <sys/types.h> #include <sys/sysmacros.h> +#include <sys/callb.h> #include <sys/conf.h> #include <sys/cmn_err.h> +#include <sys/disp.h> #include <sys/list.h> #include <sys/ksynch.h> #include <sys/kmem.h> @@ -97,8 +99,8 @@ typedef struct lacp_sel_ports { static lacp_sel_ports_t *sel_ports = NULL; static kmutex_t lacp_sel_lock; -static void periodic_timer_pop_locked(aggr_port_t *); static void periodic_timer_pop(void *); +static void periodic_timer_pop_handler(aggr_port_t *); static void lacp_xmit_sm(aggr_port_t *); static void lacp_periodic_sm(aggr_port_t *); static void fill_lacp_pdu(aggr_port_t *, lacp_t *); @@ -108,16 +110,18 @@ static void lacp_off(aggr_port_t *); static boolean_t valid_lacp_pdu(aggr_port_t *, lacp_t *); static void lacp_receive_sm(aggr_port_t *, lacp_t *); static void aggr_set_coll_dist(aggr_port_t *, boolean_t); -static void aggr_set_coll_dist_locked(aggr_port_t *, boolean_t); static void start_wait_while_timer(aggr_port_t *); static void stop_wait_while_timer(aggr_port_t *); static void lacp_reset_port(aggr_port_t *); static void stop_current_while_timer(aggr_port_t *); static void current_while_timer_pop(void *); +static void current_while_timer_pop_handler(aggr_port_t *); static void update_default_selected(aggr_port_t *); static boolean_t update_selected(aggr_port_t *, lacp_t *); static boolean_t lacp_sel_ports_add(aggr_port_t *); static void lacp_sel_ports_del(aggr_port_t *); +static void wait_while_timer_pop(void *); +static void wait_while_timer_pop_handler(aggr_port_t *); void aggr_lacp_init(void) @@ -132,13 +136,96 @@ aggr_lacp_fini(void) } /* + * The following functions are used for handling LACP timers. + * + * Note that we cannot fully rely on the aggr's mac perimeter in the timeout + * handler routine, otherwise it may cause deadlock with the untimeout() call + * which is usually called with the mac perimeter held. Instead, a + * lacp_timer_lock mutex is introduced, which protects a bitwise flag + * (lacp_timer_bits). This flag is set/cleared by timeout()/stop_timer() + * routines and is checked by a dedicated thread, that executes the real + * timeout operation. + */ +static void +aggr_port_timer_thread(void *arg) +{ + aggr_port_t *port = arg; + aggr_lacp_port_t *pl = &port->lp_lacp; + aggr_grp_t *grp = port->lp_grp; + uint32_t lacp_timer_bits; + mac_perim_handle_t mph; + callb_cpr_t cprinfo; + + CALLB_CPR_INIT(&cprinfo, &pl->lacp_timer_lock, callb_generic_cpr, + "aggr_port_timer_thread"); + + mutex_enter(&pl->lacp_timer_lock); + + for (;;) { + + if ((lacp_timer_bits = pl->lacp_timer_bits) == 0) { + CALLB_CPR_SAFE_BEGIN(&cprinfo); + cv_wait(&pl->lacp_timer_cv, &pl->lacp_timer_lock); + CALLB_CPR_SAFE_END(&cprinfo, &pl->lacp_timer_lock); + continue; + } + pl->lacp_timer_bits = 0; + + if (lacp_timer_bits & LACP_THREAD_EXIT) + break; + + if (lacp_timer_bits & LACP_PERIODIC_TIMEOUT) + pl->periodic_timer.id = 0; + if (lacp_timer_bits & LACP_WAIT_WHILE_TIMEOUT) + pl->wait_while_timer.id = 0; + if (lacp_timer_bits & LACP_CURRENT_WHILE_TIMEOUT) + pl->current_while_timer.id = 0; + + mutex_exit(&pl->lacp_timer_lock); + + mac_perim_enter_by_mh(grp->lg_mh, &mph); + if (port->lp_closing) { + mac_perim_exit(mph); + mutex_enter(&pl->lacp_timer_lock); + break; + } + + if (lacp_timer_bits & LACP_PERIODIC_TIMEOUT) + periodic_timer_pop_handler(port); + if (lacp_timer_bits & LACP_WAIT_WHILE_TIMEOUT) + wait_while_timer_pop_handler(port); + if (lacp_timer_bits & LACP_CURRENT_WHILE_TIMEOUT) + current_while_timer_pop_handler(port); + mac_perim_exit(mph); + + mutex_enter(&pl->lacp_timer_lock); + if (pl->lacp_timer_bits & LACP_THREAD_EXIT) + break; + } + + pl->lacp_timer_bits = 0; + pl->lacp_timer_thread = NULL; + cv_broadcast(&pl->lacp_timer_cv); + + /* CALLB_CPR_EXIT drops the lock */ + CALLB_CPR_EXIT(&cprinfo); + + /* + * Release the reference of the grp so aggr_grp_delete() can call + * mac_unregister() safely. + */ + aggr_grp_port_rele(port); + thread_exit(); +} + +/* * Set the port LACP state to SELECTED. Returns B_FALSE if the operation * could not be performed due to a memory allocation error, B_TRUE otherwise. */ static boolean_t lacp_port_select(aggr_port_t *portp) { - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); + ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); if (!lacp_sel_ports_add(portp)) return (B_FALSE); @@ -152,7 +239,9 @@ lacp_port_select(aggr_port_t *portp) static void lacp_port_unselect(aggr_port_t *portp) { - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); + aggr_grp_t *grp = portp->lp_grp; + + ASSERT((grp->lg_mh == NULL) || MAC_PERIM_HELD(grp->lg_mh)); lacp_sel_ports_del(portp); portp->lp_lacp.sm.selected = AGGR_UNSELECTED; @@ -180,9 +269,8 @@ aggr_lacp_init_port(aggr_port_t *portp) aggr_grp_t *aggrp = portp->lp_grp; aggr_lacp_port_t *pl = &portp->lp_lacp; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(aggrp)); - ASSERT(RW_LOCK_HELD(&aggrp->lg_lock)); - ASSERT(RW_LOCK_HELD(&portp->lp_lock)); + ASSERT(aggrp->lg_mh == NULL || MAC_PERIM_HELD(aggrp->lg_mh)); + ASSERT(MAC_PERIM_HELD(portp->lp_mh)); /* actor port # */ pl->ActorPortNumber = portp->lp_portid; @@ -251,6 +339,25 @@ aggr_lacp_init_port(aggr_port_t *portp) pl->wait_while_timer.id = 0; pl->wait_while_timer.val = AGGREGATE_WAIT_TIME; + + pl->lacp_timer_bits = 0; + + mutex_init(&pl->lacp_timer_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&pl->lacp_timer_cv, NULL, CV_DRIVER, NULL); + + pl->lacp_timer_thread = thread_create(NULL, 0, aggr_port_timer_thread, + portp, 0, &p0, TS_RUN, minclsyspri); + + /* + * Hold a reference of the grp and the port and this reference will + * be release when the thread exits. + * + * The reference on the port is used for aggr_port_delete() to + * continue without waiting for the thread to exit; the reference + * on the grp is used for aggr_grp_delete() to wait for the thread + * to exit before calling mac_unregister(). + */ + aggr_grp_port_hold(portp); } /* @@ -264,7 +371,7 @@ lacp_reset_port(aggr_port_t *portp) { aggr_lacp_port_t *pl = &portp->lp_lacp; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); + ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); pl->NTT = B_FALSE; /* need to transmit */ @@ -306,8 +413,8 @@ lacp_reset_port(aggr_port_t *portp) static void aggr_lacp_mcast_on(aggr_port_t *port) { - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(port->lp_grp)); - ASSERT(RW_WRITE_HELD(&port->lp_lock)); + ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); + ASSERT(MAC_PERIM_HELD(port->lp_mh)); if (port->lp_state != AGGR_PORT_STATE_ATTACHED) return; @@ -319,8 +426,8 @@ aggr_lacp_mcast_on(aggr_port_t *port) static void aggr_lacp_mcast_off(aggr_port_t *port) { - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(port->lp_grp)); - ASSERT(RW_WRITE_HELD(&port->lp_lock)); + ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); + ASSERT(MAC_PERIM_HELD(port->lp_mh)); if (port->lp_state != AGGR_PORT_STATE_ATTACHED) return; @@ -332,26 +439,35 @@ aggr_lacp_mcast_off(aggr_port_t *port) static void start_periodic_timer(aggr_port_t *portp) { - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); + aggr_lacp_port_t *pl = &portp->lp_lacp; + + ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); - if (portp->lp_lacp.periodic_timer.id == 0) { - portp->lp_lacp.periodic_timer.id = - timeout(periodic_timer_pop, portp, + mutex_enter(&pl->lacp_timer_lock); + if (pl->periodic_timer.id == 0) { + pl->periodic_timer.id = timeout(periodic_timer_pop, portp, drv_usectohz(1000000 * portp->lp_lacp.periodic_timer.val)); } + mutex_exit(&pl->lacp_timer_lock); } static void stop_periodic_timer(aggr_port_t *portp) { - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); + aggr_lacp_port_t *pl = &portp->lp_lacp; + timeout_id_t id; + + ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); - if (portp->lp_lacp.periodic_timer.id != 0) { - AGGR_LACP_UNLOCK(portp->lp_grp); - (void) untimeout(portp->lp_lacp.periodic_timer.id); - AGGR_LACP_LOCK_WRITER(portp->lp_grp); - portp->lp_lacp.periodic_timer.id = 0; + mutex_enter(&pl->lacp_timer_lock); + if ((id = pl->periodic_timer.id) != 0) { + pl->lacp_timer_bits &= ~LACP_PERIODIC_TIMEOUT; + pl->periodic_timer.id = 0; } + mutex_exit(&pl->lacp_timer_lock); + + if (id != 0) + (void) untimeout(id); } /* @@ -360,13 +476,29 @@ stop_periodic_timer(aggr_port_t *portp) * LACPDU. We then set the periodic state and let * the periodic state machine restart the timer. */ +static void +periodic_timer_pop(void *data) +{ + aggr_port_t *portp = data; + aggr_lacp_port_t *pl = &portp->lp_lacp; + + mutex_enter(&pl->lacp_timer_lock); + pl->lacp_timer_bits |= LACP_PERIODIC_TIMEOUT; + cv_broadcast(&pl->lacp_timer_cv); + mutex_exit(&pl->lacp_timer_lock); +} +/* + * When the timer pops, we arrive here to + * clear out LACPDU count as well as transmit an + * LACPDU. We then set the periodic state and let + * the periodic state machine restart the timer. + */ static void -periodic_timer_pop_locked(aggr_port_t *portp) +periodic_timer_pop_handler(aggr_port_t *portp) { - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); + ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); - portp->lp_lacp.periodic_timer.id = NULL; portp->lp_lacp_stats.LACPDUsTx = 0; /* current timestamp */ @@ -390,19 +522,6 @@ periodic_timer_pop_locked(aggr_port_t *portp) lacp_periodic_sm(portp); } -static void -periodic_timer_pop(void *data) -{ - aggr_port_t *portp = data; - - if (portp->lp_closing) - return; - - AGGR_LACP_LOCK_WRITER(portp->lp_grp); - periodic_timer_pop_locked(portp); - AGGR_LACP_UNLOCK(portp->lp_grp); -} - /* * Invoked from: * - startup upon aggregation @@ -417,7 +536,7 @@ lacp_periodic_sm(aggr_port_t *portp) lacp_periodic_state_t oldstate = portp->lp_lacp.sm.periodic_state; aggr_lacp_port_t *pl = &portp->lp_lacp; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); + ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); /* LACP_OFF state not in specification so check here. */ if (!pl->sm.lacp_on) { @@ -465,7 +584,7 @@ lacp_periodic_sm(aggr_port_t *portp) * a LACPDU. */ stop_periodic_timer(portp); - periodic_timer_pop_locked(portp); + periodic_timer_pop_handler(portp); } /* Rearm timer with value provided by partner */ @@ -483,9 +602,8 @@ lacp_xmit_sm(aggr_port_t *portp) size_t len; mblk_t *mp; hrtime_t now, elapsed; - const mac_txinfo_t *mtp; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); + ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); /* LACP_OFF state not in specification so check here. */ if (!pl->sm.lacp_on || !pl->NTT || !portp->lp_started) @@ -534,12 +652,7 @@ lacp_xmit_sm(aggr_port_t *portp) fill_lacp_pdu(portp, (lacp_t *)(mp->b_rptr + sizeof (struct ether_header))); - /* - * Store the transmit info pointer locally in case it changes between - * loading mt_fn and mt_arg. - */ - mtp = portp->lp_txinfo; - mtp->mt_fn(mtp->mt_arg, mp); + (void) mac_tx(portp->lp_mch, mp, 0, MAC_DROP_ON_NO_DESC, NULL); pl->NTT = B_FALSE; portp->lp_lacp_stats.LACPDUsTx++; @@ -563,15 +676,14 @@ fill_lacp_pdu(aggr_port_t *portp, lacp_t *lacp) { aggr_lacp_port_t *pl = &portp->lp_lacp; aggr_grp_t *aggrp = portp->lp_grp; + mac_perim_handle_t pmph; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); + ASSERT(MAC_PERIM_HELD(aggrp->lg_mh)); + mac_perim_enter_by_mh(portp->lp_mh, &pmph); lacp->subtype = LACP_SUBTYPE; lacp->version = LACP_VERSION; - rw_enter(&aggrp->lg_lock, RW_READER); - rw_enter(&portp->lp_lock, RW_READER); - /* * Actor Information */ @@ -609,8 +721,7 @@ fill_lacp_pdu(aggr_port_t *portp, lacp_t *lacp) lacp->tlv_terminator = TERMINATOR_TLV; lacp->terminator_len = 0x0; - rw_exit(&portp->lp_lock); - rw_exit(&aggrp->lg_lock); + mac_perim_exit(pmph); } /* @@ -633,7 +744,7 @@ lacp_mux_sm(aggr_port_t *portp) aggr_lacp_port_t *pl = &portp->lp_lacp; lacp_mux_state_t oldstate = pl->sm.mux_state; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(aggrp)); + ASSERT(MAC_PERIM_HELD(aggrp->lg_mh)); /* LACP_OFF state not in specification so check here. */ if (!pl->sm.lacp_on) { @@ -788,29 +899,28 @@ again: } /* lacp_mux_sm */ -static void +static int receive_marker_pdu(aggr_port_t *portp, mblk_t *mp) { marker_pdu_t *markerp = (marker_pdu_t *)mp->b_rptr; - const mac_txinfo_t *mtp; - AGGR_LACP_LOCK_WRITER(portp->lp_grp); + ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); AGGR_LACP_DBG(("trunk link: (%d): MARKER PDU received:\n", portp->lp_linkid)); /* LACP_OFF state not in specification so check here. */ if (!portp->lp_lacp.sm.lacp_on) - goto bail; + return (-1); if (MBLKL(mp) < sizeof (marker_pdu_t)) - goto bail; + return (-1); if (markerp->version != MARKER_VERSION) { AGGR_LACP_DBG(("trunk link (%d): Malformed MARKER PDU: " "version = %d does not match s/w version %d\n", portp->lp_linkid, markerp->version, MARKER_VERSION)); - goto bail; + return (-1); } if (markerp->tlv_marker == MARKER_RESPONSE_TLV) { @@ -818,21 +928,21 @@ receive_marker_pdu(aggr_port_t *portp, mblk_t *mp) AGGR_LACP_DBG(("trunk link (%d): MARKER RESPONSE PDU: " " MARKER TLV = %d - We don't send out info type!\n", portp->lp_linkid, markerp->tlv_marker)); - goto bail; + return (-1); } if (markerp->tlv_marker != MARKER_INFO_TLV) { AGGR_LACP_DBG(("trunk link (%d): Malformed MARKER PDU: " " MARKER TLV = %d \n", portp->lp_linkid, markerp->tlv_marker)); - goto bail; + return (-1); } if (markerp->marker_len != MARKER_INFO_RESPONSE_LENGTH) { AGGR_LACP_DBG(("trunk link (%d): Malformed MARKER PDU: " " MARKER length = %d \n", portp->lp_linkid, markerp->marker_len)); - goto bail; + return (-1); } if (markerp->requestor_port != portp->lp_lacp.PartnerOperPortNum) { @@ -840,7 +950,7 @@ receive_marker_pdu(aggr_port_t *portp, mblk_t *mp) " MARKER Port %d not equal to Partner port %d\n", portp->lp_linkid, markerp->requestor_port, portp->lp_lacp.PartnerOperPortNum)); - goto bail; + return (-1); } if (ether_cmp(&markerp->system_id, @@ -848,7 +958,7 @@ receive_marker_pdu(aggr_port_t *portp, mblk_t *mp) AGGR_LACP_DBG(("trunk link (%d): MARKER PDU: " " MARKER MAC not equal to Partner MAC\n", portp->lp_linkid)); - goto bail; + return (-1); } /* @@ -861,23 +971,9 @@ receive_marker_pdu(aggr_port_t *portp, mblk_t *mp) ASSERT(MBLKHEAD(mp) >= sizeof (struct ether_header)); mp->b_rptr -= sizeof (struct ether_header); fill_lacp_ether(portp, (struct ether_header *)mp->b_rptr); - - /* - * Store the transmit info pointer locally in case it changes between - * loading mt_fn and mt_arg. - */ - mtp = portp->lp_txinfo; - AGGR_LACP_UNLOCK(portp->lp_grp); - - mtp->mt_fn(mtp->mt_arg, mp); - return; - -bail: - AGGR_LACP_UNLOCK(portp->lp_grp); - freemsg(mp); + return (0); } - /* * Update the LACP mode (off, active, or passive) of the specified group. */ @@ -887,8 +983,8 @@ aggr_lacp_update_mode(aggr_grp_t *grp, aggr_lacp_mode_t mode) aggr_lacp_mode_t old_mode = grp->lg_lacp_mode; aggr_port_t *port; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp)); - ASSERT(RW_WRITE_HELD(&grp->lg_lock)); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + ASSERT(!grp->lg_closing); if (mode == old_mode) return; @@ -904,20 +1000,12 @@ aggr_lacp_update_mode(aggr_grp_t *grp, aggr_lacp_mode_t mode) /* OFF -> {PASSIVE,ACTIVE} */ /* turn OFF Collector_Distributor */ aggr_set_coll_dist(port, B_FALSE); - rw_enter(&port->lp_lock, RW_WRITER); lacp_on(port); - if (port->lp_state == AGGR_PORT_STATE_ATTACHED) - aggr_lacp_port_attached(port); - rw_exit(&port->lp_lock); } else if (mode == AGGR_LACP_OFF) { /* {PASSIVE,ACTIVE} -> OFF */ - rw_enter(&port->lp_lock, RW_WRITER); lacp_off(port); - rw_exit(&port->lp_lock); - if (!grp->lg_closing) { - /* Turn ON Collector_Distributor */ - aggr_set_coll_dist(port, B_TRUE); - } + /* Turn ON Collector_Distributor */ + aggr_set_coll_dist(port, B_TRUE); } else { /* PASSIVE->ACTIVE or ACTIVE->PASSIVE */ port->lp_lacp.sm.begin = B_TRUE; @@ -928,9 +1016,6 @@ aggr_lacp_update_mode(aggr_grp_t *grp, aggr_lacp_mode_t mode) lacp_receive_sm(port, NULL); lacp_mux_sm(port); } - - if (grp->lg_closing) - break; } } @@ -943,8 +1028,7 @@ aggr_lacp_update_timer(aggr_grp_t *grp, aggr_lacp_timer_t timer) { aggr_port_t *port; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp)); - ASSERT(RW_WRITE_HELD(&grp->lg_lock)); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); if (timer == grp->aggr.PeriodicTimer) return; @@ -958,6 +1042,32 @@ aggr_lacp_update_timer(aggr_grp_t *grp, aggr_lacp_timer_t timer) } } +void +aggr_port_lacp_set_mode(aggr_grp_t *grp, aggr_port_t *port) +{ + aggr_lacp_mode_t mode; + aggr_lacp_timer_t timer; + + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + + mode = grp->lg_lacp_mode; + timer = grp->aggr.PeriodicTimer; + + port->lp_lacp.ActorAdminPortState.bit.activity = + port->lp_lacp.ActorOperPortState.bit.activity = + (mode == AGGR_LACP_ACTIVE); + + port->lp_lacp.ActorAdminPortState.bit.timeout = + port->lp_lacp.ActorOperPortState.bit.timeout = + (timer == AGGR_LACP_TIMER_SHORT); + + if (mode == AGGR_LACP_OFF) { + /* Turn ON Collector_Distributor */ + aggr_set_coll_dist(port, B_TRUE); + } else { /* LACP_ACTIVE/PASSIVE */ + lacp_on(port); + } +} /* * Sets the initial LACP mode (off, active, passive) and LACP timer @@ -969,30 +1079,13 @@ aggr_lacp_set_mode(aggr_grp_t *grp, aggr_lacp_mode_t mode, { aggr_port_t *port; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp)); - ASSERT(RW_WRITE_HELD(&grp->lg_lock)); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); grp->lg_lacp_mode = mode; grp->aggr.PeriodicTimer = timer; - for (port = grp->lg_ports; port != NULL; port = port->lp_next) { - port->lp_lacp.ActorAdminPortState.bit.activity = - port->lp_lacp.ActorOperPortState.bit.activity = - (mode == AGGR_LACP_ACTIVE); - - port->lp_lacp.ActorAdminPortState.bit.timeout = - port->lp_lacp.ActorOperPortState.bit.timeout = - (timer == AGGR_LACP_TIMER_SHORT); - - if (grp->lg_lacp_mode == AGGR_LACP_OFF) { - /* Turn ON Collector_Distributor */ - aggr_set_coll_dist(port, B_TRUE); - } else { /* LACP_ACTIVE/PASSIVE */ - rw_enter(&port->lp_lock, RW_WRITER); - lacp_on(port); - rw_exit(&port->lp_lock); - } - } + for (port = grp->lg_ports; port != NULL; port = port->lp_next) + aggr_port_lacp_set_mode(grp, port); } /* @@ -1148,7 +1241,7 @@ lacp_selection_logic(aggr_port_t *portp) boolean_t reset_mac = B_FALSE; aggr_lacp_port_t *pl = &portp->lp_lacp; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(aggrp)); + ASSERT(MAC_PERIM_HELD(aggrp->lg_mh)); /* LACP_OFF state not in specification so check here. */ if (!pl->sm.lacp_on) { @@ -1377,47 +1470,65 @@ static void wait_while_timer_pop(void *data) { aggr_port_t *portp = data; + aggr_lacp_port_t *pl = &portp->lp_lacp; - if (portp->lp_closing) - return; + mutex_enter(&pl->lacp_timer_lock); + pl->lacp_timer_bits |= LACP_WAIT_WHILE_TIMEOUT; + cv_broadcast(&pl->lacp_timer_cv); + mutex_exit(&pl->lacp_timer_lock); +} - AGGR_LACP_LOCK_WRITER(portp->lp_grp); +/* + * wait_while_timer_pop_handler - When the timer pops, we arrive here to + * set ready_n and trigger the selection logic. + */ +static void +wait_while_timer_pop_handler(aggr_port_t *portp) +{ + ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); AGGR_LACP_DBG(("trunk link:(%d): wait_while_timer pop \n", portp->lp_linkid)); - portp->lp_lacp.wait_while_timer.id = 0; portp->lp_lacp.sm.ready_n = B_TRUE; lacp_selection_logic(portp); - AGGR_LACP_UNLOCK(portp->lp_grp); } static void start_wait_while_timer(aggr_port_t *portp) { - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); + aggr_lacp_port_t *pl = &portp->lp_lacp; + + ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); - if (portp->lp_lacp.wait_while_timer.id == 0) { - portp->lp_lacp.wait_while_timer.id = + mutex_enter(&pl->lacp_timer_lock); + if (pl->wait_while_timer.id == 0) { + pl->wait_while_timer.id = timeout(wait_while_timer_pop, portp, drv_usectohz(1000000 * portp->lp_lacp.wait_while_timer.val)); } + mutex_exit(&pl->lacp_timer_lock); } static void -stop_wait_while_timer(portp) -aggr_port_t *portp; +stop_wait_while_timer(aggr_port_t *portp) { - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); + aggr_lacp_port_t *pl = &portp->lp_lacp; + timeout_id_t id; + + ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); - if (portp->lp_lacp.wait_while_timer.id != 0) { - AGGR_LACP_UNLOCK(portp->lp_grp); - (void) untimeout(portp->lp_lacp.wait_while_timer.id); - AGGR_LACP_LOCK_WRITER(portp->lp_grp); - portp->lp_lacp.wait_while_timer.id = 0; + mutex_enter(&pl->lacp_timer_lock); + if ((id = pl->wait_while_timer.id) != 0) { + pl->lacp_timer_bits &= ~LACP_WAIT_WHILE_TIMEOUT; + pl->wait_while_timer.id = 0; } + mutex_exit(&pl->lacp_timer_lock); + + if (id != 0) + (void) untimeout(id); } /* @@ -1432,52 +1543,30 @@ aggr_lacp_port_attached(aggr_port_t *portp) aggr_grp_t *grp = portp->lp_grp; aggr_lacp_port_t *pl = &portp->lp_lacp; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp)); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + ASSERT(MAC_PERIM_HELD(portp->lp_mh)); ASSERT(portp->lp_state == AGGR_PORT_STATE_ATTACHED); - ASSERT(RW_WRITE_HELD(&portp->lp_lock)); AGGR_LACP_DBG(("aggr_lacp_port_attached: port %d\n", portp->lp_linkid)); portp->lp_lacp.sm.port_enabled = B_TRUE; /* link on */ - if (grp->lg_lacp_mode == AGGR_LACP_OFF) { - pl->ActorAdminPortState.bit.activity = - pl->ActorOperPortState.bit.activity = B_FALSE; - - /* Turn ON Collector_Distributor */ - aggr_set_coll_dist_locked(portp, B_TRUE); - + if (grp->lg_lacp_mode == AGGR_LACP_OFF) return; - } - - pl->ActorAdminPortState.bit.activity = - pl->ActorOperPortState.bit.activity = - (grp->lg_lacp_mode == AGGR_LACP_ACTIVE); - - pl->ActorAdminPortState.bit.timeout = - pl->ActorOperPortState.bit.timeout = - (grp->aggr.PeriodicTimer == AGGR_LACP_TIMER_SHORT); pl->sm.lacp_enabled = B_TRUE; pl->ActorOperPortState.bit.aggregation = B_TRUE; pl->sm.begin = B_TRUE; - if (!pl->sm.lacp_on) { - /* Turn OFF Collector_Distributor */ - aggr_set_coll_dist_locked(portp, B_FALSE); - - lacp_on(portp); - } else { - lacp_receive_sm(portp, NULL); - lacp_mux_sm(portp); + lacp_receive_sm(portp, NULL); + lacp_mux_sm(portp); - /* Enable Multicast Slow Protocol address */ - aggr_lacp_mcast_on(portp); + /* Enable Multicast Slow Protocol address */ + aggr_lacp_mcast_on(portp); - /* periodic_sm is started up from the receive machine */ - lacp_selection_logic(portp); - } + /* periodic_sm is started up from the receive machine */ + lacp_selection_logic(portp); } /* @@ -1489,8 +1578,8 @@ aggr_lacp_port_detached(aggr_port_t *portp) { aggr_grp_t *grp = portp->lp_grp; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp)); - ASSERT(RW_WRITE_HELD(&portp->lp_lock)); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + ASSERT(MAC_PERIM_HELD(portp->lp_mh)); AGGR_LACP_DBG(("aggr_lacp_port_detached: port %d\n", portp->lp_linkid)); @@ -1500,34 +1589,35 @@ aggr_lacp_port_detached(aggr_port_t *portp) if (grp->lg_lacp_mode == AGGR_LACP_OFF) return; - /* Disable Slow Protocol PDUs */ - lacp_off(portp); -} - + portp->lp_lacp.sm.lacp_enabled = B_FALSE; + lacp_selection_logic(portp); + lacp_mux_sm(portp); + lacp_periodic_sm(portp); -/* - * Invoked after the outbound port selection policy has been changed. - */ -void -aggr_lacp_policy_changed(aggr_grp_t *grp) -{ - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp)); - ASSERT(RW_WRITE_HELD(&grp->lg_lock)); + /* + * Disable Slow Protocol Timers. + */ + stop_periodic_timer(portp); + stop_current_while_timer(portp); + stop_wait_while_timer(portp); - /* suspend transmission for CollectorMaxDelay time */ - delay(grp->aggr.CollectorMaxDelay * 10); + /* Disable Multicast Slow Protocol address */ + aggr_lacp_mcast_off(portp); + aggr_set_coll_dist(portp, B_FALSE); } - /* * Enable Slow Protocol LACP and Marker PDUs. */ static void lacp_on(aggr_port_t *portp) { - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); - ASSERT(RW_WRITE_HELD(&portp->lp_grp->lg_lock)); - ASSERT(RW_WRITE_HELD(&portp->lp_lock)); + aggr_lacp_port_t *pl = &portp->lp_lacp; + mac_perim_handle_t mph; + + ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); + + mac_perim_enter_by_mh(portp->lp_mh, &mph); /* * Reset the state machines and Partner operational @@ -1535,67 +1625,69 @@ lacp_on(aggr_port_t *portp) * our link state. */ lacp_reset_port(portp); - portp->lp_lacp.sm.lacp_on = B_TRUE; + pl->sm.lacp_on = B_TRUE; AGGR_LACP_DBG(("lacp_on:(%d): \n", portp->lp_linkid)); + if (portp->lp_state == AGGR_PORT_STATE_ATTACHED) { + pl->sm.port_enabled = B_TRUE; + pl->sm.lacp_enabled = B_TRUE; + pl->ActorOperPortState.bit.aggregation = B_TRUE; + } + lacp_receive_sm(portp, NULL); lacp_mux_sm(portp); - if (portp->lp_state != AGGR_PORT_STATE_ATTACHED) - return; - - /* Enable Multicast Slow Protocol address */ - aggr_lacp_mcast_on(portp); + if (portp->lp_state == AGGR_PORT_STATE_ATTACHED) { + /* Enable Multicast Slow Protocol address */ + aggr_lacp_mcast_on(portp); - /* periodic_sm is started up from the receive machine */ - lacp_selection_logic(portp); + /* periodic_sm is started up from the receive machine */ + lacp_selection_logic(portp); + } +done: + mac_perim_exit(mph); } /* lacp_on */ - /* Disable Slow Protocol LACP and Marker PDUs */ static void lacp_off(aggr_port_t *portp) { - aggr_grp_t *grp = portp->lp_grp; + aggr_lacp_port_t *pl = &portp->lp_lacp; + mac_perim_handle_t mph; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); - ASSERT(RW_WRITE_HELD(&grp->lg_lock)); - ASSERT(RW_WRITE_HELD(&portp->lp_lock)); + ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); + mac_perim_enter_by_mh(portp->lp_mh, &mph); - portp->lp_lacp.sm.lacp_on = B_FALSE; + pl->sm.lacp_on = B_FALSE; AGGR_LACP_DBG(("lacp_off:(%d): \n", portp->lp_linkid)); - /* - * Disable Slow Protocol Timers. We must temporarily release - * the group and port locks to avoid deadlocks. Make sure that - * neither the port nor group are closing after re-acquiring - * their locks. - */ - rw_exit(&portp->lp_lock); - rw_exit(&grp->lg_lock); - - stop_periodic_timer(portp); - stop_current_while_timer(portp); - stop_wait_while_timer(portp); + if (portp->lp_state == AGGR_PORT_STATE_ATTACHED) { + /* + * Disable Slow Protocol Timers. + */ + stop_periodic_timer(portp); + stop_current_while_timer(portp); + stop_wait_while_timer(portp); - rw_enter(&grp->lg_lock, RW_WRITER); - rw_enter(&portp->lp_lock, RW_WRITER); + /* Disable Multicast Slow Protocol address */ + aggr_lacp_mcast_off(portp); - if (!portp->lp_closing && !grp->lg_closing) { - lacp_mux_sm(portp); - lacp_periodic_sm(portp); - lacp_selection_logic(portp); + pl->sm.port_enabled = B_FALSE; + pl->sm.lacp_enabled = B_FALSE; + pl->ActorOperPortState.bit.aggregation = B_FALSE; } - /* Turn OFF Collector_Distributor */ - aggr_set_coll_dist_locked(portp, B_FALSE); + lacp_mux_sm(portp); + lacp_periodic_sm(portp); + lacp_selection_logic(portp); - /* Disable Multicast Slow Protocol address */ - aggr_lacp_mcast_off(portp); + /* Turn OFF Collector_Distributor */ + aggr_set_coll_dist(portp, B_FALSE); lacp_reset_port(portp); + mac_perim_exit(mph); } @@ -1627,61 +1719,71 @@ valid_lacp_pdu(aggr_port_t *portp, lacp_t *lacp) static void start_current_while_timer(aggr_port_t *portp, uint_t time) { - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); - - if (portp->lp_lacp.current_while_timer.id == 0) { - if (time > 0) { - portp->lp_lacp.current_while_timer.val = time; - } else if (portp->lp_lacp.ActorOperPortState.bit.timeout) { - portp->lp_lacp.current_while_timer.val = - SHORT_TIMEOUT_TIME; - } else { - portp->lp_lacp.current_while_timer.val = - LONG_TIMEOUT_TIME; - } + aggr_lacp_port_t *pl = &portp->lp_lacp; + + ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); + + mutex_enter(&pl->lacp_timer_lock); + if (pl->current_while_timer.id == 0) { + if (time > 0) + pl->current_while_timer.val = time; + else if (pl->ActorOperPortState.bit.timeout) + pl->current_while_timer.val = SHORT_TIMEOUT_TIME; + else + pl->current_while_timer.val = LONG_TIMEOUT_TIME; - portp->lp_lacp.current_while_timer.id = + pl->current_while_timer.id = timeout(current_while_timer_pop, portp, drv_usectohz((clock_t)1000000 * (clock_t)portp->lp_lacp.current_while_timer.val)); } + mutex_exit(&pl->lacp_timer_lock); } static void stop_current_while_timer(aggr_port_t *portp) { - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); + aggr_lacp_port_t *pl = &portp->lp_lacp; + timeout_id_t id; + + ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); - if (portp->lp_lacp.current_while_timer.id != 0) { - AGGR_LACP_UNLOCK(portp->lp_grp); - (void) untimeout(portp->lp_lacp.current_while_timer.id); - AGGR_LACP_LOCK_WRITER(portp->lp_grp); - portp->lp_lacp.current_while_timer.id = 0; + mutex_enter(&pl->lacp_timer_lock); + if ((id = pl->current_while_timer.id) != 0) { + pl->lacp_timer_bits &= ~LACP_CURRENT_WHILE_TIMEOUT; + pl->current_while_timer.id = 0; } -} + mutex_exit(&pl->lacp_timer_lock); + if (id != 0) + (void) untimeout(id); +} static void current_while_timer_pop(void *data) { aggr_port_t *portp = (aggr_port_t *)data; + aggr_lacp_port_t *pl = &portp->lp_lacp; - if (portp->lp_closing) - return; + mutex_enter(&pl->lacp_timer_lock); + pl->lacp_timer_bits |= LACP_CURRENT_WHILE_TIMEOUT; + cv_broadcast(&pl->lacp_timer_cv); + mutex_exit(&pl->lacp_timer_lock); +} - AGGR_LACP_LOCK_WRITER(portp->lp_grp); +static void +current_while_timer_pop_handler(aggr_port_t *portp) +{ + ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); AGGR_LACP_DBG(("trunk link:(%d): current_while_timer " "pop id=%p\n", portp->lp_linkid, portp->lp_lacp.current_while_timer.id)); - portp->lp_lacp.current_while_timer.id = 0; lacp_receive_sm(portp, NULL); - AGGR_LACP_UNLOCK(portp->lp_grp); } - /* * record_Default - Simply copies over administrative values * to the partner operational values, and sets our state to indicate we @@ -1692,7 +1794,7 @@ record_Default(aggr_port_t *portp) { aggr_lacp_port_t *pl = &portp->lp_lacp; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); + ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); pl->PartnerOperPortNum = pl->PartnerAdminPortNum; pl->PartnerOperPortPriority = pl->PartnerAdminPortPriority; @@ -1713,7 +1815,7 @@ record_PDU(aggr_port_t *portp, lacp_t *lacp) aggr_lacp_port_t *pl = &portp->lp_lacp; uint8_t save_sync; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); + ASSERT(MAC_PERIM_HELD(aggrp->lg_mh)); /* * Partner Information @@ -1780,7 +1882,7 @@ update_selected(aggr_port_t *portp, lacp_t *lacp) { aggr_lacp_port_t *pl = &portp->lp_lacp; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); + ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); if ((pl->PartnerOperPortNum != ntohs(lacp->actor_info.port)) || (pl->PartnerOperPortPriority != @@ -1814,7 +1916,7 @@ update_default_selected(aggr_port_t *portp) { aggr_lacp_port_t *pl = &portp->lp_lacp; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); + ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); if ((pl->PartnerAdminPortNum != pl->PartnerOperPortNum) || (pl->PartnerOperPortPriority != pl->PartnerAdminPortPriority) || @@ -1844,7 +1946,7 @@ update_NTT(aggr_port_t *portp, lacp_t *lacp) aggr_grp_t *aggrp = portp->lp_grp; aggr_lacp_port_t *pl = &portp->lp_lacp; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); + ASSERT(MAC_PERIM_HELD(aggrp->lg_mh)); if ((pl->ActorPortNumber != ntohs(lacp->partner_info.port)) || (pl->ActorPortPriority != @@ -1890,7 +1992,7 @@ lacp_receive_sm(aggr_port_t *portp, lacp_t *lacp) aggr_lacp_port_t *pl = &portp->lp_lacp; lacp_receive_state_t oldstate = pl->sm.receive_state; - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); + ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); /* LACP_OFF state not in specification so check here. */ if (!pl->sm.lacp_on) @@ -1918,7 +2020,6 @@ lacp_receive_sm(aggr_port_t *portp, lacp_t *lacp) pl->sm.receive_state = LACP_DEFAULTED; } - if (!((lacp && (oldstate == LACP_CURRENT) && (pl->sm.receive_state == LACP_CURRENT)))) { AGGR_LACP_DBG(("lacp_receive_sm(%d):%s--->%s\n", @@ -2068,28 +2169,19 @@ lacp_receive_sm(aggr_port_t *portp, lacp_t *lacp) static void aggr_set_coll_dist(aggr_port_t *portp, boolean_t enable) { - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); - rw_enter(&portp->lp_lock, RW_WRITER); - aggr_set_coll_dist_locked(portp, enable); - rw_exit(&portp->lp_lock); -} - -static void -aggr_set_coll_dist_locked(aggr_port_t *portp, boolean_t enable) -{ - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(portp->lp_grp)); - ASSERT(RW_WRITE_HELD(&portp->lp_lock)); + mac_perim_handle_t mph; AGGR_LACP_DBG(("AGGR_SET_COLL_DIST_TYPE: (%d) %s\n", portp->lp_linkid, enable ? "ENABLED" : "DISABLED")); + mac_perim_enter_by_mh(portp->lp_mh, &mph); if (!enable) { /* * Turn OFF Collector_Distributor. */ portp->lp_collector_enabled = B_FALSE; aggr_send_port_disable(portp); - return; + goto done; } /* @@ -2102,14 +2194,21 @@ aggr_set_coll_dist_locked(aggr_port_t *portp, boolean_t enable) portp->lp_collector_enabled = B_TRUE; aggr_send_port_enable(portp); } + +done: + mac_perim_exit(mph); } /* - * Process a received Marker or LACPDU. + * Because the LACP packet processing needs to enter the aggr's mac perimeter + * and that would potentially cause a deadlock with the thread in which the + * grp/port is deleted, we defer the packet process to a worker thread. Here + * we only enqueue the received Marker or LACPDU for later processing. */ void -aggr_lacp_rx(aggr_port_t *portp, mblk_t *dmp) +aggr_lacp_rx_enqueue(aggr_port_t *portp, mblk_t *dmp) { + aggr_grp_t *grp = portp->lp_grp; lacp_t *lacp; dmp->b_rptr += sizeof (struct ether_header); @@ -2120,34 +2219,143 @@ aggr_lacp_rx(aggr_port_t *portp, mblk_t *dmp) } lacp = (lacp_t *)dmp->b_rptr; + if (lacp->subtype != LACP_SUBTYPE && lacp->subtype != MARKER_SUBTYPE) { + AGGR_LACP_DBG(("aggr_lacp_rx_enqueue: (%d): " + "Unknown Slow Protocol type %d\n", + portp->lp_linkid, lacp->subtype)); + freemsg(dmp); + return; + } + + mutex_enter(&grp->lg_lacp_lock); + + /* + * If the lg_lacp_done is set, this aggregation is in the process of + * being deleted, return directly. + */ + if (grp->lg_lacp_done) { + mutex_exit(&grp->lg_lacp_lock); + freemsg(dmp); + return; + } + + if (grp->lg_lacp_tail == NULL) { + grp->lg_lacp_head = grp->lg_lacp_tail = dmp; + } else { + grp->lg_lacp_tail->b_next = dmp; + grp->lg_lacp_tail = dmp; + } + + /* + * Hold a reference of the port so that the port won't be freed when it + * is removed from the aggr. The b_prev field is borrowed to save the + * port information. + */ + AGGR_PORT_REFHOLD(portp); + dmp->b_prev = (mblk_t *)portp; + cv_broadcast(&grp->lg_lacp_cv); + mutex_exit(&grp->lg_lacp_lock); +} +static void +aggr_lacp_rx(mblk_t *dmp) +{ + aggr_port_t *portp = (aggr_port_t *)dmp->b_prev; + mac_perim_handle_t mph; + lacp_t *lacp; + + dmp->b_prev = NULL; + + mac_perim_enter_by_mh(portp->lp_grp->lg_mh, &mph); + if (portp->lp_closing) + goto done; + + lacp = (lacp_t *)dmp->b_rptr; switch (lacp->subtype) { case LACP_SUBTYPE: AGGR_LACP_DBG(("aggr_lacp_rx:(%d): LACPDU received.\n", portp->lp_linkid)); - AGGR_LACP_LOCK_WRITER(portp->lp_grp); if (!portp->lp_lacp.sm.lacp_on) { - AGGR_LACP_UNLOCK(portp->lp_grp); break; } lacp_receive_sm(portp, lacp); - AGGR_LACP_UNLOCK(portp->lp_grp); break; case MARKER_SUBTYPE: AGGR_LACP_DBG(("aggr_lacp_rx:(%d): Marker Packet received.\n", portp->lp_linkid)); - (void) receive_marker_pdu(portp, dmp); - break; + if (receive_marker_pdu(portp, dmp) != 0) + break; - default: - AGGR_LACP_DBG(("aggr_lacp_rx: (%d): " - "Unknown Slow Protocol type %d\n", - portp->lp_linkid, lacp->subtype)); - break; + (void) mac_tx(portp->lp_mch, dmp, 0, MAC_DROP_ON_NO_DESC, NULL); + mac_perim_exit(mph); + AGGR_PORT_REFRELE(portp); + return; } +done: + mac_perim_exit(mph); + AGGR_PORT_REFRELE(portp); freemsg(dmp); } + +void +aggr_lacp_rx_thread(void *arg) +{ + callb_cpr_t cprinfo; + aggr_grp_t *grp = (aggr_grp_t *)arg; + aggr_port_t *port; + mblk_t *mp, *nextmp; + + CALLB_CPR_INIT(&cprinfo, &grp->lg_lacp_lock, callb_generic_cpr, + "aggr_lacp_rx_thread"); + + mutex_enter(&grp->lg_lacp_lock); + + /* + * Quit the thread if the grp is deleted. + */ + while (!grp->lg_lacp_done) { + if ((mp = grp->lg_lacp_head) == NULL) { + CALLB_CPR_SAFE_BEGIN(&cprinfo); + cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); + CALLB_CPR_SAFE_END(&cprinfo, &grp->lg_lacp_lock); + continue; + } + + grp->lg_lacp_head = grp->lg_lacp_tail = NULL; + mutex_exit(&grp->lg_lacp_lock); + + while (mp != NULL) { + nextmp = mp->b_next; + mp->b_next = NULL; + aggr_lacp_rx(mp); + mp = nextmp; + } + mutex_enter(&grp->lg_lacp_lock); + } + + /* + * The grp is being destroyed, simply free all of the LACP messages + * left in the queue which did not have the chance to be processed. + * We cannot use freemsgchain() here since we need to clear the + * b_prev field. + */ + while ((mp = grp->lg_lacp_head) != NULL) { + port = (aggr_port_t *)mp->b_prev; + AGGR_PORT_REFRELE(port); + nextmp = mp->b_next; + mp->b_next = NULL; + mp->b_prev = NULL; + freemsg(mp); + mp = nextmp; + } + + grp->lg_lacp_head = grp->lg_lacp_tail = NULL; + grp->lg_lacp_rx_thread = NULL; + cv_broadcast(&grp->lg_lacp_cv); + CALLB_CPR_EXIT(&cprinfo); + thread_exit(); +} diff --git a/usr/src/uts/common/io/aggr/aggr_port.c b/usr/src/uts/common/io/aggr/aggr_port.c index cad61f559f..a84c4a5c2a 100644 --- a/usr/src/uts/common/io/aggr/aggr_port.c +++ b/usr/src/uts/common/io/aggr/aggr_port.c @@ -46,6 +46,7 @@ #include <sys/stat.h> #include <sys/sdt.h> #include <sys/dlpi.h> +#include <sys/dls.h> #include <sys/aggr.h> #include <sys/aggr_impl.h> @@ -58,11 +59,7 @@ static void aggr_port_notify_cb(void *, mac_notify_type_t); static int aggr_port_constructor(void *buf, void *arg, int kmflag) { - aggr_port_t *port = buf; - bzero(buf, sizeof (aggr_port_t)); - rw_init(&port->lp_lock, NULL, RW_DRIVER, NULL); - return (0); } @@ -72,7 +69,10 @@ aggr_port_destructor(void *buf, void *arg) { aggr_port_t *port = buf; - rw_destroy(&port->lp_lock); + ASSERT(port->lp_mnh == NULL); + ASSERT(port->lp_mphp == NULL); + ASSERT(!port->lp_grp_added); + ASSERT(port->lp_hwgh == NULL); } void @@ -103,31 +103,37 @@ aggr_port_fini(void) id_space_destroy(aggr_portids); } -mac_resource_handle_t -aggr_port_resource_add(void *arg, mac_resource_t *mrp) -{ - aggr_port_t *port = (aggr_port_t *)arg; - aggr_grp_t *grp = port->lp_grp; - - return (mac_resource_add(grp->lg_mh, mrp)); -} - +/* ARGSUSED */ void aggr_port_init_callbacks(aggr_port_t *port) { /* add the port's receive callback */ - port->lp_mnh = mac_notify_add(port->lp_mh, aggr_port_notify_cb, - (void *)port); - - /* set port's resource_add callback */ - mac_resource_set(port->lp_mh, aggr_port_resource_add, (void *)port); + port->lp_mnh = mac_notify_add(port->lp_mh, aggr_port_notify_cb, port); + /* + * Hold a reference of the grp and the port and this reference will + * be release when the thread exits. + * + * The reference on the port is used for aggr_port_delete() to + * continue without waiting for the thread to exit; the reference + * on the grp is used for aggr_grp_delete() to wait for the thread + * to exit before calling mac_unregister(). + * + * Note that these references will be released either in + * aggr_port_delete() when mac_notify_remove() succeeds, or in + * the aggr_port_notify_cb() callback when the port is deleted + * (lp_closing is set). + */ + aggr_grp_port_hold(port); } +/* ARGSUSED */ int -aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp) +aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force, + aggr_port_t **pp) { int err; mac_handle_t mh; + mac_client_handle_t mch = NULL; aggr_port_t *port; uint16_t portid; uint_t i; @@ -135,6 +141,11 @@ aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp) const mac_info_t *mip; uint32_t note; uint32_t margin; + char client_name[MAXNAMELEN]; + char aggr_name[MAXNAMELEN]; + char port_name[MAXNAMELEN]; + mac_diag_t diag; + mac_unicast_handle_t mah; *pp = NULL; @@ -165,6 +176,20 @@ aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp) } } + if (((err = dls_mgmt_get_linkinfo(grp->lg_linkid, + aggr_name, NULL, NULL, NULL)) != 0) || + ((err = dls_mgmt_get_linkinfo(linkid, port_name, + NULL, NULL, NULL)) != 0)) { + goto fail; + } + + (void) snprintf(client_name, MAXNAMELEN, "%s-%s", aggr_name, port_name); + if ((err = mac_client_open(mh, &mch, client_name, + MAC_OPEN_FLAGS_IS_AGGR_PORT | MAC_OPEN_FLAGS_EXCLUSIVE | + MAC_OPEN_FLAGS_DISABLE_TX_VID_CHECK)) != 0) { + goto fail; + } + if ((portid = (uint16_t)id_alloc(aggr_portids)) == 0) { err = ENOMEM; goto fail; @@ -180,10 +205,9 @@ aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp) goto fail; } - if (!mac_active_set(mh)) { + if ((err = mac_unicast_primary_add(mch, &mah, &diag)) != 0) { VERIFY(mac_margin_remove(mh, margin) == 0); id_free(aggr_portids, portid); - err = EBUSY; goto fail; } @@ -192,15 +216,14 @@ aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp) port->lp_refs = 1; port->lp_next = NULL; port->lp_mh = mh; + port->lp_mch = mch; port->lp_mip = mip; port->lp_linkid = linkid; - port->lp_closing = 0; + port->lp_closing = B_FALSE; + port->lp_mah = mah; /* get the port's original MAC address */ - mac_unicst_get(port->lp_mh, port->lp_addr); - - /* set port's transmit information */ - port->lp_txinfo = mac_tx_get(port->lp_mh); + mac_unicast_primary_get(port->lp_mh, port->lp_addr); /* initialize state */ port->lp_state = AGGR_PORT_STATE_STANDBY; @@ -213,6 +236,7 @@ aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp) port->lp_no_link_update = no_link_update; port->lp_portid = portid; port->lp_margin = margin; + port->lp_prom_addr = NULL; /* * Save the current statistics of the port. They will be used @@ -235,6 +259,8 @@ aggr_port_create(const datalink_id_t linkid, boolean_t force, aggr_port_t **pp) return (0); fail: + if (mch != NULL) + mac_client_close(mch, MAC_CLOSE_FLAGS_EXCLUSIVE); mac_close(mh); return (err); } @@ -242,19 +268,48 @@ fail: void aggr_port_delete(aggr_port_t *port) { + aggr_lacp_port_t *pl = &port->lp_lacp; + + ASSERT(port->lp_mphp == NULL); + ASSERT(!port->lp_promisc_on); + + port->lp_closing = B_TRUE; + VERIFY(mac_margin_remove(port->lp_mh, port->lp_margin) == 0); - mac_rx_remove_wait(port->lp_mh); - mac_resource_set(port->lp_mh, NULL, NULL); - mac_notify_remove(port->lp_mh, port->lp_mnh); - mac_active_clear(port->lp_mh); + mac_rx_clear(port->lp_mch); + /* + * If the notification callback is already in process and waiting for + * the aggr grp's mac perimeter, don't wait (otherwise there would be + * deadlock). Otherwise, if mac_notify_remove() succeeds, we can + * release the reference held when mac_notify_add() is called. + */ + if ((port->lp_mnh != NULL) && + (mac_notify_remove(port->lp_mnh, B_FALSE) == 0)) { + aggr_grp_port_rele(port); + } + port->lp_mnh = NULL; + + /* + * Inform the the port lacp timer thread to exit. Note that waiting + * for the thread to exit may cause deadlock since that thread may + * need to enter into the mac perimeter which we are currently in. + * It is fine to continue without waiting though since that thread + * is holding a reference of the port. + */ + mutex_enter(&pl->lacp_timer_lock); + pl->lacp_timer_bits |= LACP_THREAD_EXIT; + cv_broadcast(&pl->lacp_timer_cv); + mutex_exit(&pl->lacp_timer_lock); /* * Restore the port MAC address. Note it is called after the * port's notification callback being removed. This prevent * port's MAC_NOTE_UNICST notify callback function being called. */ - (void) mac_unicst_set(port->lp_mh, port->lp_addr); + (void) mac_unicast_primary_set(port->lp_mh, port->lp_addr); + (void) mac_unicast_remove(port->lp_mch, port->lp_mah); + mac_client_close(port->lp_mch, MAC_CLOSE_FLAGS_EXCLUSIVE); mac_close(port->lp_mh); AGGR_PORT_REFRELE(port); } @@ -268,6 +323,8 @@ aggr_port_free(aggr_port_t *port) port->lp_grp = NULL; id_free(aggr_portids, port->lp_portid); port->lp_portid = 0; + mutex_destroy(&port->lp_lacp.lacp_timer_lock); + cv_destroy(&port->lp_lacp.lacp_timer_cv); kmem_cache_free(aggr_port_cache, port); } @@ -276,7 +333,7 @@ aggr_port_free(aggr_port_t *port) * one of the constituent ports. */ boolean_t -aggr_port_notify_link(aggr_grp_t *grp, aggr_port_t *port, boolean_t dolock) +aggr_port_notify_link(aggr_grp_t *grp, aggr_port_t *port) { boolean_t do_attach = B_FALSE; boolean_t do_detach = B_FALSE; @@ -284,16 +341,10 @@ aggr_port_notify_link(aggr_grp_t *grp, aggr_port_t *port, boolean_t dolock) uint64_t ifspeed; link_state_t link_state; link_duplex_t link_duplex; + mac_perim_handle_t mph; - if (dolock) { - AGGR_LACP_LOCK_WRITER(grp); - rw_enter(&grp->lg_lock, RW_WRITER); - } else { - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp)); - ASSERT(RW_WRITE_HELD(&grp->lg_lock)); - } - - rw_enter(&port->lp_lock, RW_WRITER); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + mac_perim_enter_by_mh(port->lp_mh, &mph); /* * link state change? For links that do not support link state @@ -334,15 +385,10 @@ aggr_port_notify_link(aggr_grp_t *grp, aggr_port_t *port, boolean_t dolock) link_state_changed = aggr_grp_attach_port(grp, port); } else if (do_detach) { /* detach the port from the aggregation */ - link_state_changed = aggr_grp_detach_port(grp, port, B_TRUE); + link_state_changed = aggr_grp_detach_port(grp, port); } - rw_exit(&port->lp_lock); - - if (dolock) { - rw_exit(&grp->lg_lock); - AGGR_LACP_UNLOCK(grp); - } + mac_perim_exit(mph); return (link_state_changed); } @@ -357,21 +403,20 @@ aggr_port_notify_unicst(aggr_grp_t *grp, aggr_port_t *port, boolean_t mac_addr_changed = B_FALSE; boolean_t link_state_changed = B_FALSE; uint8_t mac_addr[ETHERADDRL]; + mac_perim_handle_t mph; + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); ASSERT(mac_addr_changedp != NULL); ASSERT(link_state_changedp != NULL); - AGGR_LACP_LOCK_WRITER(grp); - rw_enter(&grp->lg_lock, RW_WRITER); - - rw_enter(&port->lp_lock, RW_WRITER); + mac_perim_enter_by_mh(port->lp_mh, &mph); /* * If it is called when setting the MAC address to the * aggregation group MAC address, do nothing. */ - mac_unicst_get(port->lp_mh, mac_addr); + mac_unicast_primary_get(port->lp_mh, mac_addr); if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { - rw_exit(&port->lp_lock); + mac_perim_exit(mph); goto done; } @@ -381,10 +426,7 @@ aggr_port_notify_unicst(aggr_grp_t *grp, aggr_port_t *port, aggr_grp_port_mac_changed(grp, port, &mac_addr_changed, &link_state_changed); - rw_exit(&port->lp_lock); - - if (grp->lg_closing) - goto done; + mac_perim_exit(mph); /* * If this port was used to determine the MAC address of @@ -397,8 +439,6 @@ aggr_port_notify_unicst(aggr_grp_t *grp, aggr_port_t *port, done: *mac_addr_changedp = mac_addr_changed; *link_state_changedp = link_state_changed; - rw_exit(&grp->lg_lock); - AGGR_LACP_UNLOCK(grp); } /* @@ -411,22 +451,26 @@ aggr_port_notify_cb(void *arg, mac_notify_type_t type) aggr_port_t *port = arg; aggr_grp_t *grp = port->lp_grp; boolean_t mac_addr_changed, link_state_changed; + mac_perim_handle_t mph; - /* - * Do nothing if the aggregation or the port is in the deletion - * process. Note that this is necessary to avoid deadlock. - */ - if ((grp->lg_closing) || (port->lp_closing)) - return; + mac_perim_enter_by_mh(grp->lg_mh, &mph); + if (port->lp_closing) { + mac_perim_exit(mph); - AGGR_PORT_REFHOLD(port); + /* + * Release the reference so it is safe for aggr to call + * mac_unregister() now. + */ + aggr_grp_port_rele(port); + return; + } switch (type) { case MAC_NOTE_TX: mac_tx_update(grp->lg_mh); break; case MAC_NOTE_LINK: - if (aggr_port_notify_link(grp, port, B_TRUE)) + if (aggr_port_notify_link(grp, port)) mac_link_update(grp->lg_mh, grp->lg_link_state); break; case MAC_NOTE_UNICST: @@ -437,46 +481,34 @@ aggr_port_notify_cb(void *arg, mac_notify_type_t type) if (link_state_changed) mac_link_update(grp->lg_mh, grp->lg_link_state); break; - case MAC_NOTE_PROMISC: - port->lp_txinfo = mac_tx_get(port->lp_mh); - break; default: break; } - AGGR_PORT_REFRELE(port); + mac_perim_exit(mph); } int aggr_port_start(aggr_port_t *port) { - int rc; - - ASSERT(RW_WRITE_HELD(&port->lp_lock)); - - if (port->lp_started) - return (0); - - if ((rc = mac_start(port->lp_mh)) != 0) - return (rc); + ASSERT(MAC_PERIM_HELD(port->lp_mh)); - /* update the port state */ - port->lp_started = B_TRUE; + if (!port->lp_started) + port->lp_started = B_TRUE; - return (rc); + return (0); } void aggr_port_stop(aggr_port_t *port) { - ASSERT(RW_WRITE_HELD(&port->lp_lock)); + ASSERT(MAC_PERIM_HELD(port->lp_mh)); if (!port->lp_started) return; - aggr_grp_multicst_port(port, B_FALSE); - - mac_stop(port->lp_mh); + if (port->lp_state == AGGR_PORT_STATE_ATTACHED) + aggr_grp_multicst_port(port, B_FALSE); /* update the port state */ port->lp_started = B_FALSE; @@ -487,33 +519,46 @@ aggr_port_promisc(aggr_port_t *port, boolean_t on) { int rc; - ASSERT(RW_WRITE_HELD(&port->lp_lock)); + ASSERT(MAC_PERIM_HELD(port->lp_mh)); if (on == port->lp_promisc_on) /* already in desired promiscous mode */ return (0); - rc = mac_promisc_set(port->lp_mh, on, MAC_DEVPROMISC); + if (on) { + mac_rx_clear(port->lp_mch); + rc = mac_promisc_add(port->lp_mch, MAC_CLIENT_PROMISC_ALL, + aggr_recv_cb, port, &port->lp_mphp, + MAC_PROMISC_FLAGS_NO_TX_LOOP); + if (rc != 0) { + mac_rx_set(port->lp_mch, aggr_recv_cb, port); + return (rc); + } + } else { + rc = mac_promisc_remove(port->lp_mphp); + if (rc != 0) + return (rc); + port->lp_mphp = NULL; + mac_rx_set(port->lp_mch, aggr_recv_cb, port); + } - if (rc == 0) - port->lp_promisc_on = on; + port->lp_promisc_on = on; - return (rc); + return (0); } /* * Set the MAC address of a port. */ int -aggr_port_unicst(aggr_port_t *port, uint8_t *macaddr) +aggr_port_unicst(aggr_port_t *port) { - int rc; - - ASSERT(RW_WRITE_HELD(&port->lp_lock)); + aggr_grp_t *grp = port->lp_grp; - rc = mac_unicst_set(port->lp_mh, macaddr); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + ASSERT(MAC_PERIM_HELD(port->lp_mh)); - return (rc); + return (mac_unicast_primary_set(port->lp_mh, grp->lg_addr)); } /* @@ -524,8 +569,12 @@ aggr_port_multicst(void *arg, boolean_t add, const uint8_t *addrp) { aggr_port_t *port = arg; - return (add ? mac_multicst_add(port->lp_mh, addrp) : - mac_multicst_remove(port->lp_mh, addrp)); + if (add) { + return (mac_multicast_add(port->lp_mch, addrp)); + } else { + mac_multicast_remove(port->lp_mch, addrp); + return (0); + } } uint64_t @@ -533,3 +582,101 @@ aggr_port_stat(aggr_port_t *port, uint_t stat) { return (mac_stat_get(port->lp_mh, stat)); } + +/* + * Add a non-primary unicast address to the underlying port. If the port + * supports HW Rx group, try to add the address into the HW Rx group of + * the port first. If that fails, or if the port does not support HW Rx + * group, enable the port's promiscous mode. + */ +int +aggr_port_addmac(aggr_port_t *port, const uint8_t *mac_addr) +{ + aggr_unicst_addr_t *addr, **pprev; + mac_perim_handle_t pmph; + int err; + + ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); + mac_perim_enter_by_mh(port->lp_mh, &pmph); + + /* + * If the underlying port support HW Rx group, add the mac to its + * RX group directly. + */ + if ((port->lp_hwgh != NULL) && + ((mac_hwgroup_addmac(port->lp_hwgh, mac_addr)) == 0)) { + mac_perim_exit(pmph); + return (0); + } + + /* + * If that fails, or if the port does not support HW Rx group, enable + * the port's promiscous mode. (Note that we turn on the promiscous + * mode only if the port is already started. + */ + if (port->lp_started && + ((err = aggr_port_promisc(port, B_TRUE)) != 0)) { + mac_perim_exit(pmph); + return (err); + } + + /* + * Walk through the unicast addresses that requires promiscous mode + * enabled on this port, and add this address to the end of the list. + */ + pprev = &port->lp_prom_addr; + while ((addr = *pprev) != NULL) { + ASSERT(bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0); + pprev = &addr->aua_next; + } + addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP); + bcopy(mac_addr, addr->aua_addr, ETHERADDRL); + addr->aua_next = NULL; + *pprev = addr; + mac_perim_exit(pmph); + return (0); +} + +/* + * Remove a non-primary unicast address from the underlying port. This address + * must has been added by aggr_port_addmac(). As a result, we probably need to + * remove the address from the port's HW Rx group, or to disable the port's + * promiscous mode. + */ +void +aggr_port_remmac(aggr_port_t *port, const uint8_t *mac_addr) +{ + aggr_grp_t *grp = port->lp_grp; + aggr_unicst_addr_t *addr, **pprev; + mac_perim_handle_t pmph; + + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + mac_perim_enter_by_mh(port->lp_mh, &pmph); + + /* + * See whether this address is in the list of addresses that requires + * the port being promiscous mode. + */ + pprev = &port->lp_prom_addr; + while ((addr = *pprev) != NULL) { + if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) + break; + pprev = &addr->aua_next; + } + if (addr != NULL) { + /* + * This unicast address put the port into the promiscous mode, + * delete this address from the lp_prom_addr list. If this is + * the last address in that list, disable the promiscous mode + * if the aggregation is not in promiscous mode. + */ + *pprev = addr->aua_next; + kmem_free(addr, sizeof (aggr_unicst_addr_t)); + if (port->lp_prom_addr == NULL && !grp->lg_promisc) + (void) aggr_port_promisc(port, B_FALSE); + } else { + ASSERT(port->lp_hwgh != NULL); + (void) mac_hwgroup_remmac(port->lp_hwgh, mac_addr); + } + mac_perim_exit(pmph); +} diff --git a/usr/src/uts/common/io/aggr/aggr_recv.c b/usr/src/uts/common/io/aggr/aggr_recv.c index bf98e65ee3..2bdb7872e3 100644 --- a/usr/src/uts/common/io/aggr/aggr_recv.c +++ b/usr/src/uts/common/io/aggr/aggr_recv.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * IEEE 802.3ad Link Aggregation - Receive * @@ -42,7 +40,18 @@ #include <sys/aggr_impl.h> static void -aggr_recv_lacp(aggr_port_t *port, mblk_t *mp) +aggr_mac_rx(mac_handle_t lg_mh, mac_resource_handle_t mrh, mblk_t *mp) +{ + if (mrh == NULL) { + mac_rx(lg_mh, mrh, mp); + } else { + aggr_pseudo_rx_ring_t *ring = (aggr_pseudo_rx_ring_t *)mrh; + mac_rx_ring(lg_mh, ring->arr_rh, mp, ring->arr_gen); + } +} + +void +aggr_recv_lacp(aggr_port_t *port, mac_resource_handle_t mrh, mblk_t *mp) { aggr_grp_t *grp = port->lp_grp; @@ -51,35 +60,26 @@ aggr_recv_lacp(aggr_port_t *port, mblk_t *mp) mblk_t *nmp = copymsg(mp); if (nmp != NULL) - mac_rx(grp->lg_mh, NULL, nmp); + aggr_mac_rx(grp->lg_mh, mrh, nmp); } - aggr_lacp_rx(port, mp); + aggr_lacp_rx_enqueue(port, mp); } /* * Callback function invoked by MAC service module when packets are * made available by a MAC port. */ +/* ARGSUSED */ void -aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) +aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback) { aggr_port_t *port = (aggr_port_t *)arg; aggr_grp_t *grp = port->lp_grp; - /* - * If this message is looped back from the legacy devices, drop - * it as the Nemo framework will be responsible for looping it - * back by the mac_txloop() function. - */ - if (mp->b_flag & MSGNOLOOP) { - ASSERT(mp->b_next == NULL); - freemsg(mp); - return; - } - if (grp->lg_lacp_mode == AGGR_LACP_OFF) { - mac_rx(grp->lg_mh, mrh, mp); + aggr_mac_rx(grp->lg_mh, mrh, mp); } else { mblk_t *cmp, *last, *head; struct ether_header *ehp; @@ -100,10 +100,12 @@ aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) } else { /* send up accumulated packets */ last->b_next = NULL; - if (port->lp_collector_enabled) - mac_rx(grp->lg_mh, mrh, head); - else + if (port->lp_collector_enabled) { + aggr_mac_rx(grp->lg_mh, mrh, + head); + } else { freemsgchain(head); + } head = cmp->b_next; cmp->b_next = NULL; freemsg(cmp); @@ -126,21 +128,23 @@ aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) ASSERT(last == NULL); head = cmp->b_next; cmp->b_next = NULL; - aggr_recv_lacp(port, cmp); + aggr_recv_lacp(port, mrh, cmp); cmp = head; } else { /* previously accumulated packets */ ASSERT(last != NULL); /* send up non-LACP packets */ last->b_next = NULL; - if (port->lp_collector_enabled) - mac_rx(grp->lg_mh, mrh, head); - else + if (port->lp_collector_enabled) { + aggr_mac_rx(grp->lg_mh, mrh, + head); + } else { freemsgchain(head); + } /* unlink and pass up LACP packets */ head = cmp->b_next; cmp->b_next = NULL; - aggr_recv_lacp(port, cmp); + aggr_recv_lacp(port, mrh, cmp); cmp = head; last = NULL; } @@ -151,7 +155,7 @@ aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) } if (head != NULL) { if (port->lp_collector_enabled) - mac_rx(grp->lg_mh, mrh, head); + aggr_mac_rx(grp->lg_mh, mrh, head); else freemsgchain(head); } diff --git a/usr/src/uts/common/io/aggr/aggr_send.c b/usr/src/uts/common/io/aggr/aggr_send.c index 467f8541a3..9b4ad24621 100644 --- a/usr/src/uts/common/io/aggr/aggr_send.c +++ b/usr/src/uts/common/io/aggr/aggr_send.c @@ -55,18 +55,19 @@ static uint16_t aggr_send_ip6_hdr_len(mblk_t *, ip6_t *); -static uint_t -aggr_send_port(aggr_grp_t *grp, mblk_t *mp) +static uint64_t +aggr_send_hash(aggr_grp_t *grp, mblk_t *mp) { struct ether_header *ehp; uint16_t sap; uint_t skip_len; uint8_t proto; uint32_t policy = grp->lg_tx_policy; - uint32_t hash = 0; + uint64_t hash = 0; ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t))); ASSERT(MBLKL(mp) >= sizeof (struct ether_header)); + ASSERT(RW_READ_HELD(&grp->lg_tx_lock)); /* compute MAC hash */ @@ -207,7 +208,7 @@ again: } done: - return (hash % grp->lg_ntx_ports); + return (hash); } /* @@ -216,8 +217,7 @@ done: void aggr_send_update_policy(aggr_grp_t *grp, uint32_t policy) { - ASSERT(AGGR_LACP_LOCK_HELD_WRITER(grp)); - ASSERT(RW_WRITE_HELD(&grp->lg_lock)); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); grp->lg_tx_policy = policy; } @@ -231,35 +231,63 @@ aggr_m_tx(void *arg, mblk_t *mp) aggr_grp_t *grp = arg; aggr_port_t *port; mblk_t *nextp; - const mac_txinfo_t *mtp; + mac_tx_cookie_t cookie; + uint64_t hash; + void *mytx_handle; for (;;) { - AGGR_LACP_LOCK_READER(grp) + rw_enter(&grp->lg_tx_lock, RW_READER); if (grp->lg_ntx_ports == 0) { /* * We could have returned from aggr_m_start() before * the ports were actually attached. Drop the chain. */ - AGGR_LACP_UNLOCK(grp) + rw_exit(&grp->lg_tx_lock); freemsgchain(mp); return (NULL); } + nextp = mp->b_next; mp->b_next = NULL; - port = grp->lg_tx_ports[aggr_send_port(grp, mp)]; - ASSERT(port->lp_state == AGGR_PORT_STATE_ATTACHED); + hash = aggr_send_hash(grp, mp); + port = grp->lg_tx_ports[hash % grp->lg_ntx_ports]; /* - * We store the transmit info pointer locally in case it - * changes between loading mt_fn and mt_arg. + * Bump the active Tx ref count so that the port won't + * be deleted. The reference count will be dropped in mac_tx(). */ - mtp = port->lp_txinfo; - AGGR_LACP_UNLOCK(grp) + mytx_handle = mac_tx_hold(port->lp_mch); + rw_exit(&grp->lg_tx_lock); - if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) { - mp->b_next = nextp; - break; + if (mytx_handle == NULL) { + /* + * The port is quiesced. + */ + freemsg(mp); + } else { + mblk_t *ret_mp; + + /* + * It is fine that the port state changes now. + * Set MAC_TX_NO_HOLD to inform mac_tx() not to bump + * the active Tx ref again. Use hash as the hint so + * to direct traffic to different TX rings. Note below + * bit operation is needed to get the most benefit + * from the mac_tx() hash algorithm. + */ + hash = (hash << 24 | hash << 16 | hash); + hash = (hash << 32 | hash); + cookie = mac_tx(port->lp_mch, mp, (uintptr_t)hash, + MAC_TX_NO_ENQUEUE | MAC_TX_NO_HOLD, &ret_mp); + + mac_tx_rele(port->lp_mch, mytx_handle); + + if (cookie != NULL) { + ret_mp->b_next = nextp; + mp = ret_mp; + break; + } } if ((mp = nextp) == NULL) @@ -276,6 +304,8 @@ aggr_send_port_enable(aggr_port_t *port) { aggr_grp_t *grp = port->lp_grp; + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + if (port->lp_tx_enabled || (port->lp_state != AGGR_PORT_STATE_ATTACHED)) { /* already enabled or port not yet attached */ @@ -285,6 +315,7 @@ aggr_send_port_enable(aggr_port_t *port) /* * Add to group's array of tx ports. */ + rw_enter(&grp->lg_tx_lock, RW_WRITER); if (grp->lg_tx_ports_size < grp->lg_ntx_ports+1) { /* current array too small */ aggr_port_t **new_ports; @@ -308,6 +339,7 @@ aggr_send_port_enable(aggr_port_t *port) grp->lg_tx_ports[grp->lg_ntx_ports++] = port; port->lp_tx_idx = grp->lg_ntx_ports-1; + rw_exit(&grp->lg_tx_lock); port->lp_tx_enabled = B_TRUE; } @@ -321,13 +353,15 @@ aggr_send_port_disable(aggr_port_t *port) uint_t idx, ntx; aggr_grp_t *grp = port->lp_grp; - ASSERT(RW_WRITE_HELD(&port->lp_lock)); + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + ASSERT(MAC_PERIM_HELD(port->lp_mh)); if (!port->lp_tx_enabled) { /* not yet enabled */ return; } + rw_enter(&grp->lg_tx_lock, RW_WRITER); idx = port->lp_tx_idx; ntx = grp->lg_ntx_ports; ASSERT(idx < ntx); @@ -347,6 +381,7 @@ aggr_send_port_disable(aggr_port_t *port) port->lp_tx_idx = 0; grp->lg_ntx_ports--; + rw_exit(&grp->lg_tx_lock); port->lp_tx_enabled = B_FALSE; } diff --git a/usr/src/uts/common/io/ath/ath_main.c b/usr/src/uts/common/io/ath/ath_main.c index b18451e570..451f827415 100644 --- a/usr/src/uts/common/io/ath/ath_main.c +++ b/usr/src/uts/common/io/ath/ath_main.c @@ -132,7 +132,7 @@ #include <sys/sunddi.h> #include <sys/pci.h> #include <sys/errno.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/dlpi.h> #include <sys/ethernet.h> #include <sys/list.h> @@ -232,7 +232,6 @@ static mac_callbacks_t ath_m_callbacks = { ath_m_multicst, ath_m_unicst, ath_m_tx, - NULL, /* mc_resources; */ ath_m_ioctl, NULL, /* mc_getcapab */ NULL, diff --git a/usr/src/uts/common/io/bge/bge.conf b/usr/src/uts/common/io/bge/bge.conf index 71a44f851a..edabf29ab1 100644 --- a/usr/src/uts/common/io/bge/bge.conf +++ b/usr/src/uts/common/io/bge/bge.conf @@ -171,6 +171,6 @@ bge-known-subsystems = 0x108e1647, # For BCM5705, BCM5782, etc, there are only 1 receive ring and 1 send ring. # Otherwise, there can be up to 16 receive rings and 4 send rings. # -bge-rx-rings = 1; +bge-rx-rings = 16; bge-tx-rings = 1; diff --git a/usr/src/uts/common/io/bge/bge_chip2.c b/usr/src/uts/common/io/bge/bge_chip2.c index 4c17aaa5a9..d91ac5f0f6 100644 --- a/usr/src/uts/common/io/bge/bge_chip2.c +++ b/usr/src/uts/common/io/bge/bge_chip2.c @@ -1838,29 +1838,13 @@ bge_nvmem_id(bge_t *bgep) static void bge_init_recv_rule(bge_t *bgep) { - bge_recv_rule_t *rulep; + bge_recv_rule_t *rulep = bgep->recv_rules; uint32_t i; /* - * receive rule: direct all TCP traffic to ring RULE_MATCH_TO_RING - * 1. to direct UDP traffic, set: - * rulep->control = RULE_PROTO_CONTROL; - * rulep->mask_value = RULE_UDP_MASK_VALUE; - * 2. to direct ICMP traffic, set: - * rulep->control = RULE_PROTO_CONTROL; - * rulep->mask_value = RULE_ICMP_MASK_VALUE; - * 3. to direct traffic by source ip, set: - * rulep->control = RULE_SIP_CONTROL; - * rulep->mask_value = RULE_SIP_MASK_VALUE; + * Initialize receive rule registers. + * Note that rules may persist across each bge_m_start/stop() call. */ - rulep = bgep->recv_rules; - rulep->control = RULE_PROTO_CONTROL; - rulep->mask_value = RULE_TCP_MASK_VALUE; - - /* - * set receive rule registers - */ - rulep = bgep->recv_rules; for (i = 0; i < RECV_RULES_NUM_MAX; i++, rulep++) { bge_reg_put32(bgep, RECV_RULE_MASK_REG(i), rulep->mask_value); bge_reg_put32(bgep, RECV_RULE_CONTROL_REG(i), rulep->control); @@ -2871,10 +2855,11 @@ bge_chip_sync(bge_t *bgep) } bge_reg_put32(bgep, MAC_TX_RANDOM_BACKOFF_REG, fill); bge_reg_put64(bgep, MAC_ADDRESS_REG(j), macaddr); - } - BGE_DEBUG(("bge_chip_sync($%p) setting MAC address %012llx", - (void *)bgep, macaddr)); + BGE_DEBUG(("bge_chip_sync($%p) " + "setting MAC address %012llx", + (void *)bgep, macaddr)); + } #ifdef BGE_IPMI_ASF } #endif @@ -5515,14 +5500,25 @@ bge_chip_ioctl(bge_t *bgep, queue_t *wq, mblk_t *mp, struct iocblk *iocp) /* NOTREACHED */ } +/* ARGSUSED */ void -bge_chip_blank(void *arg, time_t ticks, uint_t count) +bge_chip_blank(void *arg, time_t ticks, uint_t count, int flag) { - bge_t *bgep = arg; + recv_ring_t *rrp = arg; + bge_t *bgep = rrp->bgep; mutex_enter(bgep->genlock); + rrp->poll_flag = flag; +#ifdef NOT_YET + /* + * XXX-Sunay: Since most broadcom cards support only one + * interrupt but multiple rx rings, we can't disable the + * physical interrupt. This need to be done via capability + * negotiation depending on the NIC. + */ bge_reg_put32(bgep, RCV_COALESCE_TICKS_REG, ticks); bge_reg_put32(bgep, RCV_COALESCE_MAX_BD_REG, count); +#endif if (bge_check_acc_handle(bgep, bgep->io_handle) != DDI_FM_OK) ddi_fm_service_impact(bgep->devinfo, DDI_SERVICE_UNAFFECTED); mutex_exit(bgep->genlock); diff --git a/usr/src/uts/common/io/bge/bge_hw.h b/usr/src/uts/common/io/bge/bge_hw.h index 2ebdc1a7a3..1974faea88 100644 --- a/usr/src/uts/common/io/bge/bge_hw.h +++ b/usr/src/uts/common/io/bge/bge_hw.h @@ -858,30 +858,53 @@ extern "C" { /* * Receive Rules definition */ -#define RULE_MATCH_TO_RING 2 - /* ring that traffic will go into when recv rule matches. */ - /* value is between 1 and 16, not 0 and 15 */ - +#define ETHERHEADER_DEST_OFFSET 0x00 #define IPHEADER_PROTO_OFFSET 0x08 #define IPHEADER_SIP_OFFSET 0x0c +#define IPHEADER_DIP_OFFSET 0x10 +#define TCPHEADER_SPORT_OFFSET 0x00 +#define TCPHEADER_DPORT_OFFSET 0x02 +#define UDPHEADER_SPORT_OFFSET 0x00 +#define UDPHEADER_DPORT_OFFSET 0x02 + +#define RULE_MATCH(ring) (RECV_RULE_CTL_ENABLE | RECV_RULE_CTL_OP_EQ | \ + RECV_RULE_CTL_CLASS((ring))) + +#define RULE_MATCH_MASK(ring) (RULE_MATCH(ring) | RECV_RULE_CTL_MASK) + +#define RULE_DEST_MAC_1(ring) (RULE_MATCH(ring) | \ + RECV_RULE_CTL_HEADER_FRAME | \ + ETHERHEADER_DEST_OFFSET) + +#define RULE_DEST_MAC_2(ring) (RULE_MATCH_MASK(ring) | \ + RECV_RULE_CTL_HEADER_FRAME | \ + ETHERHEADER_DEST_OFFSET + 4) + +#define RULE_LOCAL_IP(ring) (RULE_MATCH(ring) | RECV_RULE_CTL_HEADER_IP | \ + IPHEADER_DIP_OFFSET) + +#define RULE_REMOTE_IP(ring) (RULE_MATCH(ring) | RECV_RULE_CTL_HEADER_IP | \ + IPHEADER_SIP_OFFSET) -#define RULE_PROTO_CONTROL (RECV_RULE_CTL_ENABLE | RECV_RULE_CTL_MASK | \ - RECV_RULE_CTL_OP_EQ | \ +#define RULE_IP_PROTO(ring) (RULE_MATCH_MASK(ring) | \ RECV_RULE_CTL_HEADER_IP | \ - RECV_RULE_CTL_CLASS(RULE_MATCH_TO_RING) | \ IPHEADER_PROTO_OFFSET) -#define RULE_TCP_MASK_VALUE 0x00ff0006 -#define RULE_UDP_MASK_VALUE 0x00ff0011 -#define RULE_ICMP_MASK_VALUE 0x00ff0001 -#define RULE_SIP_ADDR 0x0a000001 - /* ip address in 32-bit integer,such as, 0x0a000001 is "10.0.0.1" */ +#define RULE_TCP_SPORT(ring) (RULE_MATCH_MASK(ring) | \ + RECV_RULE_CTL_HEADER_TCP | \ + TCPHEADER_SPORT_OFFSET) -#define RULE_SIP_CONTROL (RECV_RULE_CTL_ENABLE | RECV_RULE_CTL_OP_EQ | \ - RECV_RULE_CTL_HEADER_IP | \ - RECV_RULE_CTL_CLASS(RULE_MATCH_TO_RING) | \ - IPHEADER_SIP_OFFSET) -#define RULE_SIP_MASK_VALUE RULE_SIP_ADDR +#define RULE_TCP_DPORT(ring) (RULE_MATCH_MASK(ring) | \ + RECV_RULE_CTL_HEADER_TCP | \ + TCPHEADER_DPORT_OFFSET) + +#define RULE_UDP_SPORT(ring) (RULE_MATCH_MASK(ring) | \ + RECV_RULE_CTL_HEADER_UDP | \ + UDPHEADER_SPORT_OFFSET) + +#define RULE_UDP_DPORT(ring) (RULE_MATCH_MASK(ring) | \ + RECV_RULE_CTL_HEADER_UDP | \ + UDPHEADER_DPORT_OFFSET) /* * 1000BaseX low-level access registers @@ -1686,6 +1709,14 @@ typedef struct { } bge_recv_rule_t; /* + * This describes which sub-rule slots are used by a particular rule. + */ +typedef struct { + int start; + int count; +} bge_rule_info_t; + +/* * Indexes into the <buff_cons_index> array */ #ifdef _BIG_ENDIAN diff --git a/usr/src/uts/common/io/bge/bge_impl.h b/usr/src/uts/common/io/bge/bge_impl.h index 961bf14064..3d2b73f325 100644 --- a/usr/src/uts/common/io/bge/bge_impl.h +++ b/usr/src/uts/common/io/bge/bge_impl.h @@ -71,7 +71,7 @@ extern "C" { #include <sys/fm/util.h> #include <sys/fm/io/ddi.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> #ifdef __amd64 @@ -397,6 +397,13 @@ typedef struct buff_ring { void *spare[4]; /* padding */ } buff_ring_t; /* 0x100 (256) bytes */ +typedef struct bge_multi_mac { + int naddr; /* total supported addresses */ + int naddrfree; /* free addresses slots */ + ether_addr_t mac_addr[MAC_ADDRESS_REGS_MAX]; + boolean_t mac_addr_set[MAC_ADDRESS_REGS_MAX]; +} bge_multi_mac_t; + /* * Software Receive (Return) Ring Control Block * There's one of these for each receiver return ring (up to 16). @@ -418,7 +425,6 @@ typedef struct recv_ring { volatile uint16_t *prod_index_p; /* (const) ptr to h/w */ /* "producer index" */ /* (in status block) */ - /* * The rx_lock must be held when updating the h/w consumer index * mailbox register (*chip_mbox_reg), or the s/w consumer index @@ -428,10 +434,16 @@ typedef struct recv_ring { /* index mailbox offset */ kmutex_t rx_lock[1]; /* serialize receive */ uint64_t rx_next; /* next slot to examine */ - mac_resource_handle_t handle; /* per ring cookie */ - /* ("producer index") */ + + mac_ring_handle_t ring_handle; + mac_group_handle_t ring_group_handle; + uint64_t ring_gen_num; + bge_rule_info_t *mac_addr_rule; + uint8_t mac_addr_val[ETHERADDRL]; + int poll_flag; /* Polling flag */ } recv_ring_t; /* 0x90 (144) bytes */ + /* * Send packet structure */ @@ -528,6 +540,7 @@ typedef struct send_ring { sw_sbd_t *sw_sbds; /* software descriptors */ uint64_t mac_resid; /* special per resource id */ + uint64_t pushed_bytes; } send_ring_t; /* 0x100 (256) bytes */ typedef struct { @@ -760,6 +773,8 @@ typedef struct bge { * Note: they're not necessarily all used. */ buff_ring_t buff[BGE_BUFF_RINGS_MAX]; /* 3*0x0100 */ + + /* may be obsoleted */ recv_ring_t recv[BGE_RECV_RINGS_MAX]; /* 16*0x0090 */ send_ring_t send[BGE_SEND_RINGS_MAX]; /* 16*0x0100 */ @@ -1158,7 +1173,8 @@ int bge_chip_sync(bge_t *bgep, boolean_t asf_keeplive); int bge_chip_reset(bge_t *bgep, boolean_t enable_dma); int bge_chip_sync(bge_t *bgep); #endif -void bge_chip_blank(void *arg, time_t ticks, uint_t count); +void bge_chip_blank(void *arg, time_t ticks, uint_t count, int flag); +extern mblk_t *bge_poll_ring(void *, int); uint_t bge_chip_factotum(caddr_t arg); void bge_chip_cyclic(void *arg); enum ioc_reply bge_chip_ioctl(bge_t *bgep, queue_t *wq, mblk_t *mp, @@ -1222,6 +1238,7 @@ void bge_receive(bge_t *bgep, bge_status_t *bsp); /* bge_send.c */ mblk_t *bge_m_tx(void *arg, mblk_t *mp); +mblk_t *bge_ring_tx(void *arg, mblk_t *mp); void bge_recycle(bge_t *bgep, bge_status_t *bsp); uint_t bge_send_drain(caddr_t arg); diff --git a/usr/src/uts/common/io/bge/bge_main2.c b/usr/src/uts/common/io/bge/bge_main2.c index fc4407214e..c8cef32365 100644 --- a/usr/src/uts/common/io/bge/bge_main2.c +++ b/usr/src/uts/common/io/bge/bge_main2.c @@ -26,7 +26,9 @@ #include "bge_impl.h" #include <sys/sdt.h> +#include <sys/mac_provider.h> #include <sys/mac.h> +#include <sys/mac_flow.h> /* * This is the string displayed by modinfo, etc. @@ -52,6 +54,7 @@ static char default_mtu[] = "default_mtu"; static int bge_add_intrs(bge_t *, int); static void bge_rem_intrs(bge_t *); +static int bge_unicst_set(void *, const uint8_t *, int); /* * Describes the chip's DMA engine @@ -104,16 +107,10 @@ static int bge_m_start(void *); static void bge_m_stop(void *); static int bge_m_promisc(void *, boolean_t); static int bge_m_multicst(void *, boolean_t, const uint8_t *); -static int bge_m_unicst(void *, const uint8_t *); -static void bge_m_resources(void *); static void bge_m_ioctl(void *, queue_t *, mblk_t *); static boolean_t bge_m_getcapab(void *, mac_capab_t, void *); static int bge_unicst_set(void *, const uint8_t *, - mac_addr_slot_t); -static int bge_m_unicst_add(void *, mac_multi_addr_t *); -static int bge_m_unicst_remove(void *, mac_addr_slot_t); -static int bge_m_unicst_modify(void *, mac_multi_addr_t *); -static int bge_m_unicst_get(void *, mac_multi_addr_t *); + int); static int bge_m_setprop(void *, const char *, mac_prop_id_t, uint_t, const void *); static int bge_m_getprop(void *, const char *, mac_prop_id_t, @@ -123,8 +120,7 @@ static int bge_set_priv_prop(bge_t *, const char *, uint_t, static int bge_get_priv_prop(bge_t *, const char *, uint_t, uint_t, void *); -#define BGE_M_CALLBACK_FLAGS\ - (MC_RESOURCES | MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP) +#define BGE_M_CALLBACK_FLAGS (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP) static mac_callbacks_t bge_m_callbacks = { BGE_M_CALLBACK_FLAGS, @@ -133,9 +129,8 @@ static mac_callbacks_t bge_m_callbacks = { bge_m_stop, bge_m_promisc, bge_m_multicst, - bge_m_unicst, + NULL, bge_m_tx, - bge_m_resources, bge_m_ioctl, bge_m_getcapab, NULL, @@ -152,6 +147,7 @@ mac_priv_prop_t bge_priv_prop[] = { #define BGE_MAX_PRIV_PROPS \ (sizeof (bge_priv_prop) / sizeof (mac_priv_prop_t)) +uint8_t zero_addr[6] = {0, 0, 0, 0, 0, 0}; /* * ========== Transmit and receive ring reinitialisation ========== */ @@ -590,23 +586,10 @@ bge_m_start(void *arg) } /* - * bge_m_unicst() -- set the physical network address - */ -static int -bge_m_unicst(void *arg, const uint8_t *macaddr) -{ - /* - * Request to set address in - * address slot 0, i.e., default address - */ - return (bge_unicst_set(arg, macaddr, 0)); -} - -/* * bge_unicst_set() -- set the physical network address */ static int -bge_unicst_set(void *arg, const uint8_t *macaddr, mac_addr_slot_t slot) +bge_unicst_set(void *arg, const uint8_t *macaddr, int slot) { bge_t *bgep = arg; /* private device info */ @@ -693,160 +676,6 @@ bge_unicst_set(void *arg, const uint8_t *macaddr, mac_addr_slot_t slot) return (0); } -/* - * The following four routines are used as callbacks for multiple MAC - * address support: - * - bge_m_unicst_add(void *, mac_multi_addr_t *); - * - bge_m_unicst_remove(void *, mac_addr_slot_t); - * - bge_m_unicst_modify(void *, mac_multi_addr_t *); - * - bge_m_unicst_get(void *, mac_multi_addr_t *); - */ - -/* - * bge_m_unicst_add() - will find an unused address slot, set the - * address value to the one specified, reserve that slot and enable - * the NIC to start filtering on the new MAC address. - * address slot. Returns 0 on success. - */ -static int -bge_m_unicst_add(void *arg, mac_multi_addr_t *maddr) -{ - bge_t *bgep = arg; /* private device info */ - mac_addr_slot_t slot; - int err; - - if (mac_unicst_verify(bgep->mh, - maddr->mma_addr, maddr->mma_addrlen) == B_FALSE) - return (EINVAL); - - mutex_enter(bgep->genlock); - if (bgep->unicst_addr_avail == 0) { - /* no slots available */ - mutex_exit(bgep->genlock); - return (ENOSPC); - } - - /* - * Primary/default address is in slot 0. The next three - * addresses are the multiple MAC addresses. So multiple - * MAC address 0 is in slot 1, 1 in slot 2, and so on. - * So the first multiple MAC address resides in slot 1. - */ - for (slot = 1; slot < bgep->unicst_addr_total; slot++) { - if (bgep->curr_addr[slot].set == B_FALSE) { - bgep->curr_addr[slot].set = B_TRUE; - break; - } - } - - ASSERT(slot < bgep->unicst_addr_total); - bgep->unicst_addr_avail--; - mutex_exit(bgep->genlock); - maddr->mma_slot = slot; - - if ((err = bge_unicst_set(bgep, maddr->mma_addr, slot)) != 0) { - mutex_enter(bgep->genlock); - bgep->curr_addr[slot].set = B_FALSE; - bgep->unicst_addr_avail++; - mutex_exit(bgep->genlock); - } - return (err); -} - -/* - * bge_m_unicst_remove() - removes a MAC address that was added by a - * call to bge_m_unicst_add(). The slot number that was returned in - * add() is passed in the call to remove the address. - * Returns 0 on success. - */ -static int -bge_m_unicst_remove(void *arg, mac_addr_slot_t slot) -{ - bge_t *bgep = arg; /* private device info */ - - if (slot <= 0 || slot >= bgep->unicst_addr_total) - return (EINVAL); - - mutex_enter(bgep->genlock); - if (bgep->curr_addr[slot].set == B_TRUE) { - bgep->curr_addr[slot].set = B_FALSE; - bgep->unicst_addr_avail++; - mutex_exit(bgep->genlock); - /* - * Copy the default address to the passed slot - */ - return (bge_unicst_set(bgep, bgep->curr_addr[0].addr, slot)); - } - mutex_exit(bgep->genlock); - return (EINVAL); -} - -/* - * bge_m_unicst_modify() - modifies the value of an address that - * has been added by bge_m_unicst_add(). The new address, address - * length and the slot number that was returned in the call to add - * should be passed to bge_m_unicst_modify(). mma_flags should be - * set to 0. Returns 0 on success. - */ -static int -bge_m_unicst_modify(void *arg, mac_multi_addr_t *maddr) -{ - bge_t *bgep = arg; /* private device info */ - mac_addr_slot_t slot; - - if (mac_unicst_verify(bgep->mh, - maddr->mma_addr, maddr->mma_addrlen) == B_FALSE) - return (EINVAL); - - slot = maddr->mma_slot; - - if (slot <= 0 || slot >= bgep->unicst_addr_total) - return (EINVAL); - - mutex_enter(bgep->genlock); - if (bgep->curr_addr[slot].set == B_TRUE) { - mutex_exit(bgep->genlock); - return (bge_unicst_set(bgep, maddr->mma_addr, slot)); - } - mutex_exit(bgep->genlock); - - return (EINVAL); -} - -/* - * bge_m_unicst_get() - will get the MAC address and all other - * information related to the address slot passed in mac_multi_addr_t. - * mma_flags should be set to 0 in the call. - * On return, mma_flags can take the following values: - * 1) MMAC_SLOT_UNUSED - * 2) MMAC_SLOT_USED | MMAC_VENDOR_ADDR - * 3) MMAC_SLOT_UNUSED | MMAC_VENDOR_ADDR - * 4) MMAC_SLOT_USED - */ -static int -bge_m_unicst_get(void *arg, mac_multi_addr_t *maddr) -{ - bge_t *bgep = arg; /* private device info */ - mac_addr_slot_t slot; - - slot = maddr->mma_slot; - - if (slot <= 0 || slot >= bgep->unicst_addr_total) - return (EINVAL); - - mutex_enter(bgep->genlock); - if (bgep->curr_addr[slot].set == B_TRUE) { - ethaddr_copy(bgep->curr_addr[slot].addr, - maddr->mma_addr); - maddr->mma_flags = MMAC_SLOT_USED; - } else { - maddr->mma_flags = MMAC_SLOT_UNUSED; - } - mutex_exit(bgep->genlock); - - return (0); -} - extern void bge_wake_factotum(bge_t *); static boolean_t @@ -1576,6 +1405,295 @@ bge_m_promisc(void *arg, boolean_t on) return (0); } +/* + * Find the slot for the specified unicast address + */ +int +bge_unicst_find(bge_t *bgep, const uint8_t *mac_addr) +{ + int slot; + + ASSERT(mutex_owned(bgep->genlock)); + + for (slot = 0; slot < bgep->unicst_addr_total; slot++) { + if (bcmp(bgep->curr_addr[slot].addr, mac_addr, ETHERADDRL) == 0) + return (slot); + } + + return (-1); +} + +/* + * Programs the classifier to start steering packets matching 'mac_addr' to the + * specified ring 'arg'. + */ +static int +bge_addmac(void *arg, const uint8_t *mac_addr) +{ + recv_ring_t *rrp = (recv_ring_t *)arg; + bge_t *bgep = rrp->bgep; + bge_recv_rule_t *rulep = bgep->recv_rules; + bge_rule_info_t *rinfop = NULL; + uint8_t ring = (uint8_t)(rrp - bgep->recv) + 1; + int i; + uint16_t tmp16; + uint32_t tmp32; + int slot; + int err; + + mutex_enter(bgep->genlock); + if (bgep->unicst_addr_avail == 0) { + mutex_exit(bgep->genlock); + return (ENOSPC); + } + + /* + * First add the unicast address to a available slot. + */ + slot = bge_unicst_find(bgep, mac_addr); + ASSERT(slot == -1); + + for (slot = 0; slot < bgep->unicst_addr_total; slot++) { + if (!bgep->curr_addr[slot].set) { + bgep->curr_addr[slot].set = B_TRUE; + break; + } + } + + ASSERT(slot < bgep->unicst_addr_total); + bgep->unicst_addr_avail--; + mutex_exit(bgep->genlock); + + if ((err = bge_unicst_set(bgep, mac_addr, slot)) != 0) + goto fail; + + /* A rule is already here. Deny this. */ + if (rrp->mac_addr_rule != NULL) { + err = ether_cmp(mac_addr, rrp->mac_addr_val) ? EEXIST : EBUSY; + goto fail; + } + + /* + * Allocate a bge_rule_info_t to keep track of which rule slots + * are being used. + */ + rinfop = kmem_zalloc(sizeof (bge_rule_info_t), KM_NOSLEEP); + if (rinfop == NULL) { + err = ENOMEM; + goto fail; + } + + /* + * Look for the starting slot to place the rules. + * The two slots we reserve must be contiguous. + */ + for (i = 0; i + 1 < RECV_RULES_NUM_MAX; i++) + if ((rulep[i].control & RECV_RULE_CTL_ENABLE) == 0 && + (rulep[i+1].control & RECV_RULE_CTL_ENABLE) == 0) + break; + + ASSERT(i + 1 < RECV_RULES_NUM_MAX); + + bcopy(mac_addr, &tmp32, sizeof (tmp32)); + rulep[i].mask_value = ntohl(tmp32); + rulep[i].control = RULE_DEST_MAC_1(ring) | RECV_RULE_CTL_AND; + bge_reg_put32(bgep, RECV_RULE_MASK_REG(i), rulep[i].mask_value); + bge_reg_put32(bgep, RECV_RULE_CONTROL_REG(i), rulep[i].control); + + bcopy(mac_addr + 4, &tmp16, sizeof (tmp16)); + rulep[i+1].mask_value = 0xffff0000 | ntohs(tmp16); + rulep[i+1].control = RULE_DEST_MAC_2(ring); + bge_reg_put32(bgep, RECV_RULE_MASK_REG(i+1), rulep[i+1].mask_value); + bge_reg_put32(bgep, RECV_RULE_CONTROL_REG(i+1), rulep[i+1].control); + rinfop->start = i; + rinfop->count = 2; + + rrp->mac_addr_rule = rinfop; + bcopy(mac_addr, rrp->mac_addr_val, ETHERADDRL); + + return (0); + +fail: + /* Clear the address just set */ + (void) bge_unicst_set(bgep, zero_addr, slot); + mutex_enter(bgep->genlock); + bgep->curr_addr[slot].set = B_FALSE; + bgep->unicst_addr_avail++; + mutex_exit(bgep->genlock); + + return (err); +} + +/* + * Stop classifying packets matching the MAC address to the specified ring. + */ +static int +bge_remmac(void *arg, const uint8_t *mac_addr) +{ + recv_ring_t *rrp = (recv_ring_t *)arg; + bge_t *bgep = rrp->bgep; + bge_recv_rule_t *rulep = bgep->recv_rules; + bge_rule_info_t *rinfop = rrp->mac_addr_rule; + int start; + int slot; + int err; + + /* + * Remove the MAC address from its slot. + */ + mutex_enter(bgep->genlock); + slot = bge_unicst_find(bgep, mac_addr); + if (slot == -1) { + mutex_exit(bgep->genlock); + return (EINVAL); + } + + ASSERT(bgep->curr_addr[slot].set); + mutex_exit(bgep->genlock); + + if ((err = bge_unicst_set(bgep, zero_addr, slot)) != 0) + return (err); + + if (rinfop == NULL || ether_cmp(mac_addr, rrp->mac_addr_val) != 0) + return (EINVAL); + + start = rinfop->start; + rulep[start].mask_value = 0; + rulep[start].control = 0; + bge_reg_put32(bgep, RECV_RULE_MASK_REG(start), rulep[start].mask_value); + bge_reg_put32(bgep, RECV_RULE_CONTROL_REG(start), rulep[start].control); + start++; + rulep[start].mask_value = 0; + rulep[start].control = 0; + bge_reg_put32(bgep, RECV_RULE_MASK_REG(start), rulep[start].mask_value); + bge_reg_put32(bgep, RECV_RULE_CONTROL_REG(start), rulep[start].control); + + kmem_free(rinfop, sizeof (bge_rule_info_t)); + rrp->mac_addr_rule = NULL; + bzero(rrp->mac_addr_val, ETHERADDRL); + + mutex_enter(bgep->genlock); + bgep->curr_addr[slot].set = B_FALSE; + bgep->unicst_addr_avail++; + mutex_exit(bgep->genlock); + + return (0); +} + +static int +bge_flag_intr_enable(mac_intr_handle_t ih) +{ + recv_ring_t *rrp = (recv_ring_t *)ih; + bge_t *bgep = rrp->bgep; + + mutex_enter(bgep->genlock); + rrp->poll_flag = 0; + mutex_exit(bgep->genlock); + + return (0); +} + +static int +bge_flag_intr_disable(mac_intr_handle_t ih) +{ + recv_ring_t *rrp = (recv_ring_t *)ih; + bge_t *bgep = rrp->bgep; + + mutex_enter(bgep->genlock); + rrp->poll_flag = 1; + mutex_exit(bgep->genlock); + + return (0); +} + +static int +bge_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num) +{ + recv_ring_t *rx_ring; + + rx_ring = (recv_ring_t *)rh; + mutex_enter(rx_ring->rx_lock); + rx_ring->ring_gen_num = mr_gen_num; + mutex_exit(rx_ring->rx_lock); + return (0); +} + + +/* + * Callback funtion for MAC layer to register all rings + * for given ring_group, noted by rg_index. + */ +void +bge_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, + const int index, mac_ring_info_t *infop, mac_ring_handle_t rh) +{ + bge_t *bgep = arg; + mac_intr_t *mintr; + + switch (rtype) { + case MAC_RING_TYPE_RX: { + recv_ring_t *rx_ring; + ASSERT(rg_index >= 0 && rg_index < MIN(bgep->chipid.rx_rings, + MAC_ADDRESS_REGS_MAX) && index == 0); + + rx_ring = &bgep->recv[rg_index]; + rx_ring->ring_handle = rh; + + infop->mri_driver = (mac_ring_driver_t)rx_ring; + infop->mri_start = bge_ring_start; + infop->mri_stop = NULL; + infop->mri_poll = bge_poll_ring; + + mintr = &infop->mri_intr; + mintr->mi_handle = (mac_intr_handle_t)rx_ring; + mintr->mi_enable = bge_flag_intr_enable; + mintr->mi_disable = bge_flag_intr_disable; + + break; + } + case MAC_RING_TYPE_TX: + default: + ASSERT(0); + break; + } +} + +/* + * Fill infop passed as argument + * fill in respective ring_group info + * Each group has a single ring in it. We keep it simple + * and use the same internal handle for rings and groups. + */ +void +bge_fill_group(void *arg, mac_ring_type_t rtype, const int rg_index, + mac_group_info_t *infop, mac_group_handle_t gh) +{ + bge_t *bgep = arg; + + switch (rtype) { + case MAC_RING_TYPE_RX: { + recv_ring_t *rx_ring; + + ASSERT(rg_index >= 0 && rg_index < MIN(bgep->chipid.rx_rings, + MAC_ADDRESS_REGS_MAX)); + rx_ring = &bgep->recv[rg_index]; + rx_ring->ring_group_handle = gh; + + infop->mgi_driver = (mac_group_driver_t)rx_ring; + infop->mgi_start = NULL; + infop->mgi_stop = NULL; + infop->mgi_addmac = bge_addmac; + infop->mgi_remmac = bge_remmac; + infop->mgi_count = 1; + break; + } + case MAC_RING_TYPE_TX: + default: + ASSERT(0); + break; + } +} + /*ARGSUSED*/ static boolean_t bge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) @@ -1589,38 +1707,20 @@ bge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) *txflags = HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM; break; } + case MAC_CAPAB_RINGS: { + mac_capab_rings_t *cap_rings = cap_data; - case MAC_CAPAB_POLL: - /* - * There's nothing for us to fill in, simply returning - * B_TRUE stating that we support polling is sufficient. - */ - break; - - case MAC_CAPAB_MULTIADDRESS: { - multiaddress_capab_t *mmacp = cap_data; + /* Temporarily disable multiple tx rings. */ + if (cap_rings->mr_type != MAC_RING_TYPE_RX) + return (B_FALSE); - mutex_enter(bgep->genlock); - /* - * The number of MAC addresses made available by - * this capability is one less than the total as - * the primary address in slot 0 is counted in - * the total. - */ - mmacp->maddr_naddr = bgep->unicst_addr_total - 1; - mmacp->maddr_naddrfree = bgep->unicst_addr_avail; - /* No multiple factory addresses, set mma_flag to 0 */ - mmacp->maddr_flag = 0; - mmacp->maddr_handle = bgep; - mmacp->maddr_add = bge_m_unicst_add; - mmacp->maddr_remove = bge_m_unicst_remove; - mmacp->maddr_modify = bge_m_unicst_modify; - mmacp->maddr_get = bge_m_unicst_get; - mmacp->maddr_reserve = NULL; - mutex_exit(bgep->genlock); + cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; + cap_rings->mr_rnum = cap_rings->mr_gnum = + MIN(bgep->chipid.rx_rings, MAC_ADDRESS_REGS_MAX); + cap_rings->mr_rget = bge_fill_ring; + cap_rings->mr_gget = bge_fill_group; break; } - default: return (B_FALSE); } @@ -1889,43 +1989,6 @@ bge_m_ioctl(void *arg, queue_t *wq, mblk_t *mp) } } -static void -bge_resources_add(bge_t *bgep, time_t time, uint_t pkt_cnt) -{ - - recv_ring_t *rrp; - mac_rx_fifo_t mrf; - int ring; - - /* - * Register Rx rings as resources and save mac - * resource id for future reference - */ - mrf.mrf_type = MAC_RX_FIFO; - mrf.mrf_blank = bge_chip_blank; - mrf.mrf_arg = (void *)bgep; - mrf.mrf_normal_blank_time = time; - mrf.mrf_normal_pkt_count = pkt_cnt; - - for (ring = 0; ring < bgep->chipid.rx_rings; ring++) { - rrp = &bgep->recv[ring]; - rrp->handle = mac_resource_add(bgep->mh, - (mac_resource_t *)&mrf); - } -} - -static void -bge_m_resources(void *arg) -{ - bge_t *bgep = arg; - - mutex_enter(bgep->genlock); - - bge_resources_add(bgep, bgep->chipid.rx_ticks_norm, - bgep->chipid.rx_count_norm); - mutex_exit(bgep->genlock); -} - /* * ========== Per-instance setup/teardown code ========== */ @@ -3404,29 +3467,23 @@ bge_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd) * Determine whether to override the chip's own MAC address */ bge_find_mac_address(bgep, cidp); - ethaddr_copy(cidp->vendor_addr.addr, bgep->curr_addr[0].addr); - bgep->curr_addr[0].set = B_TRUE; bgep->unicst_addr_total = MAC_ADDRESS_REGS_MAX; - /* - * Address available is one less than MAX - * as primary address is not advertised - * as a multiple MAC address. - */ - bgep->unicst_addr_avail = MAC_ADDRESS_REGS_MAX - 1; + bgep->unicst_addr_avail = MAC_ADDRESS_REGS_MAX; if ((macp = mac_alloc(MAC_VERSION)) == NULL) goto attach_fail; macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; macp->m_driver = bgep; macp->m_dip = devinfo; - macp->m_src_addr = bgep->curr_addr[0].addr; + macp->m_src_addr = cidp->vendor_addr.addr; macp->m_callbacks = &bge_m_callbacks; macp->m_min_sdu = 0; macp->m_max_sdu = cidp->ethmax_size - sizeof (struct ether_header); macp->m_margin = VLAN_TAGSZ; macp->m_priv_props = bge_priv_prop; macp->m_priv_prop_count = BGE_MAX_PRIV_PROPS; + macp->m_v12n = MAC_VIRT_LEVEL1; /* * Finally, we're ready to register ourselves with the MAC layer diff --git a/usr/src/uts/common/io/bge/bge_recv2.c b/usr/src/uts/common/io/bge/bge_recv2.c index 60df201711..2c8bb20f71 100644 --- a/usr/src/uts/common/io/bge/bge_recv2.c +++ b/usr/src/uts/common/io/bge/bge_recv2.c @@ -24,8 +24,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "bge_impl.h" #define U32TOPTR(x) ((void *)(uintptr_t)(uint32_t)(x)) @@ -274,7 +272,9 @@ error: * the chip to indicate the packets it has accepted from the ring. */ static mblk_t *bge_receive_ring(bge_t *bgep, recv_ring_t *rrp); +#ifndef DEBUG #pragma inline(bge_receive_ring) +#endif static mblk_t * bge_receive_ring(bge_t *bgep, recv_ring_t *rrp) @@ -328,36 +328,61 @@ bge_receive_ring(bge_t *bgep, recv_ring_t *rrp) } /* - * Receive all packets in all rings. - * - * To give priority to low-numbered rings, whenever we have received any - * packets in any ring except 0, we restart scanning again from ring 0. - * Thus, for example, if rings 0, 3, and 10 are carrying traffic, the - * pattern of receives might go 0, 3, 10, 3, 0, 10, 0: - * - * 0 found some - receive them - * 1..2 none found - * 3 found some - receive them and restart scan - * 0..9 none found - * 10 found some - receive them and restart scan - * 0..2 none found - * 3 found some more - receive them and restart scan - * 0 found some more - receive them - * 1..9 none found - * 10 found some more - receive them and restart scan - * 0 found some more - receive them - * 1..15 none found - * - * The routine returns only when a complete scan has been performed either - * without finding any packets to receive or BGE_MAXPKT_RCVED packets were - * received from ring 0 and other rings (if used) are empty. + * XXX: Poll a particular ring. The implementation is incomplete. + * Once the ring interrupts are disabled, we need to do bge_recyle() + * for the ring as well and re enable the ring interrupt automatically + * if the poll doesn't find any packets in the ring. We need to + * have MSI-X interrupts support for this. * - * Note that driver-defined locks may *NOT* be held across calls - * to gld_recv(). - * - * Note: the expression (BGE_RECV_RINGS_USED > 1), yields a compile-time - * constant and allows the compiler to optimise away the outer do-loop - * if only one receive ring is being used. + * The basic poll policy is that rings that are dealing with explicit + * flows (like TCP or some service) and are marked as such should + * have their own MSI-X interrupt per ring. bge_intr() should leave + * that interrupt disabled after an upcall. The ring is in poll mode. + * When a poll thread comes down and finds nothing, the MSI-X interrupt + * is automatically enabled. Squeue needs to deal with the race of + * a new interrupt firing and reaching before poll thread returns. + */ +mblk_t * +bge_poll_ring(void *arg, int bytes_to_pickup) +{ + recv_ring_t *rrp = arg; + bge_t *bgep = rrp->bgep; + bge_rbd_t *hw_rbd_p; + uint64_t slot; + mblk_t *head; + mblk_t **tail; + mblk_t *mp; + size_t sz = 0; + + mutex_enter(rrp->rx_lock); + + /* + * Sync (all) the receive ring descriptors + * before accepting the packets they describe + */ + DMA_SYNC(rrp->desc, DDI_DMA_SYNC_FORKERNEL); + hw_rbd_p = DMA_VPTR(rrp->desc); + head = NULL; + tail = &head; + slot = rrp->rx_next; + + /* Note: volatile */ + while ((slot != *rrp->prod_index_p) && (sz <= bytes_to_pickup)) { + if ((mp = bge_receive_packet(bgep, &hw_rbd_p[slot])) != NULL) { + *tail = mp; + sz += msgdsize(mp); + tail = &mp->b_next; + } + rrp->rx_next = slot = NEXT(slot, rrp->desc.nslots); + } + + bge_mbx_put(bgep, rrp->chip_mbx_reg, rrp->rx_next); + mutex_exit(rrp->rx_lock); + return (head); +} + +/* + * Receive all packets in all rings. */ void bge_receive(bge_t *bgep, bge_status_t *bsp); #pragma no_inline(bge_receive) @@ -366,41 +391,31 @@ void bge_receive(bge_t *bgep, bge_status_t *bsp) { recv_ring_t *rrp; - uint64_t ring; - uint64_t rx_rings = bgep->chipid.rx_rings; + uint64_t index; mblk_t *mp; -restart: - ring = 0; - rrp = &bgep->recv[ring]; - do { + for (index = 0; index < bgep->chipid.rx_rings; index++) { + /* + * Start from the first ring. + */ + rrp = &bgep->recv[index]; + /* * For each ring, (rrp->prod_index_p) points to the * proper index within the status block (which has * already been sync'd by the caller) */ - ASSERT(rrp->prod_index_p == RECV_INDEX_P(bsp, ring)); + ASSERT(rrp->prod_index_p == RECV_INDEX_P(bsp, index)); - if (*rrp->prod_index_p == rrp->rx_next) + if (*rrp->prod_index_p == rrp->rx_next || rrp->poll_flag) continue; /* no packets */ if (mutex_tryenter(rrp->rx_lock) == 0) continue; /* already in process */ mp = bge_receive_ring(bgep, rrp); mutex_exit(rrp->rx_lock); - if (mp != NULL) { - mac_rx(bgep->mh, rrp->handle, mp); - - /* - * Restart from ring 0, if the driver is compiled - * with multiple rings and we're not on ring 0 now - */ - if (rx_rings > 1 && ring > 0) - goto restart; - } - - /* - * Loop over all rings (if there *are* multiple rings) - */ - } while (++rrp, ++ring < rx_rings); + if (mp != NULL) + mac_rx_ring(bgep->mh, rrp->ring_handle, mp, + rrp->ring_gen_num); + } } diff --git a/usr/src/uts/common/io/bge/bge_send.c b/usr/src/uts/common/io/bge/bge_send.c index a8c6f16ac2..01b70fd13d 100644 --- a/usr/src/uts/common/io/bge/bge_send.c +++ b/usr/src/uts/common/io/bge/bge_send.c @@ -24,8 +24,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "bge_impl.h" @@ -484,11 +482,11 @@ start_tx: mutex_exit(srp->tx_lock); } -static boolean_t -bge_send(bge_t *bgep, mblk_t *mp) +mblk_t * +bge_ring_tx(void *arg, mblk_t *mp) { - uint_t ring = 0; /* use ring 0 */ - send_ring_t *srp; + send_ring_t *srp = arg; + bge_t *bgep = srp->bgep; struct ether_vlan_header *ehp; bge_queue_item_t *txbuf_item; sw_txbuf_t *txbuf; @@ -499,7 +497,6 @@ bge_send(bge_t *bgep, mblk_t *mp) char *pbuf; ASSERT(mp->b_next == NULL); - srp = &bgep->send[ring]; /* * Get a s/w tx buffer first @@ -510,7 +507,7 @@ bge_send(bge_t *bgep, mblk_t *mp) srp->tx_nobuf++; bgep->tx_resched_needed = B_TRUE; bge_send_serial(bgep, srp); - return (B_FALSE); + return (mp); } /* @@ -564,12 +561,23 @@ bge_send(bge_t *bgep, mblk_t *mp) */ bge_send_serial(bgep, srp); + srp->pushed_bytes += MBLKL(mp); + /* * We've copied the contents, the message can be freed right away */ freemsg(mp); + return (NULL); +} + +static mblk_t * +bge_send(bge_t *bgep, mblk_t *mp) +{ + send_ring_t *ring; + + ring = &bgep->send[0]; /* ring 0 */ - return (B_TRUE); + return (bge_ring_tx(ring, mp)); } uint_t @@ -621,7 +629,7 @@ bge_m_tx(void *arg, mblk_t *mp) next = mp->b_next; mp->b_next = NULL; - if (!bge_send(bgep, mp)) { + if ((mp = bge_send(bgep, mp)) != NULL) { mp->b_next = next; break; } diff --git a/usr/src/uts/common/io/dld/dld_drv.c b/usr/src/uts/common/io/dld/dld_drv.c index 615006d86e..55e4d161db 100644 --- a/usr/src/uts/common/io/dld/dld_drv.c +++ b/usr/src/uts/common/io/dld/dld_drv.c @@ -31,14 +31,17 @@ #include <sys/mkdev.h> #include <sys/modctl.h> #include <sys/stat.h> -#include <sys/vlan.h> -#include <sys/mac.h> #include <sys/dld_impl.h> #include <sys/dls_impl.h> #include <sys/softmac.h> -#include <sys/vlan.h> -#include <sys/policy.h> +#include <sys/mac.h> +#include <sys/mac_ether.h> +#include <sys/mac_client.h> +#include <sys/mac_client_impl.h> +#include <sys/mac_client_priv.h> #include <inet/common.h> +#include <sys/policy.h> +#include <sys/priv_names.h> static void drv_init(void); static int drv_fini(void); @@ -150,6 +153,7 @@ drv_init(void) { drv_secobj_init(); dld_str_init(); + /* * Create a hash table for autopush configuration. */ @@ -179,7 +183,6 @@ drv_fini(void) rw_enter(&dld_ap_hash_lock, RW_READER); mod_hash_walk(dld_ap_hashp, drv_ap_exist, &exist); rw_exit(&dld_ap_hash_lock); - if (exist) return (EBUSY); @@ -314,24 +317,33 @@ drv_open(dev_t *devp, int flag, int sflag, cred_t *credp) */ /* ARGSUSED */ static int -drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred) +drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { dld_ioc_attr_t *diap = karg; dls_dl_handle_t dlh; - dls_vlan_t *dvp; + dls_link_t *dlp; int err; + mac_perim_handle_t mph; if ((err = dls_devnet_hold_tmp(diap->dia_linkid, &dlh)) != 0) return (err); - if ((err = dls_vlan_hold(dls_devnet_mac(dlh), - dls_devnet_vid(dlh), &dvp, B_FALSE, B_FALSE)) != 0) { + if ((err = mac_perim_enter_by_macname( + dls_devnet_mac(dlh), &mph)) != 0) { dls_devnet_rele_tmp(dlh); return (err); } - mac_sdu_get(dvp->dv_dlp->dl_mh, NULL, &diap->dia_max_sdu); - dls_vlan_rele(dvp); + if ((err = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0) { + mac_perim_exit(mph); + dls_devnet_rele_tmp(dlh); + return (err); + } + + mac_sdu_get(dlp->dl_mh, NULL, &diap->dia_max_sdu); + + dls_link_rele(dlp); + mac_perim_exit(mph); dls_devnet_rele_tmp(dlh); return (0); @@ -342,7 +354,7 @@ drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred) */ /* ARGSUSED */ static int -drv_ioc_phys_attr(void *karg, intptr_t arg, int mode, cred_t *cred) +drv_ioc_phys_attr(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { dld_ioc_phys_attr_t *dipp = karg; int err; @@ -387,64 +399,184 @@ drv_ioc_phys_attr(void *karg, intptr_t arg, int mode, cred_t *cred) return (0); } +/* ARGSUSED */ +static int +drv_ioc_hwgrpget(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + dld_ioc_hwgrpget_t *hwgrpp = karg; + dld_hwgrpinfo_t hwgrp, *hip; + mac_handle_t mh = NULL; + int i, err, grpnum; + uint_t bytes_left; + + hwgrpp->dih_n_groups = 0; + err = mac_open_by_linkid(hwgrpp->dih_linkid, &mh); + if (err != 0) + goto done; + + hip = (dld_hwgrpinfo_t *) + ((uchar_t *)arg + sizeof (dld_ioc_hwgrpget_t)); + bytes_left = hwgrpp->dih_size; + grpnum = mac_hwgrp_num(mh); + for (i = 0; i < grpnum; i++) { + if (sizeof (dld_hwgrpinfo_t) > bytes_left) { + err = ENOSPC; + goto done; + } + + bzero(&hwgrp, sizeof (hwgrp)); + bcopy(mac_name(mh), hwgrp.dhi_link_name, + sizeof (hwgrp.dhi_link_name)); + mac_get_hwgrp_info(mh, i, &hwgrp.dhi_grp_num, + &hwgrp.dhi_n_rings, &hwgrp.dhi_grp_type, + &hwgrp.dhi_n_clnts, hwgrp.dhi_clnts); + if (copyout(&hwgrp, hip, sizeof (hwgrp)) != 0) { + err = EFAULT; + goto done; + } + + hip++; + bytes_left -= sizeof (dld_hwgrpinfo_t); + } + +done: + if (mh != NULL) + dld_mac_close(mh); + if (err == 0) + hwgrpp->dih_n_groups = grpnum; + return (err); +} + +/* ARGSUSED */ +static int +drv_ioc_macaddrget(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + dld_ioc_macaddrget_t *magp = karg; + dld_macaddrinfo_t mai, *maip; + mac_handle_t mh = NULL; + int i, err; + uint_t bytes_left; + boolean_t is_used; + + magp->dig_count = 0; + err = mac_open_by_linkid(magp->dig_linkid, &mh); + if (err != 0) + goto done; + + maip = (dld_macaddrinfo_t *) + ((uchar_t *)arg + sizeof (dld_ioc_macaddrget_t)); + bytes_left = magp->dig_size; + + for (i = 0; i < mac_addr_factory_num(mh) + 1; i++) { + if (sizeof (dld_macaddrinfo_t) > bytes_left) { + err = ENOSPC; + goto done; + } + + bzero(&mai, sizeof (mai)); + + if (i == 0) { + /* primary MAC address */ + mac_unicast_primary_get(mh, mai.dmi_addr); + mai.dmi_addrlen = mac_addr_len(mh); + mac_unicast_primary_info(mh, mai.dmi_client_name, + &is_used); + } else { + /* factory MAC address slot */ + mac_addr_factory_value(mh, i, mai.dmi_addr, + &mai.dmi_addrlen, mai.dmi_client_name, &is_used); + } + + mai.dmi_slot = i; + if (is_used) + mai.dmi_flags |= DLDIOCMACADDR_USED; + + if (copyout(&mai, maip, sizeof (mai)) != 0) { + err = EFAULT; + goto done; + } + + maip++; + bytes_left -= sizeof (dld_macaddrinfo_t); + } + +done: + if (mh != NULL) + dld_mac_close(mh); + if (err == 0) + magp->dig_count = mac_addr_factory_num(mh) + 1; + return (err); +} + /* - * DLDIOC_SETPROP + * DLDIOC_SET/GETPROP */ static int -drv_ioc_prop_common(dld_ioc_macprop_t *dipp, intptr_t arg, boolean_t set, +drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set, int mode) { - int err = EINVAL; - size_t dsize; - dld_ioc_macprop_t *kdipp; - dls_dl_handle_t dlh; - dls_vlan_t *dvp; - datalink_id_t linkid; + int err = EINVAL; + dls_dl_handle_t dlh = NULL; + dls_link_t *dlp = NULL; + mac_perim_handle_t mph = NULL; mac_prop_t macprop; - uchar_t *cp; - struct dlautopush *dlap; - dld_ioc_zid_t *dzp; + dld_ioc_macprop_t *kprop; + datalink_id_t linkid; + uint_t dsize; + /* - * We only use pr_valsize from dipp, as the caller only did a + * We only use pr_valsize from prop, as the caller only did a * copyin() for sizeof (dld_ioc_prop_t), which doesn't cover * the property data. We copyin the full dld_ioc_prop_t - * including the data into kdipp down below. + * including the data into kprop down below. */ - dsize = sizeof (dld_ioc_macprop_t) + dipp->pr_valsize - 1; - if (dsize < dipp->pr_valsize) + dsize = sizeof (dld_ioc_macprop_t) + prop->pr_valsize - 1; + if (dsize < prop->pr_valsize) return (EINVAL); /* * The property data is variable size, so we need to allocate * a buffer for kernel use as this data was not part of the - * dipp allocation and copyin() done by the framework. + * prop allocation and copyin() done by the framework. */ - if ((kdipp = kmem_alloc(dsize, KM_NOSLEEP)) == NULL) + if ((kprop = kmem_alloc(dsize, KM_NOSLEEP)) == NULL) return (ENOMEM); - if (ddi_copyin((void *)arg, kdipp, dsize, mode) != 0) { + + if (ddi_copyin((void *)arg, kprop, dsize, mode) != 0) { err = EFAULT; goto done; } - linkid = kdipp->pr_linkid; + linkid = kprop->pr_linkid; + if ((err = dls_devnet_hold_tmp(linkid, &dlh)) != 0) + goto done; + + if ((err = mac_perim_enter_by_macname(dls_devnet_mac(dlh), + &mph)) != 0) { + goto done; + } - switch (dipp->pr_num) { - case MAC_PROP_ZONE: + switch (kprop->pr_num) { + case MAC_PROP_ZONE: { if (set) { - dzp = (dld_ioc_zid_t *)kdipp->pr_val; + dld_ioc_zid_t *dzp = (dld_ioc_zid_t *)kprop->pr_val; + err = dls_devnet_setzid(dzp->diz_link, dzp->diz_zid); goto done; } else { - kdipp->pr_perm_flags = MAC_PROP_PERM_RW; - cp = (uchar_t *)kdipp->pr_val; - err = dls_devnet_getzid(linkid, (zoneid_t *)cp); + kprop->pr_perm_flags = MAC_PROP_PERM_RW; + err = dls_devnet_getzid(linkid, + (zoneid_t *)kprop->pr_val); goto done; } - case MAC_PROP_AUTOPUSH: + } + case MAC_PROP_AUTOPUSH: { + struct dlautopush *dlap = + (struct dlautopush *)kprop->pr_val; + if (set) { - if (dipp->pr_valsize != 0) { - dlap = (struct dlautopush *)kdipp->pr_val; + if (kprop->pr_valsize != 0) { err = drv_ioc_setap(linkid, dlap); goto done; } else { @@ -452,125 +584,73 @@ drv_ioc_prop_common(dld_ioc_macprop_t *dipp, intptr_t arg, boolean_t set, goto done; } } else { - kdipp->pr_perm_flags = MAC_PROP_PERM_RW; - dlap = (struct dlautopush *)kdipp->pr_val; + kprop->pr_perm_flags = MAC_PROP_PERM_RW; err = drv_ioc_getap(linkid, dlap); goto done; } - + } default: break; } - if ((err = dls_devnet_hold_tmp(linkid, &dlh)) != 0) - goto done; - - if ((err = dls_vlan_hold(dls_devnet_mac(dlh), - dls_devnet_vid(dlh), &dvp, B_FALSE, B_FALSE)) != 0) { - dls_devnet_rele_tmp(dlh); + if ((err = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0) goto done; - } - macprop.mp_name = kdipp->pr_name; - macprop.mp_id = kdipp->pr_num; - macprop.mp_flags = kdipp->pr_flags; + macprop.mp_name = kprop->pr_name; + macprop.mp_id = kprop->pr_num; + macprop.mp_flags = kprop->pr_flags; if (set) { - err = mac_set_prop(dvp->dv_dlp->dl_mh, &macprop, - kdipp->pr_val, kdipp->pr_valsize); + err = mac_set_prop(dlp->dl_mh, &macprop, kprop->pr_val, + kprop->pr_valsize); } else { - kdipp->pr_perm_flags = MAC_PROP_PERM_RW; - err = mac_get_prop(dvp->dv_dlp->dl_mh, &macprop, - kdipp->pr_val, kdipp->pr_valsize, &kdipp->pr_perm_flags); + kprop->pr_perm_flags = MAC_PROP_PERM_RW; + err = mac_get_prop(dlp->dl_mh, &macprop, kprop->pr_val, + kprop->pr_valsize, &kprop->pr_perm_flags); } - dls_vlan_rele(dvp); - dls_devnet_rele_tmp(dlh); done: if (!set && err == 0 && - ddi_copyout(kdipp, (void *)arg, dsize, mode) != 0) + ddi_copyout(kprop, (void *)arg, dsize, mode) != 0) err = EFAULT; - kmem_free(kdipp, dsize); - return (err); -} -/* ARGSUSED */ -static int -drv_ioc_setprop(void *karg, intptr_t arg, int mode, cred_t *cred) -{ - return (drv_ioc_prop_common(karg, arg, B_TRUE, mode)); -} + if (dlp != NULL) + dls_link_rele(dlp); -/* ARGSUSED */ -static int -drv_ioc_getprop(void *karg, intptr_t arg, int mode, cred_t *cred) -{ - return (drv_ioc_prop_common(karg, arg, B_FALSE, mode)); -} + if (mph != NULL) { + int32_t cpuid; + void *mdip = NULL; -/* - * DLDIOC_CREATE_VLAN - */ -/* ARGSUSED */ -static int -drv_ioc_create_vlan(void *karg, intptr_t arg, int mode, cred_t *cred) -{ - dld_ioc_create_vlan_t *dicp = karg; + if (dlp != NULL && set && err == 0) { + cpuid = mac_client_intr_cpu(dlp->dl_mch); + mdip = mac_get_devinfo(dlp->dl_mh); + } - return (dls_devnet_create_vlan(dicp->dic_vlanid, dicp->dic_linkid, - dicp->dic_vid, dicp->dic_force)); + mac_perim_exit(mph); + + if (mdip != NULL) + mac_client_set_intr_cpu(mdip, dlp->dl_mch, cpuid); + } + if (dlh != NULL) + dls_devnet_rele_tmp(dlh); + + if (kprop != NULL) + kmem_free(kprop, dsize); + return (err); } -/* - * DLDIOC_DELETE_VLAN - */ /* ARGSUSED */ static int -drv_ioc_delete_vlan(void *karg, intptr_t arg, int mode, cred_t *cred) +drv_ioc_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { - dld_ioc_delete_vlan_t *didp = karg; - - return (dls_devnet_destroy_vlan(didp->did_linkid)); + return (drv_ioc_prop_common(karg, arg, B_TRUE, mode)); } -/* - * DLDIOC_VLAN_ATTR - */ /* ARGSUSED */ static int -drv_ioc_vlan_attr(void *karg, intptr_t arg, int mode, cred_t *cred) +drv_ioc_getprop(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { - dld_ioc_vlan_attr_t *divp = karg; - dls_dl_handle_t dlh; - uint16_t vid; - dls_vlan_t *dvp; - int err; - - /* - * Hold this link to prevent it from being deleted. - */ - if ((err = dls_devnet_hold_tmp(divp->div_vlanid, &dlh)) != 0) - return (err); - - if ((vid = dls_devnet_vid(dlh)) == VLAN_ID_NONE) { - dls_devnet_rele_tmp(dlh); - return (EINVAL); - } - - err = dls_vlan_hold(dls_devnet_mac(dlh), vid, &dvp, B_FALSE, B_FALSE); - if (err != 0) { - dls_devnet_rele_tmp(dlh); - return (err); - } - - divp->div_linkid = dls_devnet_linkid(dlh); - divp->div_implicit = !dls_devnet_is_explicit(dlh); - divp->div_vid = vid; - divp->div_force = dvp->dv_force; - - dls_vlan_rele(dvp); - dls_devnet_rele_tmp(dlh); - return (0); + return (drv_ioc_prop_common(karg, arg, B_FALSE, mode)); } /* @@ -581,7 +661,7 @@ drv_ioc_vlan_attr(void *karg, intptr_t arg, int mode, cred_t *cred) */ /* ARGSUSED */ static int -drv_ioc_rename(void *karg, intptr_t arg, int mode, cred_t *cred) +drv_ioc_rename(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { dld_ioc_rename_t *dir = karg; mod_hash_key_t key; @@ -719,7 +799,7 @@ drv_ioc_clrap(datalink_id_t linkid) */ /* ARGSUSED */ static int -drv_ioc_doorserver(void *karg, intptr_t arg, int mode, cred_t *cred) +drv_ioc_doorserver(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { dld_ioc_door_t *did = karg; @@ -727,6 +807,76 @@ drv_ioc_doorserver(void *karg, intptr_t arg, int mode, cred_t *cred) } /* + * DLDIOC_USAGELOG + */ +/* ARGSUSED */ +static int +drv_ioc_usagelog(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + dld_ioc_usagelog_t *log_info = (dld_ioc_usagelog_t *)karg; + + if (log_info->ul_type < MAC_LOGTYPE_LINK || + log_info->ul_type > MAC_LOGTYPE_FLOW) + return (EINVAL); + + if (log_info->ul_onoff) + mac_start_logusage(log_info->ul_type, log_info->ul_interval); + else + mac_stop_logusage(log_info->ul_type); + return (0); +} + +/* + * Process a DLDIOC_ADDFLOW request. + */ +/* ARGSUSED */ +static int +drv_ioc_addflow(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + dld_ioc_addflow_t *afp = karg; + + return (dld_add_flow(afp->af_linkid, afp->af_name, + &afp->af_flow_desc, &afp->af_resource_props)); +} + +/* + * Process a DLDIOC_REMOVEFLOW request. + */ +/* ARGSUSED */ +static int +drv_ioc_removeflow(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + dld_ioc_removeflow_t *rfp = karg; + + return (dld_remove_flow(rfp->rf_name)); +} + +/* + * Process a DLDIOC_MODIFYFLOW request. + */ +/* ARGSUSED */ +static int +drv_ioc_modifyflow(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + dld_ioc_modifyflow_t *mfp = karg; + + return (dld_modify_flow(mfp->mf_name, &mfp->mf_resource_props)); +} + +/* + * Process a DLDIOC_WALKFLOW request. + */ +/* ARGSUSED */ +static int +drv_ioc_walkflow(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + dld_ioc_walkflow_t *wfp = karg; + + return (dld_walk_flow(wfp, arg)); +} + +/* * Check for GLDv3 autopush information. There are three cases: * * 1. If devp points to a GLDv3 datalink and it has autopush configuration, @@ -809,7 +959,7 @@ drv_secobj_fini(void) /* ARGSUSED */ static int -drv_ioc_secobj_set(void *karg, intptr_t arg, int mode, cred_t *cred) +drv_ioc_secobj_set(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { dld_ioc_secobj_set_t *ssp = karg; dld_secobj_t *sobjp, *objp; @@ -885,14 +1035,13 @@ drv_secobj_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg) /* ARGSUSED */ static int -drv_ioc_secobj_get(void *karg, intptr_t arg, int mode, cred_t *cred) +drv_ioc_secobj_get(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { dld_ioc_secobj_get_t *sgp = karg; dld_secobj_t *sobjp, *objp; int err; sobjp = &sgp->sg_obj; - if (sobjp->so_name[DLD_SECOBJ_NAME_MAX - 1] != '\0') return (EINVAL); @@ -932,7 +1081,8 @@ drv_ioc_secobj_get(void *karg, intptr_t arg, int mode, cred_t *cred) /* ARGSUSED */ static int -drv_ioc_secobj_unset(void *karg, intptr_t arg, int mode, cred_t *cred) +drv_ioc_secobj_unset(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) { dld_ioc_secobj_unset_t *sup = karg; dld_secobj_t *objp; @@ -959,32 +1109,56 @@ drv_ioc_secobj_unset(void *karg, intptr_t arg, int mode, cred_t *cred) return (0); } +static int +drv_check_policy(dld_ioc_info_t *info, cred_t *cred) +{ + int i, err = 0; + + for (i = 0; info->di_priv[i] != NULL && i < DLD_MAX_PRIV; i++) { + if ((err = secpolicy_dld_ioctl(cred, info->di_priv[i], + "dld ioctl")) != 0) { + break; + } + } + if (err == 0) + return (0); + + return (secpolicy_net_config(cred, B_FALSE)); +} + static dld_ioc_info_t drv_ioc_list[] = { {DLDIOC_ATTR, DLDCOPYINOUT, sizeof (dld_ioc_attr_t), - drv_ioc_attr}, + drv_ioc_attr, {NULL}}, {DLDIOC_PHYS_ATTR, DLDCOPYINOUT, sizeof (dld_ioc_phys_attr_t), - drv_ioc_phys_attr}, - {DLDIOC_SECOBJ_SET, DLDCOPYIN | DLDDLCONFIG, - sizeof (dld_ioc_secobj_set_t), drv_ioc_secobj_set}, - {DLDIOC_SECOBJ_GET, DLDCOPYINOUT | DLDDLCONFIG, - sizeof (dld_ioc_secobj_get_t), drv_ioc_secobj_get}, - {DLDIOC_SECOBJ_UNSET, DLDCOPYIN | DLDDLCONFIG, - sizeof (dld_ioc_secobj_unset_t), drv_ioc_secobj_unset}, - {DLDIOC_CREATE_VLAN, DLDCOPYIN | DLDDLCONFIG, - sizeof (dld_ioc_create_vlan_t), drv_ioc_create_vlan}, - {DLDIOC_DELETE_VLAN, DLDCOPYIN | DLDDLCONFIG, - sizeof (dld_ioc_delete_vlan_t), - drv_ioc_delete_vlan}, - {DLDIOC_VLAN_ATTR, DLDCOPYINOUT, sizeof (dld_ioc_vlan_attr_t), - drv_ioc_vlan_attr}, - {DLDIOC_DOORSERVER, DLDCOPYIN | DLDDLCONFIG, sizeof (dld_ioc_door_t), - drv_ioc_doorserver}, - {DLDIOC_RENAME, DLDCOPYIN | DLDDLCONFIG, sizeof (dld_ioc_rename_t), - drv_ioc_rename}, + drv_ioc_phys_attr, {NULL}}, + {DLDIOC_SECOBJ_SET, DLDCOPYIN, sizeof (dld_ioc_secobj_set_t), + drv_ioc_secobj_set, {PRIV_SYS_DL_CONFIG}}, + {DLDIOC_SECOBJ_GET, DLDCOPYINOUT, sizeof (dld_ioc_secobj_get_t), + drv_ioc_secobj_get, {PRIV_SYS_DL_CONFIG}}, + {DLDIOC_SECOBJ_UNSET, DLDCOPYIN, sizeof (dld_ioc_secobj_unset_t), + drv_ioc_secobj_unset, {PRIV_SYS_DL_CONFIG}}, + {DLDIOC_DOORSERVER, DLDCOPYIN, sizeof (dld_ioc_door_t), + drv_ioc_doorserver, {PRIV_SYS_DL_CONFIG}}, + {DLDIOC_RENAME, DLDCOPYIN, sizeof (dld_ioc_rename_t), + drv_ioc_rename, {PRIV_SYS_DL_CONFIG}}, + {DLDIOC_MACADDRGET, DLDCOPYINOUT, sizeof (dld_ioc_macaddrget_t), + drv_ioc_macaddrget, {PRIV_SYS_DL_CONFIG}}, + {DLDIOC_ADDFLOW, DLDCOPYIN, sizeof (dld_ioc_addflow_t), + drv_ioc_addflow, {PRIV_SYS_DL_CONFIG}}, + {DLDIOC_REMOVEFLOW, DLDCOPYIN, sizeof (dld_ioc_removeflow_t), + drv_ioc_removeflow, {PRIV_SYS_DL_CONFIG}}, + {DLDIOC_MODIFYFLOW, DLDCOPYIN, sizeof (dld_ioc_modifyflow_t), + drv_ioc_modifyflow, {PRIV_SYS_DL_CONFIG}}, + {DLDIOC_WALKFLOW, DLDCOPYINOUT, sizeof (dld_ioc_walkflow_t), + drv_ioc_walkflow, {NULL}}, + {DLDIOC_USAGELOG, DLDCOPYIN, sizeof (dld_ioc_usagelog_t), + drv_ioc_usagelog, {PRIV_SYS_DL_CONFIG}}, + {DLDIOC_SETMACPROP, DLDCOPYIN, sizeof (dld_ioc_macprop_t), + drv_ioc_setprop, {PRIV_SYS_DL_CONFIG}}, {DLDIOC_GETMACPROP, DLDCOPYIN, sizeof (dld_ioc_macprop_t), - drv_ioc_getprop}, - {DLDIOC_SETMACPROP, DLDCOPYIN | DLDDLCONFIG, sizeof (dld_ioc_macprop_t), - drv_ioc_setprop} + drv_ioc_getprop, {NULL}}, + {DLDIOC_GETHWGRP, DLDCOPYINOUT, sizeof (dld_ioc_hwgrpget_t), + drv_ioc_hwgrpget, {PRIV_SYS_DL_CONFIG}}, }; typedef struct dld_ioc_modentry { @@ -1090,11 +1264,8 @@ drv_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred, int *rvalp) } info = &dim->dim_list[i]; - - if ((info->di_flags & DLDDLCONFIG) && secpolicy_dl_config(cred) != 0) { - err = EPERM; + if ((err = drv_check_policy(info, cred)) != 0) goto done; - } sz = info->di_argsize; if ((buf = kmem_zalloc(sz, KM_NOSLEEP)) == NULL) { @@ -1108,7 +1279,7 @@ drv_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred, int *rvalp) goto done; } - err = info->di_func(buf, arg, mode, cred); + err = info->di_func(buf, arg, mode, cred, rvalp); if ((info->di_flags & DLDCOPYOUT) && ddi_copyout(buf, (void *)arg, sz, mode) != 0 && err == 0) diff --git a/usr/src/uts/common/io/dld/dld_flow.c b/usr/src/uts/common/io/dld/dld_flow.c new file mode 100644 index 0000000000..b57368484f --- /dev/null +++ b/usr/src/uts/common/io/dld/dld_flow.c @@ -0,0 +1,119 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Flows ioctls implementation. + */ + +#include <sys/dld.h> +#include <sys/mac_provider.h> +#include <sys/mac_client.h> +#include <sys/mac_client_priv.h> + +/* + * Implements flow add, remove, modify ioctls. + */ +int +dld_add_flow(datalink_id_t linkid, char *flow_name, flow_desc_t *flow_desc, + mac_resource_props_t *mrp) +{ + return (mac_link_flow_add(linkid, flow_name, flow_desc, mrp)); +} + +int +dld_remove_flow(char *flow_name) +{ + return (mac_link_flow_remove(flow_name)); +} + +int +dld_modify_flow(char *flow_name, mac_resource_props_t *mrp) +{ + return (mac_link_flow_modify(flow_name, mrp)); +} + + +/* + * Callback function and structure used by dld_walk_flow(). + */ +typedef struct flowinfo_state_s { + int fi_bufsize; + int fi_nflows; + uchar_t *fi_fl; +} flowinfo_state_t; + +static int +dld_walk_flow_cb(mac_flowinfo_t *finfo, void *arg) +{ + flowinfo_state_t *statep = arg; + dld_flowinfo_t fi; + + if (statep->fi_bufsize < sizeof (dld_flowinfo_t)) + return (ENOSPC); + + (void) strlcpy(fi.fi_flowname, finfo->fi_flow_name, + sizeof (fi.fi_flowname)); + fi.fi_linkid = finfo->fi_link_id; + fi.fi_flow_desc = finfo->fi_flow_desc; + fi.fi_resource_props = finfo->fi_resource_props; + + if (copyout(&fi, statep->fi_fl, sizeof (fi)) != 0) { + return (EFAULT); + } + statep->fi_nflows++; + statep->fi_bufsize -= sizeof (dld_flowinfo_t); + statep->fi_fl += sizeof (dld_flowinfo_t); + return (0); +} + +/* + * Implements flow walk ioctl. + * Retrieves a specific flow or a list of flows from the specified link. + * ENOSPC is returned a bigger buffer is needed. + */ +int +dld_walk_flow(dld_ioc_walkflow_t *wf, intptr_t uaddr) +{ + flowinfo_state_t state; + mac_flowinfo_t finfo; + int err = 0; + + state.fi_bufsize = wf->wf_len; + state.fi_fl = (uchar_t *)uaddr + sizeof (*wf); + state.fi_nflows = 0; + + if (wf->wf_name[0] == '\0') { + err = mac_link_flow_walk(wf->wf_linkid, dld_walk_flow_cb, + &state); + } else { + err = mac_link_flow_info(wf->wf_name, &finfo); + if (err != 0) + return (err); + + err = dld_walk_flow_cb(&finfo, &state); + } + wf->wf_nflows = state.fi_nflows; + return (err); +} diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c index 5bc1fc5322..2c3d0f7ecb 100644 --- a/usr/src/uts/common/io/dld/dld_proto.c +++ b/usr/src/uts/common/io/dld/dld_proto.c @@ -23,32 +23,19 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Data-Link Driver */ - -#include <sys/types.h> -#include <sys/debug.h> #include <sys/sysmacros.h> -#include <sys/stream.h> -#include <sys/ddi.h> -#include <sys/sunddi.h> -#include <sys/strsun.h> -#include <sys/cpuvar.h> -#include <sys/dlpi.h> -#include <netinet/in.h> -#include <sys/sdt.h> #include <sys/strsubr.h> +#include <sys/strsun.h> #include <sys/vlan.h> -#include <sys/mac.h> -#include <sys/dls.h> -#include <sys/dld.h> #include <sys/dld_impl.h> -#include <sys/dls_soft_ring.h> +#include <sys/mac_client.h> +#include <sys/mac_client_impl.h> +#include <sys/mac_client_priv.h> -typedef boolean_t proto_reqfunc_t(dld_str_t *, union DL_primitives *, mblk_t *); +typedef void proto_reqfunc_t(dld_str_t *, mblk_t *); static proto_reqfunc_t proto_info_req, proto_attach_req, proto_detach_req, proto_bind_req, proto_unbind_req, proto_promiscon_req, proto_promiscoff_req, @@ -56,13 +43,8 @@ static proto_reqfunc_t proto_info_req, proto_attach_req, proto_detach_req, proto_setphysaddr_req, proto_udqos_req, proto_req, proto_capability_req, proto_notify_req, proto_passive_req; -static void proto_poll_disable(dld_str_t *); -static boolean_t proto_poll_enable(dld_str_t *, dl_capab_dls_t *); - -static void proto_soft_ring_disable(dld_str_t *); -static boolean_t proto_soft_ring_enable(dld_str_t *, dl_capab_dls_t *); -static boolean_t proto_capability_advertise(dld_str_t *, mblk_t *); -static void proto_change_soft_ring_fanout(dld_str_t *, int); +static void proto_capability_advertise(dld_str_t *, mblk_t *); +static int dld_capab_poll_disable(dld_str_t *, dld_capab_poll_t *); #define DL_ACK_PENDING(state) \ ((state) == DL_ATTACH_PENDING || \ @@ -79,70 +61,72 @@ static void proto_change_soft_ring_fanout(dld_str_t *, int); * by the above primitives. */ void -dld_wput_proto_nondata(dld_str_t *dsp, mblk_t *mp) +dld_proto(dld_str_t *dsp, mblk_t *mp) { - union DL_primitives *udlp; t_uscalar_t prim; - ASSERT(MBLKL(mp) >= sizeof (t_uscalar_t)); - - udlp = (union DL_primitives *)mp->b_rptr; - prim = udlp->dl_primitive; + if (MBLKL(mp) < sizeof (t_uscalar_t)) { + freemsg(mp); + return; + } + prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive; switch (prim) { case DL_INFO_REQ: - (void) proto_info_req(dsp, udlp, mp); + proto_info_req(dsp, mp); break; case DL_BIND_REQ: - (void) proto_bind_req(dsp, udlp, mp); + proto_bind_req(dsp, mp); break; case DL_UNBIND_REQ: - (void) proto_unbind_req(dsp, udlp, mp); + proto_unbind_req(dsp, mp); + break; + case DL_UNITDATA_REQ: + proto_unitdata_req(dsp, mp); break; case DL_UDQOS_REQ: - (void) proto_udqos_req(dsp, udlp, mp); + proto_udqos_req(dsp, mp); break; case DL_ATTACH_REQ: - (void) proto_attach_req(dsp, udlp, mp); + proto_attach_req(dsp, mp); break; case DL_DETACH_REQ: - (void) proto_detach_req(dsp, udlp, mp); + proto_detach_req(dsp, mp); break; case DL_ENABMULTI_REQ: - (void) proto_enabmulti_req(dsp, udlp, mp); + proto_enabmulti_req(dsp, mp); break; case DL_DISABMULTI_REQ: - (void) proto_disabmulti_req(dsp, udlp, mp); + proto_disabmulti_req(dsp, mp); break; case DL_PROMISCON_REQ: - (void) proto_promiscon_req(dsp, udlp, mp); + proto_promiscon_req(dsp, mp); break; case DL_PROMISCOFF_REQ: - (void) proto_promiscoff_req(dsp, udlp, mp); + proto_promiscoff_req(dsp, mp); break; case DL_PHYS_ADDR_REQ: - (void) proto_physaddr_req(dsp, udlp, mp); + proto_physaddr_req(dsp, mp); break; case DL_SET_PHYS_ADDR_REQ: - (void) proto_setphysaddr_req(dsp, udlp, mp); + proto_setphysaddr_req(dsp, mp); break; case DL_NOTIFY_REQ: - (void) proto_notify_req(dsp, udlp, mp); + proto_notify_req(dsp, mp); break; case DL_CAPABILITY_REQ: - (void) proto_capability_req(dsp, udlp, mp); + proto_capability_req(dsp, mp); break; case DL_PASSIVE_REQ: - (void) proto_passive_req(dsp, udlp, mp); + proto_passive_req(dsp, mp); break; default: - (void) proto_req(dsp, udlp, mp); + proto_req(dsp, mp); break; } } #define NEG(x) -(x) - typedef struct dl_info_ack_wrapper { dl_info_ack_t dl_info; uint8_t dl_addr[MAXMACADDRLEN + sizeof (uint16_t)]; @@ -154,9 +138,8 @@ typedef struct dl_info_ack_wrapper { /* * DL_INFO_REQ */ -/*ARGSUSED*/ -static boolean_t -proto_info_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) +static void +proto_info_req(dld_str_t *dsp, mblk_t *mp) { dl_info_ack_wrapper_t *dlwp; dl_info_ack_t *dlp; @@ -176,9 +159,7 @@ proto_info_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) */ if ((mp = mexchange(q, mp, sizeof (dl_info_ack_wrapper_t), M_PCPROTO, 0)) == NULL) - return (B_FALSE); - - rw_enter(&dsp->ds_lock, RW_READER); + return; bzero(mp->b_rptr, sizeof (dl_info_ack_wrapper_t)); dlwp = (dl_info_ack_wrapper_t *)mp->b_rptr; @@ -307,7 +288,8 @@ proto_info_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) */ dlp->dl_addr_offset = (uintptr_t)addr - (uintptr_t)dlp; if (addr_length > 0) - bcopy(dsp->ds_curr_addr, addr, addr_length); + mac_unicast_primary_get(dsp->ds_mh, addr); + *(uint16_t *)(addr + addr_length) = dsp->ds_sap; } @@ -319,25 +301,20 @@ done: ASSERT(IMPLY(dlp->dl_brdcst_addr_offset != 0, dlp->dl_brdcst_addr_length != 0)); - rw_exit(&dsp->ds_lock); - qreply(q, mp); - return (B_TRUE); } /* * DL_ATTACH_REQ */ -static boolean_t -proto_attach_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) +static void +proto_attach_req(dld_str_t *dsp, mblk_t *mp) { - dl_attach_req_t *dlp = (dl_attach_req_t *)udlp; + dl_attach_req_t *dlp = (dl_attach_req_t *)mp->b_rptr; int err = 0; t_uscalar_t dl_err; queue_t *q = dsp->ds_wq; - rw_enter(&dsp->ds_lock, RW_WRITER); - if (MBLKL(mp) < sizeof (dl_attach_req_t) || dlp->dl_ppa < 0 || dsp->ds_style == DL_STYLE1) { dl_err = DL_BADPRIM; @@ -366,25 +343,22 @@ proto_attach_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) goto failed; } ASSERT(dsp->ds_dlstate == DL_UNBOUND); - rw_exit(&dsp->ds_lock); - dlokack(q, mp, DL_ATTACH_REQ); - return (B_TRUE); + return; + failed: - rw_exit(&dsp->ds_lock); dlerrorack(q, mp, DL_ATTACH_REQ, dl_err, (t_uscalar_t)err); - return (B_FALSE); } -/*ARGSUSED*/ -static boolean_t -proto_detach_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) +/* + * DL_DETACH_REQ + */ +static void +proto_detach_req(dld_str_t *dsp, mblk_t *mp) { queue_t *q = dsp->ds_wq; t_uscalar_t dl_err; - rw_enter(&dsp->ds_lock, RW_WRITER); - if (MBLKL(mp) < sizeof (dl_detach_req_t)) { dl_err = DL_BADPRIM; goto failed; @@ -400,37 +374,34 @@ proto_detach_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) goto failed; } + ASSERT(dsp->ds_datathr_cnt == 0); dsp->ds_dlstate = DL_DETACH_PENDING; - dld_str_detach(dsp); - rw_exit(&dsp->ds_lock); + dld_str_detach(dsp); dlokack(dsp->ds_wq, mp, DL_DETACH_REQ); - return (B_TRUE); + return; + failed: - rw_exit(&dsp->ds_lock); dlerrorack(q, mp, DL_DETACH_REQ, dl_err, 0); - return (B_FALSE); } /* * DL_BIND_REQ */ -static boolean_t -proto_bind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) +static void +proto_bind_req(dld_str_t *dsp, mblk_t *mp) { - dl_bind_req_t *dlp = (dl_bind_req_t *)udlp; + dl_bind_req_t *dlp = (dl_bind_req_t *)mp->b_rptr; int err = 0; uint8_t dlsap_addr[MAXMACADDRLEN + sizeof (uint16_t)]; uint_t dlsap_addr_length; t_uscalar_t dl_err; t_scalar_t sap; queue_t *q = dsp->ds_wq; + mac_perim_handle_t mph; + void *mdip; + int32_t intr_cpu; - /* - * Because control message processing is serialized, we don't need - * to hold any locks to read any fields of dsp; we only need ds_lock - * to update the ds_dlstate, ds_sap and ds_passivestate fields. - */ if (MBLKL(mp) < sizeof (dl_bind_req_t)) { dl_err = DL_BADPRIM; goto failed; @@ -451,24 +422,26 @@ proto_bind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) goto failed; } + mac_perim_enter_by_mh(dsp->ds_mh, &mph); + if (dsp->ds_passivestate == DLD_UNINITIALIZED && - !dls_active_set(dsp->ds_dc)) { + ((err = dls_active_set(dsp)) != 0)) { dl_err = DL_SYSERR; - err = EBUSY; - goto failed; + goto failed2; } + dsp->ds_dlstate = DL_BIND_PENDING; /* * Set the receive callback. */ - dls_rx_set(dsp->ds_dc, (dsp->ds_mode == DLD_RAW) ? + dls_rx_set(dsp, (dsp->ds_mode == DLD_RAW) ? dld_str_rx_raw : dld_str_rx_unitdata, dsp); /* * Bind the channel such that it can receive packets. */ sap = dlp->dl_sap; - err = dls_bind(dsp->ds_dc, sap); + err = dls_bind(dsp, sap); if (err != 0) { switch (err) { case EINVAL: @@ -480,17 +453,28 @@ proto_bind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) break; } + dsp->ds_dlstate = DL_UNBOUND; if (dsp->ds_passivestate == DLD_UNINITIALIZED) - dls_active_clear(dsp->ds_dc); - - goto failed; + dls_active_clear(dsp); + goto failed2; } + intr_cpu = mac_client_intr_cpu(dsp->ds_mch); + mdip = mac_get_devinfo(dsp->ds_mh); + mac_perim_exit(mph); + + /* + * We do this after we get out of the perim to avoid deadlocks + * etc. since part of mac_client_retarget_intr is to walk the + * device tree in order to find and retarget the interrupts. + */ + mac_client_set_intr_cpu(mdip, dsp->ds_mch, intr_cpu); + /* * Copy in MAC address. */ dlsap_addr_length = dsp->ds_mip->mi_addr_length; - bcopy(dsp->ds_curr_addr, dlsap_addr, dlsap_addr_length); + mac_unicast_primary_get(dsp->ds_mh, dlsap_addr); /* * Copy in the SAP. @@ -498,37 +482,28 @@ proto_bind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) *(uint16_t *)(dlsap_addr + dlsap_addr_length) = sap; dlsap_addr_length += sizeof (uint16_t); - rw_enter(&dsp->ds_lock, RW_WRITER); - dsp->ds_dlstate = DL_IDLE; if (dsp->ds_passivestate == DLD_UNINITIALIZED) dsp->ds_passivestate = DLD_ACTIVE; - dsp->ds_sap = sap; - - if (dsp->ds_mode == DLD_FASTPATH) - dsp->ds_tx = str_mdata_fastpath_put; - else if (dsp->ds_mode == DLD_RAW) - dsp->ds_tx = str_mdata_raw_put; - dsp->ds_unitdata_tx = dld_wput_proto_data; - - rw_exit(&dsp->ds_lock); dlbindack(q, mp, sap, dlsap_addr, dlsap_addr_length, 0, 0); - return (B_TRUE); + return; + +failed2: + mac_perim_exit(mph); failed: dlerrorack(q, mp, DL_BIND_REQ, dl_err, (t_uscalar_t)err); - return (B_FALSE); } /* * DL_UNBIND_REQ */ -/*ARGSUSED*/ -static boolean_t -proto_unbind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) +static void +proto_unbind_req(dld_str_t *dsp, mblk_t *mp) { queue_t *q = dsp->ds_wq; t_uscalar_t dl_err; + mac_perim_handle_t mph; if (MBLKL(mp) < sizeof (dl_unbind_req_t)) { dl_err = DL_BADPRIM; @@ -540,32 +515,27 @@ proto_unbind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) goto failed; } - /* - * Flush any remaining packets scheduled for transmission. - */ - dld_tx_flush(dsp); + mutex_enter(&dsp->ds_lock); + while (dsp->ds_datathr_cnt != 0) + cv_wait(&dsp->ds_datathr_cv, &dsp->ds_lock); - /* - * Unbind the channel to stop packets being received. - */ - dls_unbind(dsp->ds_dc); + dsp->ds_dlstate = DL_UNBIND_PENDING; + mutex_exit(&dsp->ds_lock); + mac_perim_enter_by_mh(dsp->ds_mh, &mph); /* - * Clear the receive callback. + * Unbind the channel to stop packets being received. */ - dls_rx_set(dsp->ds_dc, NULL, NULL); - - rw_enter(&dsp->ds_lock, RW_WRITER); + if (dls_unbind(dsp) != 0) { + dl_err = DL_OUTSTATE; + mac_perim_exit(mph); + goto failed; + } /* * Disable polling mode, if it is enabled. */ - proto_poll_disable(dsp); - - /* - * If soft rings were enabled, the workers should be quiesced. - */ - dls_soft_ring_disable(dsp->ds_dc); + (void) dld_capab_poll_disable(dsp, NULL); /* * Clear LSO flags. @@ -574,38 +544,37 @@ proto_unbind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) dsp->ds_lso_max = 0; /* + * Clear the receive callback. + */ + dls_rx_set(dsp, NULL, NULL); + dsp->ds_direct = B_FALSE; + + /* * Set the mode back to the default (unitdata). */ dsp->ds_mode = DLD_UNITDATA; dsp->ds_dlstate = DL_UNBOUND; - DLD_TX_QUIESCE(dsp); - rw_exit(&dsp->ds_lock); - - dlokack(q, mp, DL_UNBIND_REQ); - return (B_TRUE); + mac_perim_exit(mph); + dlokack(dsp->ds_wq, mp, DL_UNBIND_REQ); + return; failed: dlerrorack(q, mp, DL_UNBIND_REQ, dl_err, 0); - return (B_FALSE); } /* * DL_PROMISCON_REQ */ -static boolean_t -proto_promiscon_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) +static void +proto_promiscon_req(dld_str_t *dsp, mblk_t *mp) { - dl_promiscon_req_t *dlp = (dl_promiscon_req_t *)udlp; + dl_promiscon_req_t *dlp = (dl_promiscon_req_t *)mp->b_rptr; int err = 0; t_uscalar_t dl_err; - uint32_t promisc; + uint32_t promisc_saved; queue_t *q = dsp->ds_wq; + mac_perim_handle_t mph; - /* - * Because control message processing is serialized, we don't need - * to hold any locks to read any fields of dsp; we only need ds_lock - * to update the ds_promisc and ds_passivestate fields. - */ if (MBLKL(mp) < sizeof (dl_promiscon_req_t)) { dl_err = DL_BADPRIM; goto failed; @@ -617,70 +586,73 @@ proto_promiscon_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) goto failed; } + promisc_saved = dsp->ds_promisc; switch (dlp->dl_level) { case DL_PROMISC_SAP: - promisc = DLS_PROMISC_SAP; + dsp->ds_promisc |= DLS_PROMISC_SAP; break; + case DL_PROMISC_MULTI: - promisc = DLS_PROMISC_MULTI; + dsp->ds_promisc |= DLS_PROMISC_MULTI; break; + case DL_PROMISC_PHYS: - promisc = DLS_PROMISC_PHYS; + dsp->ds_promisc |= DLS_PROMISC_PHYS; break; + default: dl_err = DL_NOTSUPPORTED; goto failed; } + mac_perim_enter_by_mh(dsp->ds_mh, &mph); + if (dsp->ds_passivestate == DLD_UNINITIALIZED && - !dls_active_set(dsp->ds_dc)) { + ((err = dls_active_set(dsp)) != 0)) { + dsp->ds_promisc = promisc_saved; dl_err = DL_SYSERR; - err = EBUSY; - goto failed; + goto failed2; } /* * Adjust channel promiscuity. */ - promisc = (dsp->ds_promisc | promisc); - err = dls_promisc(dsp->ds_dc, promisc); + err = dls_promisc(dsp, promisc_saved); + if (err != 0) { dl_err = DL_SYSERR; + dsp->ds_promisc = promisc_saved; if (dsp->ds_passivestate == DLD_UNINITIALIZED) - dls_active_clear(dsp->ds_dc); - goto failed; + dls_active_clear(dsp); + goto failed2; } - rw_enter(&dsp->ds_lock, RW_WRITER); + mac_perim_exit(mph); + if (dsp->ds_passivestate == DLD_UNINITIALIZED) dsp->ds_passivestate = DLD_ACTIVE; - dsp->ds_promisc = promisc; - rw_exit(&dsp->ds_lock); - dlokack(q, mp, DL_PROMISCON_REQ); - return (B_TRUE); + return; + +failed2: + mac_perim_exit(mph); failed: dlerrorack(q, mp, DL_PROMISCON_REQ, dl_err, (t_uscalar_t)err); - return (B_FALSE); } /* * DL_PROMISCOFF_REQ */ -static boolean_t -proto_promiscoff_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) +static void +proto_promiscoff_req(dld_str_t *dsp, mblk_t *mp) { - dl_promiscoff_req_t *dlp = (dl_promiscoff_req_t *)udlp; + dl_promiscoff_req_t *dlp = (dl_promiscoff_req_t *)mp->b_rptr; int err = 0; t_uscalar_t dl_err; - uint32_t promisc; + uint32_t promisc_saved; queue_t *q = dsp->ds_wq; + mac_perim_handle_t mph; - /* - * Because control messages processing is serialized, we don't need - * to hold any lock to read any field of dsp; we hold ds_lock to - * update the ds_promisc field. - */ if (MBLKL(mp) < sizeof (dl_promiscoff_req_t)) { dl_err = DL_BADPRIM; goto failed; @@ -692,60 +664,66 @@ proto_promiscoff_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) goto failed; } + promisc_saved = dsp->ds_promisc; switch (dlp->dl_level) { case DL_PROMISC_SAP: - promisc = DLS_PROMISC_SAP; + if (!(dsp->ds_promisc & DLS_PROMISC_SAP)) { + dl_err = DL_NOTENAB; + goto failed; + } + dsp->ds_promisc &= ~DLS_PROMISC_SAP; break; + case DL_PROMISC_MULTI: - promisc = DLS_PROMISC_MULTI; + if (!(dsp->ds_promisc & DLS_PROMISC_MULTI)) { + dl_err = DL_NOTENAB; + goto failed; + } + dsp->ds_promisc &= ~DLS_PROMISC_MULTI; break; + case DL_PROMISC_PHYS: - promisc = DLS_PROMISC_PHYS; + if (!(dsp->ds_promisc & DLS_PROMISC_PHYS)) { + dl_err = DL_NOTENAB; + goto failed; + } + dsp->ds_promisc &= ~DLS_PROMISC_PHYS; break; + default: dl_err = DL_NOTSUPPORTED; goto failed; } - if (!(dsp->ds_promisc & promisc)) { - dl_err = DL_NOTENAB; - goto failed; - } + mac_perim_enter_by_mh(dsp->ds_mh, &mph); + /* + * Adjust channel promiscuity. + */ + err = dls_promisc(dsp, promisc_saved); + mac_perim_exit(mph); - promisc = (dsp->ds_promisc & ~promisc); - err = dls_promisc(dsp->ds_dc, promisc); if (err != 0) { dl_err = DL_SYSERR; goto failed; } - - rw_enter(&dsp->ds_lock, RW_WRITER); - dsp->ds_promisc = promisc; - rw_exit(&dsp->ds_lock); - dlokack(q, mp, DL_PROMISCOFF_REQ); - return (B_TRUE); + return; failed: dlerrorack(q, mp, DL_PROMISCOFF_REQ, dl_err, (t_uscalar_t)err); - return (B_FALSE); } /* * DL_ENABMULTI_REQ */ -static boolean_t -proto_enabmulti_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) +static void +proto_enabmulti_req(dld_str_t *dsp, mblk_t *mp) { - dl_enabmulti_req_t *dlp = (dl_enabmulti_req_t *)udlp; + dl_enabmulti_req_t *dlp = (dl_enabmulti_req_t *)mp->b_rptr; int err = 0; t_uscalar_t dl_err; queue_t *q = dsp->ds_wq; + mac_perim_handle_t mph; - /* - * Because control messages processing is serialized, we don't need - * to hold any lock to read any field of dsp; we hold ds_lock to - * update the ds_passivestate field. - */ if (dsp->ds_dlstate == DL_UNATTACHED || DL_ACK_PENDING(dsp->ds_dlstate)) { dl_err = DL_OUTSTATE; @@ -759,14 +737,16 @@ proto_enabmulti_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) goto failed; } + mac_perim_enter_by_mh(dsp->ds_mh, &mph); + if (dsp->ds_passivestate == DLD_UNINITIALIZED && - !dls_active_set(dsp->ds_dc)) { + ((err = dls_active_set(dsp)) != 0)) { dl_err = DL_SYSERR; - err = EBUSY; - goto failed; + goto failed2; } - err = dls_multicst_add(dsp->ds_dc, mp->b_rptr + dlp->dl_addr_offset); + err = dls_multicst_add(dsp, mp->b_rptr + dlp->dl_addr_offset); + if (err != 0) { switch (err) { case EINVAL: @@ -781,40 +761,37 @@ proto_enabmulti_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) dl_err = DL_SYSERR; break; } - if (dsp->ds_passivestate == DLD_UNINITIALIZED) - dls_active_clear(dsp->ds_dc); + dls_active_clear(dsp); - goto failed; + goto failed2; } - rw_enter(&dsp->ds_lock, RW_WRITER); + mac_perim_exit(mph); + if (dsp->ds_passivestate == DLD_UNINITIALIZED) dsp->ds_passivestate = DLD_ACTIVE; - rw_exit(&dsp->ds_lock); - dlokack(q, mp, DL_ENABMULTI_REQ); - return (B_TRUE); + return; + +failed2: + mac_perim_exit(mph); failed: dlerrorack(q, mp, DL_ENABMULTI_REQ, dl_err, (t_uscalar_t)err); - return (B_FALSE); } /* * DL_DISABMULTI_REQ */ -static boolean_t -proto_disabmulti_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) +static void +proto_disabmulti_req(dld_str_t *dsp, mblk_t *mp) { - dl_disabmulti_req_t *dlp = (dl_disabmulti_req_t *)udlp; + dl_disabmulti_req_t *dlp = (dl_disabmulti_req_t *)mp->b_rptr; int err = 0; t_uscalar_t dl_err; queue_t *q = dsp->ds_wq; + mac_perim_handle_t mph; - /* - * Because control messages processing is serialized, we don't need - * to hold any lock to read any field of dsp. - */ if (dsp->ds_dlstate == DL_UNATTACHED || DL_ACK_PENDING(dsp->ds_dlstate)) { dl_err = DL_OUTSTATE; @@ -828,45 +805,46 @@ proto_disabmulti_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) goto failed; } - err = dls_multicst_remove(dsp->ds_dc, mp->b_rptr + dlp->dl_addr_offset); + mac_perim_enter_by_mh(dsp->ds_mh, &mph); + err = dls_multicst_remove(dsp, mp->b_rptr + dlp->dl_addr_offset); + mac_perim_exit(mph); + if (err != 0) { - switch (err) { + switch (err) { case EINVAL: dl_err = DL_BADADDR; err = 0; break; + case ENOENT: dl_err = DL_NOTENAB; err = 0; break; + default: dl_err = DL_SYSERR; break; } goto failed; } - dlokack(q, mp, DL_DISABMULTI_REQ); - return (B_TRUE); + return; failed: dlerrorack(q, mp, DL_DISABMULTI_REQ, dl_err, (t_uscalar_t)err); - return (B_FALSE); } /* * DL_PHYS_ADDR_REQ */ -static boolean_t -proto_physaddr_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) +static void +proto_physaddr_req(dld_str_t *dsp, mblk_t *mp) { - dl_phys_addr_req_t *dlp = (dl_phys_addr_req_t *)udlp; + dl_phys_addr_req_t *dlp = (dl_phys_addr_req_t *)mp->b_rptr; queue_t *q = dsp->ds_wq; t_uscalar_t dl_err; char *addr; uint_t addr_length; - rw_enter(&dsp->ds_lock, RW_READER); - if (MBLKL(mp) < sizeof (dl_phys_addr_req_t)) { dl_err = DL_BADPRIM; goto failed; @@ -886,50 +864,34 @@ proto_physaddr_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) addr_length = dsp->ds_mip->mi_addr_length; if (addr_length > 0) { - addr = kmem_alloc(addr_length, KM_NOSLEEP); - if (addr == NULL) { - rw_exit(&dsp->ds_lock); - merror(q, mp, ENOSR); - return (B_FALSE); - } - - /* - * Copy out the address before we drop the lock; we don't - * want to call dlphysaddrack() while holding ds_lock. - */ - bcopy((dlp->dl_addr_type == DL_CURR_PHYS_ADDR) ? - dsp->ds_curr_addr : dsp->ds_fact_addr, addr, addr_length); + addr = kmem_alloc(addr_length, KM_SLEEP); + if (dlp->dl_addr_type == DL_CURR_PHYS_ADDR) + mac_unicast_primary_get(dsp->ds_mh, (uint8_t *)addr); + else + bcopy(dsp->ds_mip->mi_unicst_addr, addr, addr_length); - rw_exit(&dsp->ds_lock); dlphysaddrack(q, mp, addr, (t_uscalar_t)addr_length); kmem_free(addr, addr_length); } else { - rw_exit(&dsp->ds_lock); dlphysaddrack(q, mp, NULL, 0); } - return (B_TRUE); + return; failed: - rw_exit(&dsp->ds_lock); dlerrorack(q, mp, DL_PHYS_ADDR_REQ, dl_err, 0); - return (B_FALSE); } /* * DL_SET_PHYS_ADDR_REQ */ -static boolean_t -proto_setphysaddr_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) +static void +proto_setphysaddr_req(dld_str_t *dsp, mblk_t *mp) { - dl_set_phys_addr_req_t *dlp = (dl_set_phys_addr_req_t *)udlp; + dl_set_phys_addr_req_t *dlp = (dl_set_phys_addr_req_t *)mp->b_rptr; int err = 0; t_uscalar_t dl_err; queue_t *q = dsp->ds_wq; + mac_perim_handle_t mph; - /* - * Because control message processing is serialized, we don't need - * to hold any locks to read any fields of dsp; we only need ds_lock - * to update the ds_passivestate field. - */ if (dsp->ds_dlstate == DL_UNATTACHED || DL_ACK_PENDING(dsp->ds_dlstate)) { dl_err = DL_OUTSTATE; @@ -943,14 +905,16 @@ proto_setphysaddr_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) goto failed; } + mac_perim_enter_by_mh(dsp->ds_mh, &mph); + if (dsp->ds_passivestate == DLD_UNINITIALIZED && - !dls_active_set(dsp->ds_dc)) { + ((err = dls_active_set(dsp)) != 0)) { dl_err = DL_SYSERR; - err = EBUSY; - goto failed; + goto failed2; } - err = mac_unicst_set(dsp->ds_mh, mp->b_rptr + dlp->dl_addr_offset); + err = mac_unicast_primary_set(dsp->ds_mh, + mp->b_rptr + dlp->dl_addr_offset); if (err != 0) { switch (err) { case EINVAL: @@ -962,32 +926,33 @@ proto_setphysaddr_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) dl_err = DL_SYSERR; break; } - if (dsp->ds_passivestate == DLD_UNINITIALIZED) - dls_active_clear(dsp->ds_dc); + dls_active_clear(dsp); + + goto failed2; - goto failed; } - rw_enter(&dsp->ds_lock, RW_WRITER); + mac_perim_exit(mph); + if (dsp->ds_passivestate == DLD_UNINITIALIZED) dsp->ds_passivestate = DLD_ACTIVE; - rw_exit(&dsp->ds_lock); - dlokack(q, mp, DL_SET_PHYS_ADDR_REQ); - return (B_TRUE); + return; + +failed2: + mac_perim_exit(mph); failed: dlerrorack(q, mp, DL_SET_PHYS_ADDR_REQ, dl_err, (t_uscalar_t)err); - return (B_FALSE); } /* * DL_UDQOS_REQ */ -static boolean_t -proto_udqos_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) +static void +proto_udqos_req(dld_str_t *dsp, mblk_t *mp) { - dl_udqos_req_t *dlp = (dl_udqos_req_t *)udlp; + dl_udqos_req_t *dlp = (dl_udqos_req_t *)mp->b_rptr; dl_qos_cl_sel1_t *selp; int off, len; t_uscalar_t dl_err; @@ -1013,21 +978,11 @@ proto_udqos_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) goto failed; } - if (dsp->ds_dlstate == DL_UNATTACHED || - DL_ACK_PENDING(dsp->ds_dlstate)) { - dl_err = DL_OUTSTATE; - goto failed; - } - - rw_enter(&dsp->ds_lock, RW_WRITER); dsp->ds_pri = selp->dl_priority; - rw_exit(&dsp->ds_lock); - dlokack(q, mp, DL_UDQOS_REQ); - return (B_TRUE); + return; failed: dlerrorack(q, mp, DL_UDQOS_REQ, dl_err, 0); - return (B_FALSE); } static boolean_t @@ -1047,19 +1002,16 @@ check_ip_above(queue_t *q) /* * DL_CAPABILITY_REQ */ -/*ARGSUSED*/ -static boolean_t -proto_capability_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) +static void +proto_capability_req(dld_str_t *dsp, mblk_t *mp) { - dl_capability_req_t *dlp = (dl_capability_req_t *)udlp; + dl_capability_req_t *dlp = (dl_capability_req_t *)mp->b_rptr; dl_capability_sub_t *sp; size_t size, len; offset_t off, end; t_uscalar_t dl_err; queue_t *q = dsp->ds_wq; - rw_enter(&dsp->ds_lock, RW_WRITER); - if (MBLKL(mp) < sizeof (dl_capability_req_t)) { dl_err = DL_BADPRIM; goto failed; @@ -1077,8 +1029,8 @@ proto_capability_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) * support. Otherwise we enable the set of capabilities requested. */ if (dlp->dl_sub_length == 0) { - /* callee drops lock */ - return (proto_capability_advertise(dsp, mp)); + proto_capability_advertise(dsp, mp); + return; } if (!MBLKIN(mp, dlp->dl_sub_offset, dlp->dl_sub_length)) { @@ -1122,137 +1074,37 @@ proto_capability_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) break; } - /* - * Large segment offload. (LSO) - */ - case DL_CAPAB_LSO: { - dl_capab_lso_t *lsop; - dl_capab_lso_t lso; - - lsop = (dl_capab_lso_t *)&sp[1]; - /* - * Copy for alignment. - */ - bcopy(lsop, &lso, sizeof (dl_capab_lso_t)); - dlcapabsetqid(&(lso.lso_mid), dsp->ds_rq); - bcopy(&lso, lsop, sizeof (dl_capab_lso_t)); - break; - } - - /* - * IP polling interface. - */ - case DL_CAPAB_POLL: { - dl_capab_dls_t *pollp; - dl_capab_dls_t poll; - - pollp = (dl_capab_dls_t *)&sp[1]; - /* - * Copy for alignment. - */ - bcopy(pollp, &poll, sizeof (dl_capab_dls_t)); - - switch (poll.dls_flags) { - default: - /*FALLTHRU*/ - case POLL_DISABLE: - proto_poll_disable(dsp); - break; - - case POLL_ENABLE: - ASSERT(!(dld_opt & DLD_OPT_NO_POLL)); - - /* - * Make sure polling is disabled. - */ - proto_poll_disable(dsp); - - /* - * Note that only IP should enable POLL. - */ - if (check_ip_above(dsp->ds_rq) && - proto_poll_enable(dsp, &poll)) { - bzero(&poll, sizeof (dl_capab_dls_t)); - poll.dls_flags = POLL_ENABLE; - } else { - bzero(&poll, sizeof (dl_capab_dls_t)); - poll.dls_flags = POLL_DISABLE; - } - break; - } - - dlcapabsetqid(&(poll.dls_mid), dsp->ds_rq); - bcopy(&poll, pollp, sizeof (dl_capab_dls_t)); - break; - } - case DL_CAPAB_SOFT_RING: { - dl_capab_dls_t *soft_ringp; - dl_capab_dls_t soft_ring; + case DL_CAPAB_DLD: { + dl_capab_dld_t *dldp; + dl_capab_dld_t dld; - soft_ringp = (dl_capab_dls_t *)&sp[1]; + dldp = (dl_capab_dld_t *)&sp[1]; /* * Copy for alignment. */ - bcopy(soft_ringp, &soft_ring, - sizeof (dl_capab_dls_t)); - - switch (soft_ring.dls_flags) { - default: - /*FALLTHRU*/ - case SOFT_RING_DISABLE: - proto_soft_ring_disable(dsp); - break; - - case SOFT_RING_ENABLE: - ASSERT(!(dld_opt & DLD_OPT_NO_SOFTRING)); - /* - * Make sure soft_ring is disabled. - */ - proto_soft_ring_disable(dsp); - - /* - * Note that only IP can enable soft ring. - */ - if (check_ip_above(dsp->ds_rq) && - proto_soft_ring_enable(dsp, &soft_ring)) { - bzero(&soft_ring, - sizeof (dl_capab_dls_t)); - soft_ring.dls_flags = SOFT_RING_ENABLE; - } else { - bzero(&soft_ring, - sizeof (dl_capab_dls_t)); - soft_ring.dls_flags = SOFT_RING_DISABLE; - } - break; - } - - dlcapabsetqid(&(soft_ring.dls_mid), dsp->ds_rq); - bcopy(&soft_ring, soft_ringp, - sizeof (dl_capab_dls_t)); + bcopy(dldp, &dld, sizeof (dl_capab_dld_t)); + dlcapabsetqid(&(dld.dld_mid), dsp->ds_rq); + bcopy(&dld, dldp, sizeof (dl_capab_dld_t)); break; } default: break; } - off += size; } - rw_exit(&dsp->ds_lock); qreply(q, mp); - return (B_TRUE); + return; failed: - rw_exit(&dsp->ds_lock); dlerrorack(q, mp, DL_CAPABILITY_REQ, dl_err, 0); - return (B_FALSE); } /* * DL_NOTIFY_REQ */ -static boolean_t -proto_notify_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) +static void +proto_notify_req(dld_str_t *dsp, mblk_t *mp) { - dl_notify_req_t *dlp = (dl_notify_req_t *)udlp; + dl_notify_req_t *dlp = (dl_notify_req_t *)mp->b_rptr; t_uscalar_t dl_err; queue_t *q = dsp->ds_wq; uint_t note = @@ -1264,8 +1116,6 @@ proto_notify_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) DL_NOTE_CAPAB_RENEG | DL_NOTE_SPEED; - rw_enter(&dsp->ds_lock, RW_WRITER); - if (MBLKL(mp) < sizeof (dl_notify_req_t)) { dl_err = DL_BADPRIM; goto failed; @@ -1283,7 +1133,6 @@ proto_notify_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) * Cache the notifications that are being enabled. */ dsp->ds_notifications = dlp->dl_notifications & note; - rw_exit(&dsp->ds_lock); /* * The ACK carries all notifications regardless of which set is * being enabled. @@ -1291,27 +1140,21 @@ proto_notify_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) dlnotifyack(q, mp, note); /* - * Solicit DL_NOTIFY_IND messages for each enabled notification. + * Generate DL_NOTIFY_IND messages for each enabled notification. */ - rw_enter(&dsp->ds_lock, RW_READER); if (dsp->ds_notifications != 0) { - rw_exit(&dsp->ds_lock); dld_str_notify_ind(dsp); - } else { - rw_exit(&dsp->ds_lock); } - return (B_TRUE); + return; failed: - rw_exit(&dsp->ds_lock); dlerrorack(q, mp, DL_NOTIFY_REQ, dl_err, 0); - return (B_FALSE); } /* - * DL_UNITDATA_REQ + * DL_UINTDATA_REQ */ void -dld_wput_proto_data(dld_str_t *dsp, mblk_t *mp) +proto_unitdata_req(dld_str_t *dsp, mblk_t *mp) { queue_t *q = dsp->ds_wq; dl_unitdata_req_t *dlp = (dl_unitdata_req_t *)mp->b_rptr; @@ -1326,10 +1169,19 @@ dld_wput_proto_data(dld_str_t *dsp, mblk_t *mp) uint_t max_sdu; if (MBLKL(mp) < sizeof (dl_unitdata_req_t) || mp->b_cont == NULL) { - dl_err = DL_BADPRIM; - goto failed; + dlerrorack(q, mp, DL_UNITDATA_REQ, DL_BADPRIM, 0); + return; } + mutex_enter(&dsp->ds_lock); + if (dsp->ds_dlstate != DL_IDLE) { + mutex_exit(&dsp->ds_lock); + dlerrorack(q, mp, DL_UNITDATA_REQ, DL_OUTSTATE, 0); + return; + } + DLD_DATATHR_INC(dsp); + mutex_exit(&dsp->ds_lock); + addr_length = dsp->ds_mip->mi_addr_length; off = dlp->dl_dest_addr_offset; @@ -1367,7 +1219,7 @@ dld_wput_proto_data(dld_str_t *dsp, mblk_t *mp) /* * Build a packet header. */ - if ((bp = dls_header(dsp->ds_dc, addr, sap, dlp->dl_priority.dl_max, + if ((bp = dls_header(dsp, addr, sap, dlp->dl_priority.dl_max, &payload)) == NULL) { dl_err = DL_BADADDR; goto failed; @@ -1390,32 +1242,37 @@ dld_wput_proto_data(dld_str_t *dsp, mblk_t *mp) */ ASSERT(bp->b_cont == NULL); bp->b_cont = payload; - dld_tx_single(dsp, bp); + + /* + * No lock can be held across modules and putnext()'s, + * which can happen here with the call from DLD_TX(). + */ + if (DLD_TX(dsp, bp, 0, 0) != NULL) { + /* flow-controlled */ + DLD_SETQFULL(dsp); + } + DLD_DATATHR_DCR(dsp); return; + failed: dlerrorack(q, mp, DL_UNITDATA_REQ, dl_err, 0); + DLD_DATATHR_DCR(dsp); return; baddata: dluderrorind(q, mp, (void *)addr, len, DL_BADDATA, 0); + DLD_DATATHR_DCR(dsp); } /* * DL_PASSIVE_REQ */ -/* ARGSUSED */ -static boolean_t -proto_passive_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) +static void +proto_passive_req(dld_str_t *dsp, mblk_t *mp) { t_uscalar_t dl_err; /* - * READER lock is enough because ds_passivestate can only be changed - * as the result of non-data message processing. - */ - rw_enter(&dsp->ds_lock, RW_READER); - - /* * If we've already become active by issuing an active primitive, * then it's too late to try to become passive. */ @@ -1430,209 +1287,281 @@ proto_passive_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) } dsp->ds_passivestate = DLD_PASSIVE; - rw_exit(&dsp->ds_lock); dlokack(dsp->ds_wq, mp, DL_PASSIVE_REQ); - return (B_TRUE); + return; failed: - rw_exit(&dsp->ds_lock); dlerrorack(dsp->ds_wq, mp, DL_PASSIVE_REQ, dl_err, 0); - return (B_FALSE); } + /* * Catch-all handler. */ -static boolean_t -proto_req(dld_str_t *dsp, union DL_primitives *dlp, mblk_t *mp) +static void +proto_req(dld_str_t *dsp, mblk_t *mp) { + union DL_primitives *dlp = (union DL_primitives *)mp->b_rptr; + dlerrorack(dsp->ds_wq, mp, dlp->dl_primitive, DL_UNSUPPORTED, 0); - return (B_FALSE); } -static void -proto_poll_disable(dld_str_t *dsp) +static int +dld_capab_perim(dld_str_t *dsp, void *data, uint_t flags) { - mac_handle_t mh; + switch (flags) { + case DLD_ENABLE: + mac_perim_enter_by_mh(dsp->ds_mh, (mac_perim_handle_t *)data); + return (0); - ASSERT(RW_WRITE_HELD(&dsp->ds_lock)); + case DLD_DISABLE: + mac_perim_exit((mac_perim_handle_t)data); + return (0); - if (!dsp->ds_polling) - return; + case DLD_QUERY: + return (mac_perim_held(dsp->ds_mh)); + } + return (0); +} - /* - * It should be impossible to enable raw mode if polling is turned on. - */ - ASSERT(dsp->ds_mode != DLD_RAW); +static int +dld_capab_direct(dld_str_t *dsp, void *data, uint_t flags) +{ + dld_capab_direct_t *direct = data; - /* - * Reset the resource_add callback. - */ - mh = dls_mac(dsp->ds_dc); - mac_resource_set(mh, NULL, NULL); - mac_resources(mh); + ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); - /* - * Set receive function back to default. - */ - dls_rx_set(dsp->ds_dc, (dsp->ds_mode == DLD_FASTPATH) ? - dld_str_rx_fastpath : dld_str_rx_unitdata, dsp); + switch (flags) { + case DLD_ENABLE: + dls_rx_set(dsp, (dls_rx_t)direct->di_rx_cf, + direct->di_rx_ch); + /* + * TODO: XXXGopi + * + * Direct pointer to functions in the MAC layer + * should be passed here: + * + * 1) pass mac_tx() and mac_client_handle instead + * of str_mdata_fastpath_put() and dld_str_t. But + * not done presently because of some VLAN + * processing stuff in str_mdata_fastpath_put(). + * + * 2) pass a MAC layer callback instead of + * dld_flow_ctl_callb(). + */ + direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put; + direct->di_tx_dh = dsp; - /* - * Note that polling is disabled. - */ - dsp->ds_polling = B_FALSE; + direct->di_tx_cb_df = (uintptr_t)mac_client_tx_notify; + direct->di_tx_cb_dh = dsp->ds_mch; + dsp->ds_direct = B_TRUE; + + return (0); + + case DLD_DISABLE: + dls_rx_set(dsp, (dsp->ds_mode == DLD_FASTPATH) ? + dld_str_rx_fastpath : dld_str_rx_unitdata, (void *)dsp); + dsp->ds_direct = B_FALSE; + + return (0); + } + return (ENOTSUP); } -static boolean_t -proto_poll_enable(dld_str_t *dsp, dl_capab_dls_t *pollp) +/* + * dld_capab_poll_enable() + * + * This function is misnamed. All polling and fanouts are run out of the + * lower mac (in case of VNIC and the only mac in case of NICs). The + * availability of Rx ring and promiscous mode is all taken care between + * the soft ring set (mac_srs), the Rx ring, and S/W classifier. Any + * fanout necessary is done by the soft rings that are part of the + * mac_srs (by default mac_srs sends the packets up via a TCP and + * non TCP soft ring). + * + * The mac_srs (or its associated soft rings) always store the ill_rx_ring + * (the cookie returned when they registered with IP during plumb) as their + * 2nd argument which is passed up as mac_resource_handle_t. The upcall + * function and 1st argument is what the caller registered when they + * called mac_rx_classify_flow_add() to register the flow. For VNIC, + * the function is vnic_rx and argument is vnic_t. For regular NIC + * case, it mac_rx_default and mac_handle_t. As explained above, the + * mac_srs (or its soft ring) will add the ill_rx_ring (mac_resource_handle_t) + * from its stored 2nd argument. + */ +static int +dld_capab_poll_enable(dld_str_t *dsp, dld_capab_poll_t *poll) { - mac_handle_t mh; + if (dsp->ds_polling) + return (EINVAL); - ASSERT(RW_WRITE_HELD(&dsp->ds_lock)); - ASSERT(!dsp->ds_polling); + if ((dld_opt & DLD_OPT_NO_POLL) != 0 || dsp->ds_mode == DLD_RAW) + return (ENOTSUP); /* - * We cannot enable polling if raw mode - * has been enabled. + * Enable client polling if and only if DLS bypass is possible. + * Special cases like VLANs need DLS processing in the Rx data path. + * In such a case we can neither allow the client (IP) to directly + * poll the softring (since DLS processing hasn't been done) nor can + * we allow DLS bypass. */ - if (dsp->ds_mode == DLD_RAW) - return (B_FALSE); - - mh = dls_mac(dsp->ds_dc); + if (!mac_rx_bypass_set(dsp->ds_mch, dsp->ds_rx, dsp->ds_rx_arg)) + return (ENOTSUP); /* - * Register resources. + * Register soft ring resources. This will come in handy later if + * the user decides to modify CPU bindings to use more CPUs for the + * device in which case we will switch to fanout using soft rings. */ - mac_resource_set(mh, (mac_resource_add_t)pollp->dls_ring_add, - (void *)pollp->dls_rx_handle); - - mac_resources(mh); + mac_resource_set_common(dsp->ds_mch, + (mac_resource_add_t)poll->poll_ring_add_cf, + (mac_resource_remove_t)poll->poll_ring_remove_cf, + (mac_resource_quiesce_t)poll->poll_ring_quiesce_cf, + (mac_resource_restart_t)poll->poll_ring_restart_cf, + (mac_resource_bind_t)poll->poll_ring_bind_cf, + poll->poll_ring_ch); - /* - * Set the upstream receive function. - */ - dls_rx_set(dsp->ds_dc, (dls_rx_t)pollp->dls_rx, - (void *)pollp->dls_rx_handle); + mac_client_poll_enable(dsp->ds_mch); - /* - * Note that polling is enabled. This prevents further DLIOCHDRINFO - * ioctls from overwriting the receive function pointer. - */ dsp->ds_polling = B_TRUE; - return (B_TRUE); + return (0); } -static void -proto_soft_ring_disable(dld_str_t *dsp) +/* ARGSUSED */ +static int +dld_capab_poll_disable(dld_str_t *dsp, dld_capab_poll_t *poll) { - ASSERT(RW_WRITE_HELD(&dsp->ds_lock)); + if (!dsp->ds_polling) + return (EINVAL); - if (!dsp->ds_soft_ring) - return; + mac_client_poll_disable(dsp->ds_mch); + mac_resource_set(dsp->ds_mch, NULL, NULL); - /* - * It should be impossible to enable raw mode if soft_ring is turned on. - */ - ASSERT(dsp->ds_mode != DLD_RAW); - proto_change_soft_ring_fanout(dsp, SOFT_RING_NONE); - /* - * Note that fanout is disabled. - */ - dsp->ds_soft_ring = B_FALSE; + dsp->ds_polling = B_FALSE; + return (0); } -static boolean_t -proto_soft_ring_enable(dld_str_t *dsp, dl_capab_dls_t *soft_ringp) +static int +dld_capab_poll(dld_str_t *dsp, void *data, uint_t flags) { - ASSERT(RW_WRITE_HELD(&dsp->ds_lock)); - ASSERT(!dsp->ds_soft_ring); + dld_capab_poll_t *poll = data; - /* - * We cannot enable soft_ring if raw mode - * has been enabled. - */ - if (dsp->ds_mode == DLD_RAW) - return (B_FALSE); + ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); - if (dls_soft_ring_enable(dsp->ds_dc, soft_ringp) == B_FALSE) - return (B_FALSE); + switch (flags) { + case DLD_ENABLE: + return (dld_capab_poll_enable(dsp, poll)); + case DLD_DISABLE: + return (dld_capab_poll_disable(dsp, poll)); + } + return (ENOTSUP); +} - dsp->ds_soft_ring = B_TRUE; - return (B_TRUE); +static int +dld_capab_lso(dld_str_t *dsp, void *data, uint_t flags) +{ + dld_capab_lso_t *lso = data; + + ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + + switch (flags) { + case DLD_ENABLE: { + mac_capab_lso_t mac_lso; + + /* + * Check if LSO is supported on this MAC & enable LSO + * accordingly. + */ + if (mac_capab_get(dsp->ds_mh, MAC_CAPAB_LSO, &mac_lso)) { + lso->lso_max = mac_lso.lso_basic_tcp_ipv4.lso_max; + lso->lso_flags = 0; + /* translate the flag for mac clients */ + if ((mac_lso.lso_flags & LSO_TX_BASIC_TCP_IPV4) != 0) + lso->lso_flags |= DLD_LSO_TX_BASIC_TCP_IPV4; + dsp->ds_lso = B_TRUE; + dsp->ds_lso_max = lso->lso_max; + } else { + dsp->ds_lso = B_FALSE; + dsp->ds_lso_max = 0; + return (ENOTSUP); + } + return (0); + } + case DLD_DISABLE: { + dsp->ds_lso = B_FALSE; + dsp->ds_lso_max = 0; + return (0); + } + } + return (ENOTSUP); } -static void -proto_change_soft_ring_fanout(dld_str_t *dsp, int type) +static int +dld_capab(dld_str_t *dsp, uint_t type, void *data, uint_t flags) { - dls_channel_t dc = dsp->ds_dc; + int err; - if (type == SOFT_RING_NONE) { - dls_rx_set(dc, (dsp->ds_mode == DLD_FASTPATH) ? - dld_str_rx_fastpath : dld_str_rx_unitdata, dsp); - } else if (type != SOFT_RING_NONE) { - dls_rx_set(dc, (dls_rx_t)dls_soft_ring_fanout, dc); + /* + * Don't enable direct callback capabilities unless the caller is + * the IP client. When a module is inserted in a stream (_I_INSERT) + * the stack initiates capability disable, but due to races, the + * module insertion may complete before the capability disable + * completes. So we limit the check to DLD_ENABLE case. + */ + if ((flags == DLD_ENABLE && type != DLD_CAPAB_PERIM) && + (dsp->ds_sap != ETHERTYPE_IP || !check_ip_above(dsp->ds_rq))) { + return (ENOTSUP); } + + switch (type) { + case DLD_CAPAB_DIRECT: + err = dld_capab_direct(dsp, data, flags); + break; + + case DLD_CAPAB_POLL: + err = dld_capab_poll(dsp, data, flags); + break; + + case DLD_CAPAB_PERIM: + err = dld_capab_perim(dsp, data, flags); + break; + + case DLD_CAPAB_LSO: + err = dld_capab_lso(dsp, data, flags); + break; + + default: + err = ENOTSUP; + break; + } + + return (err); } /* * DL_CAPABILITY_ACK/DL_ERROR_ACK */ -static boolean_t +static void proto_capability_advertise(dld_str_t *dsp, mblk_t *mp) { dl_capability_ack_t *dlap; dl_capability_sub_t *dlsp; size_t subsize; - dl_capab_dls_t poll; - dl_capab_dls_t soft_ring; + dl_capab_dld_t dld; dl_capab_hcksum_t hcksum; - dl_capab_lso_t lso; dl_capab_zerocopy_t zcopy; uint8_t *ptr; queue_t *q = dsp->ds_wq; mblk_t *mp1; - boolean_t is_vlan = (dsp->ds_vid != VLAN_ID_NONE); - boolean_t poll_capable = B_FALSE; - boolean_t soft_ring_capable = B_FALSE; + boolean_t is_vlan; boolean_t hcksum_capable = B_FALSE; boolean_t zcopy_capable = B_FALSE; - boolean_t lso_capable = B_FALSE; - mac_capab_lso_t mac_lso; - - ASSERT(RW_WRITE_HELD(&dsp->ds_lock)); + boolean_t dld_capable = B_FALSE; /* * Initially assume no capabilities. */ subsize = 0; - - /* - * Check if soft ring can be enabled on this interface. Note that we - * do not enable softring on any legacy drivers, because doing that - * would hurt the performance if the legacy driver has its own taskq - * implementation. Further, most high-performance legacy drivers do - * have their own taskq implementation. - * - * If advertising DL_CAPAB_SOFT_RING has not been explicitly disabled, - * reserve space for that capability. - */ - if (!mac_is_legacy(dsp->ds_mh) && !(dld_opt & DLD_OPT_NO_SOFTRING)) { - soft_ring_capable = B_TRUE; - subsize += sizeof (dl_capability_sub_t) + - sizeof (dl_capab_dls_t); - } - - /* - * Check if polling can be enabled on this interface. - * If advertising DL_CAPAB_POLL has not been explicitly disabled - * then reserve space for that capability. - */ - if (mac_capab_get(dsp->ds_mh, MAC_CAPAB_POLL, NULL) && - !(dld_opt & DLD_OPT_NO_POLL) && !is_vlan) { - poll_capable = B_TRUE; - subsize += sizeof (dl_capability_sub_t) + - sizeof (dl_capab_dls_t); - } + is_vlan = (mac_client_vid(dsp->ds_mch) != VLAN_ID_NONE); /* * Check if checksum offload is supported on this MAC. Don't @@ -1652,16 +1581,6 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp) } /* - * Check if LSO is supported on this MAC, then reserve space for - * the DL_CAPAB_LSO capability. - */ - if (mac_capab_get(dsp->ds_mh, MAC_CAPAB_LSO, &mac_lso)) { - lso_capable = B_TRUE; - subsize += sizeof (dl_capability_sub_t) + - sizeof (dl_capab_lso_t); - } - - /* * Check if zerocopy is supported on this interface. * If advertising DL_CAPAB_ZEROCOPY has not been explicitly disabled * then reserve space for that capability. @@ -1674,14 +1593,22 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp) } /* + * Direct capability negotiation interface between IP and DLD + */ + if (dsp->ds_sap == ETHERTYPE_IP && check_ip_above(dsp->ds_rq)) { + dld_capable = B_TRUE; + subsize += sizeof (dl_capability_sub_t) + + sizeof (dl_capab_dld_t); + } + + /* * If there are no capabilities to advertise or if we * can't allocate a response, send a DL_ERROR_ACK. */ if ((mp1 = reallocb(mp, sizeof (dl_capability_ack_t) + subsize, 0)) == NULL) { - rw_exit(&dsp->ds_lock); dlerrorack(q, mp, DL_CAPABILITY_REQ, DL_NOTSUPPORTED, 0); - return (B_FALSE); + return; } mp = mp1; @@ -1695,56 +1622,6 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp) ptr = (uint8_t *)&dlap[1]; /* - * IP polling interface. - */ - if (poll_capable) { - /* - * Attempt to disable just in case this is a re-negotiation; - * READER lock is enough because ds_polling can only be - * changed as the result of non-data message processing. - */ - proto_poll_disable(dsp); - - dlsp = (dl_capability_sub_t *)ptr; - - dlsp->dl_cap = DL_CAPAB_POLL; - dlsp->dl_length = sizeof (dl_capab_dls_t); - ptr += sizeof (dl_capability_sub_t); - - bzero(&poll, sizeof (dl_capab_dls_t)); - poll.dls_version = POLL_VERSION_1; - poll.dls_flags = POLL_CAPABLE; - poll.dls_tx_handle = (uintptr_t)dsp; - poll.dls_tx = (uintptr_t)str_mdata_fastpath_put; - dlcapabsetqid(&(poll.dls_mid), dsp->ds_rq); - bcopy(&poll, ptr, sizeof (dl_capab_dls_t)); - ptr += sizeof (dl_capab_dls_t); - } - - - if (soft_ring_capable) { - dlsp = (dl_capability_sub_t *)ptr; - - dlsp->dl_cap = DL_CAPAB_SOFT_RING; - dlsp->dl_length = sizeof (dl_capab_dls_t); - ptr += sizeof (dl_capability_sub_t); - - bzero(&soft_ring, sizeof (dl_capab_dls_t)); - soft_ring.dls_version = SOFT_RING_VERSION_1; - soft_ring.dls_flags = SOFT_RING_CAPABLE; - soft_ring.dls_tx_handle = (uintptr_t)dsp; - soft_ring.dls_tx = (uintptr_t)str_mdata_fastpath_put; - soft_ring.dls_ring_change_status = - (uintptr_t)proto_change_soft_ring_fanout; - soft_ring.dls_ring_bind = (uintptr_t)soft_ring_bind; - soft_ring.dls_ring_unbind = (uintptr_t)soft_ring_unbind; - - dlcapabsetqid(&(soft_ring.dls_mid), dsp->ds_rq); - bcopy(&soft_ring, ptr, sizeof (dl_capab_dls_t)); - ptr += sizeof (dl_capab_dls_t); - } - - /* * TCP/IP checksum offload. */ if (hcksum_capable) { @@ -1761,32 +1638,6 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp) } /* - * Large segment offload. (LSO) - */ - if (lso_capable) { - dlsp = (dl_capability_sub_t *)ptr; - - dlsp->dl_cap = DL_CAPAB_LSO; - dlsp->dl_length = sizeof (dl_capab_lso_t); - ptr += sizeof (dl_capability_sub_t); - - lso.lso_version = LSO_VERSION_1; - lso.lso_flags = mac_lso.lso_flags; - lso.lso_max = mac_lso.lso_basic_tcp_ipv4.lso_max; - - /* Simply enable LSO with DLD */ - dsp->ds_lso = B_TRUE; - dsp->ds_lso_max = lso.lso_max; - - dlcapabsetqid(&(lso.lso_mid), dsp->ds_rq); - bcopy(&lso, ptr, sizeof (dl_capab_lso_t)); - ptr += sizeof (dl_capab_lso_t); - } else { - dsp->ds_lso = B_FALSE; - dsp->ds_lso_max = 0; - } - - /* * Zero copy */ if (zcopy_capable) { @@ -1805,11 +1656,28 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp) ptr += sizeof (dl_capab_zerocopy_t); } - ASSERT(ptr == mp->b_rptr + sizeof (dl_capability_ack_t) + subsize); + /* + * Direct capability negotiation interface between IP and DLD. + * Refer to dld.h for details. + */ + if (dld_capable) { + dlsp = (dl_capability_sub_t *)ptr; + dlsp->dl_cap = DL_CAPAB_DLD; + dlsp->dl_length = sizeof (dl_capab_dld_t); + ptr += sizeof (dl_capability_sub_t); - rw_exit(&dsp->ds_lock); + bzero(&dld, sizeof (dl_capab_dld_t)); + dld.dld_version = DLD_CURRENT_VERSION; + dld.dld_capab = (uintptr_t)dld_capab; + dld.dld_capab_handle = (uintptr_t)dsp; + + dlcapabsetqid(&(dld.dld_mid), dsp->ds_rq); + bcopy(&dld, ptr, sizeof (dl_capab_dld_t)); + ptr += sizeof (dl_capab_dld_t); + } + + ASSERT(ptr == mp->b_rptr + sizeof (dl_capability_ack_t) + subsize); qreply(q, mp); - return (B_TRUE); } /* @@ -1819,8 +1687,5 @@ void dld_capabilities_disable(dld_str_t *dsp) { if (dsp->ds_polling) - proto_poll_disable(dsp); - - if (dsp->ds_soft_ring) - proto_soft_ring_disable(dsp); + (void) dld_capab_poll_disable(dsp, NULL); } diff --git a/usr/src/uts/common/io/dld/dld_str.c b/usr/src/uts/common/io/dld/dld_str.c index 8694b9d6c4..cf7e7010dc 100644 --- a/usr/src/uts/common/io/dld/dld_str.c +++ b/usr/src/uts/common/io/dld/dld_str.c @@ -27,17 +27,17 @@ * Data-Link Driver */ +#include <inet/common.h> +#include <sys/strsubr.h> #include <sys/stropts.h> #include <sys/strsun.h> -#include <sys/strsubr.h> -#include <sys/atomic.h> -#include <sys/disp.h> -#include <sys/callb.h> #include <sys/vlan.h> -#include <sys/dld.h> #include <sys/dld_impl.h> -#include <sys/dls_impl.h> -#include <inet/common.h> +#include <sys/cpuvar.h> +#include <sys/callb.h> +#include <sys/list.h> +#include <sys/mac_client.h> +#include <sys/mac_client_priv.h> static int str_constructor(void *, void *, int); static void str_destructor(void *, void *); @@ -49,111 +49,80 @@ static void str_notify_link_up(dld_str_t *); static void str_notify_link_down(dld_str_t *); static void str_notify_capab_reneg(dld_str_t *); static void str_notify_speed(dld_str_t *, uint32_t); -static void str_notify(void *, mac_notify_type_t); static void ioc_native(dld_str_t *, mblk_t *); static void ioc_margin(dld_str_t *, mblk_t *); static void ioc_raw(dld_str_t *, mblk_t *); static void ioc_fast(dld_str_t *, mblk_t *); static void ioc(dld_str_t *, mblk_t *); -static void dld_tx_enqueue(dld_str_t *, mblk_t *, mblk_t *, boolean_t, - uint_t, uint_t); +static void dld_ioc(dld_str_t *, mblk_t *); static void dld_wput_nondata(dld_str_t *, mblk_t *); -static void dld_wput_nondata_task(void *); -static void dld_flush_nondata(dld_str_t *); + +static void str_mdata_raw_put(dld_str_t *, mblk_t *); static mblk_t *i_dld_ether_header_update_tag(mblk_t *, uint_t, uint16_t); static mblk_t *i_dld_ether_header_strip_tag(mblk_t *); static uint32_t str_count; static kmem_cache_t *str_cachep; -static taskq_t *dld_disp_taskq = NULL; static mod_hash_t *str_hashp; #define STR_HASHSZ 64 #define STR_HASH_KEY(key) ((mod_hash_key_t)(uintptr_t)(key)) -static inline uint_t mp_getsize(mblk_t *); +#define dld_taskq system_taskq -/* - * Interval to count the TX queued depth. Default is 1s (1000000us). - * Count the queue depth immediately (not by timeout) if this is set to 0. - * See more details above dld_tx_enqueue(). - */ -uint_t tx_qdepth_interval = 1000000; +static kmutex_t dld_taskq_lock; +static kcondvar_t dld_taskq_cv; +static list_t dld_taskq_list; /* List of dld_str_t */ +boolean_t dld_taskq_quit; +boolean_t dld_taskq_done; + +static void dld_taskq_dispatch(void); /* - * Some notes on entry points, flow-control, queueing and locking: + * Some notes on entry points, flow-control, queueing. * * This driver exports the traditional STREAMS put entry point as well as * the non-STREAMS fast-path transmit routine which is provided to IP via * the DL_CAPAB_POLL negotiation. The put procedure handles all control * and data operations, while the fast-path routine deals only with M_DATA * fast-path packets. Regardless of the entry point, all outbound packets - * will end up in dld_tx_single(), where they will be delivered to the MAC - * driver. + * will end up in DLD_TX(), where they will be delivered to the MAC layer. * - * The transmit logic operates in two modes: a "not busy" mode where the - * packets will be delivered to the MAC for a send attempt, or "busy" mode - * where they will be enqueued in the internal queue because of flow-control. - * Flow-control happens when the MAC driver indicates the packets couldn't - * be transmitted due to lack of resources (e.g. running out of descriptors). - * In such case, the driver will place a dummy message on its write-side - * STREAMS queue so that the queue is marked as "full". Any subsequent - * packets arriving at the driver will be enqueued in the internal queue, - * which is drained in the context of the service thread that gets scheduled - * whenever the driver is in the "busy" mode. When all packets have been - * successfully delivered by MAC and the internal queue is empty, it will - * transition to the "not busy" mode by removing the dummy message from the - * write-side STREAMS queue; in effect this will trigger backenabling. - * The sizes of q_hiwat and q_lowat are set to 1 and 0, respectively, due - * to the above reasons. + * The transmit logic operates in the following way: All packets coming + * into DLD will be sent to the MAC layer through DLD_TX(). Flow-control + * happens when the MAC layer indicates the packets couldn't be + * transmitted due to 1) lack of resources (e.g. running out of + * descriptors), or 2) reaching the allowed bandwidth limit for this + * particular flow. The indication comes in the form of a Tx cookie that + * identifies the blocked ring. In such case, DLD will place a + * dummy message on its write-side STREAMS queue so that the queue is + * marked as "full". Any subsequent packets arriving at the driver will + * still be sent to the MAC layer where it either gets queued in the Tx + * SRS or discarded it if queue limit is exceeded. The write-side STREAMS + * queue gets enabled when MAC layer notifies DLD through MAC_NOTE_TX. + * When the write service procedure runs, it will remove the dummy + * message from the write-side STREAMS queue; in effect this will trigger + * backenabling. The sizes of q_hiwat and q_lowat are set to 1 and 0, + * respectively, due to the above reasons. * - * The driver implements an internal transmit queue independent of STREAMS. - * This allows for flexibility and provides a fast enqueue/dequeue mechanism - * compared to the putq() and get() STREAMS interfaces. The only putq() and - * getq() operations done by the driver are those related to placing and - * removing the dummy message to/from the write-side STREAMS queue for flow- - * control purposes. + * All non-data operations, both DLPI and ioctls are single threaded on a per + * dld_str_t endpoint. This is done using a taskq so that the control operation + * has kernel context and can cv_wait for resources. In addition all set type + * operations that involve mac level state modification are serialized on a + * per mac end point using the perimeter mechanism provided by the mac layer. + * This serializes all mac clients trying to modify a single mac end point over + * the entire sequence of mac calls made by that client as an atomic unit. The + * mac framework locking is described in mac.c. A critical element is that + * DLD/DLS does not hold any locks across the mac perimeter. * - * Locking is done independent of STREAMS due to the driver being fully MT. - * Threads entering the driver (either from put or service entry points) - * will most likely be readers, with the exception of a few writer cases - * such those handling DLPI attach/detach/bind/unbind/etc. or any of the - * DLD-related ioctl requests. The DLPI detach case is special, because - * it involves freeing resources and therefore must be single-threaded. - * Unfortunately the readers/writers lock can't be used to protect against - * it, because the lock is dropped prior to the driver calling places where - * putnext() may be invoked, and such places may depend on those resources - * to exist. Because of this, the driver always completes the DLPI detach - * process when there are no other threads running in the driver. This is - * done by keeping track of the number of threads, such that the the last - * thread leaving the driver will finish the pending DLPI detach operation. - */ - -/* - * dld_max_q_count is the queue depth threshold used to limit the number of - * outstanding packets or bytes allowed in the queue; once this limit is - * reached the driver will free any incoming ones until the queue depth - * drops below the threshold. - * - * This buffering is provided to accomodate clients which do not employ - * their own buffering scheme, and to handle occasional packet bursts. - * Clients which handle their own buffering will receive positive feedback - * from this driver as soon as it transitions into the "busy" state, i.e. - * when the queue is initially filled up; they will get backenabled once - * the queue is empty. - * - * The value chosen here is rather arbitrary; in future some intelligent - * heuristics may be involved which could take into account the hardware's - * transmit ring size, etc. - */ -uint_t dld_max_q_count = (16 * 1024 *1024); - -/* * dld_finddevinfo() returns the dev_info_t * corresponding to a particular * dev_t. It searches str_hashp (a table of dld_str_t's) for streams that * match dev_t. If a stream is found and it is attached, its dev_info_t * - * is returned. + * is returned. If the mac handle is non-null, it can be safely accessed + * below. The mac handle won't be freed until the mac_unregister which + * won't happen until the driver detaches. The DDI framework ensures that + * the detach won't happen while a getinfo is in progress. */ typedef struct i_dld_str_state_s { major_t ds_major; @@ -167,35 +136,31 @@ i_dld_str_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg) { i_dld_str_state_t *statep = arg; dld_str_t *dsp = (dld_str_t *)val; + mac_handle_t mh; if (statep->ds_major != dsp->ds_major) return (MH_WALK_CONTINUE); ASSERT(statep->ds_minor != 0); + mh = dsp->ds_mh; - /* - * Access to ds_mh needs to be protected by ds_lock. - */ - rw_enter(&dsp->ds_lock, RW_READER); if (statep->ds_minor == dsp->ds_minor) { /* * Clone: a clone minor is unique. we can terminate the * walk if we find a matching stream -- even if we fail * to obtain the devinfo. */ - if (dsp->ds_mh != NULL) - statep->ds_dip = mac_devinfo_get(dsp->ds_mh); - rw_exit(&dsp->ds_lock); + if (mh != NULL) + statep->ds_dip = mac_devinfo_get(mh); return (MH_WALK_TERMINATE); } - rw_exit(&dsp->ds_lock); return (MH_WALK_CONTINUE); } static dev_info_t * dld_finddevinfo(dev_t dev) { - dev_info_t *dip; + dev_info_t *dip; i_dld_str_state_t state; if (getminor(dev) == 0) @@ -204,7 +169,7 @@ dld_finddevinfo(dev_t dev) /* * See if it's a minor node of a link */ - if ((dip = dls_finddevinfo(dev)) != NULL) + if ((dip = dls_link_devinfo(dev)) != NULL) return (dip); state.ds_minor = getminor(dev); @@ -319,11 +284,24 @@ dld_close(queue_t *rq) dld_str_t *dsp = rq->q_ptr; /* + * All modules on top have been popped off. So there can't be any + * threads from the top. + */ + ASSERT(dsp->ds_datathr_cnt == 0); + + /* + * Wait until pending DLPI requests are processed. + */ + mutex_enter(&dsp->ds_lock); + while (dsp->ds_dlpi_pending) + cv_wait(&dsp->ds_dlpi_pending_cv, &dsp->ds_lock); + mutex_exit(&dsp->ds_lock); + + /* * Disable the queue srv(9e) routine. */ qprocsoff(rq); - dld_finish_pending_task(dsp); /* * This stream was open to a provider node. Check to see @@ -348,58 +326,55 @@ dld_close(queue_t *rq) void dld_wput(queue_t *wq, mblk_t *mp) { - dld_str_t *dsp = wq->q_ptr; + dld_str_t *dsp = (dld_str_t *)wq->q_ptr; + dld_str_mode_t mode; switch (DB_TYPE(mp)) { - case M_DATA: { - dld_tx_t tx; - - DLD_TX_ENTER(dsp); - if ((tx = dsp->ds_tx) != NULL) - tx(dsp, mp); - else - freemsg(mp); - DLD_TX_EXIT(dsp); + case M_DATA: + mutex_enter(&dsp->ds_lock); + if (dsp->ds_dlstate == DL_IDLE) { + mode = dsp->ds_mode; + if (mode == DLD_FASTPATH || mode == DLD_RAW) { + DLD_DATATHR_INC(dsp); + mutex_exit(&dsp->ds_lock); + if (mode == DLD_FASTPATH) { + (void) str_mdata_fastpath_put(dsp, mp, + 0, 0); + } else { + str_mdata_raw_put(dsp, mp); + } + DLD_DATATHR_DCR(dsp); + break; + } + } + mutex_exit(&dsp->ds_lock); + freemsg(mp); break; - } + case M_PROTO: case M_PCPROTO: { t_uscalar_t prim; - dld_tx_t tx; - if (MBLKL(mp) < sizeof (t_uscalar_t)) { - freemsg(mp); - return; - } + if (MBLKL(mp) < sizeof (t_uscalar_t)) + break; prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive; - if (prim != DL_UNITDATA_REQ) { - /* Control path */ + + if (prim == DL_UNITDATA_REQ) { + proto_unitdata_req(dsp, mp); + } else { dld_wput_nondata(dsp, mp); - break; } - - /* Data path */ - DLD_TX_ENTER(dsp); - if ((tx = dsp->ds_unitdata_tx) != NULL) - tx(dsp, mp); - else - dlerrorack(wq, mp, DL_UNITDATA_REQ, DL_OUTSTATE, 0); - DLD_TX_EXIT(dsp); break; } + case M_IOCTL: - case M_IOCDATA: - /* Control path */ dld_wput_nondata(dsp, mp); break; + case M_FLUSH: - /* - * Flush both the data messages and the control messages. - */ if (*mp->b_rptr & FLUSHW) { - dld_flush_nondata(dsp); - dld_tx_flush(dsp); + DLD_CLRQFULL(dsp); *mp->b_rptr &= ~FLUSHW; } @@ -409,6 +384,7 @@ dld_wput(queue_t *wq, mblk_t *mp) freemsg(mp); } break; + default: freemsg(mp); break; @@ -416,122 +392,14 @@ dld_wput(queue_t *wq, mblk_t *mp) } /* - * Called by GLDv3 control node to process the ioctls. It will start - * a taskq to allow the ioctl processing to block. This is a temporary - * solution, and will be replaced by a more graceful approach afterwards. - */ -void -dld_ioctl(queue_t *wq, mblk_t *mp) -{ - dld_wput_nondata(wq->q_ptr, mp); -} - -/* * qi_srvp: srv(9e) */ void dld_wsrv(queue_t *wq) { - mblk_t *mp, *head, *tail; dld_str_t *dsp = wq->q_ptr; - uint_t cnt, msgcnt; - timeout_id_t tid = 0; - - rw_enter(&dsp->ds_lock, RW_READER); - /* - * Grab all packets (chained via b_next) off our transmit queue - * and try to send them all to the MAC layer. Since the queue - * is independent of streams, we are able to dequeue all messages - * at once without looping through getq() and manually chaining - * them. Note that the queue size parameters (byte and message - * counts) are cleared as well, but we postpone the backenabling - * until after the MAC transmit since some packets may end up - * back at our transmit queue. - */ - mutex_enter(&dsp->ds_tx_list_lock); - if ((mp = dsp->ds_tx_list_head) == NULL) { - ASSERT(!dsp->ds_tx_qbusy); - ASSERT(dsp->ds_tx_flow_mp != NULL); - ASSERT(dsp->ds_tx_list_head == NULL); - ASSERT(dsp->ds_tx_list_tail == NULL); - ASSERT(dsp->ds_tx_cnt == 0); - ASSERT(dsp->ds_tx_msgcnt == 0); - mutex_exit(&dsp->ds_tx_list_lock); - rw_exit(&dsp->ds_lock); - return; - } - head = mp; - tail = dsp->ds_tx_list_tail; - dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL; - cnt = dsp->ds_tx_cnt; - msgcnt = dsp->ds_tx_msgcnt; - dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0; - mutex_exit(&dsp->ds_tx_list_lock); - - /* - * Discard packets unless we are attached and bound; note that - * the driver mode (fastpath/raw/unitdata) is irrelevant here, - * because regardless of the mode all transmit will end up in - * dld_tx_single() where the packets may be queued. - */ - ASSERT((DB_TYPE(mp) == M_DATA) || (DB_TYPE(mp) == M_MULTIDATA)); - if (dsp->ds_dlstate != DL_IDLE) { - freemsgchain(mp); - goto done; - } - - /* - * Attempt to transmit one or more packets. If the MAC can't - * send them all, re-queue the packet(s) at the beginning of - * the transmit queue to avoid any re-ordering. - */ - mp = dls_tx(dsp->ds_dc, mp); - if (mp == head) { - /* - * No message was sent out. Take the saved the queue depth - * as the input, so that dld_tx_enqueue() need not to - * calculate it again. - */ - dld_tx_enqueue(dsp, mp, tail, B_TRUE, msgcnt, cnt); - } else if (mp != NULL) { - /* - * Some but not all messages were sent out. dld_tx_enqueue() - * needs to start the timer to calculate the queue depth if - * timer has not been started. - * - * Note that a timer is used to calculate the queue depth - * to improve network performance, especially for TCP, in - * which case packets are sent without canput() being checked, - * and mostly end up in dld_tx_enqueue() under heavy load. - */ - dld_tx_enqueue(dsp, mp, tail, B_TRUE, 0, 0); - } - -done: - /* - * Grab the list lock again and check if the transmit queue is - * really empty; if so, lift up flow-control and backenable any - * writer queues. If the queue is not empty, schedule service - * thread to drain it. - */ - mutex_enter(&dsp->ds_tx_list_lock); - if (dsp->ds_tx_list_head == NULL) { - dsp->ds_tx_flow_mp = getq(wq); - ASSERT(dsp->ds_tx_flow_mp != NULL); - dsp->ds_tx_qbusy = B_FALSE; - if ((tid = dsp->ds_tx_qdepth_tid) != 0) - dsp->ds_tx_qdepth_tid = 0; - } - mutex_exit(&dsp->ds_tx_list_lock); - - /* - * Note that ds_tx_list_lock (which is acquired by the timeout - * callback routine) cannot be held across the call to untimeout(). - */ - if (tid != 0) - (void) untimeout(tid); - rw_exit(&dsp->ds_lock); + DLD_CLRQFULL(dsp); } void @@ -602,12 +470,6 @@ dld_str_init(void) ASSERT(str_cachep != NULL); /* - * Create taskq to process DLPI requests. - */ - dld_disp_taskq = taskq_create("dld_disp_taskq", 1024, MINCLSYSPRI, 2, - INT_MAX, TASKQ_DYNAMIC | TASKQ_PREPOPULATE); - - /* * Create a hash table for maintaining dld_str_t's. * The ds_minor field (the clone minor number) of a dld_str_t * is used as a key for this hash table because this number is @@ -615,6 +477,16 @@ dld_str_init(void) */ str_hashp = mod_hash_create_idhash("dld_str_hash", STR_HASHSZ, mod_hash_null_valdtor); + + mutex_init(&dld_taskq_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&dld_taskq_cv, NULL, CV_DRIVER, NULL); + + dld_taskq_quit = B_FALSE; + dld_taskq_done = B_FALSE; + list_create(&dld_taskq_list, sizeof (dld_str_t), + offsetof(dld_str_t, ds_tqlist)); + (void) thread_create(NULL, 0, dld_taskq_dispatch, NULL, 0, + &p0, TS_RUN, minclsyspri); } /* @@ -629,10 +501,16 @@ dld_str_fini(void) if (str_count != 0) return (EBUSY); - ASSERT(dld_disp_taskq != NULL); - taskq_destroy(dld_disp_taskq); - dld_disp_taskq = NULL; - + /* + * Ask the dld_taskq thread to quit and wait for it to be done + */ + mutex_enter(&dld_taskq_lock); + dld_taskq_quit = B_TRUE; + cv_signal(&dld_taskq_cv); + while (!dld_taskq_done) + cv_wait(&dld_taskq_cv, &dld_taskq_lock); + mutex_exit(&dld_taskq_lock); + list_destroy(&dld_taskq_list); /* * Destroy object cache. */ @@ -668,7 +546,6 @@ dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style) dsp->ds_type = type; dsp->ds_major = major; dsp->ds_style = style; - dsp->ds_tx = dsp->ds_unitdata_tx = NULL; /* * Initialize the queue pointers. @@ -690,20 +567,6 @@ dld_str_create(queue_t *rq, uint_t type, major_t major, t_uscalar_t style) return (dsp); } -void -dld_finish_pending_task(dld_str_t *dsp) -{ - /* - * Wait until the pending requests are processed by the worker thread. - */ - mutex_enter(&dsp->ds_disp_lock); - dsp->ds_closing = B_TRUE; - while (dsp->ds_tid != NULL) - cv_wait(&dsp->ds_disp_cv, &dsp->ds_disp_lock); - dsp->ds_closing = B_FALSE; - mutex_exit(&dsp->ds_disp_lock); -} - /* * Destroy a dld_str_t object. */ @@ -713,30 +576,29 @@ dld_str_destroy(dld_str_t *dsp) queue_t *rq; queue_t *wq; mod_hash_val_t val; + /* * Clear the queue pointers. */ rq = dsp->ds_rq; wq = dsp->ds_wq; ASSERT(wq == WR(rq)); - rq->q_ptr = wq->q_ptr = NULL; dsp->ds_rq = dsp->ds_wq = NULL; - ASSERT(!RW_LOCK_HELD(&dsp->ds_lock)); - ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock)); - ASSERT(dsp->ds_tx_list_head == NULL); - ASSERT(dsp->ds_tx_list_tail == NULL); - ASSERT(dsp->ds_tx_cnt == 0); - ASSERT(dsp->ds_tx_msgcnt == 0); - ASSERT(dsp->ds_tx_qdepth_tid == 0); - ASSERT(!dsp->ds_tx_qbusy); + ASSERT(dsp->ds_dlstate == DL_UNATTACHED); + ASSERT(dsp->ds_sap == 0); + ASSERT(dsp->ds_mh == NULL); + ASSERT(dsp->ds_mch == NULL); + ASSERT(dsp->ds_promisc == 0); + ASSERT(dsp->ds_mph == NULL); + ASSERT(dsp->ds_mip == NULL); + ASSERT(dsp->ds_mnh == NULL); - ASSERT(MUTEX_NOT_HELD(&dsp->ds_disp_lock)); - ASSERT(dsp->ds_pending_head == NULL); - ASSERT(dsp->ds_pending_tail == NULL); - ASSERT(dsp->ds_tx == NULL); - ASSERT(dsp->ds_unitdata_tx == NULL); + ASSERT(dsp->ds_polling == B_FALSE); + ASSERT(dsp->ds_direct == B_FALSE); + ASSERT(dsp->ds_lso == B_FALSE); + ASSERT(dsp->ds_lso_max == 0); /* * Reinitialize all the flags. @@ -746,6 +608,18 @@ dld_str_destroy(dld_str_t *dsp) dsp->ds_mode = DLD_UNITDATA; dsp->ds_native = B_FALSE; + ASSERT(dsp->ds_datathr_cnt == 0); + ASSERT(dsp->ds_pending_head == NULL); + ASSERT(dsp->ds_pending_tail == NULL); + ASSERT(!dsp->ds_dlpi_pending); + + ASSERT(dsp->ds_dlp == NULL); + ASSERT(dsp->ds_dmap == NULL); + ASSERT(dsp->ds_rx == NULL); + ASSERT(dsp->ds_rx_arg == NULL); + ASSERT(dsp->ds_next == NULL); + ASSERT(dsp->ds_head == NULL); + /* * Free the dummy mblk if exists. */ @@ -786,12 +660,9 @@ str_constructor(void *buf, void *cdrarg, int kmflags) */ dsp->ds_dlstate = DL_UNATTACHED; - rw_init(&dsp->ds_lock, NULL, RW_DRIVER, NULL); - mutex_init(&dsp->ds_tx_list_lock, NULL, MUTEX_DRIVER, NULL); - mutex_init(&dsp->ds_disp_lock, NULL, MUTEX_DRIVER, NULL); - cv_init(&dsp->ds_disp_cv, NULL, CV_DRIVER, NULL); - mutex_init(&dsp->ds_tx_lock, NULL, MUTEX_DRIVER, NULL); - cv_init(&dsp->ds_tx_cv, NULL, CV_DRIVER, NULL); + mutex_init(&dsp->ds_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&dsp->ds_datathr_cv, NULL, CV_DRIVER, NULL); + cv_init(&dsp->ds_dlpi_pending_cv, NULL, CV_DRIVER, NULL); return (0); } @@ -806,78 +677,20 @@ str_destructor(void *buf, void *cdrarg) dld_str_t *dsp = buf; /* - * Make sure the DLPI state machine was reset. - */ - ASSERT(dsp->ds_dlstate == DL_UNATTACHED); - - /* - * Make sure the data-link interface was closed. - */ - ASSERT(dsp->ds_mh == NULL); - ASSERT(dsp->ds_dc == NULL); - ASSERT(dsp->ds_tx == NULL); - ASSERT(dsp->ds_unitdata_tx == NULL); - ASSERT(dsp->ds_intx_cnt == 0); - ASSERT(dsp->ds_detaching == B_FALSE); - - /* - * Make sure enabled notifications are cleared. - */ - ASSERT(dsp->ds_notifications == 0); - - /* - * Make sure polling is disabled. - */ - ASSERT(!dsp->ds_polling); - - /* * Release the minor number. */ mac_minor_rele(dsp->ds_minor); - ASSERT(!RW_LOCK_HELD(&dsp->ds_lock)); - rw_destroy(&dsp->ds_lock); - - ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_list_lock)); - mutex_destroy(&dsp->ds_tx_list_lock); ASSERT(dsp->ds_tx_flow_mp == NULL); - ASSERT(dsp->ds_pending_head == NULL); - ASSERT(dsp->ds_pending_tail == NULL); - ASSERT(!dsp->ds_closing); - - ASSERT(MUTEX_NOT_HELD(&dsp->ds_disp_lock)); - mutex_destroy(&dsp->ds_disp_lock); - cv_destroy(&dsp->ds_disp_cv); - - ASSERT(MUTEX_NOT_HELD(&dsp->ds_tx_lock)); - mutex_destroy(&dsp->ds_tx_lock); - cv_destroy(&dsp->ds_tx_cv); -} - -void -dld_tx_single(dld_str_t *dsp, mblk_t *mp) -{ - /* - * If we are busy enqueue the packet and return. - * Otherwise hand them over to the MAC driver for transmission. - * If the message didn't get sent it will be queued. - * - * Note here that we don't grab the list lock prior to checking - * the busy flag. This is okay, because a missed transition - * will not cause any packet reordering for any particular TCP - * connection (which is single-threaded). The enqueue routine - * will atomically set the busy flag and schedule the service - * thread to run; the flag is only cleared by the service thread - * when there is no more packet to be transmitted. - */ - if (dsp->ds_tx_qbusy || ((mp = dls_tx(dsp->ds_dc, mp)) != NULL)) - dld_tx_enqueue(dsp, mp, mp, B_FALSE, 1, mp_getsize(mp)); + mutex_destroy(&dsp->ds_lock); + cv_destroy(&dsp->ds_datathr_cv); + cv_destroy(&dsp->ds_dlpi_pending_cv); } /* * Update the priority bits and VID (may need to insert tag if mp points - * to an untagged packet). + * to an untagged packet. * If vid is VLAN_ID_NONE, use the VID encoded in the packet. */ static mblk_t * @@ -960,18 +773,16 @@ i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid) } /* - * M_DATA put - * - * The poll callback function for DLS clients which are not in the per-stream - * mode. This function is called from an upper layer protocol (currently only - * tcp and udp). + * M_DATA put (IP fast-path mode) */ -void -str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp) +mac_tx_cookie_t +str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint, + uint16_t flag) { boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER); mblk_t *newmp; uint_t pri; + mac_tx_cookie_t cookie; if (is_ethernet) { /* @@ -988,25 +799,28 @@ str_mdata_fastpath_put(dld_str_t *dsp, mblk_t *mp) } } - dld_tx_single(dsp, mp); - return; + if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) { + DLD_SETQFULL(dsp); + } + return (cookie); discard: /* TODO: bump kstat? */ freemsg(mp); + return (NULL); } /* - * M_DATA put (DLIOCRAW mode). + * M_DATA put (DLIOCRAW mode) */ -void +static void str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp) { boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER); mblk_t *bp, *newmp; size_t size; mac_header_info_t mhi; - uint_t pri, vid; + uint_t pri, vid, dvid; uint_t max_sdu; /* @@ -1039,7 +853,7 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp) size += MBLKL(bp); } - if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0) + if (dls_link_header_info(dsp->ds_dlp, mp, &mhi) != 0) goto discard; mac_sdu_get(dsp->ds_mh, NULL, &max_sdu); @@ -1052,12 +866,14 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp) goto discard; if (is_ethernet) { + dvid = mac_client_vid(dsp->ds_mch); + /* * Discard the packet if this is a VLAN stream but the VID in * the packet is not correct. */ vid = VLAN_ID(mhi.mhi_tci); - if ((dsp->ds_vid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE)) + if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE)) goto discard; /* @@ -1074,16 +890,19 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp) * packets on a VLAN stream. */ pri = (pri == 0) ? dsp->ds_pri : 0; - if ((pri != 0) || (dsp->ds_vid != VLAN_ID_NONE)) { + if ((pri != 0) || (dvid != VLAN_ID_NONE)) { if ((newmp = i_dld_ether_header_update_tag(mp, - pri, dsp->ds_vid)) == NULL) { + pri, dvid)) == NULL) { goto discard; } mp = newmp; } } - dld_tx_single(dsp, mp); + if (DLD_TX(dsp, mp, 0, 0) != NULL) { + /* Turn on flow-control for dld */ + DLD_SETQFULL(dsp); + } return; discard: @@ -1097,18 +916,21 @@ discard: int dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa) { - dev_t dev; - int err; - const char *drvname; - dls_channel_t dc; - uint_t addr_length; - boolean_t qassociated = B_FALSE; - - ASSERT(dsp->ds_dc == NULL); + dev_t dev; + int err; + const char *drvname; + mac_perim_handle_t mph; + boolean_t qassociated = B_FALSE; + dls_link_t *dlp = NULL; + dls_dl_handle_t ddp = NULL; + boolean_t entered_perim = B_FALSE; if ((drvname = ddi_major_to_name(dsp->ds_major)) == NULL) return (EINVAL); + if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA) + return (ENOTSUP); + /* * /dev node access. This will still be supported for backward * compatibility reason. @@ -1120,46 +942,22 @@ dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa) qassociated = B_TRUE; } - /* - * Open a channel. - */ - if (dsp->ds_style == DL_STYLE2 && ppa > DLS_MAX_PPA) { - /* - * style-2 VLAN open, this is a /dev VLAN ppa open - * which might result in a newly created dls_vlan_t. - */ - err = dls_open_style2_vlan(dsp->ds_major, ppa, &dc); - if (err != 0) { - if (qassociated) - (void) qassociate(dsp->ds_wq, -1); - return (err); - } - } else { - dev = makedevice(dsp->ds_major, (minor_t)ppa + 1); - if ((err = dls_open_by_dev(dev, &dc)) != 0) { - if (qassociated) - (void) qassociate(dsp->ds_wq, -1); - return (err); - } - } - - /* - * Cache the MAC interface handle, a pointer to the immutable MAC - * information and the current and 'factory' MAC address. - */ - dsp->ds_mh = dls_mac(dc); - dsp->ds_mip = mac_info(dsp->ds_mh); - - mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr); + dev = makedevice(dsp->ds_major, (minor_t)ppa + 1); + if ((err = dls_devnet_hold_by_dev(dev, &ddp)) != 0) + goto failed; - addr_length = dsp->ds_mip->mi_addr_length; - bcopy(dsp->ds_mip->mi_unicst_addr, dsp->ds_fact_addr, addr_length); + if ((err = mac_perim_enter_by_macname(dls_devnet_mac(ddp), &mph)) != 0) + goto failed; + entered_perim = B_TRUE; /* - * Cache the interface VLAN identifier. (This will be VLAN_ID_NONE for - * a non-VLAN interface). + * Open a channel. */ - dsp->ds_vid = dls_vid(dc); + if ((err = dls_link_hold(dls_devnet_mac(ddp), &dlp)) != 0) + goto failed; + + if ((err = dls_open(dlp, ddp, dsp)) != 0) + goto failed; /* * Set the default packet priority. @@ -1169,12 +967,22 @@ dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa) /* * Add a notify function so that the we get updates from the MAC. */ - dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, (void *)dsp); - - dsp->ds_dc = dc; + dsp->ds_mnh = mac_notify_add(dsp->ds_mh, str_notify, dsp); dsp->ds_dlstate = DL_UNBOUND; - + mac_perim_exit(mph); return (0); + +failed: + if (dlp != NULL) + dls_link_rele(dlp); + if (entered_perim) + mac_perim_exit(mph); + if (ddp != NULL) + dls_devnet_rele(ddp); + if (qassociated) + (void) qassociate(dsp->ds_wq, -1); + + return (err); } /* @@ -1184,35 +992,56 @@ dld_str_attach(dld_str_t *dsp, t_uscalar_t ppa) void dld_str_detach(dld_str_t *dsp) { + mac_perim_handle_t mph; + int err; + + ASSERT(dsp->ds_datathr_cnt == 0); + + mac_perim_enter_by_mh(dsp->ds_mh, &mph); /* * Remove the notify function. + * + * Note that we cannot wait for the notification callback to be removed + * since it could cause the deadlock with str_notify() since they both + * need the mac perimeter. Continue if we cannot remove the + * notification callback right now and wait after we leave the + * perimeter. */ - mac_notify_remove(dsp->ds_mh, dsp->ds_mnh); + err = mac_notify_remove(dsp->ds_mnh, B_FALSE); + dsp->ds_mnh = NULL; /* - * Disable the capabilities and clear the promisc flag. + * Disable the capabilities */ - ASSERT(!dsp->ds_polling); - ASSERT(!dsp->ds_soft_ring); dld_capabilities_disable(dsp); - dsp->ds_promisc = 0; - DLD_TX_QUIESCE(dsp); + /* + * Clear LSO flags. + */ + dsp->ds_lso = B_FALSE; + dsp->ds_lso_max = 0; + + dls_close(dsp); + mac_perim_exit(mph); /* - * Flush all pending packets which are sitting in the transmit queue. + * Now we leave the mac perimeter. If mac_notify_remove() failed + * because the notification callback was in progress, wait for + * it to finish before we proceed. */ - dld_tx_flush(dsp); + if (err != 0) + mac_notify_remove_wait(dsp->ds_mh); /* - * Clear LSO flags. + * An unreferenced tagged (non-persistent) vlan gets destroyed + * automatically in the call to dls_devnet_rele. */ - dsp->ds_lso = B_FALSE; - dsp->ds_lso_max = 0; + dls_devnet_rele(dsp->ds_ddh); - dls_close(dsp->ds_dc); - dsp->ds_dc = NULL; + dsp->ds_sap = 0; dsp->ds_mh = NULL; + dsp->ds_mch = NULL; + dsp->ds_mip = NULL; if (dsp->ds_style == DL_STYLE2) (void) qassociate(dsp->ds_wq, -1); @@ -1221,7 +1050,6 @@ dld_str_detach(dld_str_t *dsp) * Re-initialize the DLPI state machine. */ dsp->ds_dlstate = DL_UNATTACHED; - } /* @@ -1314,7 +1142,8 @@ dld_str_rx_raw(void *arg, mac_resource_handle_t mrh, mblk_t *mp, /* * Strip the VLAN tag for VLAN streams. */ - if (is_ethernet && dsp->ds_vid != VLAN_ID_NONE) { + if (is_ethernet && + mac_client_vid(dsp->ds_mch) != VLAN_ID_NONE) { newmp = i_dld_ether_header_strip_tag(mp); if (newmp == NULL) { freemsg(mp); @@ -1366,7 +1195,8 @@ dld_str_rx_fastpath(void *arg, mac_resource_handle_t mrh, mblk_t *mp, * * Otherwise, strip the whole VLAN header. * - Untagged packets. Strip the whole MAC header. */ - if (mhip->mhi_istagged && (dsp->ds_vid == VLAN_ID_NONE) && + if (mhip->mhi_istagged && + (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) && ((dsp->ds_sap == ETHERTYPE_VLAN) || (dsp->ds_promisc & DLS_PROMISC_SAP))) { offset = VLAN_TAGSZ; @@ -1418,7 +1248,8 @@ dld_str_rx_unitdata(void *arg, mac_resource_handle_t mrh, mblk_t *mp, /* * See MAC header stripping rules in the dld_str_rx_fastpath() function. */ - if (mhip->mhi_istagged && (dsp->ds_vid == VLAN_ID_NONE) && + if (mhip->mhi_istagged && + (mac_client_vid(dsp->ds_mch) == VLAN_ID_NONE) && ((dsp->ds_sap == ETHERTYPE_VLAN) || (dsp->ds_promisc & DLS_PROMISC_SAP))) { offset = VLAN_TAGSZ; @@ -1534,7 +1365,7 @@ str_unitdata_ind(dld_str_t *dsp, mblk_t *mp, boolean_t strip_vlan) /* * Get the packet header information. */ - if (dls_header_info(dsp->ds_dc, mp, &mhi) != 0) + if (dls_link_header_info(dsp->ds_dlp, mp, &mhi) != 0) return (NULL); /* @@ -1805,11 +1636,14 @@ str_notify_fastpath_flush(dld_str_t *dsp) /* * MAC notification callback. */ -static void +void str_notify(void *arg, mac_notify_type_t type) { dld_str_t *dsp = (dld_str_t *)arg; queue_t *q = dsp->ds_wq; + mac_handle_t mh = dsp->ds_mh; + mac_client_handle_t mch = dsp->ds_mch; + uint8_t addr[MAXMACADDRLEN]; switch (type) { case MAC_NOTE_TX: @@ -1820,26 +1654,23 @@ str_notify(void *arg, mac_notify_type_t type) /* * Send the appropriate DL_NOTIFY_IND. */ - if (mac_promisc_get(dsp->ds_mh, MAC_DEVPROMISC)) + if (mac_promisc_get(mh, MAC_DEVPROMISC)) str_notify_promisc_on_phys(dsp); else str_notify_promisc_off_phys(dsp); break; - case MAC_NOTE_PROMISC: - break; - case MAC_NOTE_UNICST: /* - * This notification is sent whenever the MAC unicast address - * changes. We need to re-cache the address. + * This notification is sent whenever the MAC unicast + * address changes. */ - mac_unicst_get(dsp->ds_mh, dsp->ds_curr_addr); + mac_unicast_primary_get(mh, addr); /* * Send the appropriate DL_NOTIFY_IND. */ - str_notify_phys_addr(dsp, dsp->ds_curr_addr); + str_notify_phys_addr(dsp, addr); break; case MAC_NOTE_LINK: @@ -1847,7 +1678,7 @@ str_notify(void *arg, mac_notify_type_t type) * This notification is sent every time the MAC driver * updates the link state. */ - switch (mac_link_get(dsp->ds_mh)) { + switch (mac_client_stat_get(mch, MAC_STAT_LINK_STATE)) { case LINK_STATE_UP: { uint64_t speed; /* @@ -1856,7 +1687,7 @@ str_notify(void *arg, mac_notify_type_t type) */ str_notify_link_up(dsp); - speed = mac_stat_get(dsp->ds_mh, MAC_STAT_IFSPEED); + speed = mac_stat_get(mh, MAC_STAT_IFSPEED); str_notify_speed(dsp, (uint32_t)(speed / 1000ull)); break; } @@ -1874,7 +1705,7 @@ str_notify(void *arg, mac_notify_type_t type) break; case MAC_NOTE_RESOURCE: - case MAC_NOTE_VNIC: + case MAC_NOTE_CAPAB_CHG: /* * This notification is sent whenever the MAC resources * change or capabilities change. We need to renegotiate @@ -1897,334 +1728,177 @@ str_notify(void *arg, mac_notify_type_t type) case MAC_NOTE_MARGIN: break; + case MAC_NOTE_PROMISC: + break; + default: ASSERT(B_FALSE); break; } } -static inline uint_t -mp_getsize(mblk_t *mp) -{ - ASSERT(DB_TYPE(mp) == M_DATA); - return ((mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp)); -} - /* - * Calculate the dld queue depth, free the messages that exceed the threshold. + * This function is called via a taskq mechansim to process all control + * messages on a per 'dsp' end point. */ static void -dld_tx_qdepth_timer(void *arg) +dld_wput_nondata_task(void *arg) { - dld_str_t *dsp = (dld_str_t *)arg; - mblk_t *prev, *mp; - uint_t cnt, msgcnt, size; - - mutex_enter(&dsp->ds_tx_list_lock); - - /* Calculate total size and count of the packet(s) */ - cnt = msgcnt = 0; - for (prev = NULL, mp = dsp->ds_tx_list_head; mp != NULL; - prev = mp, mp = mp->b_next) { - size = mp_getsize(mp); - cnt += size; - msgcnt++; - if (cnt >= dld_max_q_count || msgcnt >= dld_max_q_count) { - ASSERT(dsp->ds_tx_qbusy); - dsp->ds_tx_list_tail = prev; - if (prev == NULL) - dsp->ds_tx_list_head = NULL; - else - prev->b_next = NULL; - freemsgchain(mp); - cnt -= size; - msgcnt--; + dld_str_t *dsp = arg; + mblk_t *mp; + + mutex_enter(&dsp->ds_lock); + while (dsp->ds_pending_head != NULL) { + mp = dsp->ds_pending_head; + dsp->ds_pending_head = mp->b_next; + mp->b_next = NULL; + if (dsp->ds_pending_head == NULL) + dsp->ds_pending_tail = NULL; + mutex_exit(&dsp->ds_lock); + + switch (DB_TYPE(mp)) { + case M_PROTO: + case M_PCPROTO: + dld_proto(dsp, mp); break; + case M_IOCTL: + dld_ioc(dsp, mp); + break; + default: + ASSERT(0); } + + mutex_enter(&dsp->ds_lock); } - dsp->ds_tx_cnt = cnt; - dsp->ds_tx_msgcnt = msgcnt; - dsp->ds_tx_qdepth_tid = 0; - mutex_exit(&dsp->ds_tx_list_lock); + ASSERT(dsp->ds_pending_tail == NULL); + dsp->ds_dlpi_pending = 0; + cv_broadcast(&dsp->ds_dlpi_pending_cv); + mutex_exit(&dsp->ds_lock); } /* - * Enqueue one or more messages on the transmit queue. Caller specifies: - * - the insertion position (head/tail). - * - the message count and the total message size of messages to be queued - * if they are known to the caller; or 0 if they are not known. - * - * If the caller does not know the message size information, this usually - * means that dld_wsrv() managed to send some but not all of the queued - * messages. For performance reasons, we do not calculate the queue depth - * every time. Instead, a timer is started to calculate the queue depth - * every 1 second (can be changed by tx_qdepth_interval). + * Kernel thread to handle taskq dispatch failures in dld_wput_data. This + * thread is started at boot time. */ static void -dld_tx_enqueue(dld_str_t *dsp, mblk_t *mp, mblk_t *tail, boolean_t head_insert, - uint_t msgcnt, uint_t cnt) +dld_taskq_dispatch(void) { - queue_t *q = dsp->ds_wq; - uint_t tot_cnt, tot_msgcnt; - mblk_t *next; - - mutex_enter(&dsp->ds_tx_list_lock); - - /* - * Simply enqueue the message and calculate the queue depth via - * timer if: - * - * - the current queue depth is incorrect, and the timer is already - * started; or - * - * - the given message size is unknown and it is allowed to start the - * timer; - */ - if ((dsp->ds_tx_qdepth_tid != 0) || - (msgcnt == 0 && tx_qdepth_interval != 0)) { - goto enqueue; - } + callb_cpr_t cprinfo; + dld_str_t *dsp; - /* - * The timer is not allowed, so calculate the message size now. - */ - if (msgcnt == 0) { - for (next = mp; next != NULL; next = next->b_next) { - cnt += mp_getsize(next); - msgcnt++; + CALLB_CPR_INIT(&cprinfo, &dld_taskq_lock, callb_generic_cpr, + "dld_taskq_dispatch"); + mutex_enter(&dld_taskq_lock); + + while (!dld_taskq_quit) { + dsp = list_head(&dld_taskq_list); + while (dsp != NULL) { + list_remove(&dld_taskq_list, dsp); + mutex_exit(&dld_taskq_lock); + VERIFY(taskq_dispatch(dld_taskq, dld_wput_nondata_task, + dsp, TQ_SLEEP) != 0); + mutex_enter(&dld_taskq_lock); + dsp = list_head(&dld_taskq_list); } - } - - /* - * Grow the queue depth using the input messesge size. - * - * If the queue depth would exceed the allowed threshold, drop - * new packet(s) and drain those already in the queue. - */ - tot_cnt = dsp->ds_tx_cnt + cnt; - tot_msgcnt = dsp->ds_tx_msgcnt + msgcnt; - - if (!head_insert && (tot_cnt >= dld_max_q_count || - tot_msgcnt >= dld_max_q_count)) { - ASSERT(dsp->ds_tx_qbusy); - mutex_exit(&dsp->ds_tx_list_lock); - freemsgchain(mp); - goto done; - } - /* Update the queue size parameters */ - dsp->ds_tx_cnt = tot_cnt; - dsp->ds_tx_msgcnt = tot_msgcnt; - -enqueue: - /* - * If the transmit queue is currently empty and we are - * about to deposit the packet(s) there, switch mode to - * "busy" and raise flow-control condition. - */ - if (!dsp->ds_tx_qbusy) { - dsp->ds_tx_qbusy = B_TRUE; - ASSERT(dsp->ds_tx_flow_mp != NULL); - (void) putq(q, dsp->ds_tx_flow_mp); - dsp->ds_tx_flow_mp = NULL; - } - - if (!head_insert) { - /* Tail insertion */ - if (dsp->ds_tx_list_head == NULL) - dsp->ds_tx_list_head = mp; - else - dsp->ds_tx_list_tail->b_next = mp; - dsp->ds_tx_list_tail = tail; - } else { - /* Head insertion */ - tail->b_next = dsp->ds_tx_list_head; - if (dsp->ds_tx_list_head == NULL) - dsp->ds_tx_list_tail = tail; - dsp->ds_tx_list_head = mp; - } - - if (msgcnt == 0 && dsp->ds_tx_qdepth_tid == 0 && - tx_qdepth_interval != 0) { - /* - * The message size is not given so that we need to start - * the timer to calculate the queue depth. - */ - dsp->ds_tx_qdepth_tid = timeout(dld_tx_qdepth_timer, dsp, - drv_usectohz(tx_qdepth_interval)); - ASSERT(dsp->ds_tx_qdepth_tid != NULL); - } - mutex_exit(&dsp->ds_tx_list_lock); -done: - /* Schedule service thread to drain the transmit queue */ - if (!head_insert) - qenable(q); -} -void -dld_tx_flush(dld_str_t *dsp) -{ - timeout_id_t tid = 0; - - mutex_enter(&dsp->ds_tx_list_lock); - if (dsp->ds_tx_list_head != NULL) { - freemsgchain(dsp->ds_tx_list_head); - dsp->ds_tx_list_head = dsp->ds_tx_list_tail = NULL; - dsp->ds_tx_cnt = dsp->ds_tx_msgcnt = 0; - if (dsp->ds_tx_qbusy) { - dsp->ds_tx_flow_mp = getq(dsp->ds_wq); - ASSERT(dsp->ds_tx_flow_mp != NULL); - dsp->ds_tx_qbusy = B_FALSE; - } - if ((tid = dsp->ds_tx_qdepth_tid) != 0) - dsp->ds_tx_qdepth_tid = 0; + CALLB_CPR_SAFE_BEGIN(&cprinfo); + cv_wait(&dld_taskq_cv, &dld_taskq_lock); + CALLB_CPR_SAFE_END(&cprinfo, &dld_taskq_lock); } - mutex_exit(&dsp->ds_tx_list_lock); - /* - * Note that ds_tx_list_lock (which is acquired by the timeout - * callback routine) cannot be held across the call to untimeout(). - */ - if (tid != 0) - (void) untimeout(tid); + dld_taskq_done = B_TRUE; + cv_signal(&dld_taskq_cv); + CALLB_CPR_EXIT(&cprinfo); + thread_exit(); } /* - * Process a non-data message. + * All control operations are serialized on the 'dsp' and are also funneled + * through a taskq mechanism to ensure that subsequent processing has kernel + * context and can safely use cv_wait. + * + * Mechanisms to handle taskq dispatch failures + * + * The only way to be sure that taskq dispatch does not fail is to either + * specify TQ_SLEEP or to use a static taskq and prepopulate it with + * some number of entries and make sure that the number of outstanding requests + * are less than that number. We can't use TQ_SLEEP since we don't know the + * context. Nor can we bound the total number of 'dsp' end points. So we are + * unable to use either of the above schemes, and are forced to deal with + * taskq dispatch failures. Note that even dynamic taskq could fail in + * dispatch if TQ_NOSLEEP is specified, since this flag is translated + * eventually to KM_NOSLEEP and kmem allocations could fail in the taskq + * framework. + * + * We maintain a queue of 'dsp's that encountered taskq dispatch failure. + * We also have a single global thread to retry the taskq dispatch. This + * thread loops in 'dld_taskq_dispatch' and retries the taskq dispatch, but + * uses TQ_SLEEP to ensure eventual success of the dispatch operation. */ static void dld_wput_nondata(dld_str_t *dsp, mblk_t *mp) { - ASSERT((dsp->ds_type == DLD_DLPI && dsp->ds_ioctl == NULL) || - (dsp->ds_type == DLD_CONTROL && dsp->ds_ioctl != NULL)); - - mutex_enter(&dsp->ds_disp_lock); - - /* - * The processing of the message might block. Enqueue the - * message for later processing. - */ - if (dsp->ds_pending_head == NULL) { - dsp->ds_pending_head = dsp->ds_pending_tail = mp; - } else { + ASSERT(mp->b_next == NULL); + mutex_enter(&dsp->ds_lock); + if (dsp->ds_pending_head != NULL) { + ASSERT(dsp->ds_dlpi_pending); dsp->ds_pending_tail->b_next = mp; dsp->ds_pending_tail = mp; + mutex_exit(&dsp->ds_lock); + return; } - + ASSERT(dsp->ds_pending_tail == NULL); + dsp->ds_pending_head = dsp->ds_pending_tail = mp; /* - * If there is no task pending, kick off the task. + * At this point if ds_dlpi_pending is set, it implies that the taskq + * thread is still active and is processing the last message, though + * the pending queue has been emptied. */ - if (dsp->ds_tid == NULL) { - dsp->ds_tid = taskq_dispatch(dld_disp_taskq, - dld_wput_nondata_task, dsp, TQ_SLEEP); - ASSERT(dsp->ds_tid != NULL); + if (dsp->ds_dlpi_pending) { + mutex_exit(&dsp->ds_lock); + return; } - mutex_exit(&dsp->ds_disp_lock); + + dsp->ds_dlpi_pending = 1; + mutex_exit(&dsp->ds_lock); + + if (taskq_dispatch(dld_taskq, dld_wput_nondata_task, dsp, + TQ_NOSLEEP) != 0) + return; + + mutex_enter(&dld_taskq_lock); + list_insert_tail(&dld_taskq_list, dsp); + cv_signal(&dld_taskq_cv); + mutex_exit(&dld_taskq_lock); } /* - * The worker thread which processes non-data messages. Note we only process - * one message at one time in order to be able to "flush" the queued message - * and serialize the processing. + * Process an M_IOCTL message. */ static void -dld_wput_nondata_task(void *arg) +dld_ioc(dld_str_t *dsp, mblk_t *mp) { - dld_str_t *dsp = (dld_str_t *)arg; - mblk_t *mp; - - mutex_enter(&dsp->ds_disp_lock); - ASSERT(dsp->ds_pending_head != NULL); - ASSERT(dsp->ds_tid != NULL); - - if (dsp->ds_closing) - goto closing; - - mp = dsp->ds_pending_head; - if ((dsp->ds_pending_head = mp->b_next) == NULL) - dsp->ds_pending_tail = NULL; - mp->b_next = NULL; + uint_t cmd; - mutex_exit(&dsp->ds_disp_lock); + cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd; + ASSERT(dsp->ds_type == DLD_DLPI); - switch (DB_TYPE(mp)) { - case M_PROTO: - case M_PCPROTO: - ASSERT(dsp->ds_type == DLD_DLPI); - dld_wput_proto_nondata(dsp, mp); + switch (cmd) { + case DLIOCNATIVE: + ioc_native(dsp, mp); break; - case M_IOCTL: { - uint_t cmd; - - if (dsp->ds_type == DLD_CONTROL) { - ASSERT(dsp->ds_ioctl != NULL); - dsp->ds_ioctl(dsp->ds_wq, mp); - break; - } - - cmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd; - - switch (cmd) { - case DLIOCNATIVE: - ioc_native(dsp, mp); - break; - case DLIOCMARGININFO: - ioc_margin(dsp, mp); - break; - case DLIOCRAW: - ioc_raw(dsp, mp); - break; - case DLIOCHDRINFO: - ioc_fast(dsp, mp); - break; - default: - ioc(dsp, mp); - break; - } + case DLIOCMARGININFO: + ioc_margin(dsp, mp); break; - } - case M_IOCDATA: - ASSERT(dsp->ds_type == DLD_DLPI); - ioc(dsp, mp); + case DLIOCRAW: + ioc_raw(dsp, mp); break; + case DLIOCHDRINFO: + ioc_fast(dsp, mp); + break; + default: + ioc(dsp, mp); } - - mutex_enter(&dsp->ds_disp_lock); - - if (dsp->ds_closing) - goto closing; - - if (dsp->ds_pending_head != NULL) { - dsp->ds_tid = taskq_dispatch(dld_disp_taskq, - dld_wput_nondata_task, dsp, TQ_SLEEP); - ASSERT(dsp->ds_tid != NULL); - } else { - dsp->ds_tid = NULL; - } - mutex_exit(&dsp->ds_disp_lock); - return; - - /* - * If the stream is closing, flush all queued messages and inform - * the stream once it is done. - */ -closing: - freemsgchain(dsp->ds_pending_head); - dsp->ds_pending_head = dsp->ds_pending_tail = NULL; - dsp->ds_tid = NULL; - cv_signal(&dsp->ds_disp_cv); - mutex_exit(&dsp->ds_disp_lock); -} - -/* - * Flush queued non-data messages. - */ -static void -dld_flush_nondata(dld_str_t *dsp) -{ - mutex_enter(&dsp->ds_disp_lock); - freemsgchain(dsp->ds_pending_head); - dsp->ds_pending_head = dsp->ds_pending_tail = NULL; - mutex_exit(&dsp->ds_disp_lock); } /* @@ -2236,8 +1910,6 @@ ioc_native(dld_str_t *dsp, mblk_t *mp) queue_t *q = dsp->ds_wq; const mac_info_t *mip = dsp->ds_mip; - rw_enter(&dsp->ds_lock, RW_WRITER); - /* * Native mode can be enabled if it's disabled and if the * native media type is different. @@ -2245,8 +1917,6 @@ ioc_native(dld_str_t *dsp, mblk_t *mp) if (!dsp->ds_native && mip->mi_media != mip->mi_nativemedia) dsp->ds_native = B_TRUE; - rw_exit(&dsp->ds_lock); - if (dsp->ds_native) miocack(q, mp, 0, mip->mi_nativemedia); else @@ -2286,22 +1956,34 @@ static void ioc_raw(dld_str_t *dsp, mblk_t *mp) { queue_t *q = dsp->ds_wq; + mac_perim_handle_t mph; + + if (dsp->ds_mh == NULL) { + dsp->ds_mode = DLD_RAW; + miocack(q, mp, 0, 0); + return; + } - if (dsp->ds_polling || dsp->ds_soft_ring) { + mac_perim_enter_by_mh(dsp->ds_mh, &mph); + if (dsp->ds_polling || dsp->ds_direct) { + mac_perim_exit(mph); miocnak(q, mp, 0, EPROTO); return; } - rw_enter(&dsp->ds_lock, RW_WRITER); - if ((dsp->ds_mode != DLD_RAW) && (dsp->ds_dlstate == DL_IDLE)) { + if (dsp->ds_mode != DLD_RAW && dsp->ds_dlstate == DL_IDLE) { /* * Set the receive callback. */ - dls_rx_set(dsp->ds_dc, dld_str_rx_raw, dsp); - dsp->ds_tx = str_mdata_raw_put; + dls_rx_set(dsp, dld_str_rx_raw, dsp); } + + /* + * Note that raw mode is enabled. + */ dsp->ds_mode = DLD_RAW; - rw_exit(&dsp->ds_lock); + mac_perim_exit(mph); + miocack(q, mp, 0, 0); } @@ -2321,6 +2003,7 @@ ioc_fast(dld_str_t *dsp, mblk_t *mp) uint_t addr_length; queue_t *q = dsp->ds_wq; int err; + mac_perim_handle_t mph; if (dld_opt & DLD_OPT_NO_FASTPATH) { err = ENOTSUP; @@ -2352,11 +2035,6 @@ ioc_fast(dld_str_t *dsp, mblk_t *mp) goto failed; } - /* - * We don't need to hold any locks to access ds_dlstate, because - * control message prossessing (which updates this field) is - * serialized. - */ if (dsp->ds_dlstate != DL_IDLE) { err = ENOTSUP; goto failed; @@ -2371,24 +2049,31 @@ ioc_fast(dld_str_t *dsp, mblk_t *mp) addr = nmp->b_rptr + off; sap = *(uint16_t *)(nmp->b_rptr + off + addr_length); - if ((hmp = dls_header(dsp->ds_dc, addr, sap, 0, NULL)) == NULL) { + if ((hmp = dls_header(dsp, addr, sap, 0, NULL)) == NULL) { err = ENOMEM; goto failed; } - rw_enter(&dsp->ds_lock, RW_WRITER); - ASSERT(dsp->ds_dlstate == DL_IDLE); + /* + * This ioctl might happen concurrently with a direct call to dld_capab + * that tries to enable direct and/or poll capabilities. Since the + * stack does not serialize them, we do so here to avoid mixing + * the callbacks. + */ + mac_perim_enter_by_mh(dsp->ds_mh, &mph); if (dsp->ds_mode != DLD_FASTPATH) { /* - * Set the receive callback (unless polling or - * soft-ring is enabled). + * Set the receive callback (unless polling is enabled). + */ + if (!dsp->ds_polling && !dsp->ds_direct) + dls_rx_set(dsp, dld_str_rx_fastpath, dsp); + + /* + * Note that fast-path mode is enabled. */ dsp->ds_mode = DLD_FASTPATH; - if (!dsp->ds_polling && !dsp->ds_soft_ring) - dls_rx_set(dsp->ds_dc, dld_str_rx_fastpath, dsp); - dsp->ds_tx = str_mdata_fastpath_put; } - rw_exit(&dsp->ds_lock); + mac_perim_exit(mph); freemsg(nmp->b_cont); nmp->b_cont = hmp; @@ -2399,17 +2084,17 @@ failed: miocnak(q, mp, 0, err); } +/* + * Catch-all handler. + */ static void ioc(dld_str_t *dsp, mblk_t *mp) { queue_t *q = dsp->ds_wq; - mac_handle_t mh; if (dsp->ds_dlstate == DL_UNATTACHED) { miocnak(q, mp, 0, EINVAL); return; } - mh = dsp->ds_mh; - ASSERT(mh != NULL); - mac_ioctl(mh, q, mp); + mac_ioctl(dsp->ds_mh, q, mp); } diff --git a/usr/src/uts/common/io/dls/dls.c b/usr/src/uts/common/io/dls/dls.c index 2002e994bf..064217c8f2 100644 --- a/usr/src/uts/common/io/dls/dls.c +++ b/usr/src/uts/common/io/dls/dls.c @@ -23,583 +23,285 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Data-Link Services Module */ -#include <sys/types.h> -#include <sys/stream.h> #include <sys/strsun.h> -#include <sys/sysmacros.h> -#include <sys/atomic.h> -#include <sys/stat.h> -#include <sys/dlpi.h> #include <sys/vlan.h> -#include <sys/ethernet.h> -#include <sys/byteorder.h> -#include <sys/mac.h> - -#include <sys/dls.h> -#include <sys/dls_impl.h> -#include <sys/dls_soft_ring.h> - -static kmem_cache_t *i_dls_impl_cachep; -static uint32_t i_dls_impl_count; - -static kstat_t *dls_ksp = (kstat_t *)NULL; -struct dls_kstats dls_kstat = -{ - { "soft_ring_pkt_drop", KSTAT_DATA_UINT32 }, -}; - -static int dls_open(dls_vlan_t *, dls_dl_handle_t ddh, dls_channel_t *); - -/* - * Private functions. - */ - -/*ARGSUSED*/ -static int -i_dls_constructor(void *buf, void *arg, int kmflag) -{ - dls_impl_t *dip = buf; - - bzero(buf, sizeof (dls_impl_t)); - - rw_init(&(dip->di_lock), NULL, RW_DRIVER, NULL); - return (0); -} - -/*ARGSUSED*/ -static void -i_dls_destructor(void *buf, void *arg) -{ - dls_impl_t *dip = buf; - - ASSERT(dip->di_dvp == NULL); - ASSERT(dip->di_mnh == NULL); - ASSERT(dip->di_dmap == NULL); - ASSERT(!dip->di_local); - ASSERT(!dip->di_bound); - ASSERT(dip->di_rx == NULL); - ASSERT(dip->di_txinfo == NULL); - - rw_destroy(&(dip->di_lock)); -} - -static void -i_dls_notify(void *arg, mac_notify_type_t type) -{ - dls_impl_t *dip = arg; - - switch (type) { - case MAC_NOTE_UNICST: - mac_unicst_get(dip->di_mh, dip->di_unicst_addr); - break; - - case MAC_NOTE_PROMISC: - case MAC_NOTE_VNIC: - /* - * Every time the MAC interface changes promiscuity or - * the VNIC characteristics change we need to reset - * our transmit information. - */ - dip->di_txinfo = mac_tx_get(dip->di_mh); - break; - } -} - -static void -dls_stat_init() -{ - if ((dls_ksp = kstat_create("dls", 0, "dls_stat", - "net", KSTAT_TYPE_NAMED, - sizeof (dls_kstat) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL)) == NULL) { - cmn_err(CE_WARN, - "DLS: failed to create kstat structure for dls stats"); - return; - } - dls_ksp->ks_data = (void *)&dls_kstat; - kstat_install(dls_ksp); -} - -static void -dls_stat_destroy() -{ - kstat_delete(dls_ksp); -} - -/* - * Module initialization functions. - */ - -void -dls_init(void) -{ - /* - * Create a kmem_cache of dls_impl_t. - */ - i_dls_impl_cachep = kmem_cache_create("dls_cache", - sizeof (dls_impl_t), 0, i_dls_constructor, i_dls_destructor, NULL, - NULL, NULL, 0); - ASSERT(i_dls_impl_cachep != NULL); - soft_ring_init(); - dls_stat_init(); -} +#include <sys/dld_impl.h> int -dls_fini(void) +dls_open(dls_link_t *dlp, dls_dl_handle_t ddh, dld_str_t *dsp) { - /* - * If there are any dls_impl_t in use then return EBUSY. - */ - if (i_dls_impl_count != 0) - return (EBUSY); - - /* - * Destroy the kmem_cache. - */ - kmem_cache_destroy(i_dls_impl_cachep); - dls_stat_destroy(); - return (0); -} - -/* - * Client functions. - */ - -/* - * /dev node style-2 VLAN PPA access. This might result in a newly created - * dls_vlan_t. Note that this dls_vlan_t is different from others, in that - * this VLAN might not have a link name that is managed by dlmgmtd (we cannot - * use its VLAN ppa hack name as it might conflict with a vanity name). - */ -int -dls_open_style2_vlan(major_t major, uint_t ppa, dls_channel_t *dcp) -{ - dev_t dev = makedevice(major, DLS_PPA2INST(ppa) + 1); - uint_t vid = DLS_PPA2VID(ppa); - dls_vlan_t *lndvp, *dvp; - int err; - - /* - * First find the dls_vlan_t this VLAN is created on. This must be - * a GLDv3 driver based device. - */ - if ((err = dls_vlan_hold_by_dev(dev, &lndvp)) != 0) - return (err); - - if (vid > VLAN_ID_MAX) - return (ENOENT); - - err = dls_vlan_hold(lndvp->dv_dlp->dl_name, vid, &dvp, B_FALSE, B_TRUE); - if (err != 0) - goto done; - - if ((err = dls_open(dvp, NULL, dcp)) != 0) - dls_vlan_rele(dvp); - -done: - dls_vlan_rele(lndvp); - return (err); -} - -int -dls_open_by_dev(dev_t dev, dls_channel_t *dcp) -{ - dls_dl_handle_t ddh; - dls_vlan_t *dvp; - int err; - - /* - * Get a reference to the given dls_vlan_t. - */ - if ((err = dls_devnet_open_by_dev(dev, &dvp, &ddh)) != 0) - return (err); - - if ((err = dls_open(dvp, ddh, dcp)) != 0) { - if (ddh != NULL) - dls_devnet_close(ddh); - else - dls_vlan_rele(dvp); - } - - return (err); -} - -static int -dls_open(dls_vlan_t *dvp, dls_dl_handle_t ddh, dls_channel_t *dcp) -{ - dls_impl_t *dip; - dls_link_t *dlp; - int err; zoneid_t zid = getzoneid(); boolean_t local; /* - * Check whether this client belongs to the zone of this dvp. Note that - * a global zone client is allowed to open a local zone dvp. + * Check whether this client belongs to the zone of this dlp. Note that + * a global zone client is allowed to open a local zone dlp. */ - mutex_enter(&dvp->dv_lock); - if (zid != GLOBAL_ZONEID && dvp->dv_zid != zid) { - mutex_exit(&dvp->dv_lock); + if (zid != GLOBAL_ZONEID && dlp->dl_zid != zid) return (ENOENT); - } - local = (zid == dvp->dv_zid); - dvp->dv_zone_ref += (local ? 1 : 0); - mutex_exit(&dvp->dv_lock); - - dlp = dvp->dv_dlp; - if ((err = mac_start(dlp->dl_mh)) != 0) { - mutex_enter(&dvp->dv_lock); - dvp->dv_zone_ref -= (local ? 1 : 0); - mutex_exit(&dvp->dv_lock); - return (err); - } - /* - * Allocate a new dls_impl_t. - */ - dip = kmem_cache_alloc(i_dls_impl_cachep, KM_SLEEP); - dip->di_dvp = dvp; - dip->di_ddh = ddh; + local = (zid == dlp->dl_zid); + dlp->dl_zone_ref += (local ? 1 : 0); /* * Cache a copy of the MAC interface handle, a pointer to the - * immutable MAC info and a copy of the current MAC address. + * immutable MAC info. */ - dip->di_mh = dlp->dl_mh; - dip->di_mip = dlp->dl_mip; + dsp->ds_dlp = dlp; + dsp->ds_mh = dlp->dl_mh; + dsp->ds_mch = dlp->dl_mch; + dsp->ds_mip = dlp->dl_mip; + dsp->ds_ddh = ddh; + dsp->ds_local = local; - mac_unicst_get(dip->di_mh, dip->di_unicst_addr); - - /* - * Set the MAC transmit information. - */ - dip->di_txinfo = mac_tx_get(dip->di_mh); - - /* - * Add a notification function so that we get updates from - * the MAC. - */ - dip->di_mnh = mac_notify_add(dip->di_mh, i_dls_notify, - (void *)dip); - - /* - * Bump the kmem_cache count to make sure it is not prematurely - * destroyed. - */ - atomic_add_32(&i_dls_impl_count, 1); - - dip->di_local = local; - - /* - * Hand back a reference to the dls_impl_t. - */ - *dcp = (dls_channel_t)dip; + ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); return (0); } void -dls_close(dls_channel_t dc) +dls_close(dld_str_t *dsp) { - dls_impl_t *dip = (dls_impl_t *)dc; - dls_vlan_t *dvp = dip->di_dvp; - dls_link_t *dlp = dvp->dv_dlp; + dls_link_t *dlp = dsp->ds_dlp; dls_multicst_addr_t *p; dls_multicst_addr_t *nextp; - dls_dl_handle_t ddh = dip->di_ddh; + uint32_t old_flags; - if (dip->di_local) { - mutex_enter(&dvp->dv_lock); - dvp->dv_zone_ref--; - mutex_exit(&dvp->dv_lock); - } - dip->di_local = B_FALSE; + ASSERT(dsp->ds_datathr_cnt == 0); + ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); - dls_active_clear(dc); + if (dsp->ds_local) + dlp->dl_zone_ref--; + dsp->ds_local = B_FALSE; - rw_enter(&(dip->di_lock), RW_WRITER); /* - * Remove the notify function. + * Walk the list of multicast addresses, disabling each at the MAC. + * Note that we must remove multicast address before + * mac_unicast_remove() (called by dls_active_clear()) because + * mac_multicast_remove() relies on the unicast flows on the mac + * client. */ - mac_notify_remove(dip->di_mh, dip->di_mnh); - dip->di_mnh = NULL; - - /* - * If the dls_impl_t is bound then unbind it. - */ - if (dip->di_bound) { - rw_exit(&(dip->di_lock)); - dls_link_remove(dlp, dip); - rw_enter(&(dip->di_lock), RW_WRITER); - dip->di_bound = B_FALSE; - } - - /* - * Walk the list of multicast addresses, disabling each at - * the MAC. - */ - for (p = dip->di_dmap; p != NULL; p = nextp) { - (void) mac_multicst_remove(dip->di_mh, p->dma_addr); + for (p = dsp->ds_dmap; p != NULL; p = nextp) { + (void) mac_multicast_remove(dsp->ds_mch, p->dma_addr); nextp = p->dma_nextp; kmem_free(p, sizeof (dls_multicst_addr_t)); } - dip->di_dmap = NULL; + dsp->ds_dmap = NULL; - dip->di_rx = NULL; - dip->di_rx_arg = NULL; - rw_exit(&(dip->di_lock)); + dls_active_clear(dsp); /* - * If the MAC has been set in promiscuous mode then disable it. + * If the dld_str_t is bound then unbind it. */ - (void) dls_promisc(dc, 0); - dip->di_txinfo = NULL; + if (dsp->ds_dlstate == DL_IDLE) { + (void) dls_unbind(dsp); + dsp->ds_dlstate = DL_UNBOUND; + } /* - * Free the dls_impl_t back to the cache. + * If the MAC has been set in promiscuous mode then disable it. + * This needs to be done before resetting ds_rx. */ - dip->di_txinfo = NULL; - - if (dip->di_soft_ring_list != NULL) { - soft_ring_set_destroy(dip->di_soft_ring_list, - dip->di_soft_ring_size); - dip->di_soft_ring_list = NULL; - } - dip->di_soft_ring_size = 0; + old_flags = dsp->ds_promisc; + dsp->ds_promisc = 0; + (void) dls_promisc(dsp, old_flags); /* - * Decrement the reference count to allow the cache to be destroyed - * if there are no more dls_impl_t. + * At this point we have cutoff inbound packet flow from the mac + * for this 'dsp'. The dls_link_remove above cut off packets meant + * for us and waited for upcalls to finish. Similarly the dls_promisc + * reset above waited for promisc callbacks to finish. Now we can + * safely reset ds_rx to NULL */ - atomic_add_32(&i_dls_impl_count, -1); - - dip->di_dvp = NULL; + dsp->ds_rx = NULL; + dsp->ds_rx_arg = NULL; - kmem_cache_free(i_dls_impl_cachep, dip); - - mac_stop(dvp->dv_dlp->dl_mh); + dsp->ds_dlp = NULL; /* - * Release our reference to the dls_vlan_t allowing that to be - * destroyed if there are no more dls_impl_t. An unreferenced tagged - * (non-persistent) vlan gets destroyed automatically. + * Release our reference to the dls_link_t allowing that to be + * destroyed if there are no more dls_impl_t. */ - if (ddh != NULL) - dls_devnet_close(ddh); - else - dls_vlan_rele(dvp); -} - -mac_handle_t -dls_mac(dls_channel_t dc) -{ - return (((dls_impl_t *)dc)->di_mh); -} - -uint16_t -dls_vid(dls_channel_t dc) -{ - return (((dls_impl_t *)dc)->di_dvp->dv_id); + dls_link_rele(dlp); } int -dls_bind(dls_channel_t dc, uint32_t sap) +dls_bind(dld_str_t *dsp, uint32_t sap) { - dls_impl_t *dip = (dls_impl_t *)dc; - dls_link_t *dlp; uint32_t dls_sap; + ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + /* * Check to see the value is legal for the media type. */ - if (!mac_sap_verify(dip->di_mh, sap, &dls_sap)) + if (!mac_sap_verify(dsp->ds_mh, sap, &dls_sap)) return (EINVAL); - if (dip->di_promisc & DLS_PROMISC_SAP) + + if (dsp->ds_promisc & DLS_PROMISC_SAP) dls_sap = DLS_SAP_PROMISC; /* - * Set up the dls_impl_t to mark it as able to receive packets. + * Set up the dld_str_t to mark it as able to receive packets. */ - rw_enter(&(dip->di_lock), RW_WRITER); - ASSERT(!dip->di_bound); - dip->di_sap = sap; - dip->di_bound = B_TRUE; - rw_exit(&(dip->di_lock)); + dsp->ds_sap = sap; /* - * Now bind the dls_impl_t by adding it into the hash table in the - * dls_link_t. + * The MAC layer does the VLAN demultiplexing and will only pass up + * untagged packets to non-promiscuous primary MAC clients. In order to + * support the binding to the VLAN SAP which is required by DLPI, dls + * needs to get a copy of all tagged packets when the client binds to + * the VLAN SAP. We do this by registering a separate promiscuous + * callback for each dls client binding to that SAP. * - * NOTE: This must be done without the dls_impl_t lock being held - * otherwise deadlock may ensue. - */ - dlp = dip->di_dvp->dv_dlp; - dls_link_add(dlp, dls_sap, dip); + * Note: even though there are two promiscuous handles in dld_str_t, + * ds_mph is for the regular promiscuous mode, ds_vlan_mph is the handle + * to receive VLAN pkt when promiscuous mode is not on. Only one of + * them can be non-NULL at the same time, to avoid receiving dup copies + * of pkts. + */ + if (sap == ETHERTYPE_VLAN && dsp->ds_promisc == 0) { + int err; + + if (dsp->ds_vlan_mph != NULL) + return (EINVAL); + err = mac_promisc_add(dsp->ds_mch, + MAC_CLIENT_PROMISC_ALL, dls_rx_vlan_promisc, dsp, + &dsp->ds_vlan_mph, MAC_PROMISC_FLAGS_NO_PHYS); + return (err); + } + /* + * Now bind the dld_str_t by adding it into the hash table in the + * dls_link_t. + */ + dls_link_add(dsp->ds_dlp, dls_sap, dsp); return (0); } -void -dls_unbind(dls_channel_t dc) +int +dls_unbind(dld_str_t *dsp) { - dls_impl_t *dip = (dls_impl_t *)dc; - dls_link_t *dlp; + ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); /* - * Unbind the dls_impl_t by removing it from the hash table in the - * dls_link_t. - * - * NOTE: This must be done without the dls_impl_t lock being held - * otherise deadlock may enuse. + * For VLAN SAP, there was a promisc handle registered when dls_bind. + * When unbind this dls link, we need to remove the promisc handle. + * See comments in dls_bind(). */ - dlp = dip->di_dvp->dv_dlp; - dls_link_remove(dlp, dip); + if (dsp->ds_vlan_mph != NULL) { + int err; + + err = mac_promisc_remove(dsp->ds_vlan_mph); + ASSERT(err == 0); + dsp->ds_vlan_mph = NULL; + return (err); + } /* - * Mark the dls_impl_t as unable to receive packets This will make - * sure that 'receives in flight' will not come our way. + * Unbind the dld_str_t by removing it from the hash table in the + * dls_link_t. */ - dip->di_bound = B_FALSE; + dls_link_remove(dsp->ds_dlp, dsp); + dsp->ds_sap = 0; + return (0); } int -dls_promisc(dls_channel_t dc, uint32_t flags) +dls_promisc(dld_str_t *dsp, uint32_t old_flags) { - dls_impl_t *dip = (dls_impl_t *)dc; - dls_link_t *dlp; int err = 0; - ASSERT(!(flags & ~(DLS_PROMISC_SAP | DLS_PROMISC_MULTI | + ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + ASSERT(!(dsp->ds_promisc & ~(DLS_PROMISC_SAP | DLS_PROMISC_MULTI | DLS_PROMISC_PHYS))); - /* - * Check if we need to turn on 'all sap' mode. - */ - rw_enter(&(dip->di_lock), RW_WRITER); - dlp = dip->di_dvp->dv_dlp; - if ((flags & DLS_PROMISC_SAP) && - !(dip->di_promisc & DLS_PROMISC_SAP)) { - dip->di_promisc |= DLS_PROMISC_SAP; - if (!dip->di_bound) - goto multi; - - rw_exit(&(dip->di_lock)); - dls_link_remove(dlp, dip); - dls_link_add(dlp, DLS_SAP_PROMISC, dip); - rw_enter(&(dip->di_lock), RW_WRITER); - goto multi; - } - - /* - * Check if we need to turn off 'all sap' mode. - */ - if (!(flags & DLS_PROMISC_SAP) && - (dip->di_promisc & DLS_PROMISC_SAP)) { - uint32_t dls_sap; - - dip->di_promisc &= ~DLS_PROMISC_SAP; - if (!dip->di_bound) - goto multi; - - rw_exit(&(dip->di_lock)); - dls_link_remove(dlp, dip); - (void) mac_sap_verify(dip->di_mh, dip->di_sap, &dls_sap); - dls_link_add(dlp, dls_sap, dip); - rw_enter(&(dip->di_lock), RW_WRITER); - } - -multi: - /* - * It's easiest to add the txloop callback up-front; if promiscuous - * mode cannot be enabled, then we'll remove it before returning. - * Use dl_promisc_lock to prevent racing with another thread also - * manipulating the promiscuous state on another dls_impl_t associated - * with the same dls_link_t. - */ - mutex_enter(&dlp->dl_promisc_lock); - if ((dlp->dl_npromisc == 0) && (flags & DLS_PROMISC_PHYS)) { - ASSERT(dlp->dl_mth == NULL); - dlp->dl_mth = mac_txloop_add(dlp->dl_mh, dls_link_txloop, dlp); - } - - /* - * Turn on or off 'all multicast' mode, if necessary. - */ - if (flags & DLS_PROMISC_MULTI) { - if (!(dip->di_promisc & DLS_PROMISC_MULTI)) { - if ((err = mac_promisc_set(dip->di_mh, B_TRUE, - MAC_DEVPROMISC)) != 0) { - goto done; - } - dip->di_promisc |= DLS_PROMISC_MULTI; - } - } else { - if (dip->di_promisc & DLS_PROMISC_MULTI) { - if ((err = mac_promisc_set(dip->di_mh, B_FALSE, - MAC_DEVPROMISC)) != 0) { - goto done; - } - dip->di_promisc &= ~DLS_PROMISC_MULTI; - } - } - - /* - * Turn on or off 'all physical' mode, if necessary. - */ - if (flags & DLS_PROMISC_PHYS) { - if (!(dip->di_promisc & DLS_PROMISC_PHYS)) { - err = mac_promisc_set(dip->di_mh, B_TRUE, MAC_PROMISC); - if (err != 0) - goto done; - - dip->di_promisc |= DLS_PROMISC_PHYS; - dlp->dl_npromisc++; + if (old_flags == 0 && dsp->ds_promisc != 0) { + /* + * If only DLS_PROMISC_SAP, we don't turn on the + * physical promisc mode + */ + err = mac_promisc_add(dsp->ds_mch, MAC_CLIENT_PROMISC_ALL, + dls_rx_promisc, dsp, &dsp->ds_mph, + (dsp->ds_promisc != DLS_PROMISC_SAP) ? 0 : + MAC_PROMISC_FLAGS_NO_PHYS); + if (err != 0) + return (err); + + /* Remove vlan promisc handle to avoid sending dup copy up */ + if (dsp->ds_vlan_mph != NULL) { + err = mac_promisc_remove(dsp->ds_vlan_mph); + dsp->ds_vlan_mph = NULL; } - } else { - if (dip->di_promisc & DLS_PROMISC_PHYS) { - err = mac_promisc_set(dip->di_mh, B_FALSE, MAC_PROMISC); - if (err != 0) - goto done; - - dip->di_promisc &= ~DLS_PROMISC_PHYS; - dlp->dl_npromisc--; + } else if (old_flags != 0 && dsp->ds_promisc == 0) { + ASSERT(dsp->ds_mph != NULL); + err = mac_promisc_remove(dsp->ds_mph); + /* + * The failure only relates to resetting the device promiscuity + * The mac layer does not fail in freeing up the promiscuous + * data structures, and so we clear the ds_mph. The dld stream + * may be closing and we can't fail that. + */ + dsp->ds_mph = NULL; + if (err != 0) + return (err); + + if (dsp->ds_sap == ETHERTYPE_VLAN && + dsp->ds_dlstate != DL_UNBOUND) { + int err; + + if (dsp->ds_vlan_mph != NULL) + return (EINVAL); + err = mac_promisc_add(dsp->ds_mch, + MAC_CLIENT_PROMISC_ALL, dls_rx_vlan_promisc, dsp, + &dsp->ds_vlan_mph, MAC_PROMISC_FLAGS_NO_PHYS); + return (err); } + } else if (old_flags == DLS_PROMISC_SAP && dsp->ds_promisc != 0 && + dsp->ds_promisc != old_flags) { + /* + * If the old flag is PROMISC_SAP, but the current flag has + * changed to some new non-zero value, we need to turn the + * physical promiscuous mode. + */ + ASSERT(dsp->ds_mph != NULL); + err = mac_promisc_remove(dsp->ds_mph); + if (err != 0) + return (err); + err = mac_promisc_add(dsp->ds_mch, MAC_CLIENT_PROMISC_ALL, + dls_rx_promisc, dsp, &dsp->ds_mph, 0); } -done: - if (dlp->dl_npromisc == 0 && dlp->dl_mth != NULL) { - mac_txloop_remove(dlp->dl_mh, dlp->dl_mth); - dlp->dl_mth = NULL; - } - - ASSERT(dlp->dl_npromisc == 0 || dlp->dl_mth != NULL); - mutex_exit(&dlp->dl_promisc_lock); - - rw_exit(&(dip->di_lock)); return (err); } int -dls_multicst_add(dls_channel_t dc, const uint8_t *addr) +dls_multicst_add(dld_str_t *dsp, const uint8_t *addr) { - dls_impl_t *dip = (dls_impl_t *)dc; int err; dls_multicst_addr_t **pp; dls_multicst_addr_t *p; uint_t addr_length; + ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + /* * Check whether the address is in the list of enabled addresses for - * this dls_impl_t. + * this dld_str_t. + */ + addr_length = dsp->ds_mip->mi_addr_length; + + /* + * Protect against concurrent access of ds_dmap by data threads using + * ds_rw_lock. The mac perimeter serializes the dls_multicst_add and + * remove operations. Dropping the ds_rw_lock across mac calls is thus + * ok and is also required by the locking protocol. */ - rw_enter(&(dip->di_lock), RW_WRITER); - addr_length = dip->di_mip->mi_addr_length; - for (pp = &(dip->di_dmap); (p = *pp) != NULL; pp = &(p->dma_nextp)) { + rw_enter(&dsp->ds_rw_lock, RW_WRITER); + for (pp = &(dsp->ds_dmap); (p = *pp) != NULL; pp = &(p->dma_nextp)) { if (bcmp(addr, p->dma_addr, addr_length) == 0) { /* * It is there so there's nothing to do. @@ -610,92 +312,92 @@ dls_multicst_add(dls_channel_t dc, const uint8_t *addr) } /* - * Allocate a new list item. + * Allocate a new list item and add it to the list. */ - if ((p = kmem_zalloc(sizeof (dls_multicst_addr_t), - KM_NOSLEEP)) == NULL) { - err = ENOMEM; - goto done; - } + p = kmem_zalloc(sizeof (dls_multicst_addr_t), KM_SLEEP); + bcopy(addr, p->dma_addr, addr_length); + *pp = p; + rw_exit(&dsp->ds_rw_lock); /* * Enable the address at the MAC. */ - if ((err = mac_multicst_add(dip->di_mh, addr)) != 0) { - kmem_free(p, sizeof (dls_multicst_addr_t)); - goto done; - } - - /* - * The address is now enabled at the MAC so add it to the list. - */ - bcopy(addr, p->dma_addr, addr_length); - *pp = p; + err = mac_multicast_add(dsp->ds_mch, addr); + if (err == 0) + return (0); + /* Undo the operation as it has failed */ + rw_enter(&dsp->ds_rw_lock, RW_WRITER); + ASSERT(*pp == p && p->dma_nextp == NULL); + *pp = NULL; + kmem_free(p, sizeof (dls_multicst_addr_t)); done: - rw_exit(&(dip->di_lock)); + rw_exit(&dsp->ds_rw_lock); return (err); } int -dls_multicst_remove(dls_channel_t dc, const uint8_t *addr) +dls_multicst_remove(dld_str_t *dsp, const uint8_t *addr) { - dls_impl_t *dip = (dls_impl_t *)dc; - int err; dls_multicst_addr_t **pp; dls_multicst_addr_t *p; uint_t addr_length; + ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + /* * Find the address in the list of enabled addresses for this - * dls_impl_t. + * dld_str_t. */ - rw_enter(&(dip->di_lock), RW_WRITER); - addr_length = dip->di_mip->mi_addr_length; - for (pp = &(dip->di_dmap); (p = *pp) != NULL; pp = &(p->dma_nextp)) { + addr_length = dsp->ds_mip->mi_addr_length; + + /* + * Protect against concurrent access to ds_dmap by data threads using + * ds_rw_lock. The mac perimeter serializes the dls_multicst_add and + * remove operations. Dropping the ds_rw_lock across mac calls is thus + * ok and is also required by the locking protocol. + */ + rw_enter(&dsp->ds_rw_lock, RW_WRITER); + for (pp = &(dsp->ds_dmap); (p = *pp) != NULL; pp = &(p->dma_nextp)) { if (bcmp(addr, p->dma_addr, addr_length) == 0) break; } /* * If we walked to the end of the list then the given address is - * not currently enabled for this dls_impl_t. + * not currently enabled for this dld_str_t. */ if (p == NULL) { - err = ENOENT; - goto done; + rw_exit(&dsp->ds_rw_lock); + return (ENOENT); } /* - * Disable the address at the MAC. + * Remove the address from the list. */ - if ((err = mac_multicst_remove(dip->di_mh, addr)) != 0) - goto done; + *pp = p->dma_nextp; + rw_exit(&dsp->ds_rw_lock); /* - * Remove the address from the list. + * Disable the address at the MAC. */ - *pp = p->dma_nextp; + mac_multicast_remove(dsp->ds_mch, addr); kmem_free(p, sizeof (dls_multicst_addr_t)); - -done: - rw_exit(&(dip->di_lock)); - return (err); + return (0); } mblk_t * -dls_header(dls_channel_t dc, const uint8_t *addr, uint16_t sap, uint_t pri, +dls_header(dld_str_t *dsp, const uint8_t *addr, uint16_t sap, uint_t pri, mblk_t **payloadp) { - dls_impl_t *dip = (dls_impl_t *)dc; uint16_t vid; size_t extra_len; uint16_t mac_sap; mblk_t *mp, *payload; - boolean_t is_ethernet = (dip->di_mip->mi_media == DL_ETHER); + boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER); struct ether_vlan_header *evhp; - vid = dip->di_dvp->dv_id; + vid = mac_client_vid(dsp->ds_mch); payload = (payloadp == NULL) ? NULL : (*payloadp); /* @@ -719,7 +421,7 @@ dls_header(dls_channel_t dc, const uint8_t *addr, uint16_t sap, uint_t pri, mac_sap = sap; } - mp = mac_header(dip->di_mh, addr, mac_sap, payload, extra_len); + mp = mac_header(dsp->ds_mh, addr, mac_sap, payload, extra_len); if (mp == NULL) return (NULL); @@ -772,209 +474,207 @@ dls_header(dls_channel_t dc, const uint8_t *addr, uint16_t sap, uint_t pri, return (mp); } -int -dls_header_info(dls_channel_t dc, mblk_t *mp, mac_header_info_t *mhip) -{ - return (dls_link_header_info(((dls_impl_t *)dc)->di_dvp->dv_dlp, - mp, mhip)); -} - void -dls_rx_set(dls_channel_t dc, dls_rx_t rx, void *arg) -{ - dls_impl_t *dip = (dls_impl_t *)dc; - - rw_enter(&(dip->di_lock), RW_WRITER); - dip->di_rx = rx; - dip->di_rx_arg = arg; - rw_exit(&(dip->di_lock)); -} - -mblk_t * -dls_tx(dls_channel_t dc, mblk_t *mp) +dls_rx_set(dld_str_t *dsp, dls_rx_t rx, void *arg) { - const mac_txinfo_t *mtp = ((dls_impl_t *)dc)->di_txinfo; - - return (mtp->mt_fn(mtp->mt_arg, mp)); + mutex_enter(&dsp->ds_lock); + dsp->ds_rx = rx; + dsp->ds_rx_arg = arg; + mutex_exit(&dsp->ds_lock); } -boolean_t -dls_accept(dls_impl_t *dip, mac_header_info_t *mhip, dls_rx_t *di_rx, - void **di_rx_arg) +static boolean_t +dls_accept_common(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx, + void **ds_rx_arg, boolean_t promisc, boolean_t promisc_loopback) { dls_multicst_addr_t *dmap; - size_t addr_length = dip->di_mip->mi_addr_length; + size_t addr_length = dsp->ds_mip->mi_addr_length; /* - * We must not accept packets if the dls_impl_t is not marked as bound + * We must not accept packets if the dld_str_t is not marked as bound * or is being removed. */ - rw_enter(&(dip->di_lock), RW_READER); - if (!dip->di_bound || dip->di_removing) + if (dsp->ds_dlstate != DL_IDLE) goto refuse; - /* - * If the dls_impl_t is in 'all physical' mode then always accept. - */ - if (dip->di_promisc & DLS_PROMISC_PHYS) - goto accept; + if (dsp->ds_promisc != 0) { + /* + * Filter out packets that arrived from the data path + * (i_dls_link_rx) when promisc mode is on. + */ + if (!promisc) + goto refuse; + /* + * If the dls_impl_t is in 'all physical' mode then + * always accept. + */ + if (dsp->ds_promisc & DLS_PROMISC_PHYS) + goto accept; - /* - * For non-promiscs-phys streams, filter out the packets looped back - * from the underlying driver because of promiscuous setting. - */ - if (mhip->mhi_prom_looped) - goto refuse; + /* + * Loopback packets i.e. packets sent out by DLS on a given + * mac end point, will be accepted back by DLS on loopback + * from the mac, only in the 'all physical' mode which has been + * covered by the previous check above + */ + if (promisc_loopback) + goto refuse; + } switch (mhip->mhi_dsttype) { case MAC_ADDRTYPE_UNICAST: + case MAC_ADDRTYPE_BROADCAST: /* - * Check to see if the destination address matches the - * dls_impl_t unicast address. + * We can accept unicast and broadcast packets because + * filtering is already done by the mac layer. */ - if (memcmp(mhip->mhi_daddr, dip->di_unicst_addr, addr_length) == - 0) { - goto accept; - } - break; + goto accept; case MAC_ADDRTYPE_MULTICAST: /* - * Check the address against the list of addresses enabled - * for this dls_impl_t or accept it unconditionally if the - * dls_impl_t is in 'all multicast' mode. + * Additional filtering is needed for multicast addresses + * because different streams may be interested in different + * addresses. */ - if (dip->di_promisc & DLS_PROMISC_MULTI) + if (dsp->ds_promisc & DLS_PROMISC_MULTI) goto accept; - for (dmap = dip->di_dmap; dmap != NULL; + + rw_enter(&dsp->ds_rw_lock, RW_READER); + for (dmap = dsp->ds_dmap; dmap != NULL; dmap = dmap->dma_nextp) { if (memcmp(mhip->mhi_daddr, dmap->dma_addr, addr_length) == 0) { + rw_exit(&dsp->ds_rw_lock); goto accept; } } + rw_exit(&dsp->ds_rw_lock); break; - case MAC_ADDRTYPE_BROADCAST: - /* - * If the address is broadcast then the dls_impl_t will - * always accept it. - */ - goto accept; } refuse: - rw_exit(&(dip->di_lock)); return (B_FALSE); accept: /* - * Since we hold di_lock here, the returned di_rx and di_rx_arg will - * always be in sync. + * the returned ds_rx and ds_rx_arg will always be in sync. */ - *di_rx = dip->di_rx; - *di_rx_arg = dip->di_rx_arg; - rw_exit(&(dip->di_lock)); + mutex_enter(&dsp->ds_lock); + *ds_rx = dsp->ds_rx; + *ds_rx_arg = dsp->ds_rx_arg; + mutex_exit(&dsp->ds_lock); + return (B_TRUE); } /* ARGSUSED */ boolean_t -dls_accept_loopback(dls_impl_t *dip, mac_header_info_t *mhip, dls_rx_t *di_rx, - void **di_rx_arg) +dls_accept(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx, + void **ds_rx_arg) { - /* - * We must not accept packets if the dls_impl_t is not marked as bound - * or is being removed. - */ - rw_enter(&(dip->di_lock), RW_READER); - if (!dip->di_bound || dip->di_removing) - goto refuse; - - /* - * A dls_impl_t should only accept loopback packets if it is in - * 'all physical' mode. - */ - if (dip->di_promisc & DLS_PROMISC_PHYS) - goto accept; - -refuse: - rw_exit(&(dip->di_lock)); - return (B_FALSE); - -accept: - /* - * Since we hold di_lock here, the returned di_rx and di_rx_arg will - * always be in sync. - */ - *di_rx = dip->di_rx; - *di_rx_arg = dip->di_rx_arg; - rw_exit(&(dip->di_lock)); - return (B_TRUE); + return (dls_accept_common(dsp, mhip, ds_rx, ds_rx_arg, B_FALSE, + B_FALSE)); } boolean_t +dls_accept_promisc(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx, + void **ds_rx_arg, boolean_t loopback) +{ + return (dls_accept_common(dsp, mhip, ds_rx, ds_rx_arg, B_TRUE, + loopback)); +} + +int dls_mac_active_set(dls_link_t *dlp) { - mutex_enter(&dlp->dl_lock); + int err = 0; /* - * If this is the first active client on this link, notify - * the mac that we're becoming an active client. + * First client; add the primary unicast address. */ - if (dlp->dl_nactive == 0 && !mac_active_shareable_set(dlp->dl_mh)) { - mutex_exit(&dlp->dl_lock); - return (B_FALSE); + if (dlp->dl_nactive == 0) { + /* + * First client; add the primary unicast address. + */ + mac_diag_t diag; + + /* request the primary MAC address */ + if ((err = mac_unicast_primary_add(dlp->dl_mch, &dlp->dl_mah, + &diag)) != 0) { + return (err); + } + + /* + * Set the function to start receiving packets. + */ + mac_rx_set(dlp->dl_mch, i_dls_link_rx, dlp); + + /* + * We've got a MAC client for this link now. + * Push down the flows that were defined on this link + * hitherto. The flows are added to the active flow table + * and SRS, softrings etc. are created as needed. + */ + mac_link_init_flows(dlp->dl_mch); } dlp->dl_nactive++; - mutex_exit(&dlp->dl_lock); - return (B_TRUE); + return (0); } void dls_mac_active_clear(dls_link_t *dlp) { - mutex_enter(&dlp->dl_lock); - if (--dlp->dl_nactive == 0) - mac_active_clear(dlp->dl_mh); - mutex_exit(&dlp->dl_lock); + if (--dlp->dl_nactive == 0) { + ASSERT(dlp->dl_mah != NULL); + /* + * We would have initialized subflows etc. only if we + * brought up the primary client and set the unicast + * unicast address etc. Deactivate the flows. The flow + * entry will be removed from the active flow tables, + * and the associated SRS, softrings etc will be + * deleted. But the flow entry itself won't be + * destroyed, instead it will continue to be + * archived off the the global flow hash list, for a + * possible future activation when say + * IP is plumbed again + */ + + mac_link_release_flows(dlp->dl_mch); + (void) mac_unicast_remove(dlp->dl_mch, dlp->dl_mah); + dlp->dl_mah = NULL; + mac_rx_clear(dlp->dl_mch); + } } -boolean_t -dls_active_set(dls_channel_t dc) +int +dls_active_set(dld_str_t *dsp) { - dls_impl_t *dip = (dls_impl_t *)dc; - dls_link_t *dlp = dip->di_dvp->dv_dlp; + int err = 0; - rw_enter(&dip->di_lock, RW_WRITER); + ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); /* If we're already active, then there's nothing more to do. */ - if (dip->di_active) { - rw_exit(&dip->di_lock); - return (B_TRUE); - } + if (dsp->ds_active) + return (0); - if (!dls_mac_active_set(dlp)) { - rw_exit(&dip->di_lock); - return (B_FALSE); + if ((err = dls_mac_active_set(dsp->ds_dlp)) != 0) { + /* except for ENXIO all other errors are mapped to EBUSY */ + if (err != ENXIO) + return (EBUSY); + return (err); } - dip->di_active = B_TRUE; - rw_exit(&dip->di_lock); - return (B_TRUE); + + dsp->ds_active = B_TRUE; + return (0); } void -dls_active_clear(dls_channel_t dc) +dls_active_clear(dld_str_t *dsp) { - dls_impl_t *dip = (dls_impl_t *)dc; - dls_link_t *dlp = dip->di_dvp->dv_dlp; - - rw_enter(&dip->di_lock, RW_WRITER); + ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); - if (!dip->di_active) - goto out; - dip->di_active = B_FALSE; - - dls_mac_active_clear(dlp); + if (!dsp->ds_active) + return; -out: - rw_exit(&dip->di_lock); + dls_mac_active_clear(dsp->ds_dlp); + dsp->ds_active = B_FALSE; } diff --git a/usr/src/uts/common/io/dls/dls_link.c b/usr/src/uts/common/io/dls/dls_link.c index 759fb97f0a..852b87d24b 100644 --- a/usr/src/uts/common/io/dls/dls_link.c +++ b/usr/src/uts/common/io/dls/dls_link.c @@ -23,34 +23,21 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Data-Link Services Module */ -#include <sys/types.h> -#include <sys/stream.h> -#include <sys/strsun.h> -#include <sys/strsubr.h> #include <sys/sysmacros.h> -#include <sys/atomic.h> -#include <sys/modhash.h> -#include <sys/dlpi.h> -#include <sys/ethernet.h> -#include <sys/byteorder.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> #include <sys/vlan.h> -#include <sys/mac.h> -#include <sys/sdt.h> - -#include <sys/dls.h> #include <sys/dld_impl.h> -#include <sys/dls_impl.h> +#include <sys/sdt.h> +#include <sys/atomic.h> static kmem_cache_t *i_dls_link_cachep; static mod_hash_t *i_dls_link_hash; static uint_t i_dls_link_count; -static krwlock_t i_dls_link_lock; #define LINK_HASHSZ 67 /* prime */ #define IMPL_HASHSZ 67 /* prime */ @@ -58,15 +45,8 @@ static krwlock_t i_dls_link_lock; /* * Construct a hash key encompassing both DLSAP value and VLAN idenitifier. */ -#define MAKE_KEY(_sap, _vid) \ - ((mod_hash_key_t)(uintptr_t) \ - (((_sap) << VLAN_ID_SIZE) | (_vid) & VLAN_ID_MASK)) - -/* - * Extract the DLSAP value from the hash key. - */ -#define KEY_SAP(_key) \ - (((uint32_t)(uintptr_t)(_key)) >> VLAN_ID_SIZE) +#define MAKE_KEY(_sap) \ + ((mod_hash_key_t)(uintptr_t)((_sap) << VLAN_ID_SIZE)) #define DLS_STRIP_PADDING(pktsize, p) { \ if (pktsize != 0) { \ @@ -91,12 +71,9 @@ i_dls_link_constructor(void *buf, void *arg, int kmflag) bzero(buf, sizeof (dls_link_t)); (void) snprintf(name, MAXNAMELEN, "dls_link_t_%p_hash", buf); - dlp->dl_impl_hash = mod_hash_create_idhash(name, IMPL_HASHSZ, + dlp->dl_str_hash = mod_hash_create_idhash(name, IMPL_HASHSZ, mod_hash_null_valdtor); - mutex_init(&dlp->dl_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&dlp->dl_promisc_lock, NULL, MUTEX_DEFAULT, NULL); - rw_init(&dlp->dl_impl_lock, NULL, RW_DEFAULT, NULL); return (0); } @@ -108,14 +85,12 @@ i_dls_link_destructor(void *buf, void *arg) ASSERT(dlp->dl_ref == 0); ASSERT(dlp->dl_mh == NULL); + ASSERT(dlp->dl_mah == NULL); ASSERT(dlp->dl_unknowns == 0); - mod_hash_destroy_idhash(dlp->dl_impl_hash); - dlp->dl_impl_hash = NULL; + mod_hash_destroy_idhash(dlp->dl_str_hash); + dlp->dl_str_hash = NULL; - mutex_destroy(&dlp->dl_lock); - mutex_destroy(&dlp->dl_promisc_lock); - rw_destroy(&dlp->dl_impl_lock); } /* @@ -195,8 +170,7 @@ i_dls_link_subchain(dls_link_t *dlp, mblk_t *mp, const mac_header_info_t *mhip, */ if (memcmp(mhip->mhi_daddr, cmhi.mhi_daddr, addr_size) != 0 || memcmp(mhip->mhi_saddr, cmhi.mhi_saddr, addr_size) != 0 || - mhip->mhi_bindsap != cmhi.mhi_bindsap || - mhip->mhi_prom_looped != cmhi.mhi_prom_looped) { + mhip->mhi_bindsap != cmhi.mhi_bindsap) { /* * Note that we don't need to restore the padding. */ @@ -239,16 +213,34 @@ i_dls_link_subchain(dls_link_t *dlp, mblk_t *mp, const mac_header_info_t *mhip, return (mp); } -static void -i_dls_head_hold(dls_head_t *dhp) +/* ARGSUSED */ +static int +i_dls_head_hold(mod_hash_key_t key, mod_hash_val_t val) { - atomic_inc_32(&dhp->dh_ref); + dls_head_t *dhp = (dls_head_t *)val; + + /* + * The lock order is mod_hash's internal lock -> dh_lock as in the + * call to i_dls_link_rx -> mod_hash_find_cb_rval -> i_dls_head_hold + */ + mutex_enter(&dhp->dh_lock); + if (dhp->dh_removing) { + mutex_exit(&dhp->dh_lock); + return (-1); + } + dhp->dh_ref++; + mutex_exit(&dhp->dh_lock); + return (0); } -static void +void i_dls_head_rele(dls_head_t *dhp) { - atomic_dec_32(&dhp->dh_ref); + mutex_enter(&dhp->dh_lock); + dhp->dh_ref--; + if (dhp->dh_ref == 0 && dhp->dh_removing != 0) + cv_broadcast(&dhp->dh_cv); + mutex_exit(&dhp->dh_lock); } static dls_head_t * @@ -276,83 +268,86 @@ i_dls_head_free(dls_head_t *dhp) */ static uint_t i_dls_link_rx_func(dls_link_t *dlp, mac_resource_handle_t mrh, - mac_header_info_t *mhip, mblk_t *mp, uint32_t sap, uint16_t vid, + mac_header_info_t *mhip, mblk_t *mp, uint32_t sap, boolean_t (*acceptfunc)()) { - mod_hash_t *hash = dlp->dl_impl_hash; + mod_hash_t *hash = dlp->dl_str_hash; mod_hash_key_t key; dls_head_t *dhp; - dls_impl_t *dip; + dld_str_t *dsp; mblk_t *nmp; - dls_rx_t di_rx; - void *di_rx_arg; + dls_rx_t ds_rx; + void *ds_rx_arg; uint_t naccepted = 0; + int rval; /* * Construct a hash key from the VLAN identifier and the - * DLSAP that represents dls_impl_t in promiscuous mode. + * DLSAP that represents dld_str_t in promiscuous mode. */ - key = MAKE_KEY(sap, vid); + key = MAKE_KEY(sap); /* - * Search the hash table for dls_impl_t eligible to receive - * a packet chain for this DLSAP/VLAN combination. + * Search the hash table for dld_str_t eligible to receive + * a packet chain for this DLSAP/VLAN combination. The mod hash's + * internal lock serializes find/insert/remove from the mod hash list. + * Incrementing the dh_ref (while holding the mod hash lock) ensures + * dls_link_remove will wait for the upcall to finish. */ - rw_enter(&dlp->dl_impl_lock, RW_READER); - if (mod_hash_find(hash, key, (mod_hash_val_t *)&dhp) != 0) { - rw_exit(&dlp->dl_impl_lock); + if (mod_hash_find_cb_rval(hash, key, (mod_hash_val_t *)&dhp, + i_dls_head_hold, &rval) != 0 || (rval != 0)) { return (B_FALSE); } - i_dls_head_hold(dhp); - rw_exit(&dlp->dl_impl_lock); /* - * Find dls_impl_t that will accept the sub-chain. + * Find dld_str_t that will accept the sub-chain. */ - for (dip = dhp->dh_list; dip != NULL; dip = dip->di_nextp) { - if (!acceptfunc(dip, mhip, &di_rx, &di_rx_arg)) + for (dsp = dhp->dh_list; dsp != NULL; dsp = dsp->ds_next) { + if (!acceptfunc(dsp, mhip, &ds_rx, &ds_rx_arg)) continue; /* * We have at least one acceptor. */ - naccepted ++; + naccepted++; /* - * There will normally be at least more dls_impl_t + * There will normally be at least more dld_str_t * (since we've yet to check for non-promiscuous - * dls_impl_t) so dup the sub-chain. + * dld_str_t) so dup the sub-chain. */ if ((nmp = copymsgchain(mp)) != NULL) - di_rx(di_rx_arg, mrh, nmp, mhip); + ds_rx(ds_rx_arg, mrh, nmp, mhip); } /* - * Release the hold on the dls_impl_t chain now that we have + * Release the hold on the dld_str_t chain now that we have * finished walking it. */ i_dls_head_rele(dhp); return (naccepted); } -static void -i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp) +/* ARGSUSED */ +void +i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback) { dls_link_t *dlp = arg; - mod_hash_t *hash = dlp->dl_impl_hash; + mod_hash_t *hash = dlp->dl_str_hash; mblk_t *nextp; mac_header_info_t mhi; dls_head_t *dhp; - dls_impl_t *dip; - dls_impl_t *ndip; + dld_str_t *dsp; + dld_str_t *ndsp; mblk_t *nmp; mod_hash_key_t key; uint_t npacket; boolean_t accepted; - dls_rx_t di_rx, ndi_rx; - void *di_rx_arg, *ndi_rx_arg; + dls_rx_t ds_rx, nds_rx; + void *ds_rx_arg, *nds_rx_arg; uint16_t vid; - int err; + int err, rval; /* * Walk the packet chain. @@ -384,11 +379,11 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp) if (mhi.mhi_istagged) { /* * If it is tagged traffic, send it upstream to - * all dls_impl_t which are attached to the physical + * all dld_str_t which are attached to the physical * link and bound to SAP 0x8100. */ if (i_dls_link_rx_func(dlp, mrh, &mhi, mp, - ETHERTYPE_VLAN, VLAN_ID_NONE, dls_accept) > 0) { + ETHERTYPE_VLAN, dls_accept) > 0) { accepted = B_TRUE; } @@ -413,33 +408,30 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp) * Construct a hash key from the VLAN identifier and the * DLSAP. */ - key = MAKE_KEY(mhi.mhi_bindsap, vid); + key = MAKE_KEY(mhi.mhi_bindsap); /* - * Search the has table for dls_impl_t eligible to receive + * Search the has table for dld_str_t eligible to receive * a packet chain for this DLSAP/VLAN combination. */ - rw_enter(&dlp->dl_impl_lock, RW_READER); - if (mod_hash_find(hash, key, (mod_hash_val_t *)&dhp) != 0) { - rw_exit(&dlp->dl_impl_lock); + if (mod_hash_find_cb_rval(hash, key, (mod_hash_val_t *)&dhp, + i_dls_head_hold, &rval) != 0 || (rval != 0)) { freemsgchain(mp); goto loop; } - i_dls_head_hold(dhp); - rw_exit(&dlp->dl_impl_lock); /* - * Find the first dls_impl_t that will accept the sub-chain. + * Find the first dld_str_t that will accept the sub-chain. */ - for (dip = dhp->dh_list; dip != NULL; dip = dip->di_nextp) - if (dls_accept(dip, &mhi, &di_rx, &di_rx_arg)) + for (dsp = dhp->dh_list; dsp != NULL; dsp = dsp->ds_next) + if (dls_accept(dsp, &mhi, &ds_rx, &ds_rx_arg)) break; /* - * If we did not find any dls_impl_t willing to accept the + * If we did not find any dld_str_t willing to accept the * sub-chain then throw it away. */ - if (dip == NULL) { + if (dsp == NULL) { i_dls_head_rele(dhp); freemsgchain(mp); goto loop; @@ -451,43 +443,43 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp) accepted = B_TRUE; for (;;) { /* - * Find the next dls_impl_t that will accept the + * Find the next dld_str_t that will accept the * sub-chain. */ - for (ndip = dip->di_nextp; ndip != NULL; - ndip = ndip->di_nextp) - if (dls_accept(ndip, &mhi, &ndi_rx, - &ndi_rx_arg)) + for (ndsp = dsp->ds_next; ndsp != NULL; + ndsp = ndsp->ds_next) + if (dls_accept(ndsp, &mhi, &nds_rx, + &nds_rx_arg)) break; /* - * If there are no more dls_impl_t that are willing + * If there are no more dld_str_t that are willing * to accept the sub-chain then we don't need to dup * it before handing it to the current one. */ - if (ndip == NULL) { - di_rx(di_rx_arg, mrh, mp, &mhi); + if (ndsp == NULL) { + ds_rx(ds_rx_arg, mrh, mp, &mhi); /* - * Since there are no more dls_impl_t, we're + * Since there are no more dld_str_t, we're * done. */ break; } /* - * There are more dls_impl_t so dup the sub-chain. + * There are more dld_str_t so dup the sub-chain. */ if ((nmp = copymsgchain(mp)) != NULL) - di_rx(di_rx_arg, mrh, nmp, &mhi); + ds_rx(ds_rx_arg, mrh, nmp, &mhi); - dip = ndip; - di_rx = ndi_rx; - di_rx_arg = ndi_rx_arg; + dsp = ndsp; + ds_rx = nds_rx; + ds_rx_arg = nds_rx_arg; } /* - * Release the hold on the dls_impl_t chain now that we have + * Release the hold on the dld_str_t chain now that we have * finished walking it. */ i_dls_head_rele(dhp); @@ -502,220 +494,119 @@ loop: } } -/* - * Try to send mp up to the DLS_SAP_PROMISC listeners. Return B_TRUE if this - * message is sent to any streams. - */ -static uint_t -i_dls_link_rx_common_promisc(dls_link_t *dlp, mac_resource_handle_t mrh, - mac_header_info_t *mhip, mblk_t *mp, uint16_t vid, - boolean_t (*acceptfunc)()) +/* ARGSUSED */ +void +dls_rx_vlan_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback) { - uint_t naccepted; + dld_str_t *dsp = arg; + dls_link_t *dlp = dsp->ds_dlp; + mac_header_info_t mhi; + dls_rx_t ds_rx; + void *ds_rx_arg; + int err; - naccepted = i_dls_link_rx_func(dlp, mrh, mhip, mp, DLS_SAP_PROMISC, - vid, acceptfunc); + DLS_PREPARE_PKT(dlp, mp, &mhi, err); + if (err != 0) + goto drop; - if (vid != VLAN_ID_NONE) { - naccepted += i_dls_link_rx_func(dlp, mrh, mhip, mp, - DLS_SAP_PROMISC, VLAN_ID_NONE, acceptfunc); + /* + * If there is promiscuous handle for vlan, we filter out the untagged + * pkts and pkts that are not for the primary unicast address. + */ + if (dsp->ds_vlan_mph != NULL) { + uint8_t prim_addr[MAXMACADDRLEN]; + size_t addr_length = dsp->ds_mip->mi_addr_length; + + if (!(mhi.mhi_istagged)) + goto drop; + ASSERT(dsp->ds_mh != NULL); + mac_unicast_primary_get(dsp->ds_mh, (uint8_t *)prim_addr); + if (memcmp(mhi.mhi_daddr, prim_addr, addr_length) != 0) + goto drop; + + if (!dls_accept(dsp, &mhi, &ds_rx, &ds_rx_arg)) + goto drop; + + ds_rx(ds_rx_arg, NULL, mp, &mhi); + return; } - return (naccepted); + +drop: + atomic_add_32(&dlp->dl_unknowns, 1); + freemsg(mp); } -static void -i_dls_link_rx_common(void *arg, mac_resource_handle_t mrh, mblk_t *mp, - boolean_t (*acceptfunc)()) +/* ARGSUSED */ +void +dls_rx_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback) { - dls_link_t *dlp = arg; - mod_hash_t *hash = dlp->dl_impl_hash; - mblk_t *nextp; + dld_str_t *dsp = arg; + dls_link_t *dlp = dsp->ds_dlp; mac_header_info_t mhi; - uint16_t vid, vidkey, pri; + dls_rx_t ds_rx; + void *ds_rx_arg; + int err; dls_head_t *dhp; - dls_impl_t *dip; - mblk_t *nmp; mod_hash_key_t key; - uint_t npacket; - uint32_t sap; - boolean_t accepted; - dls_rx_t di_rx, fdi_rx; - void *di_rx_arg, *fdi_rx_arg; - boolean_t pass2; - int err; + + DLS_PREPARE_PKT(dlp, mp, &mhi, err); + if (err != 0) + goto drop; /* - * Walk the packet chain. + * In order to filter out sap pkt that no dls channel listens, search + * the hash table trying to find a dld_str_t eligible to receive the pkt */ - for (; mp != NULL; mp = nextp) { - /* - * Wipe the accepted state and the receive information of - * the first eligible dls_impl_t. - */ - accepted = B_FALSE; - pass2 = B_FALSE; - fdi_rx = NULL; - fdi_rx_arg = NULL; - - DLS_PREPARE_PKT(dlp, mp, &mhi, err); - if (err != 0) { - if (acceptfunc == dls_accept) - atomic_add_32(&(dlp->dl_unknowns), 1); - nextp = mp->b_next; - mp->b_next = NULL; - freemsg(mp); - continue; - } - - /* - * Grab the longest sub-chain we can process as a single - * unit. - */ - nextp = i_dls_link_subchain(dlp, mp, &mhi, &npacket); - ASSERT(npacket != 0); - - vid = VLAN_ID(mhi.mhi_tci); - pri = VLAN_PRI(mhi.mhi_tci); - - vidkey = vid; - - /* - * Note that we need to first send to the dls_impl_t - * in promiscuous mode in order to avoid the packet reordering - * when snooping. - */ - if (i_dls_link_rx_common_promisc(dlp, mrh, &mhi, mp, vidkey, - acceptfunc) > 0) { - accepted = B_TRUE; - } - - /* - * Non promisc case. Two passes: - * 1. send tagged packets to ETHERTYPE_VLAN listeners - * 2. send packets to listeners bound to the specific SAP. - */ - if (mhi.mhi_istagged) { - vidkey = VLAN_ID_NONE; - sap = ETHERTYPE_VLAN; - } else { - goto non_promisc_loop; - } -non_promisc: - /* - * Construct a hash key from the VLAN identifier and the - * DLSAP. - */ - key = MAKE_KEY(sap, vidkey); - - /* - * Search the has table for dls_impl_t eligible to receive - * a packet chain for this DLSAP/VLAN combination. - */ - rw_enter(&dlp->dl_impl_lock, RW_READER); - if (mod_hash_find(hash, key, (mod_hash_val_t *)&dhp) != 0) { - rw_exit(&dlp->dl_impl_lock); - goto non_promisc_loop; - } - i_dls_head_hold(dhp); - rw_exit(&dlp->dl_impl_lock); - - /* - * Find the first dls_impl_t that will accept the sub-chain. - */ - for (dip = dhp->dh_list; dip != NULL; dip = dip->di_nextp) { - if (!acceptfunc(dip, &mhi, &di_rx, &di_rx_arg)) - continue; - - accepted = B_TRUE; - - /* - * To avoid the extra copymsgchain(), if this - * is the first eligible dls_impl_t, remember required - * information and send up the message afterwards. - */ - if (fdi_rx == NULL) { - fdi_rx = di_rx; - fdi_rx_arg = di_rx_arg; - continue; - } + if ((dsp->ds_promisc & DLS_PROMISC_SAP) == 0) { + key = MAKE_KEY(mhi.mhi_bindsap); + if (mod_hash_find(dsp->ds_dlp->dl_str_hash, key, + (mod_hash_val_t *)&dhp) != 0) + goto drop; + } - if ((nmp = copymsgchain(mp)) != NULL) - di_rx(di_rx_arg, mrh, nmp, &mhi); - } + if (!dls_accept_promisc(dsp, &mhi, &ds_rx, &ds_rx_arg, loopback)) + goto drop; - /* - * Release the hold on the dls_impl_t chain now that we have - * finished walking it. - */ - i_dls_head_rele(dhp); + ds_rx(ds_rx_arg, NULL, mp, &mhi); + return; -non_promisc_loop: - /* - * Don't pass the packets up again if: - * - First pass is done and the packets are tagged and their: - * - VID and priority are both zero (invalid packets). - * - their sap is ETHERTYPE_VLAN and their VID is zero - * (they have already been sent upstreams). - * - Second pass is done: - */ - if (pass2 || (mhi.mhi_istagged && - ((vid == VLAN_ID_NONE && pri == 0) || - (mhi.mhi_bindsap == ETHERTYPE_VLAN && - vid == VLAN_ID_NONE)))) { - /* - * Send the message up to the first eligible dls_impl_t. - */ - if (fdi_rx != NULL) - fdi_rx(fdi_rx_arg, mrh, mp, &mhi); - else - freemsgchain(mp); - } else { - vidkey = vid; - sap = mhi.mhi_bindsap; - pass2 = B_TRUE; - goto non_promisc; - } - - /* - * If there were no acceptors then add the packet count to the - * 'unknown' count. - */ - if (!accepted && (acceptfunc == dls_accept)) - atomic_add_32(&(dlp->dl_unknowns), npacket); - } +drop: + atomic_add_32(&dlp->dl_unknowns, 1); + freemsg(mp); } static void -i_dls_link_rx_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp) -{ - i_dls_link_rx_common(arg, mrh, mp, dls_accept); -} - -void -dls_link_txloop(void *arg, mblk_t *mp) +i_dls_link_destroy(dls_link_t *dlp) { - i_dls_link_rx_common(arg, NULL, mp, dls_accept_loopback); -} + ASSERT(dlp->dl_nactive == 0); + ASSERT(dlp->dl_impl_count == 0); + ASSERT(dlp->dl_zone_ref == 0); -/*ARGSUSED*/ -static uint_t -i_dls_link_walk(mod_hash_key_t key, mod_hash_val_t *val, void *arg) -{ - boolean_t *promiscp = arg; - uint32_t sap = KEY_SAP(key); + /* + * Free the structure back to the cache. + */ + if (dlp->dl_mch != NULL) + mac_client_close(dlp->dl_mch, 0); - if (sap == DLS_SAP_PROMISC) { - *promiscp = B_TRUE; - return (MH_WALK_TERMINATE); + if (dlp->dl_mh != NULL) { + ASSERT(MAC_PERIM_HELD(dlp->dl_mh)); + mac_close(dlp->dl_mh); } - return (MH_WALK_CONTINUE); + dlp->dl_mh = NULL; + dlp->dl_mch = NULL; + dlp->dl_mip = NULL; + dlp->dl_unknowns = 0; + kmem_cache_free(i_dls_link_cachep, dlp); } static int i_dls_link_create(const char *name, dls_link_t **dlpp) { dls_link_t *dlp; + int err; /* * Allocate a new dls_link_t structure. @@ -728,32 +619,34 @@ i_dls_link_create(const char *name, dls_link_t **dlpp) (void) strlcpy(dlp->dl_name, name, sizeof (dlp->dl_name)); /* - * Initialize promiscuous bookkeeping fields. + * First reference; hold open the MAC interface. */ - dlp->dl_npromisc = 0; - dlp->dl_mth = NULL; + ASSERT(dlp->dl_mh == NULL); + err = mac_open(dlp->dl_name, &dlp->dl_mh); + if (err != 0) + goto bail; + + ASSERT(MAC_PERIM_HELD(dlp->dl_mh)); + dlp->dl_mip = mac_info(dlp->dl_mh); + + /* DLS is the "primary" MAC client */ + ASSERT(dlp->dl_mch == NULL); + + err = mac_client_open(dlp->dl_mh, &dlp->dl_mch, NULL, + MAC_OPEN_FLAGS_TAG_DISABLE | MAC_OPEN_FLAGS_DISABLE_TX_VID_CHECK | + MAC_OPEN_FLAGS_USE_DATALINK_NAME); + if (err != 0) + goto bail; + + DTRACE_PROBE2(dls__primary__client, char *, dlp->dl_name, void *, + dlp->dl_mch); *dlpp = dlp; return (0); -} -static void -i_dls_link_destroy(dls_link_t *dlp) -{ - ASSERT(dlp->dl_npromisc == 0); - ASSERT(dlp->dl_nactive == 0); - ASSERT(dlp->dl_mth == NULL); - ASSERT(dlp->dl_macref == 0); - ASSERT(dlp->dl_mh == NULL); - ASSERT(dlp->dl_mip == NULL); - ASSERT(dlp->dl_impl_count == 0); - ASSERT(dlp->dl_mrh == NULL); - - /* - * Free the structure back to the cache. - */ - dlp->dl_unknowns = 0; - kmem_cache_free(i_dls_link_cachep, dlp); +bail: + i_dls_link_destroy(dlp); + return (err); } /* @@ -777,7 +670,6 @@ dls_link_init(void) i_dls_link_hash = mod_hash_create_extended("dls_link_hash", IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor, mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); - rw_init(&i_dls_link_lock, NULL, RW_DEFAULT, NULL); i_dls_link_count = 0; } @@ -796,7 +688,6 @@ dls_link_fini(void) * Destroy the hash table and associated lock. */ mod_hash_destroy_hash(i_dls_link_hash); - rw_destroy(&i_dls_link_lock); return (0); } @@ -804,32 +695,33 @@ dls_link_fini(void) * Exported functions. */ -int -dls_link_hold(const char *name, dls_link_t **dlpp) +static int +dls_link_hold_common(const char *name, dls_link_t **dlpp, boolean_t create) { dls_link_t *dlp; int err; /* - * Look up a dls_link_t corresponding to the given mac_handle_t - * in the global hash table. We need to hold i_dls_link_lock in - * order to atomically find and insert a dls_link_t into the - * hash table. + * Look up a dls_link_t corresponding to the given macname in the + * global hash table. The i_dls_link_hash itself is protected by the + * mod_hash package's internal lock which synchronizes + * find/insert/remove into the global mod_hash list. Assumes that + * inserts and removes are single threaded on a per mac end point + * by the mac perimeter. */ - rw_enter(&i_dls_link_lock, RW_WRITER); if ((err = mod_hash_find(i_dls_link_hash, (mod_hash_key_t)name, (mod_hash_val_t *)&dlp)) == 0) goto done; ASSERT(err == MH_ERR_NOTFOUND); + if (!create) + return (ENOENT); /* * We didn't find anything so we need to create one. */ - if ((err = i_dls_link_create(name, &dlp)) != 0) { - rw_exit(&i_dls_link_lock); + if ((err = i_dls_link_create(name, &dlp)) != 0) return (err); - } /* * Insert the dls_link_t. @@ -838,124 +730,200 @@ dls_link_hold(const char *name, dls_link_t **dlpp) (mod_hash_val_t)dlp); ASSERT(err == 0); - i_dls_link_count++; + atomic_add_32(&i_dls_link_count, 1); ASSERT(i_dls_link_count != 0); done: - + ASSERT(MAC_PERIM_HELD(dlp->dl_mh)); /* * Bump the reference count and hand back the reference. */ dlp->dl_ref++; *dlpp = dlp; - rw_exit(&i_dls_link_lock); return (0); } +int +dls_link_hold_create(const char *name, dls_link_t **dlpp) +{ + return (dls_link_hold_common(name, dlpp, B_TRUE)); +} + +int +dls_link_hold(const char *name, dls_link_t **dlpp) +{ + return (dls_link_hold_common(name, dlpp, B_FALSE)); +} + +dev_info_t * +dls_link_devinfo(dev_t dev) +{ + dls_link_t *dlp; + dev_info_t *dip; + char macname[MAXNAMELEN]; + char *drv; + mac_perim_handle_t mph; + + if ((drv = ddi_major_to_name(getmajor(dev))) == NULL) + return (NULL); + (void) snprintf(macname, MAXNAMELEN, "%s%d", drv, getminor(dev) - 1); + + /* + * The code below assumes that the name constructed above is the + * macname. This is not the case for legacy devices. Currently this + * is ok because this function is only called in the getinfo(9e) path, + * which for a legacy device would directly end up in the driver's + * getinfo, rather than here + */ + if (mac_perim_enter_by_macname(macname, &mph) != 0) + return (NULL); + + if (dls_link_hold(macname, &dlp) != 0) { + mac_perim_exit(mph); + return (NULL); + } + + dip = mac_devinfo_get(dlp->dl_mh); + dls_link_rele(dlp); + mac_perim_exit(mph); + + return (dip); +} + +dev_t +dls_link_dev(dls_link_t *dlp) +{ + return (makedevice(ddi_driver_major(mac_devinfo_get(dlp->dl_mh)), + mac_minor(dlp->dl_mh))); +} + void dls_link_rele(dls_link_t *dlp) { mod_hash_val_t val; - rw_enter(&i_dls_link_lock, RW_WRITER); - + ASSERT(MAC_PERIM_HELD(dlp->dl_mh)); /* * Check if there are any more references. */ - if (--dlp->dl_ref != 0) { + if (--dlp->dl_ref == 0) { + (void) mod_hash_remove(i_dls_link_hash, + (mod_hash_key_t)dlp->dl_name, &val); + ASSERT(dlp == (dls_link_t *)val); + /* - * There are more references so there's nothing more to do. + * Destroy the dls_link_t. */ - goto done; + i_dls_link_destroy(dlp); + ASSERT(i_dls_link_count > 0); + atomic_add_32(&i_dls_link_count, -1); } +} + +int +dls_link_rele_by_name(const char *name) +{ + dls_link_t *dlp; + + if (mod_hash_find(i_dls_link_hash, (mod_hash_key_t)name, + (mod_hash_val_t *)&dlp) != 0) + return (ENOENT); - (void) mod_hash_remove(i_dls_link_hash, - (mod_hash_key_t)dlp->dl_name, &val); - ASSERT(dlp == (dls_link_t *)val); + ASSERT(MAC_PERIM_HELD(dlp->dl_mh)); /* - * Destroy the dls_link_t. + * Must fail detach if mac client is busy. */ - i_dls_link_destroy(dlp); - ASSERT(i_dls_link_count > 0); - i_dls_link_count--; -done: - rw_exit(&i_dls_link_lock); + ASSERT(dlp->dl_ref > 0 && dlp->dl_mch != NULL); + if (mac_link_has_flows(dlp->dl_mch)) + return (ENOTEMPTY); + + dls_link_rele(dlp); + return (0); } int -dls_mac_hold(dls_link_t *dlp) +dls_link_setzid(const char *name, zoneid_t zid) { - mac_handle_t mh; - int err = 0; + dls_link_t *dlp; + int err = 0; + zoneid_t old_zid; + + if ((err = dls_link_hold_create(name, &dlp)) != 0) + return (err); - err = mac_open(dlp->dl_name, &mh); + ASSERT(MAC_PERIM_HELD(dlp->dl_mh)); - mutex_enter(&dlp->dl_lock); + if ((old_zid = dlp->dl_zid) == zid) + goto done; - ASSERT(IMPLY(dlp->dl_macref != 0, dlp->dl_mh != NULL)); - ASSERT(IMPLY(dlp->dl_macref == 0, dlp->dl_mh == NULL)); - if (err == 0) { - ASSERT(dlp->dl_mh == NULL || dlp->dl_mh == mh); - if (dlp->dl_mh == NULL) { - dlp->dl_mh = mh; - dlp->dl_mip = mac_info(mh); + /* + * Check whether this dlp is used by its own zones, if yes, + * we cannot change its zoneid. + */ + if (dlp->dl_zone_ref != 0) { + err = EBUSY; + goto done; + } + + if (zid == GLOBAL_ZONEID) { + /* + * Move the link from the local zone to the global zone, + * and release the reference to this link. At the same time + * reset the link's active state so that an aggregation is + * allowed to be created over it. + */ + dlp->dl_zid = zid; + dls_mac_active_clear(dlp); + dls_link_rele(dlp); + goto done; + } else if (old_zid == GLOBAL_ZONEID) { + /* + * Move the link from the global zone to the local zone, + * and hold a reference to this link. Also, set the link + * to the "active" state so that the global zone is + * not able to create an aggregation over this link. + * TODO: revisit once we allow creating aggregations + * within a local zone. + */ + if ((err = dls_mac_active_set(dlp)) != 0) { + if (err != ENXIO) + err = EBUSY; + goto done; } - dlp->dl_macref++; + dlp->dl_zid = zid; + return (0); + } else { + /* + * Move the link from a local zone to another local zone. + */ + dlp->dl_zid = zid; } - mutex_exit(&dlp->dl_lock); +done: + dls_link_rele(dlp); return (err); } void -dls_mac_rele(dls_link_t *dlp) -{ - mutex_enter(&dlp->dl_lock); - ASSERT(dlp->dl_mh != NULL); - - mac_close(dlp->dl_mh); - - if (--dlp->dl_macref == 0) { - dlp->dl_mh = NULL; - dlp->dl_mip = NULL; - } - mutex_exit(&dlp->dl_lock); -} - -void -dls_link_add(dls_link_t *dlp, uint32_t sap, dls_impl_t *dip) +dls_link_add(dls_link_t *dlp, uint32_t sap, dld_str_t *dsp) { - dls_vlan_t *dvp = dip->di_dvp; - mod_hash_t *hash = dlp->dl_impl_hash; + mod_hash_t *hash = dlp->dl_str_hash; mod_hash_key_t key; dls_head_t *dhp; - dls_impl_t *p; - mac_rx_t rx; + dld_str_t *p; int err; - boolean_t promisc = B_FALSE; - /* - * Generate a hash key based on the sap and the VLAN id. - */ - key = MAKE_KEY(sap, dvp->dv_id); + ASSERT(MAC_PERIM_HELD(dlp->dl_mh)); /* - * We need dl_lock here because we want to be able to walk - * the hash table *and* set the mac rx func atomically. if - * these two operations are separate, someone else could - * insert/remove dls_impl_t from the hash table after we - * drop the hash lock and this could cause our chosen rx - * func to be incorrect. note that we cannot call mac_rx_add - * when holding the hash lock because this can cause deadlock. + * Generate a hash key based on the sap. */ - mutex_enter(&dlp->dl_lock); + key = MAKE_KEY(sap); /* * Search the table for a list head with this key. */ - rw_enter(&dlp->dl_impl_lock, RW_WRITER); - if ((err = mod_hash_find(hash, key, (mod_hash_val_t *)&dhp)) != 0) { ASSERT(err == MH_ERR_NOTFOUND); @@ -965,94 +933,68 @@ dls_link_add(dls_link_t *dlp, uint32_t sap, dls_impl_t *dip) } /* - * Add the dls_impl_t to the head of the list. + * Add the dld_str_t to the head of the list. List walkers in + * i_dls_link_rx_* bump up dh_ref to ensure the list does not change + * while they walk the list. The membar below ensures that list walkers + * see exactly the old list or the new list. */ - ASSERT(dip->di_nextp == NULL); + ASSERT(dsp->ds_next == NULL); p = dhp->dh_list; - dip->di_nextp = p; - dhp->dh_list = dip; + dsp->ds_next = p; - /* - * Save a pointer to the list head. - */ - dip->di_headp = dhp; - dlp->dl_impl_count++; + membar_producer(); - /* - * Walk the bound dls_impl_t to see if there are any - * in promiscuous 'all sap' mode. - */ - mod_hash_walk(hash, i_dls_link_walk, (void *)&promisc); - rw_exit(&dlp->dl_impl_lock); + dhp->dh_list = dsp; /* - * If there are then we need to use a receive routine - * which will route packets to those dls_impl_t as well - * as ones bound to the DLSAP of the packet. + * Save a pointer to the list head. */ - if (promisc) - rx = i_dls_link_rx_promisc; - else - rx = i_dls_link_rx; - - /* Replace the existing receive function if there is one. */ - if (dlp->dl_mrh != NULL) - mac_rx_remove(dlp->dl_mh, dlp->dl_mrh, B_TRUE); - dlp->dl_mrh = mac_active_rx_add(dlp->dl_mh, rx, (void *)dlp); - mutex_exit(&dlp->dl_lock); + dsp->ds_head = dhp; + dlp->dl_impl_count++; } void -dls_link_remove(dls_link_t *dlp, dls_impl_t *dip) +dls_link_remove(dls_link_t *dlp, dld_str_t *dsp) { - mod_hash_t *hash = dlp->dl_impl_hash; - dls_impl_t **pp; - dls_impl_t *p; + mod_hash_t *hash = dlp->dl_str_hash; + dld_str_t **pp; + dld_str_t *p; dls_head_t *dhp; - mac_rx_t rx; - /* - * We need dl_lock here because we want to be able to walk - * the hash table *and* set the mac rx func atomically. if - * these two operations are separate, someone else could - * insert/remove dls_impl_t from the hash table after we - * drop the hash lock and this could cause our chosen rx - * func to be incorrect. note that we cannot call mac_rx_add - * when holding the hash lock because this can cause deadlock. - */ - mutex_enter(&dlp->dl_lock); - rw_enter(&dlp->dl_impl_lock, RW_WRITER); + ASSERT(MAC_PERIM_HELD(dlp->dl_mh)); /* - * Poll the hash table entry until all references have been dropped. - * We need to drop all locks before sleeping because we don't want - * the interrupt handler to block. We set di_removing here to - * tell the receive callbacks not to pass up packets anymore. - * This is only a hint to quicken the decrease of the refcnt so - * the assignment need not be protected by any lock. + * We set dh_removing here to tell the receive callbacks not to pass + * up packets anymore. Then wait till the current callbacks are done. + * This happens either in the close path or in processing the + * DL_UNBIND_REQ via a taskq thread, and it is ok to cv_wait in either. + * The dh_ref ensures there aren't and there won't be any upcalls + * walking or using the dh_list. The mod hash internal lock ensures + * that the insert/remove of the dls_head_t itself synchronizes with + * any i_dls_link_rx trying to locate it. The perimeter ensures that + * there isn't another simultaneous dls_link_add/remove. */ - dhp = dip->di_headp; - dip->di_removing = B_TRUE; - while (dhp->dh_ref != 0) { - rw_exit(&dlp->dl_impl_lock); - mutex_exit(&dlp->dl_lock); - delay(drv_usectohz(1000)); /* 1ms delay */ - mutex_enter(&dlp->dl_lock); - rw_enter(&dlp->dl_impl_lock, RW_WRITER); - } + dhp = dsp->ds_head; + + mutex_enter(&dhp->dh_lock); + dhp->dh_removing = B_TRUE; + while (dhp->dh_ref != 0) + cv_wait(&dhp->dh_cv, &dhp->dh_lock); + mutex_exit(&dhp->dh_lock); /* - * Walk the list and remove the dls_impl_t. + * Walk the list and remove the dld_str_t. */ - for (pp = &dhp->dh_list; (p = *pp) != NULL; pp = &(p->di_nextp)) { - if (p == dip) + for (pp = &dhp->dh_list; (p = *pp) != NULL; pp = &(p->ds_next)) { + if (p == dsp) break; } ASSERT(p != NULL); - *pp = p->di_nextp; - p->di_nextp = NULL; + *pp = p->ds_next; + p->ds_next = NULL; + p->ds_head = NULL; - ASSERT(dlp->dl_impl_count > 0); + ASSERT(dlp->dl_impl_count != 0); dlp->dl_impl_count--; if (dhp->dh_list == NULL) { @@ -1064,41 +1006,11 @@ dls_link_remove(dls_link_t *dlp, dls_impl_t *dip) (void) mod_hash_remove(hash, dhp->dh_key, &val); ASSERT(dhp == (dls_head_t *)val); i_dls_head_free(dhp); - } - dip->di_removing = B_FALSE; - - /* - * If there are no dls_impl_t then there's no need to register a - * receive function with the mac. - */ - if (dlp->dl_impl_count == 0) { - rw_exit(&dlp->dl_impl_lock); - mac_rx_remove(dlp->dl_mh, dlp->dl_mrh, B_TRUE); - dlp->dl_mrh = NULL; } else { - boolean_t promisc = B_FALSE; - - /* - * Walk the bound dls_impl_t to see if there are any - * in promiscuous 'all sap' mode. - */ - mod_hash_walk(hash, i_dls_link_walk, (void *)&promisc); - rw_exit(&dlp->dl_impl_lock); - - /* - * If there are then we need to use a receive routine - * which will route packets to those dls_impl_t as well - * as ones bound to the DLSAP of the packet. - */ - if (promisc) - rx = i_dls_link_rx_promisc; - else - rx = i_dls_link_rx; - - mac_rx_remove(dlp->dl_mh, dlp->dl_mrh, B_TRUE); - dlp->dl_mrh = mac_active_rx_add(dlp->dl_mh, rx, (void *)dlp); + mutex_enter(&dhp->dh_lock); + dhp->dh_removing = B_FALSE; + mutex_exit(&dhp->dh_lock); } - mutex_exit(&dlp->dl_lock); } int @@ -1153,10 +1065,5 @@ dls_link_header_info(dls_link_t *dlp, mblk_t *mp, mac_header_info_t *mhip) mhip->mhi_tci = 0; } - /* - * The messsage is looped back from the underlying driver. - */ - mhip->mhi_prom_looped = (mp->b_flag & MSGNOLOOP); - return (0); } diff --git a/usr/src/uts/common/io/dls/dls_mgmt.c b/usr/src/uts/common/io/dls/dls_mgmt.c index bf5fc0a814..bb922423b3 100644 --- a/usr/src/uts/common/io/dls/dls_mgmt.c +++ b/usr/src/uts/common/io/dls/dls_mgmt.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Datalink management routines. */ @@ -38,11 +36,17 @@ #include <sys/kstat.h> #include <sys/vnode.h> #include <sys/cmn_err.h> -#include <sys/vlan.h> #include <sys/softmac.h> #include <sys/dls.h> #include <sys/dls_impl.h> +/* + * This vanity name management module is treated as part of the GLD framework + * and we don't hold any GLD framework lock across a call to any mac + * function that needs to acquire the mac perimeter. The hierarchy is + * mac perimeter -> framework locks + */ + static kmem_cache_t *i_dls_devnet_cachep; static kmutex_t i_dls_mgmt_lock; static krwlock_t i_dls_devnet_lock; @@ -56,25 +60,22 @@ boolean_t devnet_need_rebuild; /* Upcall door handle */ static door_handle_t dls_mgmt_dh = NULL; +#define DD_CONDEMNED 0x1 + /* - * This structure is used to keep the <linkid, macname, vid> mapping. + * This structure is used to keep the <linkid, macname> mapping. */ typedef struct dls_devnet_s { - datalink_id_t dd_vlanid; datalink_id_t dd_linkid; char dd_mac[MAXNAMELEN]; - uint16_t dd_vid; - char dd_spa[MAXSPALEN]; - boolean_t dd_explicit; kstat_t *dd_ksp; - uint32_t dd_ref; kmutex_t dd_mutex; kcondvar_t dd_cv; uint32_t dd_tref; + uint_t dd_flags; - kmutex_t dd_zid_mutex; zoneid_t dd_zid; boolean_t dd_prop_loaded; @@ -90,7 +91,6 @@ i_dls_devnet_constructor(void *buf, void *arg, int kmflag) bzero(buf, sizeof (dls_devnet_t)); mutex_init(&ddp->dd_mutex, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&ddp->dd_zid_mutex, NULL, MUTEX_DEFAULT, NULL); cv_init(&ddp->dd_cv, NULL, CV_DEFAULT, NULL); return (0); } @@ -104,9 +104,7 @@ i_dls_devnet_destructor(void *buf, void *arg) ASSERT(ddp->dd_ksp == NULL); ASSERT(ddp->dd_ref == 0); ASSERT(ddp->dd_tref == 0); - ASSERT(!ddp->dd_explicit); mutex_destroy(&ddp->dd_mutex); - mutex_destroy(&ddp->dd_zid_mutex); cv_destroy(&ddp->dd_cv); } @@ -128,13 +126,13 @@ dls_mgmt_init(void) ASSERT(i_dls_devnet_cachep != NULL); /* - * Create a hash table, keyed by dd_vlanid, of dls_devnet_t. + * Create a hash table, keyed by dd_linkid, of dls_devnet_t. */ i_dls_devnet_id_hash = mod_hash_create_idhash("dls_devnet_id_hash", VLAN_HASHSZ, mod_hash_null_valdtor); /* - * Create a hash table, keyed by dd_spa. + * Create a hash table, keyed by dd_mac */ i_dls_devnet_hash = mod_hash_create_extended("dls_devnet_hash", VLAN_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor, @@ -310,7 +308,6 @@ done: * registration of its mac * - class datalink class * - media type media type; DL_OTHER means unknown - * - vid VLAN ID (for VLANs) * - persist whether to persist the datalink */ int @@ -546,7 +543,7 @@ dls_devnet_prop_task(void *arg) { dls_devnet_t *ddp = arg; - (void) dls_mgmt_linkprop_init(ddp->dd_vlanid); + (void) dls_mgmt_linkprop_init(ddp->dd_linkid); mutex_enter(&ddp->dd_mutex); ddp->dd_prop_loaded = B_TRUE; @@ -567,58 +564,48 @@ dls_devnet_prop_task_wait(dls_dl_handle_t ddp) mutex_exit(&ddp->dd_mutex); } -/* - * Hold the vanity naming structure (dls_devnet_t) temporarily. The request to - * delete the dls_devnet_t will wait until the temporary reference is released. - */ +void +dls_devnet_rele_tmp(dls_dl_handle_t dlh) +{ + dls_devnet_t *ddp = dlh; + + mutex_enter(&ddp->dd_mutex); + ASSERT(ddp->dd_tref != 0); + if (--ddp->dd_tref == 0) + cv_signal(&ddp->dd_cv); + mutex_exit(&ddp->dd_mutex); +} + int -dls_devnet_hold_tmp(datalink_id_t linkid, dls_dl_handle_t *ddhp) +dls_devnet_hold_link(datalink_id_t linkid, dls_dl_handle_t *ddhp, + dls_link_t **dlpp) { - dls_devnet_t *ddp; - dls_dev_handle_t ddh = NULL; - dev_t phydev = 0; - int err; + dls_dl_handle_t dlh; + dls_link_t *dlp; + int err; - /* - * Hold this link to prevent it being detached (if physical link). - */ - if (dls_mgmt_get_phydev(linkid, &phydev) == 0) - (void) softmac_hold_device(phydev, &ddh); + if ((err = dls_devnet_hold_tmp(linkid, &dlh)) != 0) + return (err); - rw_enter(&i_dls_devnet_lock, RW_READER); - if ((err = mod_hash_find(i_dls_devnet_id_hash, - (mod_hash_key_t)(uintptr_t)linkid, (mod_hash_val_t *)&ddp)) != 0) { - ASSERT(err == MH_ERR_NOTFOUND); - rw_exit(&i_dls_devnet_lock); - softmac_rele_device(ddh); - return (ENOENT); + if ((err = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0) { + dls_devnet_rele_tmp(dlh); + return (err); } - /* - * At least one reference was held when this datalink was created. - */ - ASSERT(ddp->dd_ref > 0); - mutex_enter(&ddp->dd_mutex); - ddp->dd_tref++; - mutex_exit(&ddp->dd_mutex); - rw_exit(&i_dls_devnet_lock); - softmac_rele_device(ddh); + ASSERT(MAC_PERIM_HELD(dlp->dl_mh)); -done: - *ddhp = ddp; + *ddhp = dlh; + *dlpp = dlp; return (0); } void -dls_devnet_rele_tmp(dls_dl_handle_t dlh) +dls_devnet_rele_link(dls_dl_handle_t dlh, dls_link_t *dlp) { - dls_devnet_t *ddp = dlh; + ASSERT(MAC_PERIM_HELD(dlp->dl_mh)); - mutex_enter(&ddp->dd_mutex); - ASSERT(ddp->dd_tref != 0); - if (--ddp->dd_tref == 0) - cv_signal(&ddp->dd_cv); - mutex_exit(&ddp->dd_mutex); + dls_link_rele(dlp); + dls_devnet_rele_tmp(dlh); } /* @@ -632,15 +619,23 @@ static int dls_devnet_stat_update(kstat_t *ksp, int rw) { dls_devnet_t *ddp = ksp->ks_private; - dls_vlan_t *dvp; + dls_link_t *dlp; int err; + mac_perim_handle_t mph; - err = dls_vlan_hold(ddp->dd_mac, ddp->dd_vid, &dvp, B_FALSE, B_FALSE); + err = mac_perim_enter_by_macname(ddp->dd_mac, &mph); if (err != 0) return (err); - err = dls_stat_update(ksp, dvp, rw); - dls_vlan_rele(dvp); + err = dls_link_hold(ddp->dd_mac, &dlp); + if (err != 0) { + mac_perim_exit(mph); + return (err); + } + + err = dls_stat_update(ksp, dlp, rw); + dls_link_rele(dlp); + mac_perim_exit(mph); return (err); } @@ -653,7 +648,7 @@ dls_devnet_stat_create(dls_devnet_t *ddp) char link[MAXLINKNAMELEN]; kstat_t *ksp; - if ((dls_mgmt_get_linkinfo(ddp->dd_vlanid, link, + if ((dls_mgmt_get_linkinfo(ddp->dd_linkid, link, NULL, NULL, NULL)) != 0) { return; } @@ -704,114 +699,53 @@ dls_devnet_stat_rename(dls_devnet_t *ddp, const char *link) } /* - * Associate a linkid with a given link (identified by <macname/vid>) - * - * Several cases: - * a. implicit VLAN creation: (non-NULL "vlan") - * b. explicit VLAN creation: (NULL "vlan") - * c. explicit non-VLAN creation: - * (NULL "vlan" and linkid could be INVALID_LINKID if the physical device - * was created before the daemon was started) + * Associate a linkid with a given link (identified by macname) */ static int -dls_devnet_set(const char *macname, uint16_t vid, - datalink_id_t vlan_linkid, datalink_id_t linkid, const char *vlan, - dls_devnet_t **ddpp) +dls_devnet_set(const char *macname, datalink_id_t linkid, dls_devnet_t **ddpp) { dls_devnet_t *ddp = NULL; - char spa[MAXSPALEN]; - boolean_t explicit = (vlan == NULL); datalink_class_t class; int err; - ASSERT(vid != VLAN_ID_NONE || explicit); - ASSERT(vlan_linkid != DATALINK_INVALID_LINKID || !explicit || - vid == VLAN_ID_NONE); - - (void) snprintf(spa, MAXSPALEN, "%s/%d", macname, vid); rw_enter(&i_dls_devnet_lock, RW_WRITER); if ((err = mod_hash_find(i_dls_devnet_hash, - (mod_hash_key_t)spa, (mod_hash_val_t *)&ddp)) == 0) { - char link[MAXLINKNAMELEN]; - - if (explicit) { - if ((vid != VLAN_ID_NONE) || - (ddp->dd_vlanid != DATALINK_INVALID_LINKID)) { - err = EEXIST; - goto done; - } - - /* - * This might be a physical link that has already - * been created, but which does not have a vlan_linkid - * because dlmgmtd was not running when it was created. - */ - if ((err = dls_mgmt_get_linkinfo(vlan_linkid, NULL, - &class, NULL, NULL)) != 0) { - goto done; - } - - if (class != DATALINK_CLASS_PHYS) { - err = EINVAL; - goto done; - } - - goto newphys; + (mod_hash_key_t)macname, (mod_hash_val_t *)&ddp)) == 0) { + if (ddp->dd_linkid != DATALINK_INVALID_LINKID) { + err = EEXIST; + goto done; } /* - * Implicit VLAN, but the same name has already - * been associated with another linkid. Check if the name - * of that link matches the given VLAN name. + * This might be a physical link that has already + * been created, but which does not have a linkid + * because dlmgmtd was not running when it was created. */ - ASSERT(vid != VLAN_ID_NONE); - if ((err = dls_mgmt_get_linkinfo(ddp->dd_vlanid, link, - NULL, NULL, NULL)) != 0) { + if ((err = dls_mgmt_get_linkinfo(linkid, NULL, + &class, NULL, NULL)) != 0) { goto done; } - if (strcmp(link, vlan) != 0) { - err = EEXIST; + if (class != DATALINK_CLASS_PHYS) { + err = EINVAL; goto done; } - /* - * This is not an implicit created VLAN any more, return - * this existing datalink. - */ - ASSERT(ddp->dd_ref > 0); - ddp->dd_ref++; - goto done; - } - - /* - * Request the daemon to create a new vlan_linkid for this implicitly - * created vlan. - */ - if (!explicit && ((err = dls_mgmt_create(vlan, 0, - DATALINK_CLASS_VLAN, DL_ETHER, B_FALSE, &vlan_linkid)) != 0)) { - goto done; + goto newphys; } - ddp = kmem_cache_alloc(i_dls_devnet_cachep, KM_SLEEP); - ddp->dd_vid = vid; - ddp->dd_explicit = explicit; ddp->dd_tref = 0; ddp->dd_ref++; ddp->dd_zid = GLOBAL_ZONEID; (void) strncpy(ddp->dd_mac, macname, MAXNAMELEN); - (void) snprintf(ddp->dd_spa, MAXSPALEN, "%s/%d", macname, vid); VERIFY(mod_hash_insert(i_dls_devnet_hash, - (mod_hash_key_t)ddp->dd_spa, (mod_hash_val_t)ddp) == 0); + (mod_hash_key_t)ddp->dd_mac, (mod_hash_val_t)ddp) == 0); newphys: - - ddp->dd_vlanid = vlan_linkid; - if (ddp->dd_vlanid != DATALINK_INVALID_LINKID) { + if (linkid != DATALINK_INVALID_LINKID) { ddp->dd_linkid = linkid; - VERIFY(mod_hash_insert(i_dls_devnet_id_hash, - (mod_hash_key_t)(uintptr_t)vlan_linkid, + (mod_hash_key_t)(uintptr_t)linkid, (mod_hash_val_t)ddp) == 0); devnet_need_rebuild = B_TRUE; dls_devnet_stat_create(ddp); @@ -832,90 +766,83 @@ done: return (err); } -static void -dls_devnet_unset_common(dls_devnet_t *ddp) -{ - mod_hash_val_t val; - - ASSERT(RW_WRITE_HELD(&i_dls_devnet_lock)); - - ASSERT(ddp->dd_ref == 0); - - /* - * Remove this dls_devnet_t from the hash table. - */ - VERIFY(mod_hash_remove(i_dls_devnet_hash, - (mod_hash_key_t)ddp->dd_spa, &val) == 0); - - if (ddp->dd_vlanid != DATALINK_INVALID_LINKID) { - VERIFY(mod_hash_remove(i_dls_devnet_id_hash, - (mod_hash_key_t)(uintptr_t)ddp->dd_vlanid, &val) == 0); - - dls_devnet_stat_destroy(ddp); - devnet_need_rebuild = B_TRUE; - } - - /* - * Wait until all temporary references are released. - */ - mutex_enter(&ddp->dd_mutex); - while ((ddp->dd_tref != 0) || (ddp->dd_prop_taskid != NULL)) - cv_wait(&ddp->dd_cv, &ddp->dd_mutex); - - ddp->dd_prop_loaded = B_FALSE; - mutex_exit(&ddp->dd_mutex); - - if (!ddp->dd_explicit) { - ASSERT(ddp->dd_vid != VLAN_ID_NONE); - ASSERT(ddp->dd_vlanid != DATALINK_INVALID_LINKID); - (void) dls_mgmt_destroy(ddp->dd_vlanid, B_FALSE); - } - - ddp->dd_vlanid = DATALINK_INVALID_LINKID; - ddp->dd_zid = GLOBAL_ZONEID; - ddp->dd_explicit = B_FALSE; - kmem_cache_free(i_dls_devnet_cachep, ddp); -} - /* - * Disassociate a linkid with a given link (identified by <macname/vid>) + * Disassociate a linkid with a given link (identified by macname) + * This waits until temporary references to the dls_devnet_t are gone. */ static int -dls_devnet_unset(const char *macname, uint16_t vid, datalink_id_t *id) +dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait) { dls_devnet_t *ddp; - char spa[MAXSPALEN]; int err; - - (void) snprintf(spa, MAXSPALEN, "%s/%d", macname, vid); + mod_hash_val_t val; rw_enter(&i_dls_devnet_lock, RW_WRITER); if ((err = mod_hash_find(i_dls_devnet_hash, - (mod_hash_key_t)spa, (mod_hash_val_t *)&ddp)) != 0) { + (mod_hash_key_t)macname, (mod_hash_val_t *)&ddp)) != 0) { ASSERT(err == MH_ERR_NOTFOUND); rw_exit(&i_dls_devnet_lock); return (ENOENT); } - ASSERT(ddp->dd_ref != 0); + mutex_enter(&ddp->dd_mutex); - if (ddp->dd_ref != 1) { + /* + * Make sure downcalls into softmac_create or softmac_destroy from + * devfs don't cv_wait on any devfs related condition for fear of + * deadlock. Return EBUSY if the asynchronous thread started for + * property loading as part of the post attach hasn't yet completed. + */ + ASSERT(ddp->dd_ref != 0); + if ((ddp->dd_ref != 1) || (!wait && + (ddp->dd_tref != 0 || ddp->dd_prop_taskid != NULL))) { + mutex_exit(&ddp->dd_mutex); rw_exit(&i_dls_devnet_lock); return (EBUSY); } + ddp->dd_flags |= DD_CONDEMNED; ddp->dd_ref--; + *id = ddp->dd_linkid; - if (id != NULL) - *id = ddp->dd_vlanid; + /* + * Remove this dls_devnet_t from the hash table. + */ + VERIFY(mod_hash_remove(i_dls_devnet_hash, + (mod_hash_key_t)ddp->dd_mac, &val) == 0); - dls_devnet_unset_common(ddp); + if (ddp->dd_linkid != DATALINK_INVALID_LINKID) { + VERIFY(mod_hash_remove(i_dls_devnet_id_hash, + (mod_hash_key_t)(uintptr_t)ddp->dd_linkid, &val) == 0); + + dls_devnet_stat_destroy(ddp); + devnet_need_rebuild = B_TRUE; + } rw_exit(&i_dls_devnet_lock); + + if (wait) { + /* + * Wait until all temporary references are released. + */ + while ((ddp->dd_tref != 0) || (ddp->dd_prop_taskid != NULL)) + cv_wait(&ddp->dd_cv, &ddp->dd_mutex); + } else { + ASSERT(ddp->dd_tref == 0 && ddp->dd_prop_taskid == NULL); + } + + ddp->dd_prop_loaded = B_FALSE; + ddp->dd_linkid = DATALINK_INVALID_LINKID; + ddp->dd_zid = GLOBAL_ZONEID; + ddp->dd_flags = 0; + mutex_exit(&ddp->dd_mutex); + kmem_cache_free(i_dls_devnet_cachep, ddp); + return (0); } static int -dls_devnet_hold(datalink_id_t linkid, dls_devnet_t **ddpp) +dls_devnet_hold_common(datalink_id_t linkid, dls_devnet_t **ddpp, + boolean_t tmp_hold) { dls_devnet_t *ddp; dev_t phydev = 0; @@ -938,39 +865,70 @@ dls_devnet_hold(datalink_id_t linkid, dls_devnet_t **ddpp) return (ENOENT); } + mutex_enter(&ddp->dd_mutex); ASSERT(ddp->dd_ref > 0); - ddp->dd_ref++; + if (ddp->dd_flags & DD_CONDEMNED) { + mutex_exit(&ddp->dd_mutex); + rw_exit(&i_dls_devnet_lock); + softmac_rele_device(ddh); + return (ENOENT); + } + if (tmp_hold) + ddp->dd_tref++; + else + ddp->dd_ref++; + mutex_exit(&ddp->dd_mutex); rw_exit(&i_dls_devnet_lock); + softmac_rele_device(ddh); -done: *ddpp = ddp; return (0); } +int +dls_devnet_hold(datalink_id_t linkid, dls_devnet_t **ddpp) +{ + return (dls_devnet_hold_common(linkid, ddpp, B_FALSE)); +} + +/* + * Hold the vanity naming structure (dls_devnet_t) temporarily. The request to + * delete the dls_devnet_t will wait until the temporary reference is released. + */ +int +dls_devnet_hold_tmp(datalink_id_t linkid, dls_devnet_t **ddpp) +{ + return (dls_devnet_hold_common(linkid, ddpp, B_TRUE)); +} + /* * This funtion is called when a DLS client tries to open a device node. * This dev_t could a result of a /dev/net node access (returned by * devnet_create_rvp->dls_devnet_open()) or a direct /dev node access. - * In both cases, this function returns 0. In the first case, bump the - * reference count of the dls_devnet_t structure, so that it will not be - * freed when devnet_inactive_callback->dls_devnet_close() is called - * (Note that devnet_inactive_callback() is called right after dld_open, - * not when the /dev/net access is done). In the second case, ddhp would - * be NULL. - * - * To undo this function, call dls_devnet_close() in the first case, and call - * dls_vlan_rele() in the second case. + * In both cases, this function bumps up the reference count of the + * dls_devnet_t structure. The reference is held as long as the device node + * is open. In the case of /dev/net while it is true that the initial reference + * is held when the devnet_create_rvp->dls_devnet_open call happens, this + * initial reference is released immediately in devnet_inactive_callback -> + * dls_devnet_close(). (Note that devnet_inactive_callback() is called right + * after dld_open completes, not when the /dev/net node is being closed). + * To undo this function, call dls_devnet_rele() */ int -dls_devnet_open_by_dev(dev_t dev, dls_vlan_t **dvpp, dls_dl_handle_t *ddhp) +dls_devnet_hold_by_dev(dev_t dev, dls_dl_handle_t *ddhp) { + char name[MAXNAMELEN]; + char *drv; dls_dev_handle_t ddh = NULL; - char spa[MAXSPALEN]; dls_devnet_t *ddp; - dls_vlan_t *dvp; int err; + if ((drv = ddi_major_to_name(getmajor(dev))) == NULL) + return (EINVAL); + + (void) snprintf(name, MAXNAMELEN, "%s%d", drv, getminor(dev) - 1); + /* * Hold this link to prevent it being detached in case of a * GLDv3 physical link. @@ -978,64 +936,49 @@ dls_devnet_open_by_dev(dev_t dev, dls_vlan_t **dvpp, dls_dl_handle_t *ddhp) if (getminor(dev) - 1 < MAC_MAX_MINOR) (void) softmac_hold_device(dev, &ddh); - /* - * Found the dls_vlan_t with the given dev. - */ - err = dls_vlan_hold_by_dev(dev, &dvp); - softmac_rele_device(ddh); - - if (err != 0) - return (err); - - (void) snprintf(spa, MAXSPALEN, "%s/%d", - dvp->dv_dlp->dl_name, dvp->dv_id); - rw_enter(&i_dls_devnet_lock, RW_WRITER); if ((err = mod_hash_find(i_dls_devnet_hash, - (mod_hash_key_t)spa, (mod_hash_val_t *)&ddp)) != 0) { + (mod_hash_key_t)name, (mod_hash_val_t *)&ddp)) != 0) { ASSERT(err == MH_ERR_NOTFOUND); rw_exit(&i_dls_devnet_lock); - *ddhp = NULL; - *dvpp = dvp; - return (0); + softmac_rele_device(ddh); + return (ENOENT); } - + mutex_enter(&ddp->dd_mutex); ASSERT(ddp->dd_ref > 0); + if (ddp->dd_flags & DD_CONDEMNED) { + mutex_exit(&ddp->dd_mutex); + rw_exit(&i_dls_devnet_lock); + softmac_rele_device(ddh); + return (ENOENT); + } ddp->dd_ref++; + mutex_exit(&ddp->dd_mutex); rw_exit(&i_dls_devnet_lock); + + softmac_rele_device(ddh); + *ddhp = ddp; - *dvpp = dvp; return (0); } -static void +void dls_devnet_rele(dls_devnet_t *ddp) { - rw_enter(&i_dls_devnet_lock, RW_WRITER); - ASSERT(ddp->dd_ref != 0); - if (--ddp->dd_ref != 0) { - rw_exit(&i_dls_devnet_lock); - return; - } - /* - * This should only happen for implicitly-created VLAN. - */ - ASSERT(ddp->dd_vid != VLAN_ID_NONE); - dls_devnet_unset_common(ddp); - rw_exit(&i_dls_devnet_lock); + mutex_enter(&ddp->dd_mutex); + ASSERT(ddp->dd_ref > 1); + ddp->dd_ref--; + mutex_exit(&ddp->dd_mutex); } static int -dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid) +dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp) { - char link_under[MAXLINKNAMELEN]; char drv[MAXLINKNAMELEN]; uint_t ppa; major_t major; dev_t phy_dev, tmp_dev; - uint_t vid; datalink_id_t linkid; - dls_devnet_t *ddp; dls_dev_handle_t ddh; int err; @@ -1056,35 +999,8 @@ dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid) if (ddi_parse(link, drv, &ppa) != DDI_SUCCESS) return (ENOENT); - if ((vid = DLS_PPA2VID(ppa)) > VLAN_ID_MAX) - return (ENOENT); - - ppa = (uint_t)DLS_PPA2INST(ppa); - (void) snprintf(link_under, sizeof (link_under), "%s%d", drv, ppa); - - if (vid != VLAN_ID_NONE) { - /* - * Only global zone can implicitly create a VLAN. - */ - if (zid != GLOBAL_ZONEID) - return (ENOENT); - - /* - * This is potentially an implicitly-created VLAN. Hold the - * link this VLAN is created on. - */ - if (dls_mgmt_get_linkid(link_under, &linkid) == 0 && - dls_devnet_hold_tmp(linkid, &ddp) == 0) { - if (ddp->dd_vid != VLAN_ID_NONE) { - dls_devnet_rele_tmp(ddp); - return (ENOENT); - } - goto implicit; - } - } - /* - * If this link (or the link that an implicit vlan is created on) + * If this link: * (a) is a physical device, (b) this is the first boot, (c) the MAC * is not registered yet, and (d) we cannot find its linkid, then the * linkname is the same as the devname. @@ -1102,7 +1018,7 @@ dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid) * At this time, the MAC should be registered, check its phy_dev using * the given name. */ - if ((err = dls_mgmt_get_linkid(link_under, &linkid)) != 0 || + if ((err = dls_mgmt_get_linkid(link, &linkid)) != 0 || (err = dls_mgmt_get_phydev(linkid, &tmp_dev)) != 0) { softmac_rele_device(ddh); return (err); @@ -1112,65 +1028,45 @@ dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid) return (ENOENT); } - if (vid == VLAN_ID_NONE) { - /* - * For non-VLAN, we are done. - */ - err = dls_devnet_hold(linkid, ddpp); - softmac_rele_device(ddh); - return (err); - } - - /* - * If this is an implicit VLAN, temporarily hold this non-VLAN. - */ - VERIFY(dls_devnet_hold_tmp(linkid, &ddp) == 0); + err = dls_devnet_hold(linkid, ddpp); softmac_rele_device(ddh); - ASSERT(ddp->dd_vid == VLAN_ID_NONE); - - /* - * Again, this is potentially an implicitly-created VLAN. - */ - -implicit: - ASSERT(vid != VLAN_ID_NONE); - err = dls_devnet_set(ddp->dd_mac, vid, DATALINK_INVALID_LINKID, - linkid, link, ddpp); - dls_devnet_rele_tmp(ddp); return (err); } -/* - * Get linkid for the given dev. - */ int -dls_devnet_dev2linkid(dev_t dev, datalink_id_t *linkidp) +dls_devnet_macname2linkid(const char *macname, datalink_id_t *linkidp) { - dls_vlan_t *dvp; dls_devnet_t *ddp; - char spa[MAXSPALEN]; - int err; - - if ((err = dls_vlan_hold_by_dev(dev, &dvp)) != 0) - return (err); - - (void) snprintf(spa, MAXSPALEN, "%s/%d", - dvp->dv_dlp->dl_name, dvp->dv_id); rw_enter(&i_dls_devnet_lock, RW_READER); - if (mod_hash_find(i_dls_devnet_hash, (mod_hash_key_t)spa, + if (mod_hash_find(i_dls_devnet_hash, (mod_hash_key_t)macname, (mod_hash_val_t *)&ddp) != 0) { rw_exit(&i_dls_devnet_lock); - dls_vlan_rele(dvp); return (ENOENT); } - *linkidp = ddp->dd_vlanid; + *linkidp = ddp->dd_linkid; rw_exit(&i_dls_devnet_lock); - dls_vlan_rele(dvp); return (0); } + +/* + * Get linkid for the given dev. + */ +int +dls_devnet_dev2linkid(dev_t dev, datalink_id_t *linkidp) +{ + char macname[MAXNAMELEN]; + char *drv; + + if ((drv = ddi_major_to_name(getmajor(dev))) == NULL) + return (EINVAL); + + (void) snprintf(macname, MAXNAMELEN, "%s%d", drv, getminor(dev) - 1); + return (dls_devnet_macname2linkid(macname, linkidp)); +} + /* * Get the link's physical dev_t. It this is a VLAN, get the dev_t of the * link this VLAN is created on. @@ -1213,6 +1109,7 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) int err = 0; dev_t phydev = 0; dls_devnet_t *ddp; + mac_perim_handle_t mph = NULL; mac_handle_t mh; mod_hash_val_t val; @@ -1232,6 +1129,14 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) if (dls_mgmt_get_phydev(id1, &phydev) == 0) (void) softmac_hold_device(phydev, &ddh); + /* + * The framework does not hold hold locks across calls to the + * mac perimeter, hence enter the perimeter first. This also waits + * for the property loading to finish. + */ + if ((err = mac_perim_enter_by_linkid(id1, &mph)) != 0) + goto done; + rw_enter(&i_dls_devnet_lock, RW_WRITER); if ((err = mod_hash_find(i_dls_devnet_id_hash, (mod_hash_key_t)(uintptr_t)id1, (mod_hash_val_t *)&ddp)) != 0) { @@ -1241,41 +1146,21 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) } /* - * Let the property loading thread finish. - * Unfortunately, we have to drop i_dls_devnet_lock temporarily - * to avoid deadlocks, and ensure ddp is still in the hash after - * reacquiring it. Observe lock order as well. - */ - mutex_enter(&ddp->dd_mutex); - if (ddp->dd_prop_taskid != NULL) { - rw_exit(&i_dls_devnet_lock); - while (ddp->dd_prop_taskid != NULL) - cv_wait(&ddp->dd_cv, &ddp->dd_mutex); - mutex_exit(&ddp->dd_mutex); - rw_enter(&i_dls_devnet_lock, RW_WRITER); - - if ((err = mod_hash_find(i_dls_devnet_id_hash, - (mod_hash_key_t)(uintptr_t)id1, - (mod_hash_val_t *)&ddp)) != 0) { - ASSERT(err == MH_ERR_NOTFOUND); - err = ENOENT; - goto done; - } - } else { - mutex_exit(&ddp->dd_mutex); - } - - /* * Return EBUSY if any applications have this link open. */ - if ((ddp->dd_explicit && ddp->dd_ref > 1) || - (!ddp->dd_explicit && ddp->dd_ref > 0)) { + if (ddp->dd_ref > 1) { err = EBUSY; goto done; } if (id2 == DATALINK_INVALID_LINKID) { (void) strlcpy(linkname, link, sizeof (linkname)); + + /* rename mac client name and its flow if exists */ + if ((err = mac_open(ddp->dd_mac, &mh)) != 0) + goto done; + (void) mac_rename_primary(mh, link); + mac_close(mh); goto done; } @@ -1294,7 +1179,7 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) /* * We release the reference of the MAC which mac_open() is * holding. Note that this mac will not be unregistered - * because the physical device is hold. + * because the physical device is held. */ mac_close(mh); @@ -1302,7 +1187,7 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) * Check if there is any other MAC clients, if not, hold this mac * exclusively until we are done. */ - if ((err = mac_hold_exclusive(mh)) != 0) + if ((err = mac_mark_exclusive(mh)) != 0) goto done; /* @@ -1310,23 +1195,25 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) */ if ((err = mod_hash_find(i_dls_devnet_id_hash, (mod_hash_key_t)(uintptr_t)id2, &val)) != MH_ERR_NOTFOUND) { - mac_rele_exclusive(mh); + mac_unmark_exclusive(mh); err = EEXIST; goto done; } err = dls_mgmt_get_linkinfo(id2, linkname, NULL, NULL, NULL); if (err != 0) { - mac_rele_exclusive(mh); + mac_unmark_exclusive(mh); goto done; } (void) mod_hash_remove(i_dls_devnet_id_hash, (mod_hash_key_t)(uintptr_t)id1, &val); - ddp->dd_vlanid = id2; + ddp->dd_linkid = id2; (void) mod_hash_insert(i_dls_devnet_id_hash, - (mod_hash_key_t)(uintptr_t)ddp->dd_vlanid, (mod_hash_val_t)ddp); + (mod_hash_key_t)(uintptr_t)ddp->dd_linkid, (mod_hash_val_t)ddp); + + mac_unmark_exclusive(mh); /* load properties for new id */ mutex_enter(&ddp->dd_mutex); @@ -1335,8 +1222,6 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) dls_devnet_prop_task, ddp, TQ_SLEEP); mutex_exit(&ddp->dd_mutex); - mac_rele_exclusive(mh); - done: /* * Change the name of the kstat based on the new link name. @@ -1345,6 +1230,8 @@ done: dls_devnet_stat_rename(ddp, linkname); rw_exit(&i_dls_devnet_lock); + if (mph != NULL) + mac_perim_exit(mph); softmac_rele_device(ddh); return (err); } @@ -1355,26 +1242,30 @@ dls_devnet_setzid(const char *link, zoneid_t zid) dls_devnet_t *ddp; int err; zoneid_t old_zid; + mac_perim_handle_t mph; + + if ((err = dls_devnet_hold_by_name(link, &ddp)) != 0) + return (err); - if ((err = dls_devnet_hold_by_name(link, &ddp, GLOBAL_ZONEID)) != 0) + err = mac_perim_enter_by_macname(ddp->dd_mac, &mph); + if (err != 0) return (err); - mutex_enter(&ddp->dd_zid_mutex); if ((old_zid = ddp->dd_zid) == zid) { - mutex_exit(&ddp->dd_zid_mutex); + mac_perim_exit(mph); dls_devnet_rele(ddp); return (0); } - if ((err = dls_vlan_setzid(ddp->dd_mac, ddp->dd_vid, zid)) != 0) { - mutex_exit(&ddp->dd_zid_mutex); + if ((err = dls_link_setzid(ddp->dd_mac, zid)) != 0) { + mac_perim_exit(mph); dls_devnet_rele(ddp); return (err); } ddp->dd_zid = zid; devnet_need_rebuild = B_TRUE; - mutex_exit(&ddp->dd_zid_mutex); + mac_perim_exit(mph); /* * Keep this open reference only if it belonged to the global zone @@ -1402,9 +1293,7 @@ dls_devnet_getzid(datalink_id_t linkid, zoneid_t *zidp) if ((err = dls_devnet_hold_tmp(linkid, &ddp)) != 0) return (err); - mutex_enter(&ddp->dd_zid_mutex); *zidp = ddp->dd_zid; - mutex_exit(&ddp->dd_zid_mutex); dls_devnet_rele_tmp(ddp); return (0); @@ -1417,13 +1306,16 @@ int dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp) { dls_devnet_t *ddp; - dls_vlan_t *dvp; + dls_link_t *dlp; zoneid_t zid = getzoneid(); int err; + mac_perim_handle_t mph; - if ((err = dls_devnet_hold_by_name(link, &ddp, zid)) != 0) + if ((err = dls_devnet_hold_by_name(link, &ddp)) != 0) return (err); + dls_devnet_prop_task_wait(ddp); + /* * Opening a link that does not belong to the current non-global zone * is not allowed. @@ -1433,16 +1325,22 @@ dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp) return (ENOENT); } - err = dls_vlan_hold(ddp->dd_mac, ddp->dd_vid, &dvp, B_FALSE, B_TRUE); + err = mac_perim_enter_by_macname(ddp->dd_mac, &mph); if (err != 0) { dls_devnet_rele(ddp); return (err); } - dls_devnet_prop_task_wait(ddp); + err = dls_link_hold_create(ddp->dd_mac, &dlp); + mac_perim_exit(mph); + + if (err != 0) { + dls_devnet_rele(ddp); + return (err); + } *dhp = ddp; - *devp = dvp->dv_dev; + *devp = dls_link_dev(dlp); return (0); } @@ -1453,15 +1351,20 @@ void dls_devnet_close(dls_dl_handle_t dlh) { dls_devnet_t *ddp = dlh; - dls_vlan_t *dvp; + dls_link_t *dlp; + mac_perim_handle_t mph; + + VERIFY(mac_perim_enter_by_macname(ddp->dd_mac, &mph) == 0); + VERIFY(dls_link_hold(ddp->dd_mac, &dlp) == 0); /* - * The VLAN is hold in dls_open_devnet_link(). + * One rele for the hold placed in dls_devnet_open, another for + * the hold done just above */ - VERIFY((dls_vlan_hold(ddp->dd_mac, ddp->dd_vid, &dvp, B_FALSE, - B_FALSE)) == 0); - dls_vlan_rele(dvp); - dls_vlan_rele(dvp); + dls_link_rele(dlp); + dls_link_rele(dlp); + mac_perim_exit(mph); + dls_devnet_rele(ddp); } @@ -1481,15 +1384,27 @@ dls_devnet_rebuild() int dls_devnet_create(mac_handle_t mh, datalink_id_t linkid) { + dls_link_t *dlp; int err; + mac_perim_handle_t mph; - if ((err = dls_vlan_create(mac_name(mh), 0, B_FALSE)) != 0) - return (err); - - err = dls_devnet_set(mac_name(mh), 0, linkid, linkid, NULL, NULL); - if (err != 0) - (void) dls_vlan_destroy(mac_name(mh), 0); + mac_perim_enter_by_mh(mh, &mph); + /* + * Make this association before we call dls_link_hold_create as + * we need to use the linkid to get the user name for the link + * when we create the MAC client. + */ + if ((err = dls_devnet_set(mac_name(mh), linkid, NULL)) != 0) { + mac_perim_exit(mph); + return (err); + } + if ((err = dls_link_hold_create(mac_name(mh), &dlp)) != 0) { + (void) dls_devnet_unset(mac_name(mh), &linkid, B_TRUE); + mac_perim_exit(mph); + return (err); + } + mac_perim_exit(mph); return (err); } @@ -1503,134 +1418,39 @@ int dls_devnet_recreate(mac_handle_t mh, datalink_id_t linkid) { ASSERT(linkid != DATALINK_INVALID_LINKID); - return (dls_devnet_set(mac_name(mh), 0, linkid, linkid, NULL, NULL)); + return (dls_devnet_set(mac_name(mh), linkid, NULL)); } int -dls_devnet_destroy(mac_handle_t mh, datalink_id_t *idp) +dls_devnet_destroy(mac_handle_t mh, datalink_id_t *idp, boolean_t wait) { - int err; + int err; + mac_perim_handle_t mph; *idp = DATALINK_INVALID_LINKID; - err = dls_devnet_unset(mac_name(mh), 0, idp); + err = dls_devnet_unset(mac_name(mh), idp, wait); if (err != 0 && err != ENOENT) return (err); - if ((err = dls_vlan_destroy(mac_name(mh), 0)) == 0) - return (0); - - (void) dls_devnet_set(mac_name(mh), 0, *idp, *idp, NULL, NULL); - return (err); -} + mac_perim_enter_by_mh(mh, &mph); + err = dls_link_rele_by_name(mac_name(mh)); + mac_perim_exit(mph); -int -dls_devnet_create_vlan(datalink_id_t vlanid, datalink_id_t linkid, - uint16_t vid, boolean_t force) -{ - dls_devnet_t *lnddp, *ddp; - dls_vlan_t *dvp; - int err; - - /* - * Hold the link the VLAN is being created on (which must not be a - * VLAN). - */ - ASSERT(vid != VLAN_ID_NONE); - if ((err = dls_devnet_hold_tmp(linkid, &lnddp)) != 0) - return (err); - - if (lnddp->dd_vid != VLAN_ID_NONE) { - err = EINVAL; - goto done; - } - - /* - * A new link. - */ - err = dls_devnet_set(lnddp->dd_mac, vid, vlanid, linkid, NULL, &ddp); - if (err != 0) - goto done; - - /* - * Hold the dls_vlan_t (and create it if needed). - */ - err = dls_vlan_hold(ddp->dd_mac, ddp->dd_vid, &dvp, force, B_TRUE); - if (err != 0) - VERIFY(dls_devnet_unset(lnddp->dd_mac, vid, NULL) == 0); + if (err == 0) + return (0); -done: - dls_devnet_rele_tmp(lnddp); + (void) dls_devnet_set(mac_name(mh), *idp, NULL); return (err); } -int -dls_devnet_destroy_vlan(datalink_id_t vlanid) -{ - char macname[MAXNAMELEN]; - uint16_t vid; - dls_devnet_t *ddp; - dls_vlan_t *dvp; - int err; - - if ((err = dls_devnet_hold_tmp(vlanid, &ddp)) != 0) - return (err); - - if (ddp->dd_vid == VLAN_ID_NONE) { - dls_devnet_rele_tmp(ddp); - return (EINVAL); - } - - if (!ddp->dd_explicit) { - dls_devnet_rele_tmp(ddp); - return (EBUSY); - } - - (void) strncpy(macname, ddp->dd_mac, MAXNAMELEN); - vid = ddp->dd_vid; - - /* - * It is safe to release the temporary reference we just held, as the - * reference from VLAN creation is still held. - */ - dls_devnet_rele_tmp(ddp); - - if ((err = dls_devnet_unset(macname, vid, NULL)) != 0) - return (err); - - /* - * This VLAN has already been held as the result of VLAN creation. - */ - VERIFY(dls_vlan_hold(macname, vid, &dvp, B_FALSE, B_FALSE) == 0); - - /* - * Release the reference which was held when this VLAN was created, - * and the reference which was just held. - */ - dls_vlan_rele(dvp); - dls_vlan_rele(dvp); - return (0); -} - const char * dls_devnet_mac(dls_dl_handle_t ddh) { return (ddh->dd_mac); } -uint16_t -dls_devnet_vid(dls_dl_handle_t ddh) -{ - return (ddh->dd_vid); -} - datalink_id_t dls_devnet_linkid(dls_dl_handle_t ddh) { return (ddh->dd_linkid); } - -boolean_t -dls_devnet_is_explicit(dls_dl_handle_t ddh) -{ - return (ddh->dd_explicit); -} diff --git a/usr/src/uts/common/io/dls/dls_mod.c b/usr/src/uts/common/io/dls/dls_mod.c index b93befd45c..5f594a0ff9 100644 --- a/usr/src/uts/common/io/dls/dls_mod.c +++ b/usr/src/uts/common/io/dls/dls_mod.c @@ -23,18 +23,12 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Data-Link Services Module */ -#include <sys/types.h> #include <sys/modctl.h> -#include <sys/mac.h> - -#include <sys/dls.h> -#include <sys/dls_impl.h> +#include <sys/dld_impl.h> static struct modlmisc i_dls_modlmisc = { &mod_miscops, @@ -54,8 +48,6 @@ static struct modlinkage i_dls_modlinkage = { static void i_dls_mod_init(void) { - dls_init(); - dls_vlan_init(); dls_link_init(); dls_mgmt_init(); } @@ -69,13 +61,6 @@ i_dls_mod_fini(void) return (err); dls_mgmt_fini(); - - err = dls_vlan_fini(); - ASSERT(err == 0); - - err = dls_fini(); - ASSERT(err == 0); - return (0); } diff --git a/usr/src/uts/common/io/dls/dls_soft_ring.c b/usr/src/uts/common/io/dls/dls_soft_ring.c deleted file mode 100644 index 078b9a9e4c..0000000000 --- a/usr/src/uts/common/io/dls/dls_soft_ring.c +++ /dev/null @@ -1,773 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * General Soft rings - Simulating Rx rings in S/W. - * - * This is a general purpose high-performance soft ring mechanism. It is - * similar to a taskq with a single worker thread. The dls creates a - * set of these rings to simulate the H/W Rx ring (DMA channels) some - * NICs have. The purpose is to present a common interface to IP - * so the individual squeues can control these rings and switch them - * between polling and interrupt mode. - * - * This code also serves as a fanout mechanism for fast NIC feeding slow - * CPU where incoming traffic can be separated into multiple soft rings - * based on capability negotiation with IP and IP also creates thread - * affinity to soft ring worker threads to CPU so that conenction to - * CPU/Squeue affinity is never broken. - * - * The soft rings can also be driven by a classifier which can direct - * traffic to individual soft rings based on the input from IP. - */ - -#include <sys/types.h> -#include <sys/cmn_err.h> -#include <sys/debug.h> -#include <sys/kmem.h> -#include <sys/cpuvar.h> -#include <sys/condvar_impl.h> -#include <sys/systm.h> -#include <sys/callb.h> -#include <sys/sdt.h> -#include <sys/ddi.h> -#include <sys/strsun.h> -#include <sys/strsubr.h> -#include <inet/common.h> -#include <inet/ip.h> -#include <inet/ipsec_impl.h> -#include <inet/sadb.h> -#include <inet/ipsecah.h> - -#include <sys/dls_impl.h> -#include <sys/dls_soft_ring.h> - -static void soft_ring_fire(void *); -static void soft_ring_drain(soft_ring_t *, clock_t); -static void soft_ring_worker(soft_ring_t *); -static void soft_ring_stop_workers(soft_ring_t **, int); -static void dls_taskq_stop_soft_ring(void *); - -typedef struct soft_ring_taskq { - soft_ring_t **ringp_list; - uint_t ring_size; -} soft_ring_taskq_t; - -kmem_cache_t *soft_ring_cache; - - -int soft_ring_workerwait_ms = 10; -int soft_ring_max_q_cnt = (4 * 1024 * 1024); - -/* The values above converted to ticks */ -static int soft_ring_workerwait_tick = 0; - -#define SOFT_RING_WORKER_WAKEUP(ringp) { \ - timeout_id_t tid = (ringp)->s_ring_tid; \ - \ - ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \ - /* \ - * Queue isn't being processed, so take \ - * any post enqueue actions needed before leaving. \ - */ \ - if (tid != 0) { \ - /* \ - * Waiting for an enter() to process mblk(s). \ - */ \ - clock_t waited = lbolt - (ringp)->s_ring_awaken; \ - \ - if (TICK_TO_MSEC(waited) >= (ringp)->s_ring_wait) { \ - /* \ - * Times up and have a worker thread \ - * waiting for work, so schedule it. \ - */ \ - (ringp)->s_ring_tid = 0; \ - cv_signal(&(ringp)->s_ring_async); \ - mutex_exit(&(ringp)->s_ring_lock); \ - (void) untimeout(tid); \ - } else { \ - mutex_exit(&(ringp)->s_ring_lock); \ - } \ - } else if ((ringp)->s_ring_wait != 0) { \ - (ringp)->s_ring_awaken = lbolt; \ - (ringp)->s_ring_tid = timeout(soft_ring_fire, (ringp), \ - (ringp)->s_ring_wait); \ - mutex_exit(&(ringp)->s_ring_lock); \ - } else { \ - /* \ - * Schedule the worker thread. \ - */ \ - cv_signal(&(ringp)->s_ring_async); \ - mutex_exit(&(ringp)->s_ring_lock); \ - } \ - ASSERT(MUTEX_NOT_HELD(&(ringp)->s_ring_lock)); \ -} - - -#define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt) { \ - /* \ - * Enqueue our mblk chain. \ - */ \ - ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \ - \ - if ((ringp)->s_ring_last != NULL) \ - (ringp)->s_ring_last->b_next = (mp); \ - else \ - (ringp)->s_ring_first = (mp); \ - (ringp)->s_ring_last = (tail); \ - (ringp)->s_ring_count += (cnt); \ - ASSERT((ringp)->s_ring_count > 0); \ -} - -void -soft_ring_init(void) -{ - soft_ring_cache = kmem_cache_create("soft_ring_cache", - sizeof (soft_ring_t), 64, NULL, NULL, NULL, NULL, NULL, 0); - - soft_ring_workerwait_tick = - MSEC_TO_TICK_ROUNDUP(soft_ring_workerwait_ms); -} - -/* ARGSUSED */ -soft_ring_t * -soft_ring_create(char *name, processorid_t bind, clock_t wait, - uint_t type, pri_t pri) -{ - soft_ring_t *ringp; - - ringp = kmem_cache_alloc(soft_ring_cache, KM_NOSLEEP); - if (ringp == NULL) - return (NULL); - - bzero(ringp, sizeof (soft_ring_t)); - (void) strncpy(ringp->s_ring_name, name, S_RING_NAMELEN + 1); - ringp->s_ring_name[S_RING_NAMELEN] = '\0'; - mutex_init(&ringp->s_ring_lock, NULL, MUTEX_DEFAULT, NULL); - - ringp->s_ring_type = type; - ringp->s_ring_bind = bind; - if (bind != S_RING_BIND_NONE) - soft_ring_bind(ringp, bind); - ringp->s_ring_wait = MSEC_TO_TICK(wait); - - ringp->s_ring_worker = thread_create(NULL, 0, soft_ring_worker, - ringp, 0, &p0, TS_RUN, pri); - - return (ringp); -} - -soft_ring_t ** -soft_ring_set_create(char *name, processorid_t bind, clock_t wait, - uint_t type, pri_t pri, int ring_size) -{ - int i; - soft_ring_t **ringp_list; - - if ((ringp_list = - (soft_ring_t **) kmem_zalloc(sizeof (soft_ring_t *) * ring_size, - KM_NOSLEEP)) != NULL) { - for (i = 0; i < ring_size; i++) { - ringp_list[i] = soft_ring_create(name, bind, wait, - type, pri); - if (ringp_list[i] == NULL) - break; - } - if (i != ring_size) { - soft_ring_stop_workers(ringp_list, ring_size); - soft_ring_set_destroy(ringp_list, ring_size); - ringp_list = NULL; - } - } - return (ringp_list); -} - -static void -soft_ring_stop_workers(soft_ring_t **ringp_set, int ring_size) -{ - int i; - soft_ring_t *ringp; - timeout_id_t tid; - kt_did_t t_did = 0; - - for (i = 0; (i < ring_size) && (ringp_set[i] != NULL); i++) { - ringp = ringp_set[i]; - - soft_ring_unbind((void *)ringp); - mutex_enter(&ringp->s_ring_lock); - if ((tid = ringp->s_ring_tid) != 0) - (void) untimeout(tid); - - ringp->s_ring_tid = 0; - - if (!(ringp->s_ring_state & S_RING_DEAD)) { - ringp->s_ring_state |= S_RING_DESTROY; - t_did = ringp->s_ring_worker->t_did; - - - /* Wake the worker so it can exit */ - cv_signal(&(ringp)->s_ring_async); - } - mutex_exit(&ringp->s_ring_lock); - - /* - * Here comes the tricky part. IP and driver ensure - * that packet flow has stopped but worker thread - * might still be draining the soft ring. We have - * already set the S_RING_DESTROY flag. We wait till - * the worker thread takes notice and stops processing - * the soft_ring and exits. It sets S_RING_DEAD on - * exiting. - */ - if (t_did) - thread_join(t_did); - } -} - -void -soft_ring_set_destroy(soft_ring_t **ringp_set, int ring_size) -{ - int i; - mblk_t *mp; - soft_ring_t *ringp; - - for (i = 0; (i < ring_size) && (ringp_set[i] != NULL); i++) { - ringp = ringp_set[i]; - - mutex_enter(&ringp->s_ring_lock); - - ASSERT(ringp->s_ring_state & S_RING_DEAD); - - while ((mp = ringp->s_ring_first) != NULL) { - ringp->s_ring_first = mp->b_next; - mp->b_next = NULL; - freemsg(mp); - } - ringp->s_ring_last = NULL; - mutex_exit(&ringp->s_ring_lock); - - /* - * IP/driver ensure that no packets are flowing - * when we are destroying the soft rings otherwise bad - * things will happen. - */ - kmem_cache_free(soft_ring_cache, ringp); - ringp_set[i] = NULL; - } - kmem_free(ringp_set, sizeof (soft_ring_t *) * ring_size); -} - -/* ARGSUSED */ -void -soft_ring_bind(void *arg, processorid_t bind) -{ - cpu_t *cp; - soft_ring_t *ringp = (soft_ring_t *)arg; - - mutex_enter(&ringp->s_ring_lock); - if (ringp->s_ring_state & S_RING_BOUND) { - mutex_exit(&ringp->s_ring_lock); - return; - } - - ringp->s_ring_state |= S_RING_BOUND; - ringp->s_ring_bind = bind; - mutex_exit(&ringp->s_ring_lock); - - cp = cpu[bind]; - mutex_enter(&cpu_lock); - if (cpu_is_online(cp)) { - thread_affinity_set(ringp->s_ring_worker, ringp->s_ring_bind); - } - mutex_exit(&cpu_lock); -} - -void -soft_ring_unbind(void *arg) -{ - soft_ring_t *ringp = (soft_ring_t *)arg; - - mutex_enter(&ringp->s_ring_lock); - if (!(ringp->s_ring_state & S_RING_BOUND)) { - mutex_exit(&ringp->s_ring_lock); - return; - } - - ringp->s_ring_state &= ~S_RING_BOUND; - ringp->s_ring_bind = S_RING_BIND_NONE; - mutex_exit(&ringp->s_ring_lock); - - thread_affinity_clear(ringp->s_ring_worker); -} - -/* - * soft_ring_enter() - enter soft_ring sqp with mblk mp (which can be - * a chain), while tail points to the end and cnt in number of - * mblks in the chain. - * - * For a chain of single packet (i.e. mp == tail), go through the - * fast path if no one is processing the soft_ring and nothing is queued. - * - * The proc and arg for each mblk is already stored in the mblk in - * appropriate places. - */ -/* ARGSUSED */ -static void -soft_ring_process(soft_ring_t *ringp, - mblk_t *mp_chain, mblk_t *tail, uint_t count) -{ - void *arg1, *arg2; - s_ring_proc_t proc; - - ASSERT(ringp != NULL); - ASSERT(mp_chain != NULL); - ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); - - mutex_enter(&ringp->s_ring_lock); - - ringp->s_ring_total_inpkt += count; - if (!(ringp->s_ring_state & S_RING_PROC) && - !(ringp->s_ring_type == S_RING_WORKER_ONLY)) { - /* - * See if anything is already queued. If we are the - * first packet, do inline processing else queue the - * packet and do the drain. - */ - if (ringp->s_ring_first == NULL && count == 1) { - /* - * Fast-path, ok to process and nothing queued. - */ - ringp->s_ring_run = curthread; - ringp->s_ring_state |= (S_RING_PROC); - - /* - * We are the chain of 1 packet so - * go through this fast path. - */ - ASSERT(mp_chain->b_next == NULL); - proc = ringp->s_ring_upcall; - arg1 = ringp->s_ring_upcall_arg1; - arg2 = ringp->s_ring_upcall_arg2; - - mutex_exit(&ringp->s_ring_lock); - (*proc)(arg1, arg2, mp_chain, NULL); - - ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); - mutex_enter(&ringp->s_ring_lock); - ringp->s_ring_run = NULL; - ringp->s_ring_state &= ~S_RING_PROC; - if (ringp->s_ring_first == NULL) { - /* - * We processed inline our packet and - * nothing new has arrived. We are done. - */ - mutex_exit(&ringp->s_ring_lock); - return; - } - } else { - SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, count); - } - - /* - * We are here because either we couldn't do inline - * processing (because something was already queued), - * or we had a chanin of more than one packet, - * or something else arrived after we were done with - * inline processing. - */ - ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); - ASSERT(ringp->s_ring_first != NULL); - - - soft_ring_drain(ringp, -1); - mutex_exit(&ringp->s_ring_lock); - return; - } else { - /* - * Queue is already being processed. Just enqueue - * the packet and go away. - */ - if (ringp->s_ring_count > soft_ring_max_q_cnt) { - freemsgchain(mp_chain); - DLS_BUMP_STAT(dlss_soft_ring_pkt_drop, count); - } else - SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, count); - if (!(ringp->s_ring_state & S_RING_PROC)) { - SOFT_RING_WORKER_WAKEUP(ringp); - } else { - ASSERT(ringp->s_ring_run != NULL); - mutex_exit(&ringp->s_ring_lock); - } - return; - } -} - -/* - * PRIVATE FUNCTIONS - */ - -static void -soft_ring_fire(void *arg) -{ - soft_ring_t *ringp = arg; - - mutex_enter(&ringp->s_ring_lock); - if (ringp->s_ring_tid == 0) { - mutex_exit(&ringp->s_ring_lock); - return; - } - - ringp->s_ring_tid = 0; - - if (!(ringp->s_ring_state & S_RING_PROC)) { - cv_signal(&ringp->s_ring_async); - } - mutex_exit(&ringp->s_ring_lock); -} - -/* ARGSUSED */ -static void -soft_ring_drain(soft_ring_t *ringp, clock_t expire) -{ - mblk_t *mp; - s_ring_proc_t proc; - void *arg1, *arg2; - timeout_id_t tid; - - ringp->s_ring_run = curthread; - ASSERT(mutex_owned(&ringp->s_ring_lock)); - ASSERT(!(ringp->s_ring_state & S_RING_PROC)); - - if ((tid = ringp->s_ring_tid) != 0) - ringp->s_ring_tid = 0; - - ringp->s_ring_state |= S_RING_PROC; - - - proc = ringp->s_ring_upcall; - arg1 = ringp->s_ring_upcall_arg1; - arg2 = ringp->s_ring_upcall_arg2; - - while (ringp->s_ring_first != NULL) { - mp = ringp->s_ring_first; - ringp->s_ring_first = NULL; - ringp->s_ring_last = NULL; - ringp->s_ring_count = 0; - mutex_exit(&ringp->s_ring_lock); - - if (tid != 0) { - (void) untimeout(tid); - tid = 0; - } - - (*proc)(arg1, arg2, mp, NULL); - - mutex_enter(&ringp->s_ring_lock); - } - - ringp->s_ring_state &= ~S_RING_PROC; - ringp->s_ring_run = NULL; -} - -static void -soft_ring_worker(soft_ring_t *ringp) -{ - kmutex_t *lock = &ringp->s_ring_lock; - kcondvar_t *async = &ringp->s_ring_async; - callb_cpr_t cprinfo; - - CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "soft_ring"); - mutex_enter(lock); - - for (;;) { - while (ringp->s_ring_first == NULL || - (ringp->s_ring_state & S_RING_PROC)) { - CALLB_CPR_SAFE_BEGIN(&cprinfo); - if (ringp->s_ring_state & S_RING_DESTROY) - goto destroy; -still_wait: - cv_wait(async, lock); - if (ringp->s_ring_state & S_RING_DESTROY) { -destroy: - if (ringp->s_ring_state & S_RING_DESTROY) { - ringp->s_ring_state |= S_RING_DEAD; - CALLB_CPR_EXIT(&cprinfo); - thread_exit(); - } - } - if (ringp->s_ring_state & S_RING_PROC) { - goto still_wait; - } - CALLB_CPR_SAFE_END(&cprinfo, lock); - } - soft_ring_drain(ringp, -1); - } -} - -void -dls_soft_ring_disable(dls_channel_t dc) -{ - dls_impl_t *dip = (dls_impl_t *)dc; - soft_ring_t **ringp_list = NULL; - int ring_size; - - rw_enter(&(dip->di_lock), RW_READER); - if (dip->di_soft_ring_list != NULL) { - ringp_list = dip->di_soft_ring_list; - ring_size = dip->di_soft_ring_size; - } - rw_exit(&(dip->di_lock)); - - if (ringp_list != NULL) - soft_ring_stop_workers(ringp_list, ring_size); -} - -static void -dls_taskq_stop_soft_ring(void *arg) -{ - soft_ring_taskq_t *ring_taskq; - soft_ring_t **ringp_list; - int ring_size; - - ring_taskq = (soft_ring_taskq_t *)arg; - ringp_list = ring_taskq->ringp_list; - ring_size = ring_taskq->ring_size; - kmem_free(ring_taskq, sizeof (soft_ring_taskq_t)); - - soft_ring_stop_workers(ringp_list, ring_size); - soft_ring_set_destroy(ringp_list, ring_size); -} - -boolean_t -dls_soft_ring_enable(dls_channel_t dc, dl_capab_dls_t *soft_ringp) -{ - dls_impl_t *dip; - int i; - soft_ring_t **softring_set; - soft_ring_t *softring; - mac_rx_fifo_t mrf; - soft_ring_taskq_t *ring_taskq; - char name[64]; - - dip = (dls_impl_t *)dc; - - rw_enter(&(dip->di_lock), RW_WRITER); - - if (dip->di_soft_ring_list != NULL) { - /* - * Both ds_lock and di_lock are held as writer. - * As soft_ring_stop_workers() blocks for the - * worker thread(s) to complete, there is a possibility - * that the worker thread(s) could be in the process - * of draining the queue and is blocked waiting for - * either ds_lock or di_lock. Moreover the NIC interrupt - * thread could be blocked in dls_accept(). - * To avoid deadlock condition, taskq thread would be - * created to handle soft_ring_stop_workers() and - * blocking if required which would avoid holding - * both ds_lock and di_lock. - * NOTE: we cannot drop either locks here, due to - * weird race conditions seen. - */ - ring_taskq = (soft_ring_taskq_t *) - kmem_zalloc(sizeof (soft_ring_taskq_t), KM_NOSLEEP); - if (ring_taskq == NULL) { - rw_exit(&(dip->di_lock)); - return (B_FALSE); - } - ring_taskq->ringp_list = dip->di_soft_ring_list; - ring_taskq->ring_size = dip->di_soft_ring_size; - if (taskq_dispatch(system_taskq, dls_taskq_stop_soft_ring, - ring_taskq, TQ_NOSLEEP) == NULL) { - rw_exit(&(dip->di_lock)); - kmem_free(ring_taskq, sizeof (soft_ring_taskq_t)); - return (B_FALSE); - } - dip->di_soft_ring_list = NULL; - } - dip->di_soft_ring_size = 0; - - bzero(name, sizeof (name)); - (void) snprintf(name, sizeof (name), "dls_soft_ring_%p", (void *)dip); - dip->di_soft_ring_list = soft_ring_set_create(name, S_RING_BIND_NONE, - 0, S_RING_WORKER_ONLY, minclsyspri, soft_ringp->dls_ring_cnt); - - if (dip->di_soft_ring_list == NULL) { - rw_exit(&(dip->di_lock)); - return (B_FALSE); - } - - dip->di_soft_ring_size = soft_ringp->dls_ring_cnt; - softring_set = dip->di_soft_ring_list; - - dip->di_ring_add = (mac_resource_add_t)soft_ringp->dls_ring_add; - dip->di_rx = (dls_rx_t)soft_ringp->dls_ring_assign; - dip->di_rx_arg = (void *)soft_ringp->dls_rx_handle; - - bzero(&mrf, sizeof (mac_rx_fifo_t)); - mrf.mrf_type = MAC_RX_FIFO; - for (i = 0; i < soft_ringp->dls_ring_cnt; i++) { - softring = softring_set[i]; - mrf.mrf_arg = softring; - softring->s_ring_upcall_arg1 = - (void *)soft_ringp->dls_rx_handle; - softring->s_ring_upcall_arg2 = - dip->di_ring_add((void *)soft_ringp->dls_rx_handle, - (mac_resource_t *)&mrf); - softring->s_ring_upcall = - (s_ring_proc_t)soft_ringp->dls_rx; - } - - /* - * Note that soft_ring is enabled. This prevents further DLIOCHDRINFO - * ioctls from overwriting the receive function pointer. - */ - rw_exit(&(dip->di_lock)); - return (B_TRUE); -} - -int dls_bad_ip_pkt = 0; - -static mblk_t * -dls_skip_mblk(mblk_t *bp, mblk_t *mp, int *skip_lenp) -{ - while (MBLKL(bp) <= *skip_lenp) { - *skip_lenp -= MBLKL(bp); - bp = bp->b_cont; - if (bp == NULL) { - dls_bad_ip_pkt++; - freemsg(mp); - return (NULL); - } - } - return (bp); -} - -#define HASH32(x) (((x) >> 24) ^ ((x) >> 16) ^ ((x) >> 8) ^ (x)) -#define COMPUTE_INDEX(key, sz) (key % sz) - -/* - * dls_soft_ring_fanout(): - */ -/* ARGSUSED */ -void -dls_soft_ring_fanout(void *rx_handle, void *rx_cookie, mblk_t *mp_chain, - mac_header_info_t *mhip) -{ - mblk_t *mp, *bp, *head, *tail; - ipha_t *ipha; - dls_impl_t *dip = (dls_impl_t *)rx_handle; - int indx, saved_indx; - int hash = 0; - int skip_len; - uint8_t protocol; - int count = 0; - - head = tail = NULL; - - while (mp_chain != NULL) { - bp = mp = mp_chain; - mp_chain = mp_chain->b_next; - mp->b_next = NULL; - if ((MBLKL(mp) < sizeof (ipha_t)) || !OK_32PTR(mp->b_rptr)) { - mp = msgpullup(bp, sizeof (ipha_t)); - freemsg(bp); - if (mp == NULL) { - dls_bad_ip_pkt++; - continue; - } - bp = mp; - } - - ipha = (ipha_t *)mp->b_rptr; - skip_len = IPH_HDR_LENGTH(ipha); - protocol = ipha->ipha_protocol; - again: - switch (protocol) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_SCTP: - case IPPROTO_ESP: - /* - * Note that for ESP, we fanout on SPI and it is at the - * same offset as the 2x16-bit ports. So it is clumped - * along with TCP, UDP and SCTP. - */ - if (MBLKL(bp) <= skip_len) { - bp = dls_skip_mblk(bp, mp, &skip_len); - if (bp == NULL) - continue; - } - - hash = HASH32(*(uint32_t *)(bp->b_rptr + skip_len)); - break; - - case IPPROTO_AH: { - ah_t *ah; - uint_t ah_length; - - if (MBLKL(bp) <= skip_len) { - bp = dls_skip_mblk(bp, mp, &skip_len); - if (bp == NULL) - continue; - } - - ah = (ah_t *)(bp->b_rptr + skip_len); - protocol = ah->ah_nexthdr; - ah_length = AH_TOTAL_LEN(ah); - skip_len += ah_length; - goto again; - } - - default: - /* - * Send the packet to a ring based on src/dest addresses - */ - hash = - (HASH32(ipha->ipha_src) ^ HASH32(ipha->ipha_dst)); - break; - } - - indx = COMPUTE_INDEX(hash, dip->di_soft_ring_size); - if (head == NULL) { - saved_indx = indx; - head = tail = mp; - count++; - } else if (indx == saved_indx) { - tail->b_next = mp; - tail = mp; - count++; - } else { - soft_ring_process(dip->di_soft_ring_list[saved_indx], - head, tail, count); - head = tail = mp; - saved_indx = indx; - count = 1; - } - } - if (head != NULL) - soft_ring_process(dip->di_soft_ring_list[saved_indx], - head, tail, count); -} diff --git a/usr/src/uts/common/io/dls/dls_stat.c b/usr/src/uts/common/io/dls/dls_stat.c index 99f41d0c7d..a6f89a8b49 100644 --- a/usr/src/uts/common/io/dls/dls_stat.c +++ b/usr/src/uts/common/io/dls/dls_stat.c @@ -23,22 +23,12 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Data-Link Services Module */ -#include <sys/types.h> -#include <sys/sysmacros.h> -#include <sys/atomic.h> -#include <sys/kstat.h> -#include <sys/vlan.h> -#include <sys/mac.h> +#include <sys/dld_impl.h> #include <sys/mac_ether.h> -#include <sys/ctype.h> -#include <sys/dls.h> -#include <sys/dls_impl.h> static mac_stat_info_t i_dls_si[] = { { MAC_STAT_IFSPEED, "ifspeed", KSTAT_DATA_UINT64, 0 }, @@ -66,35 +56,18 @@ static mac_stat_info_t i_dls_si[] = { #define STAT_INFO_COUNT (sizeof (i_dls_si) / sizeof (i_dls_si[0])) /* - * Private functions. - */ - -static int -i_dls_mac_stat_update(kstat_t *ksp, int rw) -{ - dls_vlan_t *dvp = ksp->ks_private; - - return (dls_stat_update(ksp, dvp, rw)); -} - -/* * Exported functions. */ int -dls_stat_update(kstat_t *ksp, dls_vlan_t *dvp, int rw) +dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw) { - dls_link_t *dlp = dvp->dv_dlp; kstat_named_t *knp; uint_t i; uint64_t val; - int err; if (rw != KSTAT_READ) return (EACCES); - if ((err = dls_mac_hold(dlp)) != 0) - return (err); - knp = (kstat_named_t *)ksp->ks_data; for (i = 0; i < STAT_INFO_COUNT; i++) { val = mac_stat_get(dlp->dl_mh, i_dls_si[i].msi_stat); @@ -124,7 +97,6 @@ dls_stat_update(kstat_t *ksp, dls_vlan_t *dvp, int rw) } knp++; knp->value.ui32 = dlp->dl_unknowns; - dls_mac_rele(dlp); return (0); } @@ -158,45 +130,3 @@ dls_stat_create(const char *module, int instance, const char *name, *kspp = ksp; return (0); } - -void -dls_mac_stat_create(dls_vlan_t *dvp) -{ - kstat_t *ksp = NULL; - major_t major; - - /* - * Create the legacy kstats to provide backward compatibility. - * These kstats need to be created even when this link does not - * have a link name, i.e., when the VLAN is accessed using its - * /dev node. - * - * Note that we only need to create the legacy kstats for GLDv3 - * physical links, aggregation links which are created using - * the 'key' option, and any VLAN links created over them. - * This can be determined by checking its dv_ppa. - */ - ASSERT(dvp->dv_ksp == NULL); - if (dvp->dv_ppa >= MAC_MAX_MINOR) - return; - - major = getmajor(dvp->dv_dev); - ASSERT(GLDV3_DRV(major) && (dvp->dv_ksp == NULL)); - - if (dls_stat_create(ddi_major_to_name(major), - dvp->dv_id * 1000 + dvp->dv_ppa, NULL, - i_dls_mac_stat_update, dvp, &ksp) != 0) { - return; - } - ASSERT(ksp != NULL); - dvp->dv_ksp = ksp; -} - -void -dls_mac_stat_destroy(dls_vlan_t *dvp) -{ - if (dvp->dv_ksp != NULL) { - kstat_delete(dvp->dv_ksp); - dvp->dv_ksp = NULL; - } -} diff --git a/usr/src/uts/common/io/dls/dls_vlan.c b/usr/src/uts/common/io/dls/dls_vlan.c deleted file mode 100644 index 9df000e86a..0000000000 --- a/usr/src/uts/common/io/dls/dls_vlan.c +++ /dev/null @@ -1,561 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -/* - * Data-Link Services Module - */ - -#include <sys/types.h> -#include <sys/sysmacros.h> -#include <sys/modhash.h> -#include <sys/stat.h> -#include <sys/kstat.h> -#include <sys/vlan.h> -#include <sys/mac.h> -#include <sys/ctype.h> -#include <sys/dls.h> -#include <sys/dls_impl.h> - -static kmem_cache_t *i_dls_vlan_cachep; -static mod_hash_t *i_dls_vlan_hash; -static mod_hash_t *i_dls_vlan_dev_hash; -static krwlock_t i_dls_vlan_lock; -static uint_t i_dls_vlan_count; - -#define VLAN_HASHSZ 67 /* prime */ - -/* - * Private functions. - */ - -/*ARGSUSED*/ -static int -i_dls_vlan_constructor(void *buf, void *arg, int kmflag) -{ - dls_vlan_t *dvp = buf; - - bzero(buf, sizeof (dls_vlan_t)); - mutex_init(&dvp->dv_lock, NULL, MUTEX_DEFAULT, NULL); - return (0); -} - -/*ARGSUSED*/ -static void -i_dls_vlan_destructor(void *buf, void *arg) -{ - dls_vlan_t *dvp = buf; - - ASSERT(dvp->dv_ref == 0); - ASSERT(dvp->dv_zone_ref == 0); - mutex_destroy(&dvp->dv_lock); -} - -/* - * Module initialization functions. - */ -void -dls_vlan_init(void) -{ - /* - * Create a kmem_cache of dls_vlan_t structures. - */ - i_dls_vlan_cachep = kmem_cache_create("dls_vlan_cache", - sizeof (dls_vlan_t), 0, i_dls_vlan_constructor, - i_dls_vlan_destructor, NULL, NULL, NULL, 0); - ASSERT(i_dls_vlan_cachep != NULL); - - /* - * Create a hash table, keyed by dv_spa, of dls_vlan_t. - */ - i_dls_vlan_hash = mod_hash_create_extended("dls_vlan_hash", - VLAN_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor, - mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); - - /* - * Create a hash table, keyed by dv_dev, of dls_vlan_t. - */ - i_dls_vlan_dev_hash = mod_hash_create_ptrhash("dls_vlan_dev_hash", - VLAN_HASHSZ, mod_hash_null_valdtor, sizeof (dev_t)); - - rw_init(&i_dls_vlan_lock, NULL, RW_DEFAULT, NULL); - i_dls_vlan_count = 0; -} - -int -dls_vlan_fini(void) -{ - if (i_dls_vlan_count > 0) - return (EBUSY); - - /* - * Destroy the hash table - */ - mod_hash_destroy_hash(i_dls_vlan_hash); - mod_hash_destroy_hash(i_dls_vlan_dev_hash); - rw_destroy(&i_dls_vlan_lock); - - /* - * Destroy the kmem_cache. - */ - kmem_cache_destroy(i_dls_vlan_cachep); - return (0); -} - -/* - * Exported functions. - */ - -/* - * If vid is VLAN_ID_NONE, then the minor_t to access this dls_vlan_t is - * ppa + 1, otherwise, we need to allocate the minor_t in this function. - * - * If ppa is greater than DLS_MAX_PPA, it means that we do not need to create - * the VLAN minor node for this MAC, as this MAC is (a) a legacy device, (b) - * an aggr created without the "key" argument, or (c) a new type of link - * whose ppa is allocated by mac_minor_hold() in mac_register(). - */ -int -dls_vlan_create(const char *macname, uint16_t vid, boolean_t force) -{ - char node[MAXPATHLEN]; - char spa[MAXSPALEN]; - char *driver; - dls_link_t *dlp; - dls_vlan_t *dvp; - minor_t minor = 0; - mac_handle_t mh; - int ppa; - dev_info_t *dip; - uint32_t margin = VLAN_TAGSZ; - int err = 0; - - if ((err = mac_open(macname, &mh)) != 0) - return (err); - - /* - * First check whether VLANs are able to be created on this MAC. - */ - if (vid != VLAN_ID_NONE) { - if ((mac_info(mh)->mi_media != DL_ETHER) || - (mac_info(mh)->mi_nativemedia != DL_ETHER)) { - mac_close(mh); - return (EINVAL); - } - if (!force && - ((err = mac_margin_add(mh, &margin, B_FALSE)) != 0)) { - mac_close(mh); - return (err); - } - } - - /* - * Get a reference to a dls_link_t representing the MAC. This call - * will create one if necessary. - */ - if ((err = dls_link_hold(macname, &dlp)) != 0) { - if (vid != VLAN_ID_NONE && !force) - VERIFY(mac_margin_remove(mh, margin) == 0); - mac_close(mh); - return (err); - } - - rw_enter(&i_dls_vlan_lock, RW_WRITER); - - /* - * Try to find this VLAN in i_dls_vlan_hash first. The spa - * is in the <macname/vid> form. - */ - (void) snprintf(spa, MAXSPALEN, "%s/%d", macname, vid); - if ((err = mod_hash_find(i_dls_vlan_hash, - (mod_hash_key_t)spa, (mod_hash_val_t)&dvp)) == 0) { - err = EEXIST; - goto fail; - } - - ppa = mac_minor(mh) - 1; - dip = mac_devinfo_get(mh); - - if (vid == VLAN_ID_NONE) { - /* - * Derives minor number directly from non-VLAN link's PPA. - */ - minor = ppa + 1; - } else if ((minor = mac_minor_hold(B_TRUE)) == 0) { - /* - * Allocate minor number from minor_arenap for VLANs. - */ - err = ENOMEM; - goto fail; - } - - /* - * First create its minor node for non-legacy links, including VLANs - * and non-VLANs. This is for /dev nodes backward compatibility. - */ - if (vid != VLAN_ID_NONE && ppa < MAC_MAX_MINOR) { - - driver = (char *)ddi_driver_name(dip); - - /* Create a style-1 DLPI device */ - (void) snprintf(node, MAXPATHLEN, "%s%d", driver, - vid * 1000 + ppa); - if (ddi_create_minor_node(dip, node, S_IFCHR, minor, - DDI_NT_NET, 0) != DDI_SUCCESS) { - err = EINVAL; - goto fail; - } - } - - dvp = kmem_cache_alloc(i_dls_vlan_cachep, KM_SLEEP); - dvp->dv_id = vid; - dvp->dv_dlp = dlp; - dvp->dv_dev = makedevice(ddi_driver_major(dip), minor); - dvp->dv_dip = dip; - dvp->dv_ppa = ppa; - dvp->dv_force = force; - dvp->dv_ref = 0; - dvp->dv_zone_ref = 0; - dvp->dv_zid = GLOBAL_ZONEID; - (void) strlcpy(dvp->dv_spa, spa, MAXSPALEN); - dls_mac_stat_create(dvp); - - err = mod_hash_insert(i_dls_vlan_hash, - (mod_hash_key_t)dvp->dv_spa, (mod_hash_val_t)dvp); - ASSERT(err == 0); - - err = mod_hash_insert(i_dls_vlan_dev_hash, - (mod_hash_key_t)dvp->dv_dev, (mod_hash_val_t)dvp); - ASSERT(err == 0); - - i_dls_vlan_count++; - rw_exit(&i_dls_vlan_lock); - - /* - * Hold the underlying MAC for VLANs to keep the margin request. - * We cannot hold the mac for non-VLANs, because a reference would - * prevent the device from detaching. - */ - if (vid != VLAN_ID_NONE) - VERIFY(dls_mac_hold(dvp->dv_dlp) == 0); - - mac_close(mh); - return (0); - -fail: - rw_exit(&i_dls_vlan_lock); - if (vid != VLAN_ID_NONE && minor != 0) - mac_minor_rele(minor); - dls_link_rele(dlp); - if (vid != VLAN_ID_NONE && !force) - VERIFY(mac_margin_remove(mh, margin) == 0); - mac_close(mh); - return (err); -} - -int -dls_vlan_destroy(const char *macname, uint16_t vid) -{ - char spa[MAXSPALEN]; - dls_vlan_t *dvp; - mod_hash_val_t val; - int err; - - /* - * Try to find this VLAN in i_dls_vlan_hash first. The spa - * is in the <macname/vid> form. - */ - (void) snprintf(spa, MAXSPALEN, "%s/%d", macname, vid); - - rw_enter(&i_dls_vlan_lock, RW_WRITER); - - if ((err = mod_hash_find(i_dls_vlan_hash, - (mod_hash_key_t)spa, (mod_hash_val_t)&dvp)) != 0) { - rw_exit(&i_dls_vlan_lock); - return (ENOENT); - } - - /* - * Check to see if it is referenced by any dls_impl_t. - */ - if (dvp->dv_ref != 0) { - rw_exit(&i_dls_vlan_lock); - return (EBUSY); - } - - ASSERT(dvp->dv_zone_ref == 0); - - /* - * Remove and destroy the hash table entry. - */ - err = mod_hash_remove(i_dls_vlan_hash, - (mod_hash_key_t)dvp->dv_spa, (mod_hash_val_t *)&val); - ASSERT(err == 0); - ASSERT(dvp == (dls_vlan_t *)val); - - err = mod_hash_remove(i_dls_vlan_dev_hash, - (mod_hash_key_t)dvp->dv_dev, (mod_hash_val_t *)&val); - ASSERT(err == 0); - ASSERT(dvp == (dls_vlan_t *)val); - - if (vid != VLAN_ID_NONE && dvp->dv_ppa < MAC_MAX_MINOR) { - char node[MAXPATHLEN]; - char *driver; - - /* - * Remove the minor nodes for this link. - */ - driver = (char *)ddi_driver_name(dvp->dv_dip); - (void) snprintf(node, MAXPATHLEN, "%s%d", driver, - vid * 1000 + dvp->dv_ppa); - ddi_remove_minor_node(dvp->dv_dip, node); - } - - dls_mac_stat_destroy(dvp); - - ASSERT(i_dls_vlan_count > 0); - i_dls_vlan_count--; - rw_exit(&i_dls_vlan_lock); - - if (vid != VLAN_ID_NONE) { - if (!dvp->dv_force) { - (void) mac_margin_remove(dvp->dv_dlp->dl_mh, - VLAN_TAGSZ); - } - dls_mac_rele(dvp->dv_dlp); - } - - /* - * Release minor to dls_minor_arenap for VLANs - */ - if (vid != VLAN_ID_NONE) - mac_minor_rele(getminor(dvp->dv_dev)); - - /* - * Release the dls_link_t. This will destroy the dls_link_t and - * release the MAC if there are no more dls_vlan_t. - */ - dls_link_rele(dvp->dv_dlp); - kmem_cache_free(i_dls_vlan_cachep, dvp); - return (0); -} - -int -dls_vlan_hold(const char *macname, uint16_t vid, dls_vlan_t **dvpp, - boolean_t force, boolean_t create_vlan) -{ - char spa[MAXSPALEN]; - dls_vlan_t *dvp; - boolean_t vlan_created; - int err = 0; - - (void) snprintf(spa, MAXSPALEN, "%s/%d", macname, vid); - -again: - rw_enter(&i_dls_vlan_lock, RW_WRITER); - if ((err = mod_hash_find(i_dls_vlan_hash, - (mod_hash_key_t)spa, (mod_hash_val_t)&dvp)) != 0) { - - ASSERT(err == MH_ERR_NOTFOUND); - - vlan_created = B_FALSE; - if (!create_vlan || vid == VLAN_ID_NONE) { - rw_exit(&i_dls_vlan_lock); - return (ENOENT); - } - rw_exit(&i_dls_vlan_lock); - - err = dls_vlan_create(macname, vid, force); - if ((err != 0) && (err != EEXIST)) - return (err); - - /* - * At this point someone else could do a dls_vlan_hold and - * dls_vlan_rele on this new vlan and causes it to be - * destroyed. This will at worst cause us to spin a few - * times. - */ - vlan_created = (err != EEXIST); - goto again; - } - - dvp->dv_ref++; - rw_exit(&i_dls_vlan_lock); - - if ((err = dls_mac_hold(dvp->dv_dlp)) != 0) { - rw_enter(&i_dls_vlan_lock, RW_WRITER); - dvp->dv_ref--; - rw_exit(&i_dls_vlan_lock); - if (vlan_created) - (void) dls_vlan_destroy(macname, vid); - return (err); - } - - *dvpp = dvp; - return (0); -} - -int -dls_vlan_hold_by_dev(dev_t dev, dls_vlan_t **dvpp) -{ - dls_vlan_t *dvp; - int err; - - rw_enter(&i_dls_vlan_lock, RW_WRITER); - if ((err = mod_hash_find(i_dls_vlan_dev_hash, (mod_hash_key_t)dev, - (mod_hash_val_t *)&dvp)) != 0) { - ASSERT(err == MH_ERR_NOTFOUND); - rw_exit(&i_dls_vlan_lock); - return (ENOENT); - } - - dvp->dv_ref++; - rw_exit(&i_dls_vlan_lock); - - if ((err = dls_mac_hold(dvp->dv_dlp)) != 0) { - rw_enter(&i_dls_vlan_lock, RW_WRITER); - dvp->dv_ref--; - rw_exit(&i_dls_vlan_lock); - return (err); - } - - *dvpp = dvp; - return (0); -} - -/* - * Free the dvp if this is a VLAN and this is the last reference. - */ -void -dls_vlan_rele(dls_vlan_t *dvp) -{ - char macname[MAXNAMELEN]; - uint16_t vid; - boolean_t destroy_vlan = B_FALSE; - - dls_mac_rele(dvp->dv_dlp); - - rw_enter(&i_dls_vlan_lock, RW_WRITER); - if (--dvp->dv_ref != 0) { - rw_exit(&i_dls_vlan_lock); - return; - } - - if (dvp->dv_id != VLAN_ID_NONE) { - destroy_vlan = B_TRUE; - (void) strncpy(macname, dvp->dv_dlp->dl_name, MAXNAMELEN); - vid = dvp->dv_id; - } - rw_exit(&i_dls_vlan_lock); - - if (destroy_vlan) - (void) dls_vlan_destroy(macname, vid); -} - -int -dls_vlan_setzid(const char *mac, uint16_t vid, zoneid_t zid) -{ - dls_vlan_t *dvp; - int err; - zoneid_t old_zid; - - if ((err = dls_vlan_hold(mac, vid, &dvp, B_FALSE, B_TRUE)) != 0) - return (err); - - mutex_enter(&dvp->dv_lock); - if ((old_zid = dvp->dv_zid) == zid) { - mutex_exit(&dvp->dv_lock); - goto done; - } - - /* - * Check whether this dvp is used by its own zones, if yes, - * we cannot change its zoneid. - */ - if (dvp->dv_zone_ref != 0) { - mutex_exit(&dvp->dv_lock); - err = EBUSY; - goto done; - } - - if (zid == GLOBAL_ZONEID) { - /* - * Move the link from the local zone to the global zone, - * and release the reference to this link. At the same time - * reset the link's active state so that an aggregation is - * allowed to be created over it. - */ - dvp->dv_zid = zid; - mutex_exit(&dvp->dv_lock); - dls_mac_active_clear(dvp->dv_dlp); - dls_vlan_rele(dvp); - goto done; - } else if (old_zid == GLOBAL_ZONEID) { - /* - * Move the link from the global zone to the local zone, - * and hold a reference to this link. Also, set the link - * to the "active" state so that the global zone is - * not able to create an aggregation over this link. - * TODO: revisit once we allow creating aggregations - * within a local zone. - */ - if (!dls_mac_active_set(dvp->dv_dlp)) { - mutex_exit(&dvp->dv_lock); - err = EBUSY; - goto done; - } - dvp->dv_zid = zid; - mutex_exit(&dvp->dv_lock); - return (0); - } else { - /* - * Move the link from a local zone to another local zone. - */ - dvp->dv_zid = zid; - mutex_exit(&dvp->dv_lock); - } - -done: - dls_vlan_rele(dvp); - return (err); -} - -/* - * Find dev_info_t based on the minor node of the link. - */ -dev_info_t * -dls_finddevinfo(dev_t dev) -{ - dls_vlan_t *dvp; - dev_info_t *dip; - - if (dls_vlan_hold_by_dev(dev, &dvp) != 0) - return (NULL); - - dip = dvp->dv_dip; - dls_vlan_rele(dvp); - return (dip); -} diff --git a/usr/src/uts/common/io/dmfe/dmfe_impl.h b/usr/src/uts/common/io/dmfe/dmfe_impl.h index 6792f540bd..978229574d 100644 --- a/usr/src/uts/common/io/dmfe/dmfe_impl.h +++ b/usr/src/uts/common/io/dmfe/dmfe_impl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_DMFE_IMPL_H #define _SYS_DMFE_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -56,7 +54,7 @@ extern "C" { #include <sys/sunddi.h> #include <sys/miiregs.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> #include "dmfe.h" diff --git a/usr/src/uts/common/io/dmfe/dmfe_main.c b/usr/src/uts/common/io/dmfe/dmfe_main.c index 152c14f1e8..c231f61ec4 100644 --- a/usr/src/uts/common/io/dmfe/dmfe_main.c +++ b/usr/src/uts/common/io/dmfe/dmfe_main.c @@ -207,12 +207,11 @@ static int dmfe_m_promisc(void *, boolean_t); static int dmfe_m_multicst(void *, boolean_t, const uint8_t *); static int dmfe_m_unicst(void *, const uint8_t *); static void dmfe_m_ioctl(void *, queue_t *, mblk_t *); -static boolean_t dmfe_m_getcapab(void *, mac_capab_t, void *); static mblk_t *dmfe_m_tx(void *, mblk_t *); static int dmfe_m_stat(void *, uint_t, uint64_t *); static mac_callbacks_t dmfe_m_callbacks = { - (MC_IOCTL | MC_GETCAPAB), + (MC_IOCTL), dmfe_m_stat, dmfe_m_start, dmfe_m_stop, @@ -220,9 +219,8 @@ static mac_callbacks_t dmfe_m_callbacks = { dmfe_m_multicst, dmfe_m_unicst, dmfe_m_tx, - NULL, dmfe_m_ioctl, - dmfe_m_getcapab, + NULL, }; @@ -1621,46 +1619,6 @@ dmfe_m_promisc(void *arg, boolean_t on) return (0); } -/*ARGSUSED*/ -static boolean_t -dmfe_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) -{ - /* - * Note that the chip could support some form of polling and - * multiaddress support. We should look into adding polling - * support later, once Solaris is better positioned to take - * advantage of it, although it may be of little use since - * even a lowly 500MHz US-IIe should be able to keep up with - * 100Mbps. (Esp. if the packets are not unreasonably sized.) - * - * Multiaddress support, however, is likely to be of more - * utility with crossbow and virtualized NICs. Although, the - * fact that dmfe is only supported on low-end US-IIe hardware - * makes one wonder whether VNICs are likely to be used on - * such platforms. The chip certainly supports the notion, - * since it can be run in HASH-ONLY mode. (Though this would - * require software to drop unicast packets that are - * incorrectly received due to hash collision of the - * destination mac address.) - * - * Interestingly enough, modern Davicom chips (the 9102D) - * support full IP checksum offload, though its unclear - * whether any of these chips are used on any systems that can - * run Solaris. - * - * If this driver is ever supported on x86 hardware, then - * these assumptions should be revisited. - */ - switch (cap) { - case MAC_CAPAB_POLL: - case MAC_CAPAB_MULTIADDRESS: - case MAC_CAPAB_HCKSUM: - default: - return (B_FALSE); - } -} - - #undef DMFE_DBG diff --git a/usr/src/uts/common/io/e1000g/e1000g_main.c b/usr/src/uts/common/io/e1000g/e1000g_main.c index 8bde171cbb..e7fe619c3e 100644 --- a/usr/src/uts/common/io/e1000g/e1000g_main.c +++ b/usr/src/uts/common/io/e1000g/e1000g_main.c @@ -64,8 +64,6 @@ static uint_t e1000g_intr_pciexpress(caddr_t); static uint_t e1000g_intr(caddr_t); static void e1000g_intr_work(struct e1000g *, uint32_t); #pragma inline(e1000g_intr_work) -static uint32_t e1000g_get_itr(uint32_t, uint32_t, uint32_t); -#pragma inline(e1000g_get_itr) static int e1000g_init(struct e1000g *); static int e1000g_start(struct e1000g *, boolean_t); static void e1000g_stop(struct e1000g *, boolean_t); @@ -73,11 +71,6 @@ static int e1000g_m_start(void *); static void e1000g_m_stop(void *); static int e1000g_m_promisc(void *, boolean_t); static boolean_t e1000g_m_getcapab(void *, mac_capab_t, void *); -static int e1000g_m_unicst(void *, const uint8_t *); -static int e1000g_m_unicst_add(void *, mac_multi_addr_t *); -static int e1000g_m_unicst_remove(void *, mac_addr_slot_t); -static int e1000g_m_unicst_modify(void *, mac_multi_addr_t *); -static int e1000g_m_unicst_get(void *, mac_multi_addr_t *); static int e1000g_m_multicst(void *, boolean_t, const uint8_t *); static void e1000g_m_ioctl(void *, queue_t *, mblk_t *); static int e1000g_m_setprop(void *, const char *, mac_prop_id_t, @@ -98,7 +91,7 @@ static int e1000g_register_mac(struct e1000g *); static boolean_t e1000g_rx_drain(struct e1000g *); static boolean_t e1000g_tx_drain(struct e1000g *); static void e1000g_init_unicst(struct e1000g *); -static int e1000g_unicst_set(struct e1000g *, const uint8_t *, mac_addr_slot_t); +static int e1000g_unicst_set(struct e1000g *, const uint8_t *, int); /* * Local routines @@ -172,10 +165,8 @@ mac_priv_prop_t e1000g_priv_props[] = { {"_rx_intr_abs_delay", MAC_PROP_PERM_RW}, {"_intr_throttling_rate", MAC_PROP_PERM_RW}, {"_intr_adaptive", MAC_PROP_PERM_RW}, - {"_tx_recycle_thresh", MAC_PROP_PERM_RW}, {"_adv_pause_cap", MAC_PROP_PERM_READ}, {"_adv_asym_pause_cap", MAC_PROP_PERM_READ}, - {"_tx_recycle_num", MAC_PROP_PERM_RW} }; #define E1000G_MAX_PRIV_PROPS \ (sizeof (e1000g_priv_props)/sizeof (mac_priv_prop_t)) @@ -245,9 +236,8 @@ static mac_callbacks_t e1000g_m_callbacks = { e1000g_m_stop, e1000g_m_promisc, e1000g_m_multicst, - e1000g_m_unicst, - e1000g_m_tx, NULL, + e1000g_m_tx, e1000g_m_ioctl, e1000g_m_getcapab, NULL, @@ -607,6 +597,7 @@ e1000g_register_mac(struct e1000g *Adapter) mac->m_margin = VLAN_TAGSZ; mac->m_priv_props = e1000g_priv_props; mac->m_priv_prop_count = E1000G_MAX_PRIV_PROPS; + mac->m_v12n = MAC_VIRT_LEVEL1; err = mac_register(mac, &Adapter->mh); mac_free(mac); @@ -935,17 +926,17 @@ e1000g_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd) if (Adapter == NULL) return (DDI_FAILURE); + rx_drain = e1000g_rx_drain(Adapter); + if (!rx_drain && !e1000g_force_detach) + return (DDI_FAILURE); + if (mac_unregister(Adapter->mh) != 0) { e1000g_log(Adapter, CE_WARN, "Unregister MAC failed"); return (DDI_FAILURE); } Adapter->attach_progress &= ~ATTACH_PROGRESS_MAC; - - if (Adapter->chip_state != E1000G_STOP) - e1000g_stop(Adapter, B_TRUE); - - rx_drain = e1000g_rx_drain(Adapter); + ASSERT(Adapter->chip_state == E1000G_STOP); /* * If e1000g_force_detach is enabled, driver detach is safe. @@ -955,9 +946,6 @@ e1000g_detach(dev_info_t *devinfo, ddi_detach_cmd_t cmd) */ if (e1000g_force_detach) { e1000g_free_priv_devi_node(Adapter, rx_drain); - } else { - if (!rx_drain) - return (DDI_FAILURE); } e1000g_unattach(devinfo, Adapter); @@ -1122,6 +1110,8 @@ e1000g_init_locks(struct e1000g *Adapter) MUTEX_DRIVER, DDI_INTR_PRI(Adapter->intr_pri)); mutex_init(&rx_ring->freelist_lock, NULL, MUTEX_DRIVER, DDI_INTR_PRI(Adapter->intr_pri)); + mutex_init(&rx_ring->recycle_lock, NULL, + MUTEX_DRIVER, DDI_INTR_PRI(Adapter->intr_pri)); } static void @@ -1138,6 +1128,7 @@ e1000g_destroy_locks(struct e1000g *Adapter) rx_ring = Adapter->rx_ring; mutex_destroy(&rx_ring->rx_lock); mutex_destroy(&rx_ring->freelist_lock); + mutex_destroy(&rx_ring->recycle_lock); mutex_destroy(&Adapter->link_lock); mutex_destroy(&Adapter->watchdog_lock); @@ -1432,6 +1423,8 @@ e1000g_init(struct e1000g *Adapter) goto init_fail; } + Adapter->poll_mode = e1000g_poll_mode; + rw_exit(&Adapter->chip_lock); return (DDI_SUCCESS); @@ -1549,6 +1542,106 @@ e1000g_m_ioctl(void *arg, queue_t *q, mblk_t *mp) } } +/* + * The default value of e1000g_poll_mode == 0 assumes that the NIC is + * capable of supporting only one interrupt and we shouldn't disable + * the physical interrupt. In this case we let the interrupt come and + * we queue the packets in the rx ring itself in case we are in polling + * mode (better latency but slightly lower performance and a very + * high intrrupt count in mpstat which is harmless). + * + * e1000g_poll_mode == 1 assumes that we have per Rx ring interrupt + * which can be disabled in poll mode. This gives better overall + * throughput (compared to the mode above), shows very low interrupt + * count but has slightly higher latency since we pick the packets when + * the poll thread does polling. + * + * Currently, this flag should be enabled only while doing performance + * measurement or when it can be guaranteed that entire NIC going + * in poll mode will not harm any traffic like cluster heartbeat etc. + */ +int e1000g_poll_mode = 0; + +/* + * Called from the upper layers when driver is in polling mode to + * pick up any queued packets. Care should be taken to not block + * this thread. + */ +static mblk_t *e1000g_poll_ring(void *arg, int bytes_to_pickup) +{ + e1000g_rx_ring_t *rx_ring = (e1000g_rx_ring_t *)arg; + mblk_t *mp = NULL; + mblk_t *tail; + uint_t sz = 0; + struct e1000g *adapter; + + adapter = rx_ring->adapter; + + mutex_enter(&rx_ring->rx_lock); + ASSERT(rx_ring->poll_flag); + + /* + * Get any packets that have arrived. Works only if we + * actually disable the physical adapter/rx_ring interrupt. + * (e1000g_poll_mode == 1). In case e1000g_poll_mode == 0, + * packets will have already been added to the poll list + * by the interrupt (see e1000g_intr_work()). + */ + if (adapter->poll_mode) { + mp = e1000g_receive(rx_ring, &tail, &sz); + if (mp != NULL) { + if (rx_ring->poll_list_head == NULL) + rx_ring->poll_list_head = mp; + else + rx_ring->poll_list_tail->b_next = mp; + rx_ring->poll_list_tail = tail; + rx_ring->poll_list_sz += sz; + } + } + + mp = rx_ring->poll_list_head; + if (mp == NULL) { + mutex_exit(&rx_ring->rx_lock); + return (NULL); + } + + /* Check if we can sendup the entire chain */ + if (bytes_to_pickup >= rx_ring->poll_list_sz) { + mp = rx_ring->poll_list_head; + rx_ring->poll_list_head = NULL; + rx_ring->poll_list_tail = NULL; + rx_ring->poll_list_sz = 0; + mutex_exit(&rx_ring->rx_lock); + return (mp); + } + + /* + * We need to find out how much chain we can send up. We + * are guaranteed that atleast one packet will go up since + * we already checked that. + */ + tail = mp; + sz = 0; + while (mp != NULL) { + sz += MBLKL(mp); + if (sz > bytes_to_pickup) { + sz -= MBLKL(mp); + break; + } + tail = mp; + mp = mp->b_next; + } + + mp = rx_ring->poll_list_head; + rx_ring->poll_list_head = tail->b_next; + if (rx_ring->poll_list_head == NULL) + rx_ring->poll_list_tail = NULL; + rx_ring->poll_list_sz -= sz; + tail->b_next = NULL; + mutex_exit(&rx_ring->rx_lock); + return (mp); +} + static int e1000g_m_start(void *arg) { @@ -1912,7 +2005,6 @@ e1000g_intr_work(struct e1000g *Adapter, uint32_t icr) struct e1000_hw *hw; hw = &Adapter->shared; e1000g_tx_ring_t *tx_ring = Adapter->tx_ring; - uint32_t itr; Adapter->rx_pkt_cnt = 0; Adapter->tx_pkt_cnt = 0; @@ -1929,16 +2021,79 @@ e1000g_intr_work(struct e1000g *Adapter, uint32_t icr) } if (icr & E1000_ICR_RXT0) { - mblk_t *mp; + mblk_t *mp; + uint_t sz = 0; + mblk_t *tmp, *tail = NULL; + e1000g_rx_ring_t *rx_ring; - mutex_enter(&Adapter->rx_ring->rx_lock); - mp = e1000g_receive(Adapter); - mutex_exit(&Adapter->rx_ring->rx_lock); + rx_ring = Adapter->rx_ring; + mutex_enter(&rx_ring->rx_lock); + /* + * If the real interrupt for the Rx ring was + * not disabled (e1000g_poll_mode == 0), then + * we still pick up the packets and queue them + * on Rx ring if we were in polling mode. this + * enables the polling thread to pick up packets + * really fast in polling mode and helps improve + * latency. + */ + mp = e1000g_receive(rx_ring, &tail, &sz); rw_exit(&Adapter->chip_lock); - if (mp != NULL) - mac_rx(Adapter->mh, Adapter->mrh, mp); + if (mp != NULL) { + ASSERT(tail != NULL); + if (!rx_ring->poll_flag) { + /* + * If not polling, see if something was + * already queued. Take care not to + * reorder packets. + */ + if (rx_ring->poll_list_head == NULL) { + mutex_exit(&rx_ring->rx_lock); + mac_rx_ring(Adapter->mh, rx_ring->mrh, + mp, rx_ring->ring_gen_num); + } else { + tmp = rx_ring->poll_list_head; + rx_ring->poll_list_head = NULL; + rx_ring->poll_list_tail->b_next = mp; + rx_ring->poll_list_tail = NULL; + rx_ring->poll_list_sz = 0; + mutex_exit(&rx_ring->rx_lock); + mac_rx_ring(Adapter->mh, rx_ring->mrh, + tmp, rx_ring->ring_gen_num); + } + } else { + /* + * We are in a polling mode. Put the + * processed packets on the poll list. + */ + if (rx_ring->poll_list_head == NULL) + rx_ring->poll_list_head = mp; + else + rx_ring->poll_list_tail->b_next = mp; + rx_ring->poll_list_tail = tail; + rx_ring->poll_list_sz += sz; + mutex_exit(&rx_ring->rx_lock); + } + } else if (!rx_ring->poll_flag && + rx_ring->poll_list_head != NULL) { + /* + * Nothing new has arrived (then why + * was the interrupt raised??). Check + * if something queued from the last + * time. + */ + tmp = rx_ring->poll_list_head; + rx_ring->poll_list_head = NULL; + rx_ring->poll_list_tail = NULL; + rx_ring->poll_list_sz = 0; + mutex_exit(&rx_ring->rx_lock); + mac_rx_ring(Adapter->mh, rx_ring->mrh, + tmp, rx_ring->ring_gen_num); + } else { + mutex_exit(&rx_ring->rx_lock); + } } else rw_exit(&Adapter->chip_lock); @@ -1952,7 +2107,6 @@ e1000g_intr_work(struct e1000g *Adapter, uint32_t icr) E1000G_DEBUG_STAT(tx_ring->stat_recycle_intr); rw_exit(&Adapter->chip_lock); - /* Schedule the re-transmit */ if (tx_ring->resched_needed && (tx_ring->tbd_avail > DEFAULT_TX_UPDATE_THRESHOLD)) { tx_ring->resched_needed = B_FALSE; @@ -1961,15 +2115,6 @@ e1000g_intr_work(struct e1000g *Adapter, uint32_t icr) } } - if (Adapter->intr_adaptive) { - itr = e1000g_get_itr(Adapter->rx_pkt_cnt, Adapter->tx_pkt_cnt, - Adapter->intr_throttling_rate); - if (itr) { - E1000_WRITE_REG(hw, E1000_ITR, itr); - Adapter->intr_throttling_rate = itr; - } - } - /* * The Receive Sequence errors RXSEQ and the link status change LSC * are checked to detect that the cable has been pulled out. For @@ -2040,40 +2185,6 @@ e1000g_intr_work(struct e1000g *Adapter, uint32_t icr) } } -static uint32_t -e1000g_get_itr(uint32_t rx_packet, uint32_t tx_packet, uint32_t cur_itr) -{ - uint32_t new_itr; - - /* - * Determine a propper itr according to rx/tx packet count - * per interrupt, the value of itr are based on document - * and testing. - */ - if ((rx_packet < DEFAULT_INTR_PACKET_LOW) || - (tx_packet < DEFAULT_INTR_PACKET_LOW)) { - new_itr = DEFAULT_INTR_THROTTLING_LOW; - goto itr_done; - } - if ((rx_packet > DEFAULT_INTR_PACKET_HIGH) || - (tx_packet > DEFAULT_INTR_PACKET_HIGH)) { - new_itr = DEFAULT_INTR_THROTTLING_LOW; - goto itr_done; - } - if (cur_itr < DEFAULT_INTR_THROTTLING_HIGH) { - new_itr = cur_itr + (DEFAULT_INTR_THROTTLING_HIGH >> 2); - if (new_itr > DEFAULT_INTR_THROTTLING_HIGH) - new_itr = DEFAULT_INTR_THROTTLING_HIGH; - } else - new_itr = DEFAULT_INTR_THROTTLING_HIGH; - -itr_done: - if (cur_itr == new_itr) - return (0); - else - return (new_itr); -} - static void e1000g_init_unicst(struct e1000g *Adapter) { @@ -2082,45 +2193,33 @@ e1000g_init_unicst(struct e1000g *Adapter) hw = &Adapter->shared; - if (!Adapter->unicst_init) { + if (Adapter->init_count == 0) { /* Initialize the multiple unicast addresses */ Adapter->unicst_total = MAX_NUM_UNICAST_ADDRESSES; + /* Workaround for an erratum of 82571 chipst */ if ((hw->mac.type == e1000_82571) && (e1000_get_laa_state_82571(hw) == B_TRUE)) Adapter->unicst_total--; - Adapter->unicst_avail = Adapter->unicst_total - 1; + Adapter->unicst_avail = Adapter->unicst_total; - /* Store the default mac address */ - e1000_rar_set(hw, hw->mac.addr, 0); - if ((hw->mac.type == e1000_82571) && - (e1000_get_laa_state_82571(hw) == B_TRUE)) - e1000_rar_set(hw, hw->mac.addr, LAST_RAR_ENTRY); - - bcopy(hw->mac.addr, Adapter->unicst_addr[0].mac.addr, - ETHERADDRL); - Adapter->unicst_addr[0].mac.set = 1; - - for (slot = 1; slot < Adapter->unicst_total; slot++) - Adapter->unicst_addr[slot].mac.set = 0; - - Adapter->unicst_init = B_TRUE; + for (slot = 0; slot < Adapter->unicst_total; slot++) { + /* Clear both the flag and MAC address */ + Adapter->unicst_addr[slot].reg.high = 0; + Adapter->unicst_addr[slot].reg.low = 0; + } } else { - /* Recover the default mac address */ - bcopy(Adapter->unicst_addr[0].mac.addr, hw->mac.addr, - ETHERADDRL); - - /* Store the default mac address */ - e1000_rar_set(hw, hw->mac.addr, 0); + /* Workaround for an erratum of 82571 chipst */ if ((hw->mac.type == e1000_82571) && (e1000_get_laa_state_82571(hw) == B_TRUE)) e1000_rar_set(hw, hw->mac.addr, LAST_RAR_ENTRY); /* Re-configure the RAR registers */ - for (slot = 1; slot < Adapter->unicst_total; slot++) - e1000_rar_set(hw, - Adapter->unicst_addr[slot].mac.addr, slot); + for (slot = 0; slot < Adapter->unicst_total; slot++) + if (Adapter->unicst_addr[slot].mac.set == 1) + e1000_rar_set(hw, + Adapter->unicst_addr[slot].mac.addr, slot); } if (e1000g_check_acc_handle(Adapter->osdep.reg_handle) != DDI_FM_OK) @@ -2128,22 +2227,8 @@ e1000g_init_unicst(struct e1000g *Adapter) } static int -e1000g_m_unicst(void *arg, const uint8_t *mac_addr) -{ - struct e1000g *Adapter; - - Adapter = (struct e1000g *)arg; - - /* Store the default MAC address */ - bcopy(mac_addr, Adapter->shared.mac.addr, ETHERADDRL); - - /* Set MAC address in address slot 0, which is the default address */ - return (e1000g_unicst_set(Adapter, mac_addr, 0)); -} - -static int e1000g_unicst_set(struct e1000g *Adapter, const uint8_t *mac_addr, - mac_addr_slot_t slot) + int slot) { struct e1000_hw *hw; @@ -2166,14 +2251,36 @@ e1000g_unicst_set(struct e1000g *Adapter, const uint8_t *mac_addr, E1000_WRITE_REG(hw, E1000_RCTL, E1000_RCTL_RST); msec_delay(5); } + if (mac_addr == NULL) { + E1000_WRITE_REG_ARRAY(hw, E1000_RA, slot << 1, 0); + E1000_WRITE_FLUSH(hw); + E1000_WRITE_REG_ARRAY(hw, E1000_RA, (slot << 1) + 1, 0); + E1000_WRITE_FLUSH(hw); + /* Clear both the flag and MAC address */ + Adapter->unicst_addr[slot].reg.high = 0; + Adapter->unicst_addr[slot].reg.low = 0; + } else { + bcopy(mac_addr, Adapter->unicst_addr[slot].mac.addr, + ETHERADDRL); + e1000_rar_set(hw, (uint8_t *)mac_addr, slot); + Adapter->unicst_addr[slot].mac.set = 1; + } - bcopy(mac_addr, Adapter->unicst_addr[slot].mac.addr, ETHERADDRL); - e1000_rar_set(hw, (uint8_t *)mac_addr, slot); - + /* Workaround for an erratum of 82571 chipst */ if (slot == 0) { if ((hw->mac.type == e1000_82571) && (e1000_get_laa_state_82571(hw) == B_TRUE)) - e1000_rar_set(hw, (uint8_t *)mac_addr, LAST_RAR_ENTRY); + if (mac_addr == NULL) { + E1000_WRITE_REG_ARRAY(hw, E1000_RA, + slot << 1, 0); + E1000_WRITE_FLUSH(hw); + E1000_WRITE_REG_ARRAY(hw, E1000_RA, + (slot << 1) + 1, 0); + E1000_WRITE_FLUSH(hw); + } else { + e1000_rar_set(hw, (uint8_t *)mac_addr, + LAST_RAR_ENTRY); + } } /* @@ -2192,7 +2299,6 @@ e1000g_unicst_set(struct e1000g *Adapter, const uint8_t *mac_addr, } rw_exit(&Adapter->chip_lock); - if (e1000g_check_acc_handle(Adapter->osdep.reg_handle) != DDI_FM_OK) { ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED); return (EIO); @@ -2201,163 +2307,6 @@ e1000g_unicst_set(struct e1000g *Adapter, const uint8_t *mac_addr, return (0); } -/* - * e1000g_m_unicst_add() - will find an unused address slot, set the - * address value to the one specified, reserve that slot and enable - * the NIC to start filtering on the new MAC address. - * Returns 0 on success. - */ -static int -e1000g_m_unicst_add(void *arg, mac_multi_addr_t *maddr) -{ - struct e1000g *Adapter = (struct e1000g *)arg; - mac_addr_slot_t slot; - int err; - - if (mac_unicst_verify(Adapter->mh, - maddr->mma_addr, maddr->mma_addrlen) == B_FALSE) - return (EINVAL); - - rw_enter(&Adapter->chip_lock, RW_WRITER); - if (Adapter->unicst_avail == 0) { - /* no slots available */ - rw_exit(&Adapter->chip_lock); - return (ENOSPC); - } - - /* - * Primary/default address is in slot 0. The next addresses - * are the multiple MAC addresses. So multiple MAC address 0 - * is in slot 1, 1 in slot 2, and so on. So the first multiple - * MAC address resides in slot 1. - */ - for (slot = 1; slot < Adapter->unicst_total; slot++) { - if (Adapter->unicst_addr[slot].mac.set == 0) { - Adapter->unicst_addr[slot].mac.set = 1; - break; - } - } - - ASSERT((slot > 0) && (slot < Adapter->unicst_total)); - - Adapter->unicst_avail--; - rw_exit(&Adapter->chip_lock); - - maddr->mma_slot = slot; - - if ((err = e1000g_unicst_set(Adapter, maddr->mma_addr, slot)) != 0) { - rw_enter(&Adapter->chip_lock, RW_WRITER); - Adapter->unicst_addr[slot].mac.set = 0; - Adapter->unicst_avail++; - rw_exit(&Adapter->chip_lock); - } - - return (err); -} - -/* - * e1000g_m_unicst_remove() - removes a MAC address that was added by a - * call to e1000g_m_unicst_add(). The slot number that was returned in - * e1000g_m_unicst_add() is passed in the call to remove the address. - * Returns 0 on success. - */ -static int -e1000g_m_unicst_remove(void *arg, mac_addr_slot_t slot) -{ - struct e1000g *Adapter = (struct e1000g *)arg; - int err; - - if ((slot <= 0) || (slot >= Adapter->unicst_total)) - return (EINVAL); - - rw_enter(&Adapter->chip_lock, RW_WRITER); - if (Adapter->unicst_addr[slot].mac.set == 1) { - Adapter->unicst_addr[slot].mac.set = 0; - Adapter->unicst_avail++; - rw_exit(&Adapter->chip_lock); - - /* Copy the default address to the passed slot */ - if ((err = e1000g_unicst_set(Adapter, - Adapter->unicst_addr[0].mac.addr, slot)) != 0) { - rw_enter(&Adapter->chip_lock, RW_WRITER); - Adapter->unicst_addr[slot].mac.set = 1; - Adapter->unicst_avail--; - rw_exit(&Adapter->chip_lock); - } - return (err); - } - rw_exit(&Adapter->chip_lock); - - return (EINVAL); -} - -/* - * e1000g_m_unicst_modify() - modifies the value of an address that - * has been added by e1000g_m_unicst_add(). The new address, address - * length and the slot number that was returned in the call to add - * should be passed to e1000g_m_unicst_modify(). mma_flags should be - * set to 0. Returns 0 on success. - */ -static int -e1000g_m_unicst_modify(void *arg, mac_multi_addr_t *maddr) -{ - struct e1000g *Adapter = (struct e1000g *)arg; - mac_addr_slot_t slot; - - if (mac_unicst_verify(Adapter->mh, - maddr->mma_addr, maddr->mma_addrlen) == B_FALSE) - return (EINVAL); - - slot = maddr->mma_slot; - - if ((slot <= 0) || (slot >= Adapter->unicst_total)) - return (EINVAL); - - rw_enter(&Adapter->chip_lock, RW_WRITER); - if (Adapter->unicst_addr[slot].mac.set == 1) { - rw_exit(&Adapter->chip_lock); - - return (e1000g_unicst_set(Adapter, maddr->mma_addr, slot)); - } - rw_exit(&Adapter->chip_lock); - - return (EINVAL); -} - -/* - * e1000g_m_unicst_get() - will get the MAC address and all other - * information related to the address slot passed in mac_multi_addr_t. - * mma_flags should be set to 0 in the call. - * On return, mma_flags can take the following values: - * 1) MMAC_SLOT_UNUSED - * 2) MMAC_SLOT_USED | MMAC_VENDOR_ADDR - * 3) MMAC_SLOT_UNUSED | MMAC_VENDOR_ADDR - * 4) MMAC_SLOT_USED - */ -static int -e1000g_m_unicst_get(void *arg, mac_multi_addr_t *maddr) -{ - struct e1000g *Adapter = (struct e1000g *)arg; - mac_addr_slot_t slot; - - slot = maddr->mma_slot; - - if ((slot <= 0) || (slot >= Adapter->unicst_total)) - return (EINVAL); - - rw_enter(&Adapter->chip_lock, RW_WRITER); - if (Adapter->unicst_addr[slot].mac.set == 1) { - bcopy(Adapter->unicst_addr[slot].mac.addr, - maddr->mma_addr, ETHERADDRL); - maddr->mma_flags = MMAC_SLOT_USED; - } else { - maddr->mma_flags = MMAC_SLOT_UNUSED; - } - rw_exit(&Adapter->chip_lock); - - return (0); -} - static int multicst_add(struct e1000g *Adapter, const uint8_t *multiaddr) { @@ -2586,6 +2535,274 @@ e1000g_m_promisc(void *arg, boolean_t on) return (0); } +/* + * Entry points to enable and disable interrupts at the granularity of + * a group. + * Turns the poll_mode for the whole adapter on and off to enable or + * override the ring level polling control over the hardware interrupts. + */ +static int +e1000g_rx_group_intr_enable(mac_intr_handle_t arg) +{ + struct e1000g *adapter = (struct e1000g *)arg; + e1000g_rx_ring_t *rx_ring = adapter->rx_ring; + + /* + * Later interrupts at the granularity of the this ring will + * invoke mac_rx() with NULL, indicating the need for another + * software classification. + * We have a single ring usable per adapter now, so we only need to + * reset the rx handle for that one. + * When more RX rings can be used, we should update each one of them. + */ + mutex_enter(&rx_ring->rx_lock); + rx_ring->mrh = NULL; + adapter->poll_mode = B_FALSE; + mutex_exit(&rx_ring->rx_lock); + return (0); +} + +static int +e1000g_rx_group_intr_disable(mac_intr_handle_t arg) +{ + struct e1000g *adapter = (struct e1000g *)arg; + e1000g_rx_ring_t *rx_ring = adapter->rx_ring; + + mutex_enter(&rx_ring->rx_lock); + + /* + * Later interrupts at the granularity of the this ring will + * invoke mac_rx() with the handle for this ring; + */ + adapter->poll_mode = B_TRUE; + rx_ring->mrh = rx_ring->mrh_init; + mutex_exit(&rx_ring->rx_lock); + return (0); +} + +/* + * Entry points to enable and disable interrupts at the granularity of + * a ring. + * adapter poll_mode controls whether we actually proceed with hardware + * interrupt toggling. + */ +static int +e1000g_rx_ring_intr_enable(mac_intr_handle_t intrh) +{ + e1000g_rx_ring_t *rx_ring = (e1000g_rx_ring_t *)intrh; + struct e1000g *adapter = rx_ring->adapter; + struct e1000_hw *hw = &adapter->shared; + uint32_t intr_mask; + boolean_t poll_mode; + + mutex_enter(&rx_ring->rx_lock); + rx_ring->poll_flag = 0; + poll_mode = adapter->poll_mode; + mutex_exit(&rx_ring->rx_lock); + + if (poll_mode) { + /* Rx interrupt enabling for MSI and legacy */ + intr_mask = E1000_READ_REG(hw, E1000_IMS); + intr_mask |= E1000_IMS_RXT0; + E1000_WRITE_REG(hw, E1000_IMS, intr_mask); + E1000_WRITE_FLUSH(hw); + + /* Trigger a Rx interrupt to check Rx ring */ + E1000_WRITE_REG(hw, E1000_ICS, E1000_IMS_RXT0); + E1000_WRITE_FLUSH(hw); + } + return (0); +} + +static int +e1000g_rx_ring_intr_disable(mac_intr_handle_t intrh) +{ + e1000g_rx_ring_t *rx_ring = (e1000g_rx_ring_t *)intrh; + struct e1000g *adapter = rx_ring->adapter; + struct e1000_hw *hw = &adapter->shared; + boolean_t poll_mode; + + /* + * Once the adapter can support per Rx ring interrupt, + * we should disable the real interrupt instead of just setting + * the flag. + */ + mutex_enter(&rx_ring->rx_lock); + rx_ring->poll_flag = 1; + poll_mode = adapter->poll_mode; + mutex_exit(&rx_ring->rx_lock); + + if (poll_mode) { + /* Rx interrupt disabling for MSI and legacy */ + E1000_WRITE_REG(hw, E1000_IMC, E1000_IMS_RXT0); + E1000_WRITE_FLUSH(hw); + } + return (0); +} + +/* + * e1000g_unicst_find - Find the slot for the specified unicast address + */ +static int +e1000g_unicst_find(struct e1000g *Adapter, const uint8_t *mac_addr) +{ + int slot; + + ASSERT(mutex_owned(&Adapter->gen_lock)); + + for (slot = 0; slot < Adapter->unicst_total; slot++) { + if (Adapter->unicst_addr[slot].mac.set == 1) { + if (bcmp(Adapter->unicst_addr[slot].mac.addr, + mac_addr, ETHERADDRL) == 0) + return (slot); + } else + continue; + } + + return (-1); +} + +/* + * Entry points to add and remove a MAC address to a ring group. + * The caller takes care of adding and removing the MAC addresses + * to the filter via these two routines. + */ + +static int +e1000g_addmac(void *arg, const uint8_t *mac_addr) +{ + struct e1000g *Adapter = (struct e1000g *)arg; + int slot; + + mutex_enter(&Adapter->gen_lock); + + if (e1000g_unicst_find(Adapter, mac_addr) != -1) { + /* The same address is already in slot */ + mutex_exit(&Adapter->gen_lock); + return (0); + } + + if (Adapter->unicst_avail == 0) { + /* no slots available */ + mutex_exit(&Adapter->gen_lock); + return (ENOSPC); + } + + /* Search for a free slot */ + for (slot = 0; slot < Adapter->unicst_total; slot++) { + if (Adapter->unicst_addr[slot].mac.set == 0) + break; + } + ASSERT(slot < Adapter->unicst_total); + + e1000g_unicst_set(Adapter, mac_addr, slot); + Adapter->unicst_avail--; + + mutex_exit(&Adapter->gen_lock); + + return (0); +} + +static int +e1000g_remmac(void *arg, const uint8_t *mac_addr) +{ + struct e1000g *Adapter = (struct e1000g *)arg; + int slot; + + mutex_enter(&Adapter->gen_lock); + + slot = e1000g_unicst_find(Adapter, mac_addr); + if (slot == -1) { + mutex_exit(&Adapter->gen_lock); + return (EINVAL); + } + + ASSERT(Adapter->unicst_addr[slot].mac.set); + + /* Clear this slot */ + e1000g_unicst_set(Adapter, NULL, slot); + Adapter->unicst_avail++; + + mutex_exit(&Adapter->gen_lock); + + return (0); +} + +static int +e1000g_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num) +{ + e1000g_rx_ring_t *rx_ring = (e1000g_rx_ring_t *)rh; + + mutex_enter(&rx_ring->rx_lock); + rx_ring->ring_gen_num = mr_gen_num; + mutex_exit(&rx_ring->rx_lock); + return (0); +} + +/* + * Callback funtion for MAC layer to register all rings. + * + * The hardware supports a single group with currently only one ring + * available. + * Though not offering virtualization ability per se, exposing the + * group/ring still enables the polling and interrupt toggling. + */ +void +e1000g_fill_ring(void *arg, mac_ring_type_t rtype, const int grp_index, + const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh) +{ + struct e1000g *Adapter = (struct e1000g *)arg; + e1000g_rx_ring_t *rx_ring = Adapter->rx_ring; + mac_intr_t *mintr; + + /* + * We advertised only RX group/rings, so the MAC framework shouldn't + * ask for any thing else. + */ + ASSERT(rtype == MAC_RING_TYPE_RX && grp_index == 0 && ring_index == 0); + + rx_ring->mrh = rx_ring->mrh_init = rh; + infop->mri_driver = (mac_ring_driver_t)rx_ring; + infop->mri_start = e1000g_ring_start; + infop->mri_stop = NULL; + infop->mri_poll = e1000g_poll_ring; + + /* Ring level interrupts */ + mintr = &infop->mri_intr; + mintr->mi_handle = (mac_intr_handle_t)rx_ring; + mintr->mi_enable = e1000g_rx_ring_intr_enable; + mintr->mi_disable = e1000g_rx_ring_intr_disable; +} + +static void +e1000g_fill_group(void *arg, mac_ring_type_t rtype, const int grp_index, + mac_group_info_t *infop, mac_group_handle_t gh) +{ + struct e1000g *Adapter = (struct e1000g *)arg; + mac_intr_t *mintr; + + /* + * We advertised a single RX ring. Getting a request for anything else + * signifies a bug in the MAC framework. + */ + ASSERT(rtype == MAC_RING_TYPE_RX && grp_index == 0); + + Adapter->rx_group = gh; + + infop->mgi_driver = (mac_group_driver_t)Adapter; + infop->mgi_start = NULL; + infop->mgi_stop = NULL; + infop->mgi_addmac = e1000g_addmac; + infop->mgi_remmac = e1000g_remmac; + infop->mgi_count = 1; + + /* Group level interrupts */ + mintr = &infop->mgi_intr; + mintr->mi_handle = (mac_intr_handle_t)Adapter; + mintr->mi_enable = e1000g_rx_group_intr_enable; + mintr->mi_disable = e1000g_rx_group_intr_disable; +} + static boolean_t e1000g_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) { @@ -2602,34 +2819,6 @@ e1000g_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) return (B_FALSE); break; } - case MAC_CAPAB_POLL: - /* - * There's nothing for us to fill in, simply returning - * B_TRUE stating that we support polling is sufficient. - */ - break; - - case MAC_CAPAB_MULTIADDRESS: { - multiaddress_capab_t *mmacp = cap_data; - - /* - * The number of MAC addresses made available by - * this capability is one less than the total as - * the primary address in slot 0 is counted in - * the total. - */ - mmacp->maddr_naddr = Adapter->unicst_total - 1; - mmacp->maddr_naddrfree = Adapter->unicst_avail; - /* No multiple factory addresses, set mma_flag to 0 */ - mmacp->maddr_flag = 0; - mmacp->maddr_handle = Adapter; - mmacp->maddr_add = e1000g_m_unicst_add; - mmacp->maddr_remove = e1000g_m_unicst_remove; - mmacp->maddr_modify = e1000g_m_unicst_modify; - mmacp->maddr_get = e1000g_m_unicst_get; - mmacp->maddr_reserve = NULL; - break; - } case MAC_CAPAB_LSO: { mac_capab_lso_t *cap_lso = cap_data; @@ -2642,7 +2831,20 @@ e1000g_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) return (B_FALSE); break; } + case MAC_CAPAB_RINGS: { + mac_capab_rings_t *cap_rings = cap_data; + /* No TX rings exposed yet */ + if (cap_rings->mr_type != MAC_RING_TYPE_RX) + return (B_FALSE); + + cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; + cap_rings->mr_rnum = 1; + cap_rings->mr_gnum = 1; + cap_rings->mr_rget = e1000g_fill_ring; + cap_rings->mr_gget = e1000g_fill_group; + break; + } default: return (B_FALSE); } @@ -3124,32 +3326,6 @@ e1000g_set_priv_prop(struct e1000g *Adapter, const char *pr_name, } return (err); } - if (strcmp(pr_name, "_tx_recycle_thresh") == 0) { - if (pr_val == NULL) { - err = EINVAL; - return (err); - } - (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); - if (result < MIN_TX_RECYCLE_THRESHOLD || - result > MAX_TX_RECYCLE_THRESHOLD) - err = EINVAL; - else - Adapter->tx_recycle_thresh = (uint32_t)result; - return (err); - } - if (strcmp(pr_name, "_tx_recycle_num") == 0) { - if (pr_val == NULL) { - err = EINVAL; - return (err); - } - (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); - if (result < MIN_TX_RECYCLE_NUM || - result > MAX_TX_RECYCLE_NUM) - err = EINVAL; - else - Adapter->tx_recycle_num = (uint32_t)result; - return (err); - } return (ENOTSUP); } @@ -3236,18 +3412,6 @@ e1000g_get_priv_prop(struct e1000g *Adapter, const char *pr_name, err = 0; goto done; } - if (strcmp(pr_name, "_tx_recycle_thresh") == 0) { - value = (is_default ? DEFAULT_TX_RECYCLE_THRESHOLD : - Adapter->tx_recycle_thresh); - err = 0; - goto done; - } - if (strcmp(pr_name, "_tx_recycle_num") == 0) { - value = (is_default ? DEFAULT_TX_RECYCLE_NUM : - Adapter->tx_recycle_num); - err = 0; - goto done; - } done: if (err == 0) { (void) snprintf(pr_val, pr_valsize, "%d", value); @@ -3368,22 +3532,6 @@ e1000g_get_conf(struct e1000g *Adapter) B_TRUE : B_FALSE; /* - * Tx recycle threshold - */ - Adapter->tx_recycle_thresh = - e1000g_get_prop(Adapter, "tx_recycle_thresh", - MIN_TX_RECYCLE_THRESHOLD, MAX_TX_RECYCLE_THRESHOLD, - DEFAULT_TX_RECYCLE_THRESHOLD); - - /* - * Tx recycle descriptor number - */ - Adapter->tx_recycle_num = - e1000g_get_prop(Adapter, "tx_recycle_num", - MIN_TX_RECYCLE_NUM, MAX_TX_RECYCLE_NUM, - DEFAULT_TX_RECYCLE_NUM); - - /* * Hardware checksum enable/disable parameter */ Adapter->tx_hcksum_enable = @@ -3672,6 +3820,23 @@ e1000g_reset_link(struct e1000g *Adapter) } static void +e1000g_timer_tx_resched(struct e1000g *Adapter) +{ + e1000g_tx_ring_t *tx_ring = Adapter->tx_ring; + + if (tx_ring->resched_needed && + ((ddi_get_lbolt() - tx_ring->resched_timestamp) > + drv_usectohz(1000000)) && + (Adapter->chip_state == E1000G_START) && + (tx_ring->tbd_avail >= DEFAULT_TX_NO_RESOURCE)) { + tx_ring->resched_needed = B_FALSE; + mac_tx_update(Adapter->mh); + E1000G_STAT(tx_ring->stat_reschedule); + E1000G_STAT(tx_ring->stat_timer_reschedule); + } +} + +static void e1000g_local_timer(void *ws) { struct e1000g *Adapter = (struct e1000g *)ws; @@ -3683,10 +3848,11 @@ e1000g_local_timer(void *ws) if (Adapter->chip_state == E1000G_ERROR) { Adapter->reset_count++; - if (e1000g_global_reset(Adapter)) + if (e1000g_global_reset(Adapter)) { ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_RESTORED); - else + e1000g_timer_tx_resched(Adapter); + } else ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_LOST); return; @@ -3697,10 +3863,11 @@ e1000g_local_timer(void *ws) "Tx stall detected. Activate automatic recovery.\n"); e1000g_fm_ereport(Adapter, DDI_FM_DEVICE_STALL); Adapter->reset_count++; - if (e1000g_reset_adapter(Adapter)) + if (e1000g_reset_adapter(Adapter)) { ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_RESTORED); - else + e1000g_timer_tx_resched(Adapter); + } else ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_LOST); return; @@ -3769,6 +3936,8 @@ e1000g_local_timer(void *ws) if (e1000g_check_acc_handle(Adapter->osdep.reg_handle) != DDI_FM_OK) ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED); + else + e1000g_timer_tx_resched(Adapter); restart_watchdog_timer(Adapter); } diff --git a/usr/src/uts/common/io/e1000g/e1000g_rx.c b/usr/src/uts/common/io/e1000g/e1000g_rx.c index 3bb4a5e90f..15d22b8c9a 100644 --- a/usr/src/uts/common/io/e1000g/e1000g_rx.c +++ b/usr/src/uts/common/io/e1000g/e1000g_rx.c @@ -20,7 +20,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms of the CDDLv1. + * Use is subject to license terms. */ /* @@ -147,10 +147,16 @@ e1000g_rxfree_func(p_rx_sw_packet_t packet) } } - mutex_enter(&rx_ring->freelist_lock); - QUEUE_PUSH_TAIL(&rx_ring->free_list, &packet->Link); - rx_ring->avail_freepkt++; - mutex_exit(&rx_ring->freelist_lock); + /* + * Enqueue the recycled packets in a recycle queue. When freelist + * dries up, move the entire chain of packets from recycle queue + * to freelist. This helps in avoiding per packet mutex contention + * around freelist. + */ + mutex_enter(&rx_ring->recycle_lock); + QUEUE_PUSH_TAIL(&rx_ring->recycle_list, &packet->Link); + rx_ring->recycle_freepkt++; + mutex_exit(&rx_ring->recycle_lock); rw_exit(&e1000g_rx_detach_lock); } @@ -236,6 +242,8 @@ e1000g_rx_setup(struct e1000g *Adapter) /* Init the list of "Free Receive Buffer" */ QUEUE_INIT_LIST(&rx_ring->free_list); + /* Init the list of "Free Receive Buffer" */ + QUEUE_INIT_LIST(&rx_ring->recycle_list); /* * Setup Receive list and the Free list. Note that * the both were allocated in one packet area. @@ -263,6 +271,7 @@ e1000g_rx_setup(struct e1000g *Adapter) &packet->Link); } rx_ring->avail_freepkt = Adapter->rx_freelist_num; + rx_ring->recycle_freepkt = 0; Adapter->rx_buffer_setup = B_TRUE; } else { @@ -414,8 +423,23 @@ e1000g_get_buf(e1000g_rx_ring_t *rx_ring) mutex_enter(&rx_ring->freelist_lock); packet = (p_rx_sw_packet_t) QUEUE_POP_HEAD(&rx_ring->free_list); - if (packet != NULL) + if (packet != NULL) { rx_ring->avail_freepkt--; + } else { + /* + * If the freelist has no packets, check the recycle list + * to see if there are any available descriptor there. + */ + mutex_enter(&rx_ring->recycle_lock); + QUEUE_SWITCH(&rx_ring->free_list, &rx_ring->recycle_list); + rx_ring->avail_freepkt = rx_ring->recycle_freepkt; + rx_ring->recycle_freepkt = 0; + mutex_exit(&rx_ring->recycle_lock); + packet = (p_rx_sw_packet_t) + QUEUE_POP_HEAD(&rx_ring->free_list); + if (packet != NULL) + rx_ring->avail_freepkt--; + } mutex_exit(&rx_ring->freelist_lock); return (packet); @@ -427,7 +451,7 @@ e1000g_get_buf(e1000g_rx_ring_t *rx_ring) * This routine will process packets received in an interrupt */ mblk_t * -e1000g_receive(struct e1000g *Adapter) +e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t *sz) { struct e1000_hw *hw; mblk_t *nmp; @@ -443,7 +467,7 @@ e1000g_receive(struct e1000g *Adapter) boolean_t accept_frame; boolean_t end_of_packet; boolean_t need_copy; - e1000g_rx_ring_t *rx_ring; + struct e1000g *Adapter; dma_buffer_t *rx_buf; uint16_t cksumflags; @@ -452,9 +476,10 @@ e1000g_receive(struct e1000g *Adapter) pkt_count = 0; desc_count = 0; cksumflags = 0; + *sz = 0; + Adapter = rx_ring->adapter; hw = &Adapter->shared; - rx_ring = Adapter->rx_ring; /* Sync the Rx descriptor DMA buffers */ (void) ddi_dma_sync(rx_ring->rbd_dma_handle, @@ -805,6 +830,8 @@ rx_end_of_packet: ret_nmp = rx_ring->rx_mblk; } ret_nmp->b_next = NULL; + *tail = ret_nmp; + *sz += length; rx_ring->rx_mblk = NULL; rx_ring->rx_mblk_tail = NULL; diff --git a/usr/src/uts/common/io/e1000g/e1000g_stat.c b/usr/src/uts/common/io/e1000g/e1000g_stat.c index 7df4317e9e..0c67c914a5 100644 --- a/usr/src/uts/common/io/e1000g/e1000g_stat.c +++ b/usr/src/uts/common/io/e1000g/e1000g_stat.c @@ -20,7 +20,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms of the CDDLv1. + * Use is subject to license terms. */ /* @@ -185,7 +185,8 @@ e1000g_update_stats(kstat_t *ksp, int rw) e1000g_ksp->rx_none.value.ul = rx_ring->stat_none; e1000g_ksp->rx_multi_desc.value.ul = rx_ring->stat_multi_desc; e1000g_ksp->rx_no_freepkt.value.ul = rx_ring->stat_no_freepkt; - e1000g_ksp->rx_avail_freepkt.value.ul = rx_ring->avail_freepkt; + e1000g_ksp->rx_avail_freepkt.value.ul = rx_ring->avail_freepkt + + rx_ring->recycle_freepkt; e1000g_ksp->tx_under_size.value.ul = tx_ring->stat_under_size; e1000g_ksp->tx_exceed_frags.value.ul = tx_ring->stat_exceed_frags; diff --git a/usr/src/uts/common/io/e1000g/e1000g_sw.h b/usr/src/uts/common/io/e1000g/e1000g_sw.h index 605440cd48..e7c56a5877 100644 --- a/usr/src/uts/common/io/e1000g/e1000g_sw.h +++ b/usr/src/uts/common/io/e1000g/e1000g_sw.h @@ -54,7 +54,7 @@ extern "C" { #include <sys/kstat.h> #include <sys/modctl.h> #include <sys/errno.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> #include <sys/vlan.h> #include <sys/ddi.h> @@ -114,8 +114,6 @@ extern "C" { #define MAX_INTR_THROTTLING 65535 #define MAX_RX_BCOPY_THRESHOLD E1000_RX_BUFFER_SIZE_2K #define MAX_TX_BCOPY_THRESHOLD E1000_TX_BUFFER_SIZE_2K -#define MAX_TX_RECYCLE_THRESHOLD MAX_NUM_TX_DESCRIPTOR -#define MAX_TX_RECYCLE_NUM MAX_NUM_TX_DESCRIPTOR #define MIN_NUM_TX_DESCRIPTOR 80 #define MIN_NUM_RX_DESCRIPTOR 80 @@ -129,8 +127,6 @@ extern "C" { #define MIN_INTR_THROTTLING 0 #define MIN_RX_BCOPY_THRESHOLD 0 #define MIN_TX_BCOPY_THRESHOLD ETHERMIN -#define MIN_TX_RECYCLE_THRESHOLD 0 -#define MIN_TX_RECYCLE_NUM MAX_TX_DESC_PER_PACKET #define DEFAULT_NUM_RX_DESCRIPTOR 2048 #define DEFAULT_NUM_TX_DESCRIPTOR 2048 @@ -143,13 +139,11 @@ extern "C" { #define MIN_INTR_PER_SEC 3000 #define DEFAULT_INTR_PACKET_LOW 5 #define DEFAULT_INTR_PACKET_HIGH 128 -#define DEFAULT_TX_RECYCLE_THRESHOLD 512 #else #define MAX_INTR_PER_SEC 15000 #define MIN_INTR_PER_SEC 4000 #define DEFAULT_INTR_PACKET_LOW 10 #define DEFAULT_INTR_PACKET_HIGH 48 -#define DEFAULT_TX_RECYCLE_THRESHOLD DEFAULT_TX_NO_RESOURCE #endif #define DEFAULT_RX_INTR_DELAY 0 @@ -162,7 +156,6 @@ extern "C" { #define DEFAULT_RX_BCOPY_THRESHOLD 128 #define DEFAULT_TX_BCOPY_THRESHOLD 512 -#define DEFAULT_TX_RECYCLE_NUM 64 #define DEFAULT_TX_UPDATE_THRESHOLD 256 #define DEFAULT_TX_NO_RESOURCE MAX_TX_DESC_PER_PACKET @@ -402,6 +395,14 @@ extern "C" { (_LH1)->Blink = ((PSINGLE_LIST_LINK)(_LH2)->Blink); \ } + +#define QUEUE_SWITCH(_LH1, _LH2) \ + if ((_LH2)->Flink) { \ + (_LH1)->Flink = (_LH2)->Flink; \ + (_LH1)->Blink = (_LH2)->Blink; \ + (_LH2)->Flink = (_LH2)->Blink = (PSINGLE_LIST_LINK)0; \ + } + /* * Property lookups */ @@ -717,6 +718,7 @@ typedef struct _e1000g_tx_ring { * reschedule when tx resource is available */ boolean_t resched_needed; + clock_t resched_timestamp; uint32_t stall_watchdog; uint32_t recycle_fail; mblk_list_t mblks; @@ -727,6 +729,7 @@ typedef struct _e1000g_tx_ring { uint32_t stat_no_desc; uint32_t stat_send_fail; uint32_t stat_reschedule; + uint32_t stat_timer_reschedule; uint32_t stat_over_size; #ifdef E1000G_DEBUG uint32_t stat_under_size; @@ -752,6 +755,7 @@ typedef struct _e1000g_tx_ring { typedef struct _e1000g_rx_ring { kmutex_t rx_lock; kmutex_t freelist_lock; + kmutex_t recycle_lock; /* * Descriptor queue definitions */ @@ -768,13 +772,23 @@ typedef struct _e1000g_rx_ring { p_rx_sw_packet_t packet_area; LIST_DESCRIBER recv_list; LIST_DESCRIBER free_list; + LIST_DESCRIBER recycle_list; p_rx_sw_packet_t pending_list; uint32_t pending_count; uint32_t avail_freepkt; + uint32_t recycle_freepkt; uint32_t rx_mblk_len; mblk_t *rx_mblk; mblk_t *rx_mblk_tail; + mac_ring_handle_t mrh; + mac_ring_handle_t mrh_init; + uint64_t ring_gen_num; + mblk_t *poll_list_head; + mblk_t *poll_list_tail; + uint_t poll_list_sz; + boolean_t poll_flag; + /* * Statistics */ @@ -833,8 +847,6 @@ typedef struct e1000g { boolean_t intr_adaptive; boolean_t tx_intr_enable; - uint32_t tx_recycle_thresh; - uint32_t tx_recycle_num; uint32_t tx_intr_delay; uint32_t tx_intr_abs_delay; uint32_t rx_intr_delay; @@ -853,6 +865,9 @@ typedef struct e1000g { e1000g_rx_ring_t rx_ring[1]; e1000g_tx_ring_t tx_ring[1]; + mac_group_handle_t rx_group; + + kmutex_t gen_lock; /* General lock for the whole struct e1000g */ /* * Rx and Tx packet count for interrupt adaptive setting @@ -909,6 +924,8 @@ typedef struct e1000g { kstat_t *e1000g_ksp; + boolean_t poll_mode; + uint16_t phy_ctrl; /* contents of PHY_CTRL */ uint16_t phy_status; /* contents of PHY_STATUS */ uint16_t phy_an_adv; /* contents of PHY_AUTONEG_ADV */ @@ -980,7 +997,7 @@ void e1000g_free_tx_swpkt(p_tx_sw_packet_t packet); void e1000g_tx_freemsg(e1000g_tx_ring_t *tx_ring); uint_t e1000g_tx_softint_worker(caddr_t arg1, caddr_t arg2); mblk_t *e1000g_m_tx(void *arg, mblk_t *mp); -mblk_t *e1000g_receive(struct e1000g *Adapter); +mblk_t *e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t *sz); void e1000g_rxfree_func(p_rx_sw_packet_t packet); int e1000g_m_stat(void *arg, uint_t stat, uint64_t *val); @@ -1008,6 +1025,7 @@ extern boolean_t e1000g_force_detach; extern uint32_t e1000g_mblks_pending; extern krwlock_t e1000g_rx_detach_lock; extern private_devi_list_t *e1000g_private_devi_list; +extern int e1000g_poll_mode; #ifdef __cplusplus } diff --git a/usr/src/uts/common/io/e1000g/e1000g_tx.c b/usr/src/uts/common/io/e1000g/e1000g_tx.c index 4255c098b4..d67b67ff63 100644 --- a/usr/src/uts/common/io/e1000g/e1000g_tx.c +++ b/usr/src/uts/common/io/e1000g/e1000g_tx.c @@ -20,7 +20,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms of the CDDLv1. + * Use is subject to license terms. */ /* @@ -211,8 +211,7 @@ e1000g_send(struct e1000g *Adapter, mblk_t *mp) * Descriptors... As you may run short of them before getting any * transmit interrupt... */ - if (tx_ring->resched_needed || - (tx_ring->tbd_avail < Adapter->tx_recycle_thresh)) { + if (tx_ring->tbd_avail < DEFAULT_TX_NO_RESOURCE) { (void) e1000g_recycle(tx_ring); E1000G_DEBUG_STAT(tx_ring->stat_recycle); @@ -406,6 +405,7 @@ tx_send_failed: * Enable Transmit interrupts, so that the interrupt routine can * call mac_tx_update() when transmit descriptors become available. */ + tx_ring->resched_timestamp = ddi_get_lbolt(); tx_ring->resched_needed = B_TRUE; if (!Adapter->tx_intr_enable) e1000g_mask_tx_interrupt(Adapter); @@ -434,6 +434,7 @@ tx_no_resource: * Enable Transmit interrupts, so that the interrupt routine can * call mac_tx_update() when transmit descriptors become available. */ + tx_ring->resched_timestamp = ddi_get_lbolt(); tx_ring->resched_needed = B_TRUE; if (!Adapter->tx_intr_enable) e1000g_mask_tx_interrupt(Adapter); @@ -449,9 +450,14 @@ e1000g_retrieve_context(mblk_t *mp, context_data_t *cur_context, uintptr_t ip_start; uintptr_t tcp_start; mblk_t *nmp; + uint32_t lsoflags; + uint32_t mss; bzero(cur_context, sizeof (context_data_t)); + /* first check lso information */ + lso_info_get(mp, &mss, &lsoflags); + /* retrieve checksum info */ hcksum_retrieve(mp, NULL, NULL, &cur_context->cksum_start, &cur_context->cksum_stuff, NULL, NULL, &cur_context->cksum_flags); @@ -464,45 +470,48 @@ e1000g_retrieve_context(mblk_t *mp, context_data_t *cur_context, cur_context->ether_header_size = sizeof (struct ether_header); - if (cur_context->cksum_flags & HW_LSO) { - if ((cur_context->mss = DB_LSOMSS(mp)) != 0) { - /* free the invaid packet */ - if (!((cur_context->cksum_flags & HCK_PARTIALCKSUM) && - (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM))) { - return (B_FALSE); - } - cur_context->lso_flag = B_TRUE; - /* - * Some fields are cleared for the hardware to fill - * in. We don't assume Ethernet header, IP header and - * TCP header are always in the same mblk fragment, - * while we assume each header is always within one - * mblk fragment and Ethernet header is always in the - * first mblk fragment. - */ - nmp = mp; - ip_start = (uintptr_t)(nmp->b_rptr) - + cur_context->ether_header_size; - if (ip_start >= (uintptr_t)(nmp->b_wptr)) { - ip_start = (uintptr_t)nmp->b_cont->b_rptr - + (ip_start - (uintptr_t)(nmp->b_wptr)); - nmp = nmp->b_cont; - } - tcp_start = ip_start + - IPH_HDR_LENGTH((ipha_t *)ip_start); - if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { - tcp_start = (uintptr_t)nmp->b_cont->b_rptr - + (tcp_start - (uintptr_t)(nmp->b_wptr)); - nmp = nmp->b_cont; - } - cur_context->hdr_len = cur_context->ether_header_size - + IPH_HDR_LENGTH((ipha_t *)ip_start) - + TCP_HDR_LENGTH((tcph_t *)tcp_start); - ((ipha_t *)ip_start)->ipha_length = 0; - ((ipha_t *)ip_start)->ipha_hdr_checksum = 0; - /* calculate the TCP packet payload length */ - cur_context->pay_len = msg_size - cur_context->hdr_len; + if (lsoflags & HW_LSO) { + ASSERT(mss != 0); + + /* free the invalid packet */ + if (mss == 0 || + !((cur_context->cksum_flags & HCK_PARTIALCKSUM) && + (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM))) { + return (B_FALSE); + } + cur_context->mss = (uint16_t)mss; + cur_context->lso_flag = B_TRUE; + + /* + * Some fields are cleared for the hardware to fill + * in. We don't assume Ethernet header, IP header and + * TCP header are always in the same mblk fragment, + * while we assume each header is always within one + * mblk fragment and Ethernet header is always in the + * first mblk fragment. + */ + nmp = mp; + ip_start = (uintptr_t)(nmp->b_rptr) + + cur_context->ether_header_size; + if (ip_start >= (uintptr_t)(nmp->b_wptr)) { + ip_start = (uintptr_t)nmp->b_cont->b_rptr + + (ip_start - (uintptr_t)(nmp->b_wptr)); + nmp = nmp->b_cont; } + tcp_start = ip_start + + IPH_HDR_LENGTH((ipha_t *)ip_start); + if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { + tcp_start = (uintptr_t)nmp->b_cont->b_rptr + + (tcp_start - (uintptr_t)(nmp->b_wptr)); + nmp = nmp->b_cont; + } + cur_context->hdr_len = cur_context->ether_header_size + + IPH_HDR_LENGTH((ipha_t *)ip_start) + + TCP_HDR_LENGTH((tcph_t *)tcp_start); + ((ipha_t *)ip_start)->ipha_length = 0; + ((ipha_t *)ip_start)->ipha_hdr_checksum = 0; + /* calculate the TCP packet payload length */ + cur_context->pay_len = msg_size - cur_context->hdr_len; } return (B_TRUE); } @@ -816,7 +825,6 @@ e1000g_fill_tx_ring(e1000g_tx_ring_t *tx_ring, LIST_DESCRIBER *pending_list, return (desc_count); } - /* * e1000g_tx_setup - setup tx data structures * @@ -955,7 +963,6 @@ e1000g_recycle(e1000g_tx_ring_t *tx_ring) mblk_t *nmp; struct e1000_tx_desc *descriptor; int desc_count; - int is_intr; /* * This function will examine each TxSwPacket in the 'used' queue @@ -972,13 +979,6 @@ e1000g_recycle(e1000g_tx_ring_t *tx_ring) return (0); } - is_intr = servicing_interrupt(); - - if (is_intr) - mutex_enter(&tx_ring->usedlist_lock); - else if (mutex_tryenter(&tx_ring->usedlist_lock) == 0) - return (0); - desc_count = 0; QUEUE_INIT_LIST(&pending_list); @@ -987,7 +987,6 @@ e1000g_recycle(e1000g_tx_ring_t *tx_ring) 0, 0, DDI_DMA_SYNC_FORKERNEL); if (e1000g_check_dma_handle( tx_ring->tbd_dma_handle) != DDI_FM_OK) { - mutex_exit(&tx_ring->usedlist_lock); ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED); Adapter->chip_state = E1000G_ERROR; return (0); @@ -996,6 +995,7 @@ e1000g_recycle(e1000g_tx_ring_t *tx_ring) /* * While there are still TxSwPackets in the used queue check them */ + mutex_enter(&tx_ring->usedlist_lock); while ((packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&tx_ring->used_list)) != NULL) { @@ -1030,9 +1030,6 @@ e1000g_recycle(e1000g_tx_ring_t *tx_ring) descriptor + 1; desc_count += packet->num_desc; - - if (is_intr && (desc_count >= Adapter->tx_recycle_num)) - break; } else { /* * Found a sw packet that the e1000g is not done diff --git a/usr/src/uts/common/io/hxge/hxge.h b/usr/src/uts/common/io/hxge/hxge.h index 837cbbc90c..37183afc7d 100644 --- a/usr/src/uts/common/io/hxge/hxge.h +++ b/usr/src/uts/common/io/hxge/hxge.h @@ -202,7 +202,6 @@ typedef struct _hxge_stats_t { hxge_pfc_stats_t pfc_stats; /* pfc stats */ hxge_port_stats_t port_stats; /* port stats */ - hxge_mmac_stats_t mmac_stats; /* Multi mac. stats */ hxge_peu_sys_stats_t peu_sys_stats; /* PEU system stats */ } hxge_stats_t, *p_hxge_stats_t; @@ -357,7 +356,6 @@ struct _hxge_t { uint32_t hxge_port_rbr_size; uint32_t hxge_port_rcr_size; uint32_t hxge_port_tx_ring_size; - hxge_mmac_t hxge_mmac_info; kmutex_t pio_lock; hxge_timeout timeout; diff --git a/usr/src/uts/common/io/hxge/hxge_impl.h b/usr/src/uts/common/io/hxge/hxge_impl.h index 57ad2c9a21..67bab83787 100644 --- a/usr/src/uts/common/io/hxge/hxge_impl.h +++ b/usr/src/uts/common/io/hxge/hxge_impl.h @@ -68,8 +68,7 @@ extern "C" { #include <sys/netlb.h> #include <sys/ddi_intr.h> -#include <sys/mac.h> -#include <sys/mac_impl.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> /* @@ -315,32 +314,6 @@ typedef struct _dev_regs_t { unsigned char *hxge_romp; /* fcode pointer */ } dev_regs_t, *p_dev_regs_t; -typedef struct _nxge_mac_addr_t { - ether_addr_t addr; - uint_t flags; -} hxge_mac_addr_t; - -/* - * Driver alternate mac address structure. - */ -typedef struct _hxge_mmac_t { - uint8_t total_factory_macs; - uint8_t num_mmac; - uint8_t num_factory_mmac; - hxge_mac_addr_t mac_pool[16]; - ether_addr_t factory_mac_pool[16]; - uint8_t naddrfree; /* number of alt mac addr available */ -} hxge_mmac_t; - -/* - * mmac stats structure - */ -typedef struct _hxge_mmac_stats_t { - uint8_t mmac_max_cnt; - uint8_t mmac_avail_cnt; - struct ether_addr mmac_avail_pool[16]; -} hxge_mmac_stats_t, *p_hxge_mmac_stats_t; - #include <hxge_common_impl.h> #include <hxge_common.h> #include <hxge_rxdma.h> diff --git a/usr/src/uts/common/io/hxge/hxge_kstats.c b/usr/src/uts/common/io/hxge/hxge_kstats.c index 9e3a86e953..1629c7c828 100644 --- a/usr/src/uts/common/io/hxge/hxge_kstats.c +++ b/usr/src/uts/common/io/hxge/hxge_kstats.c @@ -261,50 +261,6 @@ hxge_kstat_index_t hxge_pfc_stats[] = { }; typedef enum { - MMAC_MAX_ADDR, - MMAC_AVAIL_ADDR, - MMAC_ADDR_POOL1, - MMAC_ADDR_POOL2, - MMAC_ADDR_POOL3, - MMAC_ADDR_POOL4, - MMAC_ADDR_POOL5, - MMAC_ADDR_POOL6, - MMAC_ADDR_POOL7, - MMAC_ADDR_POOL8, - MMAC_ADDR_POOL9, - MMAC_ADDR_POOL10, - MMAC_ADDR_POOL11, - MMAC_ADDR_POOL12, - MMAC_ADDR_POOL13, - MMAC_ADDR_POOL14, - MMAC_ADDR_POOL15, - MMAC_ADDR_POOL16, - MMAC_STATS_END -} hxge_mmac_stat_index_t; - -hxge_kstat_index_t hxge_mmac_stats[] = { - {MMAC_MAX_ADDR, KSTAT_DATA_UINT64, "max_mmac_addr"}, - {MMAC_AVAIL_ADDR, KSTAT_DATA_UINT64, "avail_mmac_addr"}, - {MMAC_ADDR_POOL1, KSTAT_DATA_UINT64, "mmac_addr_1"}, - {MMAC_ADDR_POOL2, KSTAT_DATA_UINT64, "mmac_addr_2"}, - {MMAC_ADDR_POOL3, KSTAT_DATA_UINT64, "mmac_addr_3"}, - {MMAC_ADDR_POOL4, KSTAT_DATA_UINT64, "mmac_addr_4"}, - {MMAC_ADDR_POOL5, KSTAT_DATA_UINT64, "mmac_addr_5"}, - {MMAC_ADDR_POOL6, KSTAT_DATA_UINT64, "mmac_addr_6"}, - {MMAC_ADDR_POOL7, KSTAT_DATA_UINT64, "mmac_addr_7"}, - {MMAC_ADDR_POOL8, KSTAT_DATA_UINT64, "mmac_addr_8"}, - {MMAC_ADDR_POOL9, KSTAT_DATA_UINT64, "mmac_addr_9"}, - {MMAC_ADDR_POOL10, KSTAT_DATA_UINT64, "mmac_addr_10"}, - {MMAC_ADDR_POOL11, KSTAT_DATA_UINT64, "mmac_addr_11"}, - {MMAC_ADDR_POOL12, KSTAT_DATA_UINT64, "mmac_addr_12"}, - {MMAC_ADDR_POOL13, KSTAT_DATA_UINT64, "mmac_addr_13"}, - {MMAC_ADDR_POOL14, KSTAT_DATA_UINT64, "mmac_addr_14"}, - {MMAC_ADDR_POOL15, KSTAT_DATA_UINT64, "mmac_addr_15"}, - {MMAC_ADDR_POOL16, KSTAT_DATA_UINT64, "mmac_addr_16"}, - {MMAC_STATS_END, NULL, NULL}, -}; - -typedef enum { SPC_ACC_ERR = 0, TDC_PIOACC_ERR, RDC_PIOACC_ERR, @@ -580,75 +536,6 @@ hxge_pfc_stat_update(kstat_t *ksp, int rw) return (0); } -static uint64_t -hxge_mac_octet_to_u64(struct ether_addr addr) -{ - int i; - uint64_t addr64 = 0; - - for (i = ETHERADDRL - 1; i >= 0; i--) { - addr64 <<= 8; - addr64 |= addr.ether_addr_octet[i]; - } - return (addr64); -} - -/* ARGSUSED */ -int -hxge_mmac_stat_update(kstat_t *ksp, int rw) -{ - p_hxge_t hxgep; - p_hxge_mmac_kstat_t mmac_kstatsp; - p_hxge_mmac_stats_t statsp; - - hxgep = (p_hxge_t)ksp->ks_private; - if (hxgep == NULL) - return (-1); - - HXGE_DEBUG_MSG((hxgep, KST_CTL, "==> hxge_mmac_stat_update")); - - mmac_kstatsp = (p_hxge_mmac_kstat_t)ksp->ks_data; - statsp = (p_hxge_mmac_stats_t)&hxgep->statsp->mmac_stats; - - mmac_kstatsp->mmac_max_addr_cnt.value.ul = statsp->mmac_max_cnt; - mmac_kstatsp->mmac_avail_addr_cnt.value.ul = statsp->mmac_avail_cnt; - mmac_kstatsp->mmac_addr1.value.ul = - hxge_mac_octet_to_u64(statsp->mmac_avail_pool[0]); - mmac_kstatsp->mmac_addr2.value.ul = - hxge_mac_octet_to_u64(statsp->mmac_avail_pool[1]); - mmac_kstatsp->mmac_addr3.value.ul = - hxge_mac_octet_to_u64(statsp->mmac_avail_pool[2]); - mmac_kstatsp->mmac_addr4.value.ul = - hxge_mac_octet_to_u64(statsp->mmac_avail_pool[3]); - mmac_kstatsp->mmac_addr5.value.ul = - hxge_mac_octet_to_u64(statsp->mmac_avail_pool[4]); - mmac_kstatsp->mmac_addr6.value.ul = - hxge_mac_octet_to_u64(statsp->mmac_avail_pool[5]); - mmac_kstatsp->mmac_addr7.value.ul = - hxge_mac_octet_to_u64(statsp->mmac_avail_pool[6]); - mmac_kstatsp->mmac_addr8.value.ul = - hxge_mac_octet_to_u64(statsp->mmac_avail_pool[7]); - mmac_kstatsp->mmac_addr9.value.ul = - hxge_mac_octet_to_u64(statsp->mmac_avail_pool[8]); - mmac_kstatsp->mmac_addr10.value.ul = - hxge_mac_octet_to_u64(statsp->mmac_avail_pool[9]); - mmac_kstatsp->mmac_addr11.value.ul = - hxge_mac_octet_to_u64(statsp->mmac_avail_pool[10]); - mmac_kstatsp->mmac_addr12.value.ul = - hxge_mac_octet_to_u64(statsp->mmac_avail_pool[11]); - mmac_kstatsp->mmac_addr13.value.ul = - hxge_mac_octet_to_u64(statsp->mmac_avail_pool[12]); - mmac_kstatsp->mmac_addr14.value.ul = - hxge_mac_octet_to_u64(statsp->mmac_avail_pool[13]); - mmac_kstatsp->mmac_addr15.value.ul = - hxge_mac_octet_to_u64(statsp->mmac_avail_pool[14]); - mmac_kstatsp->mmac_addr16.value.ul = - hxge_mac_octet_to_u64(statsp->mmac_avail_pool[15]); - - HXGE_DEBUG_MSG((hxgep, KST_CTL, "<== hxge_mmac_stat_update")); - return (0); -} - /* ARGSUSED */ int hxge_peu_sys_stat_update(kstat_t *ksp, int rw) @@ -722,7 +609,6 @@ hxge_setup_kstats(p_hxge_t hxgep) p_hxge_port_kstat_t hxgekp; size_t hxge_kstat_sz; char stat_name[64]; - char mmac_name[64]; int i; HXGE_DEBUG_MSG((hxgep, KST_CTL, "==> hxge_setup_kstats")); @@ -779,14 +665,6 @@ hxge_setup_kstats(p_hxge_t hxgep) if (hxgep->statsp->vmac_ksp == NULL) cmn_err(CE_WARN, "kstat_create failed for vmac"); - /* Setup MMAC statistics */ - (void) sprintf(mmac_name, "MMAC Stats%d", hxgep->instance); - hxgep->statsp->mmac_ksp = hxge_setup_local_kstat(hxgep, - hxgep->instance, "MMAC", - &hxge_mmac_stats[0], MMAC_STATS_END, hxge_mmac_stat_update); - if (hxgep->statsp->mmac_ksp == NULL) - cmn_err(CE_WARN, "kstat_create failed for mmac"); - /* Setup PEU System statistics */ hxgep->statsp->peu_sys_ksp = hxge_setup_local_kstat(hxgep, hxgep->instance, "PEU", &hxge_peu_sys_stats[0], diff --git a/usr/src/uts/common/io/hxge/hxge_main.c b/usr/src/uts/common/io/hxge/hxge_main.c index b58bf49d8d..47a61060bf 100644 --- a/usr/src/uts/common/io/hxge/hxge_main.c +++ b/usr/src/uts/common/io/hxge/hxge_main.c @@ -151,13 +151,8 @@ static int hxge_m_unicst(void *, const uint8_t *); static int hxge_m_multicst(void *, boolean_t, const uint8_t *); static int hxge_m_promisc(void *, boolean_t); static void hxge_m_ioctl(void *, queue_t *, mblk_t *); -static void hxge_m_resources(void *); static hxge_status_t hxge_mac_register(p_hxge_t hxgep); -static int hxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr); -static int hxge_m_mmac_remove(void *arg, mac_addr_slot_t slot); -static int hxge_m_mmac_modify(void *arg, mac_multi_addr_t *maddr); -static int hxge_m_mmac_get(void *arg, mac_multi_addr_t *maddr); static boolean_t hxge_m_getcapab(void *, mac_capab_t, void *); static boolean_t hxge_param_locked(mac_prop_id_t pr_num); static int hxge_m_setprop(void *barg, const char *pr_name, mac_prop_id_t pr_num, @@ -196,7 +191,7 @@ mac_priv_prop_t hxge_priv_props[] = { #define MAX_DUMP_SZ 256 #define HXGE_M_CALLBACK_FLAGS \ - (MC_RESOURCES | MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP) + (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP) extern mblk_t *hxge_m_tx(void *arg, mblk_t *mp); extern hxge_status_t hxge_pfc_set_default_mac_addr(p_hxge_t hxgep); @@ -210,7 +205,6 @@ static mac_callbacks_t hxge_m_callbacks = { hxge_m_multicst, hxge_m_unicst, hxge_m_tx, - hxge_m_resources, hxge_m_ioctl, hxge_m_getcapab, NULL, @@ -2697,386 +2691,17 @@ hxge_m_ioctl(void *arg, queue_t *wq, mblk_t *mp) HXGE_DEBUG_MSG((hxgep, NEMO_CTL, "<== hxge_m_ioctl")); } -extern void hxge_rx_hw_blank(void *arg, time_t ticks, uint_t count); - -static void -hxge_m_resources(void *arg) -{ - p_hxge_t hxgep = arg; - mac_rx_fifo_t mrf; - p_rx_rcr_rings_t rcr_rings; - p_rx_rcr_ring_t *rcr_p; - p_rx_rcr_ring_t rcrp; - uint32_t i, ndmas; - int status; - - HXGE_DEBUG_MSG((hxgep, RX_CTL, "==> hxge_m_resources")); - - MUTEX_ENTER(hxgep->genlock); - - if (!(hxgep->drv_state & STATE_HW_INITIALIZED)) { - status = hxge_init(hxgep); - if (status != HXGE_OK) { - HXGE_DEBUG_MSG((hxgep, RX_CTL, "==> hxge_m_resources: " - "hxge_init failed")); - MUTEX_EXIT(hxgep->genlock); - return; - } - } - - mrf.mrf_type = MAC_RX_FIFO; - mrf.mrf_blank = hxge_rx_hw_blank; - mrf.mrf_arg = (void *)hxgep; - - mrf.mrf_normal_blank_time = RXDMA_RCR_TO_DEFAULT; - mrf.mrf_normal_pkt_count = RXDMA_RCR_PTHRES_DEFAULT; - - rcr_rings = hxgep->rx_rcr_rings; - rcr_p = rcr_rings->rcr_rings; - ndmas = rcr_rings->ndmas; - - /* - * Export our receive resources to the MAC layer. - */ - for (i = 0; i < ndmas; i++) { - rcrp = (void *)(p_rx_rcr_ring_t)rcr_p[i]; - rcrp->rcr_mac_handle = - mac_resource_add(hxgep->mach, (mac_resource_t *)&mrf); - - HXGE_DEBUG_MSG((hxgep, RX_CTL, - "==> hxge_m_resources: vdma %d dma %d " - "rcrptr 0x%016llx mac_handle 0x%016llx", - i, rcrp->rdc, rcr_p[i], rcrp->rcr_mac_handle)); - } - - MUTEX_EXIT(hxgep->genlock); - - HXGE_DEBUG_MSG((hxgep, RX_CTL, "<== hxge_m_resources")); -} - -/* - * Set an alternate MAC address - */ -static int -hxge_altmac_set(p_hxge_t hxgep, uint8_t *maddr, mac_addr_slot_t slot) -{ - uint64_t address; - uint64_t tmp; - hpi_status_t status; - uint8_t addrn; - int i; - - /* - * Convert a byte array to a 48 bit value. - * Need to check endianess if in doubt - */ - address = 0; - for (i = 0; i < ETHERADDRL; i++) { - tmp = maddr[i]; - address <<= 8; - address |= tmp; - } - - addrn = (uint8_t)slot; - status = hpi_pfc_set_mac_address(hxgep->hpi_handle, addrn, address); - if (status != HPI_SUCCESS) - return (EIO); - - return (0); -} - -static void -hxge_mmac_kstat_update(p_hxge_t hxgep, mac_addr_slot_t slot) -{ - p_hxge_mmac_stats_t mmac_stats; - int i; - hxge_mmac_t *mmac_info; - - mmac_info = &hxgep->hxge_mmac_info; - mmac_stats = &hxgep->statsp->mmac_stats; - mmac_stats->mmac_max_cnt = mmac_info->num_mmac; - mmac_stats->mmac_avail_cnt = mmac_info->naddrfree; - - for (i = 0; i < ETHERADDRL; i++) { - mmac_stats->mmac_avail_pool[slot].ether_addr_octet[i] = - mmac_info->mac_pool[slot].addr[(ETHERADDRL - 1) - i]; - } -} - -/* - * Find an unused address slot, set the address value to the one specified, - * enable the port to start filtering on the new MAC address. - * Returns: 0 on success. - */ -int -hxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr) -{ - p_hxge_t hxgep = arg; - mac_addr_slot_t slot; - hxge_mmac_t *mmac_info; - int err; - hxge_status_t status; - - mutex_enter(hxgep->genlock); - - /* - * Make sure that hxge is initialized, if _start() has - * not been called. - */ - if (!(hxgep->drv_state & STATE_HW_INITIALIZED)) { - status = hxge_init(hxgep); - if (status != HXGE_OK) { - mutex_exit(hxgep->genlock); - return (ENXIO); - } - } - - mmac_info = &hxgep->hxge_mmac_info; - if (mmac_info->naddrfree == 0) { - mutex_exit(hxgep->genlock); - return (ENOSPC); - } - - if (!mac_unicst_verify(hxgep->mach, maddr->mma_addr, - maddr->mma_addrlen)) { - mutex_exit(hxgep->genlock); - return (EINVAL); - } - - /* - * Search for the first available slot. Because naddrfree - * is not zero, we are guaranteed to find one. - * Slot 0 is for unique (primary) MAC. The first alternate - * MAC slot is slot 1. - */ - for (slot = 1; slot < mmac_info->num_mmac; slot++) { - if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED)) - break; - } - - ASSERT(slot < mmac_info->num_mmac); - if ((err = hxge_altmac_set(hxgep, maddr->mma_addr, slot)) != 0) { - mutex_exit(hxgep->genlock); - return (err); - } - bcopy(maddr->mma_addr, mmac_info->mac_pool[slot].addr, ETHERADDRL); - mmac_info->mac_pool[slot].flags |= MMAC_SLOT_USED; - mmac_info->naddrfree--; - hxge_mmac_kstat_update(hxgep, slot); - - maddr->mma_slot = slot; - - mutex_exit(hxgep->genlock); - return (0); -} - -/* - * Remove the specified mac address and update - * the h/w not to filter the mac address anymore. - * Returns: 0, on success. - */ -int -hxge_m_mmac_remove(void *arg, mac_addr_slot_t slot) -{ - p_hxge_t hxgep = arg; - hxge_mmac_t *mmac_info; - int err = 0; - hxge_status_t status; - - mutex_enter(hxgep->genlock); - - /* - * Make sure that hxge is initialized, if _start() has - * not been called. - */ - if (!(hxgep->drv_state & STATE_HW_INITIALIZED)) { - status = hxge_init(hxgep); - if (status != HXGE_OK) { - mutex_exit(hxgep->genlock); - return (ENXIO); - } - } - - mmac_info = &hxgep->hxge_mmac_info; - if (slot <= 0 || slot >= mmac_info->num_mmac) { - mutex_exit(hxgep->genlock); - return (EINVAL); - } - - if (mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED) { - if (hpi_pfc_mac_addr_disable(hxgep->hpi_handle, slot) == - HPI_SUCCESS) { - mmac_info->mac_pool[slot].flags &= ~MMAC_SLOT_USED; - mmac_info->naddrfree++; - /* - * Clear mac_pool[slot].addr so that kstat shows 0 - * alternate MAC address if the slot is not used. - */ - bzero(mmac_info->mac_pool[slot].addr, ETHERADDRL); - hxge_mmac_kstat_update(hxgep, slot); - } else { - err = EIO; - } - } else { - err = EINVAL; - } - - mutex_exit(hxgep->genlock); - return (err); -} - -/* - * Modify a mac address added by hxge_mmac_add(). - * Returns: 0, on success. - */ -int -hxge_m_mmac_modify(void *arg, mac_multi_addr_t *maddr) -{ - p_hxge_t hxgep = arg; - mac_addr_slot_t slot; - hxge_mmac_t *mmac_info; - int err = 0; - hxge_status_t status; - - if (!mac_unicst_verify(hxgep->mach, maddr->mma_addr, - maddr->mma_addrlen)) - return (EINVAL); - - slot = maddr->mma_slot; - - mutex_enter(hxgep->genlock); - - /* - * Make sure that hxge is initialized, if _start() has - * not been called. - */ - if (!(hxgep->drv_state & STATE_HW_INITIALIZED)) { - status = hxge_init(hxgep); - if (status != HXGE_OK) { - mutex_exit(hxgep->genlock); - return (ENXIO); - } - } - - mmac_info = &hxgep->hxge_mmac_info; - if (slot <= 0 || slot >= mmac_info->num_mmac) { - mutex_exit(hxgep->genlock); - return (EINVAL); - } - - if (mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED) { - if ((err = hxge_altmac_set(hxgep, maddr->mma_addr, - slot)) == 0) { - bcopy(maddr->mma_addr, mmac_info->mac_pool[slot].addr, - ETHERADDRL); - hxge_mmac_kstat_update(hxgep, slot); - } - } else { - err = EINVAL; - } - - mutex_exit(hxgep->genlock); - return (err); -} - -/* - * static int - * hxge_m_mmac_get() - Get the MAC address and other information - * related to the slot. mma_flags should be set to 0 in the call. - * Note: although kstat shows MAC address as zero when a slot is - * not used, Crossbow expects hxge_m_mmac_get to copy factory MAC - * to the caller as long as the slot is not using a user MAC address. - * The following table shows the rules, - * - * USED VENDOR mma_addr - * ------------------------------------------------------------ - * (1) Slot uses a user MAC: yes no user MAC - * (2) Slot uses a factory MAC: yes yes factory MAC - * (3) Slot is not used but is - * factory MAC capable: no yes factory MAC - * (4) Slot is not used and is - * not factory MAC capable: no no 0 - * ------------------------------------------------------------ - */ -int -hxge_m_mmac_get(void *arg, mac_multi_addr_t *maddr) -{ - hxge_t *hxgep = arg; - mac_addr_slot_t slot; - hxge_mmac_t *mmac_info; - hxge_status_t status; - - slot = maddr->mma_slot; - - mutex_enter(hxgep->genlock); - - /* - * Make sure that hxge is initialized, if _start() has - * not been called. - */ - if (!(hxgep->drv_state & STATE_HW_INITIALIZED)) { - status = hxge_init(hxgep); - if (status != HXGE_OK) { - mutex_exit(hxgep->genlock); - return (ENXIO); - } - } - - mmac_info = &hxgep->hxge_mmac_info; - if (slot <= 0 || slot >= mmac_info->num_mmac) { - mutex_exit(hxgep->genlock); - return (EINVAL); - } - - maddr->mma_flags = 0; - if (mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED) { - maddr->mma_flags |= MMAC_SLOT_USED; - bcopy(mmac_info->mac_pool[slot].addr, - maddr->mma_addr, ETHERADDRL); - maddr->mma_addrlen = ETHERADDRL; - } - - mutex_exit(hxgep->genlock); - return (0); -} - /*ARGSUSED*/ boolean_t hxge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) { - p_hxge_t hxgep = (p_hxge_t)arg; uint32_t *txflags = cap_data; - multiaddress_capab_t *mmacp = cap_data; switch (cap) { case MAC_CAPAB_HCKSUM: *txflags = HCKSUM_INET_PARTIAL; break; - case MAC_CAPAB_POLL: - /* - * There's nothing for us to fill in, simply returning B_TRUE - * stating that we support polling is sufficient. - */ - break; - - case MAC_CAPAB_MULTIADDRESS: - /* - * The number of MAC addresses made available by - * this capability is one less than the total as - * the primary address in slot 0 is counted in - * the total. - */ - mmacp->maddr_naddr = PFC_N_MAC_ADDRESSES - 1; - mmacp->maddr_naddrfree = hxgep->hxge_mmac_info.naddrfree; - mmacp->maddr_flag = 0; /* No multiple factory macs */ - mmacp->maddr_handle = hxgep; - mmacp->maddr_add = hxge_m_mmac_add; - mmacp->maddr_remove = hxge_m_mmac_remove; - mmacp->maddr_modify = hxge_m_mmac_modify; - mmacp->maddr_get = hxge_m_mmac_get; - mmacp->maddr_reserve = NULL; /* No multiple factory macs */ - break; default: return (B_FALSE); } diff --git a/usr/src/uts/common/io/hxge/hxge_rxdma.c b/usr/src/uts/common/io/hxge/hxge_rxdma.c index 0c3747f6bd..2de507a8e9 100644 --- a/usr/src/uts/common/io/hxge/hxge_rxdma.c +++ b/usr/src/uts/common/io/hxge/hxge_rxdma.c @@ -1228,10 +1228,8 @@ hxge_rx_pkts_vring(p_hxge_t hxgep, uint_t vindex, p_hxge_ldv_t ldvp, #ifdef HXGE_DEBUG HXGE_DEBUG_MSG((hxgep, RX_CTL, "==> hxge_rx_pkts_vring:calling mac_rx (NEMO) " - "LEN %d mp $%p mp->b_next $%p rcrp $%p " - "mac_handle $%p", - (mp->b_wptr - mp->b_rptr), mp, mp->b_next, - rcrp, rcrp->rcr_mac_handle)); + "LEN %d mp $%p mp->b_next $%p rcrp $%p", + (mp->b_wptr - mp->b_rptr), mp, mp->b_next, rcrp)); HXGE_DEBUG_MSG((hxgep, RX_CTL, "==> hxge_rx_pkts_vring: dump packets " "(mp $%p b_rptr $%p b_wptr $%p):\n %s", @@ -1257,7 +1255,7 @@ hxge_rx_pkts_vring(p_hxge_t hxgep, uint_t vindex, p_hxge_ldv_t ldvp, HXGE_DEBUG_MSG((hxgep, RX_CTL, "==> hxge_rx_pkts_vring: send packet to stack")); - mac_rx(hxgep->mach, rcrp->rcr_mac_handle, mp); + mac_rx(hxgep->mach, NULL, mp); HXGE_DEBUG_MSG((hxgep, RX_CTL, "<== hxge_rx_pkts_vring")); } diff --git a/usr/src/uts/common/io/hxge/hxge_rxdma.h b/usr/src/uts/common/io/hxge/hxge_rxdma.h index c5277ca590..0d1808a67c 100644 --- a/usr/src/uts/common/io/hxge/hxge_rxdma.h +++ b/usr/src/uts/common/io/hxge/hxge_rxdma.h @@ -344,7 +344,6 @@ typedef struct _rx_rcr_ring_t { uint32_t intr_timeout; uint32_t intr_threshold; uint64_t max_receive_pkts; - mac_resource_handle_t rcr_mac_handle; uint32_t rcvd_pkt_bytes; /* Received bytes of a packet */ } rx_rcr_ring_t, *p_rx_rcr_ring_t; diff --git a/usr/src/uts/common/io/hxge/hxge_virtual.c b/usr/src/uts/common/io/hxge/hxge_virtual.c index b1eff782aa..bbc65993d0 100644 --- a/usr/src/uts/common/io/hxge/hxge_virtual.c +++ b/usr/src/uts/common/io/hxge/hxge_virtual.c @@ -36,7 +36,6 @@ static void hxge_set_hw_dma_config(p_hxge_t); static void hxge_set_hw_class_config(p_hxge_t); static void hxge_ldgv_setup(p_hxge_ldg_t *ldgp, p_hxge_ldv_t *ldvp, uint8_t ldv, uint8_t endldg, int *ngrps); -static hxge_status_t hxge_mmac_init(p_hxge_t); extern uint16_t hxge_rcr_timeout; extern uint16_t hxge_rcr_threshold; @@ -894,35 +893,11 @@ hxge_intr_mask_mgmt_set(p_hxge_t hxgep, boolean_t on) static hxge_status_t hxge_get_mac_addr_properties(p_hxge_t hxgep) { - uint32_t num_macs; - hxge_status_t status; - HXGE_DEBUG_MSG((hxgep, DDI_CTL, "==> hxge_get_mac_addr_properties ")); (void) hxge_pfc_mac_addrs_get(hxgep); hxgep->ouraddr = hxgep->factaddr; - /* - * Get the number of MAC addresses the Hydra supports per blade. - */ - if (hxge_pfc_num_macs_get(hxgep, &num_macs) == HXGE_OK) { - hxgep->hxge_mmac_info.num_mmac = (uint8_t)num_macs; - } else { - HXGE_ERROR_MSG((NULL, HXGE_ERR_CTL, - "hxge_get_mac_addr_properties: get macs failed")); - return (HXGE_ERROR); - } - - /* - * Initialize alt. mac addr. in the mac pool - */ - status = hxge_mmac_init(hxgep); - if (status != HXGE_OK) { - HXGE_ERROR_MSG((NULL, HXGE_ERR_CTL, - "hxge_get_mac_addr_properties: init mmac failed")); - return (HXGE_ERROR); - } - HXGE_DEBUG_MSG((hxgep, DDI_CTL, "<== hxge_get_mac_addr_properties ")); return (HXGE_OK); } @@ -971,49 +946,3 @@ hxge_ldgv_setup(p_hxge_ldg_t *ldgp, p_hxge_ldv_t *ldvp, uint8_t ldv, HXGE_DEBUG_MSG((NULL, INT_CTL, "<== hxge_ldgv_setup")); } - -/* - * Note: This function assumes the following distribution of mac - * addresses for a hydra blade: - * - * ------------- - * 0| |0 - local-mac-address for blade - * ------------- - * | |1 - Start of alt. mac addr. for blade - * | | - * | | - * | |15 - * -------------- - */ - -static hxge_status_t -hxge_mmac_init(p_hxge_t hxgep) -{ - int slot; - hxge_mmac_t *mmac_info; - - mmac_info = (hxge_mmac_t *)&hxgep->hxge_mmac_info; - - /* Set flags for unique MAC */ - mmac_info->mac_pool[0].flags |= MMAC_SLOT_USED | MMAC_VENDOR_ADDR; - mmac_info->num_factory_mmac = 1; - - /* - * Skip the factory/default address which is in slot 0. - * Initialze all other mac addr. to "AVAILABLE" state. - * Clear flags of all alternate MAC slots. - */ - for (slot = 1; slot < mmac_info->num_mmac; slot++) { - (void) hpi_pfc_clear_mac_address(hxgep->hpi_handle, slot); - mmac_info->mac_pool[slot].flags = 0; - } - - /* Exclude the factory mac address */ - mmac_info->naddrfree = mmac_info->num_mmac - 1; - - /* Initialize the first two parameters for mmac kstat */ - hxgep->statsp->mmac_stats.mmac_max_cnt = mmac_info->num_mmac; - hxgep->statsp->mmac_stats.mmac_avail_cnt = mmac_info->naddrfree; - - return (HXGE_OK); -} diff --git a/usr/src/uts/common/io/ib/clients/ibd/ibd.c b/usr/src/uts/common/io/ib/clients/ibd/ibd.c index 099e2036c8..7992e1007b 100644 --- a/usr/src/uts/common/io/ib/clients/ibd/ibd.c +++ b/usr/src/uts/common/io/ib/clients/ibd/ibd.c @@ -37,6 +37,7 @@ #include <sys/strsun.h> #include <sys/strsubr.h> #include <sys/dlpi.h> +#include <sys/mac_provider.h> #include <sys/pattr.h> /* for HCK_PARTIALCKSUM */ #include <sys/sysmacros.h> /* for offsetof */ @@ -310,7 +311,6 @@ static mac_callbacks_t ib_m_callbacks = { ibd_m_unicst, ibd_m_tx, NULL, - NULL, ibd_m_getcapab }; @@ -4102,13 +4102,6 @@ ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) return (B_FALSE); break; } - case MAC_CAPAB_POLL: - /* - * Fallthrough to default, as we don't support GLDv3 - * polling. When blanking is implemented, we will need to - * change this to return B_TRUE in addition to registering - * an mc_resources callback. - */ default: return (B_FALSE); } diff --git a/usr/src/uts/common/io/igb/igb.conf b/usr/src/uts/common/io/igb/igb.conf index c2ae8d4cd3..93860209f0 100644 --- a/usr/src/uts/common/io/igb/igb.conf +++ b/usr/src/uts/common/io/igb/igb.conf @@ -1,19 +1,17 @@ # # CDDL HEADER START # -# Copyright(c) 2007-2008 Intel Corporation. All rights reserved. # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # -# You can obtain a copy of the license at: -# http://www.opensolaris.org/os/licensing. +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # -# When using or redistributing this file, you may do so under the -# License only. No other modification of this header is permitted. -# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] @@ -21,11 +19,11 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms of the CDDL. +# Copyright(c) 2007-2008 Intel Corporation. All rights reserved. # +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" # # # Driver.conf file for Intel 1Gb ethernet driver (igb) @@ -121,29 +119,29 @@ # flow_control = 3; # # -------------------- Transmit/Receive Queues -------------------- -# tx_queue_number -# The number of the transmit queues -# Allowed values: 1 - 4 -# Default value: 1 # # tx_ring_size # The number of the transmit descriptors per transmit queue # Allowed values: 64 - 4096 # Default value: 512 # -# rx_queue_number -# The number of the receive queues -# Allowed values: 1 - 4 -# Default value: 1 -# # rx_ring_size # The number of the receive descriptors per receive queue # Allowed values: 64 - 4096 # Default value: 512 # -# Note: The final values of tx_queue_number and rx_queue_number are decided -# by the number of interrupt vectors obtained by the driver. They could be -# less than the specified values because of limited interrupt vector number. +# mr_enable +# Enable multiple rx queues and tx queues +# Allowed values: 0, 1 +# Default value: 1 +# +# rx_group_number +# The number of the receive ring groups +# Allowed values: 1 - 4 +# Default value: 1 +# +# Note: If the specified values of the rx_group_number are not supported by +# hardware, the rx_group_number will be downgrade to an acceptable value. # # -------- How to set parameters for a particular interface --------- # The example below shows how to locate the device path and set a parameter diff --git a/usr/src/uts/common/io/igb/igb_gld.c b/usr/src/uts/common/io/igb/igb_gld.c index d897a484e3..c1213647ec 100644 --- a/usr/src/uts/common/io/igb/igb_gld.c +++ b/usr/src/uts/common/io/igb/igb_gld.c @@ -1,19 +1,17 @@ /* * CDDL HEADER START * - * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * - * You can obtain a copy of the license at: - * http://www.opensolaris.org/os/licensing. + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * - * When using or redistributing this file, you may do so under the - * License only. No other modification of this header is permitted. - * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] @@ -22,11 +20,13 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms of the CDDL. + * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ #include "igb_sw.h" @@ -555,37 +555,6 @@ igb_m_multicst(void *arg, boolean_t add, const uint8_t *mcst_addr) } /* - * Set a new device unicast address. - */ -int -igb_m_unicst(void *arg, const uint8_t *mac_addr) -{ - igb_t *igb = (igb_t *)arg; - int result; - - mutex_enter(&igb->gen_lock); - - if (igb->igb_state & IGB_SUSPENDED) { - mutex_exit(&igb->gen_lock); - return (ECANCELED); - } - - /* - * Store the new MAC address. - */ - bcopy(mac_addr, igb->hw.mac.addr, ETHERADDRL); - - /* - * Set MAC address in address slot 0, which is the default address. - */ - result = igb_unicst_set(igb, mac_addr, 0); - - mutex_exit(&igb->gen_lock); - - return (result); -} - -/* * Pass on M_IOCTL messages passed to the DLD, and support * private IOCTLs for debugging and ndd. */ @@ -654,18 +623,16 @@ igb_m_ioctl(void *arg, queue_t *q, mblk_t *mp) } } - /* - * Find an unused address slot, set the address to it, reserve - * this slot and enable the device to start filtering on the - * new address. + * Add a MAC address to the target RX group. */ -int -igb_m_unicst_add(void *arg, mac_multi_addr_t *maddr) +static int +igb_addmac(void *arg, const uint8_t *mac_addr) { - igb_t *igb = (igb_t *)arg; - mac_addr_slot_t slot; - int err; + igb_rx_group_t *rx_group = (igb_rx_group_t *)arg; + igb_t *igb = rx_group->igb; + struct e1000_hw *hw = &igb->hw; + int i, slot; mutex_enter(&igb->gen_lock); @@ -674,12 +641,6 @@ igb_m_unicst_add(void *arg, mac_multi_addr_t *maddr) return (ECANCELED); } - if (mac_unicst_verify(igb->mac_hdl, - maddr->mma_addr, maddr->mma_addrlen) == B_FALSE) { - mutex_exit(&igb->gen_lock); - return (EINVAL); - } - if (igb->unicst_avail == 0) { /* no slots available */ mutex_exit(&igb->gen_lock); @@ -687,39 +648,55 @@ igb_m_unicst_add(void *arg, mac_multi_addr_t *maddr) } /* - * Primary/default address is in slot 0. The next addresses - * are the multiple MAC addresses. So multiple MAC address 0 - * is in slot 1, 1 in slot 2, and so on. So the first multiple - * MAC address resides in slot 1. + * The slots from 0 to igb->num_rx_groups are reserved slots which + * are 1 to 1 mapped with group index directly. The other slots are + * shared between the all of groups. While adding a MAC address, + * it will try to set the reserved slots first, then the shared slots. */ - for (slot = 1; slot < igb->unicst_total; slot++) { - if (igb->unicst_addr[slot].mac.set == 0) - break; - } + slot = -1; + if (igb->unicst_addr[rx_group->index].mac.set == 1) { + /* + * The reserved slot for current group is used, find the free + * slots in the shared slots. + */ + for (i = igb->num_rx_groups; i < igb->unicst_total; i++) { + if (igb->unicst_addr[i].mac.set == 0) { + slot = i; + break; + } + } + } else + slot = rx_group->index; - ASSERT((slot > 0) && (slot < igb->unicst_total)); + if (slot == -1) { + /* no slots available in the shared slots */ + mutex_exit(&igb->gen_lock); + return (ENOSPC); + } - maddr->mma_slot = slot; + /* Set VMDq according to the mode supported by hardware. */ + e1000_rar_set_vmdq(hw, mac_addr, slot, igb->vmdq_mode, rx_group->index); - if ((err = igb_unicst_set(igb, maddr->mma_addr, slot)) == 0) { - igb->unicst_addr[slot].mac.set = 1; - igb->unicst_avail--; - } + bcopy(mac_addr, igb->unicst_addr[slot].mac.addr, ETHERADDRL); + igb->unicst_addr[slot].mac.group_index = rx_group->index; + igb->unicst_addr[slot].mac.set = 1; + igb->unicst_avail--; mutex_exit(&igb->gen_lock); - return (err); + return (0); } - /* - * Removes a MAC address that was added before. + * Remove a MAC address from the specified RX group. */ -int -igb_m_unicst_remove(void *arg, mac_addr_slot_t slot) +static int +igb_remmac(void *arg, const uint8_t *mac_addr) { - igb_t *igb = (igb_t *)arg; - int err; + igb_rx_group_t *rx_group = (igb_rx_group_t *)arg; + igb_t *igb = rx_group->igb; + struct e1000_hw *hw = &igb->hw; + int slot; mutex_enter(&igb->gen_lock); @@ -728,7 +705,8 @@ igb_m_unicst_remove(void *arg, mac_addr_slot_t slot) return (ECANCELED); } - if ((slot <= 0) || (slot >= igb->unicst_total)) { + slot = igb_unicst_find(igb, mac_addr); + if (slot == -1) { mutex_exit(&igb->gen_lock); return (EINVAL); } @@ -738,104 +716,189 @@ igb_m_unicst_remove(void *arg, mac_addr_slot_t slot) return (EINVAL); } - /* Copy the default address to the passed slot */ - if ((err = igb_unicst_set(igb, - igb->unicst_addr[0].mac.addr, slot)) == 0) { - igb->unicst_addr[slot].mac.set = 0; - igb->unicst_avail++; - } + /* Clear the MAC ddress in the slot */ + e1000_rar_clear(hw, slot); + igb->unicst_addr[slot].mac.set = 0; + igb->unicst_avail++; mutex_exit(&igb->gen_lock); - return (err); + return (0); } /* - * Modifies the value of an address that has been added before. - * The new address length and the slot number that was returned - * in the call to add should be passed in. mma_flags should be - * set to 0. - * Returns 0 on success. + * Enable interrupt on the specificed rx ring. */ int -igb_m_unicst_modify(void *arg, mac_multi_addr_t *maddr) +igb_rx_ring_intr_enable(mac_intr_handle_t intrh) { - igb_t *igb = (igb_t *)arg; - mac_addr_slot_t slot; - int err; - - mutex_enter(&igb->gen_lock); + igb_rx_ring_t *rx_ring = (igb_rx_ring_t *)intrh; + igb_t *igb = rx_ring->igb; + struct e1000_hw *hw = &igb->hw; + uint32_t index = rx_ring->index; - if (igb->igb_state & IGB_SUSPENDED) { - mutex_exit(&igb->gen_lock); - return (ECANCELED); + if (igb->intr_type == DDI_INTR_TYPE_MSIX) { + /* Interrupt enabling for MSI-X */ + igb->eims_mask |= (E1000_EICR_RX_QUEUE0 << index); + E1000_WRITE_REG(hw, E1000_EIMS, igb->eims_mask); + E1000_WRITE_REG(hw, E1000_EIAC, igb->eims_mask); + } else { + ASSERT(index == 0); + /* Interrupt enabling for MSI and legacy */ + igb->ims_mask |= E1000_IMS_RXT0; + E1000_WRITE_REG(hw, E1000_IMS, igb->ims_mask); } - if (mac_unicst_verify(igb->mac_hdl, - maddr->mma_addr, maddr->mma_addrlen) == B_FALSE) { - mutex_exit(&igb->gen_lock); - return (EINVAL); - } + E1000_WRITE_FLUSH(hw); - slot = maddr->mma_slot; + return (0); +} - if ((slot <= 0) || (slot >= igb->unicst_total)) { - mutex_exit(&igb->gen_lock); - return (EINVAL); +/* + * Disable interrupt on the specificed rx ring. + */ +int +igb_rx_ring_intr_disable(mac_intr_handle_t intrh) +{ + igb_rx_ring_t *rx_ring = (igb_rx_ring_t *)intrh; + igb_t *igb = rx_ring->igb; + struct e1000_hw *hw = &igb->hw; + uint32_t index = rx_ring->index; + + if (igb->intr_type == DDI_INTR_TYPE_MSIX) { + /* Interrupt disabling for MSI-X */ + igb->eims_mask &= ~(E1000_EICR_RX_QUEUE0 << index); + E1000_WRITE_REG(hw, E1000_EIMC, + (E1000_EICR_RX_QUEUE0 << index)); + E1000_WRITE_REG(hw, E1000_EIAC, igb->eims_mask); + } else { + ASSERT(index == 0); + /* Interrupt disabling for MSI and legacy */ + igb->ims_mask &= ~E1000_IMS_RXT0; + E1000_WRITE_REG(hw, E1000_IMC, E1000_IMS_RXT0); } - if (igb->unicst_addr[slot].mac.set == 0) { - mutex_exit(&igb->gen_lock); - return (EINVAL); + E1000_WRITE_FLUSH(hw); + + return (0); +} + +/* + * Get the global ring index by a ring index within a group. + */ +int +igb_get_rx_ring_index(igb_t *igb, int gindex, int rindex) +{ + igb_rx_ring_t *rx_ring; + int i; + + for (i = 0; i < igb->num_rx_rings; i++) { + rx_ring = &igb->rx_rings[i]; + if (rx_ring->group_index == gindex) + rindex--; + if (rindex < 0) + return (i); } - err = igb_unicst_set(igb, maddr->mma_addr, slot); + return (-1); +} - mutex_exit(&igb->gen_lock); +static int +igb_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num) +{ + igb_rx_ring_t *rx_ring = (igb_rx_ring_t *)rh; - return (err); + mutex_enter(&rx_ring->rx_lock); + rx_ring->ring_gen_num = mr_gen_num; + mutex_exit(&rx_ring->rx_lock); + return (0); } /* - * Get the MAC address and all other information related to - * the address slot passed in mac_multi_addr_t. - * mma_flags should be set to 0 in the call. - * On return, mma_flags can take the following values: - * 1) MMAC_SLOT_UNUSED - * 2) MMAC_SLOT_USED | MMAC_VENDOR_ADDR - * 3) MMAC_SLOT_UNUSED | MMAC_VENDOR_ADDR - * 4) MMAC_SLOT_USED + * Callback funtion for MAC layer to register all rings. */ -int -igb_m_unicst_get(void *arg, mac_multi_addr_t *maddr) +/* ARGSUSED */ +void +igb_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, + const int index, mac_ring_info_t *infop, mac_ring_handle_t rh) { igb_t *igb = (igb_t *)arg; - mac_addr_slot_t slot; + mac_intr_t *mintr = &infop->mri_intr; - mutex_enter(&igb->gen_lock); + switch (rtype) { + case MAC_RING_TYPE_RX: { + igb_rx_ring_t *rx_ring; + int global_index; - if (igb->igb_state & IGB_SUSPENDED) { - mutex_exit(&igb->gen_lock); - return (ECANCELED); - } + /* + * 'index' is the ring index within the group. + * We need the global ring index by searching in group. + */ + global_index = igb_get_rx_ring_index(igb, rg_index, index); - slot = maddr->mma_slot; + ASSERT(global_index >= 0); - if ((slot <= 0) || (slot >= igb->unicst_total)) { - mutex_exit(&igb->gen_lock); - return (EINVAL); + rx_ring = &igb->rx_rings[global_index]; + rx_ring->ring_handle = rh; + + infop->mri_driver = (mac_ring_driver_t)rx_ring; + infop->mri_start = igb_ring_start; + infop->mri_stop = NULL; + infop->mri_poll = (mac_ring_poll_t)igb_rx_ring_poll; + + mintr->mi_handle = (mac_intr_handle_t)rx_ring; + mintr->mi_enable = igb_rx_ring_intr_enable; + mintr->mi_disable = igb_rx_ring_intr_disable; + + break; } + case MAC_RING_TYPE_TX: { + ASSERT(index < igb->num_tx_rings); - if (igb->unicst_addr[slot].mac.set == 1) { - bcopy(igb->unicst_addr[slot].mac.addr, - maddr->mma_addr, ETHERADDRL); - maddr->mma_flags = MMAC_SLOT_USED; - } else { - maddr->mma_flags = MMAC_SLOT_UNUSED; + igb_tx_ring_t *tx_ring = &igb->tx_rings[index]; + tx_ring->ring_handle = rh; + + infop->mri_driver = (mac_ring_driver_t)tx_ring; + infop->mri_start = NULL; + infop->mri_stop = NULL; + infop->mri_tx = igb_tx_ring_send; + + break; } - mutex_exit(&igb->gen_lock); + default: + break; + } +} - return (0); +void +igb_fill_group(void *arg, mac_ring_type_t rtype, const int index, + mac_group_info_t *infop, mac_group_handle_t gh) +{ + igb_t *igb = (igb_t *)arg; + + switch (rtype) { + case MAC_RING_TYPE_RX: { + igb_rx_group_t *rx_group; + + ASSERT((index >= 0) && (index < igb->num_rx_groups)); + + rx_group = &igb->rx_groups[index]; + rx_group->group_handle = gh; + + infop->mgi_driver = (mac_group_driver_t)rx_group; + infop->mgi_start = NULL; + infop->mgi_stop = NULL; + infop->mgi_addmac = igb_addmac; + infop->mgi_remmac = igb_remmac; + infop->mgi_count = (igb->num_rx_rings / igb->num_rx_groups); + + break; + } + case MAC_RING_TYPE_TX: + break; + default: + break; + } } /* @@ -863,27 +926,34 @@ igb_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) *tx_hcksum_flags = HCKSUM_INET_PARTIAL | HCKSUM_IPHDRCKSUM; break; } - case MAC_CAPAB_MULTIADDRESS: { - multiaddress_capab_t *mmacp = cap_data; + case MAC_CAPAB_RINGS: { + mac_capab_rings_t *cap_rings = cap_data; + + switch (cap_rings->mr_type) { + case MAC_RING_TYPE_RX: + cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; + cap_rings->mr_rnum = igb->num_rx_rings; + cap_rings->mr_gnum = igb->num_rx_groups; + cap_rings->mr_rget = igb_fill_ring; + cap_rings->mr_gget = igb_fill_group; + cap_rings->mr_gaddring = NULL; + cap_rings->mr_gremring = NULL; - /* - * The number of MAC addresses made available by - * this capability is one less than the total as - * the primary address in slot 0 is counted in - * the total. - */ - mmacp->maddr_naddr = igb->unicst_total - 1; - mmacp->maddr_naddrfree = igb->unicst_avail; - /* No multiple factory addresses, set mma_flag to 0 */ - mmacp->maddr_flag = 0; - mmacp->maddr_handle = igb; - mmacp->maddr_add = igb_m_unicst_add; - mmacp->maddr_remove = igb_m_unicst_remove; - mmacp->maddr_modify = igb_m_unicst_modify; - mmacp->maddr_get = igb_m_unicst_get; - mmacp->maddr_reserve = NULL; + break; + case MAC_RING_TYPE_TX: + cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; + cap_rings->mr_rnum = igb->num_tx_rings; + cap_rings->mr_gnum = 0; + cap_rings->mr_rget = igb_fill_ring; + cap_rings->mr_gget = NULL; + + break; + default: + break; + } break; } + default: return (B_FALSE); } diff --git a/usr/src/uts/common/io/igb/igb_hw.h b/usr/src/uts/common/io/igb/igb_hw.h index 814b0c09fb..04c410d7d1 100644 --- a/usr/src/uts/common/io/igb/igb_hw.h +++ b/usr/src/uts/common/io/igb/igb_hw.h @@ -1,19 +1,17 @@ /* * CDDL HEADER START * - * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * - * You can obtain a copy of the license at: - * http://www.opensolaris.org/os/licensing. + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * - * When using or redistributing this file, you may do so under the - * License only. No other modification of this header is permitted. - * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] @@ -22,8 +20,12 @@ */ /* + * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. + */ + +/* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms of the CDDL. + * Use is subject to license terms. */ /* IntelVersion: 1.357 v2007-12-10_dragonlake5 */ @@ -31,8 +33,6 @@ #ifndef _IGB_HW_H #define _IGB_HW_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -620,6 +620,9 @@ s32 e1000_read_pcie_cap_reg(struct e1000_hw *hw, u32 reg, u16 *value); void e1000_free_dev_spec_struct(struct e1000_hw *hw); void e1000_read_pci_cfg(struct e1000_hw *hw, u32 reg, u16 *value); void e1000_write_pci_cfg(struct e1000_hw *hw, u32 reg, u16 *value); +void e1000_rar_clear(struct e1000_hw *hw, uint32_t); +void e1000_rar_set_vmdq(struct e1000_hw *hw, const uint8_t *, uint32_t, + uint32_t, uint8_t); #ifdef __cplusplus } diff --git a/usr/src/uts/common/io/igb/igb_main.c b/usr/src/uts/common/io/igb/igb_main.c index 18a7050e7e..ed475f0014 100644 --- a/usr/src/uts/common/io/igb/igb_main.c +++ b/usr/src/uts/common/io/igb/igb_main.c @@ -60,6 +60,8 @@ static void igb_setup_tx(igb_t *); static void igb_setup_rx_ring(igb_rx_ring_t *); static void igb_setup_tx_ring(igb_tx_ring_t *); static void igb_setup_rss(igb_t *); +static void igb_setup_mac_rss_classify(igb_t *); +static void igb_setup_mac_classify(igb_t *); static void igb_init_unicst(igb_t *); static void igb_setup_multicst(igb_t *); static void igb_get_phy_state(igb_t *); @@ -93,10 +95,11 @@ static void igb_setup_adapter_msix(igb_t *); static uint_t igb_intr_legacy(void *, void *); static uint_t igb_intr_msi(void *, void *); static uint_t igb_intr_rx(void *, void *); +static uint_t igb_intr_tx(void *, void *); static uint_t igb_intr_tx_other(void *, void *); static void igb_intr_rx_work(igb_rx_ring_t *); static void igb_intr_tx_work(igb_tx_ring_t *); -static void igb_intr_other_work(igb_t *); +static void igb_intr_link_work(igb_t *); static void igb_get_driver_control(struct e1000_hw *); static void igb_release_driver_control(struct e1000_hw *); @@ -175,14 +178,12 @@ static mac_callbacks_t igb_m_callbacks = { igb_m_stop, igb_m_promisc, igb_m_multicst, - igb_m_unicst, - igb_m_tx, + NULL, NULL, igb_m_ioctl, igb_m_getcapab }; - /* * Module Initialization Functions */ @@ -339,7 +340,7 @@ igb_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd) * interrupts are allocated. */ if (igb_alloc_rings(igb) != IGB_SUCCESS) { - igb_error(igb, "Failed to allocate rx and tx rings"); + igb_error(igb, "Failed to allocate rx/tx rings or groups"); goto attach_fail; } igb->attach_progress |= ATTACH_PROGRESS_ALLOC_RINGS; @@ -378,10 +379,13 @@ igb_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd) /* * Initialize chipset hardware */ + mutex_enter(&igb->gen_lock); if (igb_init(igb) != IGB_SUCCESS) { + mutex_exit(&igb->gen_lock); igb_error(igb, "Failed to initialize adapter"); goto attach_fail; } + mutex_exit(&igb->gen_lock); igb->attach_progress |= ATTACH_PROGRESS_INIT; /* @@ -710,6 +714,7 @@ igb_register_mac(igb_t *igb) mac->m_max_sdu = igb->max_frame_size - sizeof (struct ether_vlan_header) - ETHERFCSL; mac->m_margin = VLAN_TAGSZ; + mac->m_v12n = MAC_VIRT_LEVEL1; status = mac_register(mac, &igb->mac_hdl); @@ -1019,7 +1024,7 @@ igb_init(igb_t *igb) uint32_t pba; uint32_t high_water; - mutex_enter(&igb->gen_lock); + ASSERT(mutex_owned(&igb->gen_lock)); /* * Reset chipset to put the hardware in a known state @@ -1121,7 +1126,6 @@ igb_init(igb_t *igb) goto init_fail; } - mutex_exit(&igb->gen_lock); return (IGB_SUCCESS); init_fail: @@ -1131,8 +1135,6 @@ init_fail: if (e1000_check_reset_block(hw) == E1000_SUCCESS) (void) e1000_phy_hw_reset(hw); - mutex_exit(&igb->gen_lock); - ddi_fm_service_impact(igb->dip, DDI_SERVICE_LOST); return (IGB_FAILURE); @@ -1541,9 +1543,12 @@ igb_start(igb_t *igb) /* * Start the chipset hardware */ - if (igb_chip_start(igb) != IGB_SUCCESS) { - igb_fm_ereport(igb, DDI_FM_DEVICE_INVAL_STATE); - goto start_failure; + if (!(igb->attach_progress & ATTACH_PROGRESS_INIT)) { + if (igb_init(igb) != IGB_SUCCESS) { + igb_fm_ereport(igb, DDI_FM_DEVICE_INVAL_STATE); + goto start_failure; + } + igb->attach_progress |= ATTACH_PROGRESS_INIT; } /* @@ -1591,6 +1596,8 @@ igb_stop(igb_t *igb) ASSERT(mutex_owned(&igb->gen_lock)); + igb->attach_progress &= ~ ATTACH_PROGRESS_INIT; + /* * Disable the adapter interrupts */ @@ -1656,6 +1663,23 @@ igb_alloc_rings(igb_t *igb) return (IGB_FAILURE); } + /* + * Allocate memory space for rx ring groups + */ + igb->rx_groups = kmem_zalloc( + sizeof (igb_rx_group_t) * igb->num_rx_groups, + KM_NOSLEEP); + + if (igb->rx_groups == NULL) { + kmem_free(igb->rx_rings, + sizeof (igb_rx_ring_t) * igb->num_rx_rings); + kmem_free(igb->tx_rings, + sizeof (igb_tx_ring_t) * igb->num_tx_rings); + igb->rx_rings = NULL; + igb->tx_rings = NULL; + return (IGB_FAILURE); + } + return (IGB_SUCCESS); } @@ -1676,6 +1700,12 @@ igb_free_rings(igb_t *igb) sizeof (igb_tx_ring_t) * igb->num_tx_rings); igb->tx_rings = NULL; } + + if (igb->rx_groups != NULL) { + kmem_free(igb->rx_groups, + sizeof (igb_rx_group_t) * igb->num_rx_groups); + igb->rx_groups = NULL; + } } /* @@ -1782,8 +1812,10 @@ static void igb_setup_rx(igb_t *igb) { igb_rx_ring_t *rx_ring; + igb_rx_group_t *rx_group; struct e1000_hw *hw = &igb->hw; uint32_t reg_val; + uint32_t ring_per_group; int i; /* @@ -1804,12 +1836,24 @@ igb_setup_rx(igb_t *igb) E1000_WRITE_REG(hw, E1000_RCTL, reg_val); + for (i = 0; i < igb->num_rx_groups; i++) { + rx_group = &igb->rx_groups[i]; + rx_group->index = i; + rx_group->igb = igb; + } + /* * igb_setup_rx_ring must be called after configuring RCTL */ + ring_per_group = igb->num_rx_rings / igb->num_rx_groups; for (i = 0; i < igb->num_rx_rings; i++) { rx_ring = &igb->rx_rings[i]; igb_setup_rx_ring(rx_ring); + + /* + * Map a ring to a group by assigning a group index + */ + rx_ring->group_index = i / ring_per_group; } /* @@ -1829,10 +1873,32 @@ igb_setup_rx(igb_t *igb) } /* - * Setup RSS for multiple receive queues + * Setup classify and RSS for multiple receive queues */ - if (igb->num_rx_rings > 1) - igb_setup_rss(igb); + switch (igb->vmdq_mode) { + case E1000_VMDQ_OFF: + /* + * One ring group, only RSS is needed when more than + * one ring enabled. + */ + if (igb->num_rx_rings > 1) + igb_setup_rss(igb); + break; + case E1000_VMDQ_MAC: + /* + * Multiple groups, each group has one ring, + * only the MAC classification is needed. + */ + igb_setup_mac_classify(igb); + break; + case E1000_VMDQ_MAC_RSS: + /* + * Multiple groups and multiple rings, both + * MAC classification and RSS are needed. + */ + igb_setup_mac_rss_classify(igb); + break; + } } static void @@ -1848,6 +1914,7 @@ igb_setup_tx_ring(igb_tx_ring_t *tx_ring) ASSERT(mutex_owned(&tx_ring->tx_lock)); ASSERT(mutex_owned(&igb->gen_lock)); + /* * Initialize the length register */ @@ -1922,6 +1989,14 @@ igb_setup_tx_ring(igb_tx_ring_t *tx_ring) } /* + * Enable specific tx ring, it is required by multiple tx + * ring support. + */ + reg_val = E1000_READ_REG(hw, E1000_TXDCTL(tx_ring->index)); + reg_val |= E1000_TXDCTL_QUEUE_ENABLE; + E1000_WRITE_REG(hw, E1000_TXDCTL(tx_ring->index), reg_val); + + /* * Initialize hardware checksum offload settings */ tx_ring->hcksum_context.hcksum_flags = 0; @@ -2036,6 +2111,117 @@ igb_setup_rss(igb_t *igb) } /* + * igb_setup_mac_rss_classify - Setup MAC classification and rss + */ +static void +igb_setup_mac_rss_classify(igb_t *igb) +{ + struct e1000_hw *hw = &igb->hw; + uint32_t i, mrqc, vmdctl, rxcsum; + uint32_t ring_per_group; + int shift_group0, shift_group1; + uint32_t random; + union e1000_reta { + uint32_t dword; + uint8_t bytes[4]; + } reta; + + ring_per_group = igb->num_rx_rings / igb->num_rx_groups; + + /* Setup the Redirection Table, it is shared between two groups */ + shift_group0 = 2; + shift_group1 = 6; + for (i = 0; i < (32 * 4); i++) { + reta.bytes[i & 3] = ((i % ring_per_group) << shift_group0) | + ((ring_per_group + (i % ring_per_group)) << shift_group1); + if ((i & 3) == 3) { + E1000_WRITE_REG(hw, + (E1000_RETA(0) + (i & ~3)), reta.dword); + } + } + + /* Fill out hash function seeds */ + for (i = 0; i < 10; i++) { + (void) random_get_pseudo_bytes((uint8_t *)&random, + sizeof (uint32_t)); + E1000_WRITE_REG(hw, E1000_RSSRK(i), random); + } + + /* + * Setup the Multiple Receive Queue Control register, + * enable VMDq based on packet destination MAC address and RSS. + */ + mrqc = E1000_MRQC_ENABLE_VMDQ_MAC_RSS_GROUP; + mrqc |= (E1000_MRQC_RSS_FIELD_IPV4 | + E1000_MRQC_RSS_FIELD_IPV4_TCP | + E1000_MRQC_RSS_FIELD_IPV6 | + E1000_MRQC_RSS_FIELD_IPV6_TCP | + E1000_MRQC_RSS_FIELD_IPV4_UDP | + E1000_MRQC_RSS_FIELD_IPV6_UDP | + E1000_MRQC_RSS_FIELD_IPV6_UDP_EX | + E1000_MRQC_RSS_FIELD_IPV6_TCP_EX); + + E1000_WRITE_REG(hw, E1000_MRQC, mrqc); + + + /* Define the default group and default queues */ + vmdctl = E1000_VMDQ_MAC_GROUP_DEFAULT_QUEUE; + E1000_WRITE_REG(hw, E1000_VMD_CTL, vmdctl); + + /* + * Disable Packet Checksum to enable RSS for multiple receive queues. + * + * The Packet Checksum is not ethernet CRC. It is another kind of + * checksum offloading provided by the 82575 chipset besides the IP + * header checksum offloading and the TCP/UDP checksum offloading. + * The Packet Checksum is by default computed over the entire packet + * from the first byte of the DA through the last byte of the CRC, + * including the Ethernet and IP headers. + * + * It is a hardware limitation that Packet Checksum is mutually + * exclusive with RSS. + */ + rxcsum = E1000_READ_REG(hw, E1000_RXCSUM); + rxcsum |= E1000_RXCSUM_PCSD; + E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum); +} + +/* + * igb_setup_mac_classify - Setup MAC classification feature + */ +static void +igb_setup_mac_classify(igb_t *igb) +{ + struct e1000_hw *hw = &igb->hw; + uint32_t mrqc, rxcsum; + + /* + * Setup the Multiple Receive Queue Control register, + * enable VMDq based on packet destination MAC address. + */ + mrqc = E1000_MRQC_ENABLE_VMDQ_MAC_GROUP; + E1000_WRITE_REG(hw, E1000_MRQC, mrqc); + + /* + * Disable Packet Checksum to enable RSS for multiple receive queues. + * + * The Packet Checksum is not ethernet CRC. It is another kind of + * checksum offloading provided by the 82575 chipset besides the IP + * header checksum offloading and the TCP/UDP checksum offloading. + * The Packet Checksum is by default computed over the entire packet + * from the first byte of the DA through the last byte of the CRC, + * including the Ethernet and IP headers. + * + * It is a hardware limitation that Packet Checksum is mutually + * exclusive with RSS. + */ + rxcsum = E1000_READ_REG(hw, E1000_RXCSUM); + rxcsum |= E1000_RXCSUM_PCSD; + E1000_WRITE_REG(hw, E1000_RXCSUM, rxcsum); + +} + +/* * igb_init_unicst - Initialize the unicast addresses */ static void @@ -2049,41 +2235,39 @@ igb_init_unicst(igb_t *igb) * * 1. Chipset is initialized the first time * Initialize the multiple unicast addresses, and - * save the default mac address. + * save the default MAC address. * * 2. Chipset is reset * Recover the multiple unicast addresses from the * software data structure to the RAR registers. */ - if (!igb->unicst_init) { - /* Initialize the multiple unicast addresses */ - igb->unicst_total = MAX_NUM_UNICAST_ADDRESSES; - igb->unicst_avail = igb->unicst_total - 1; + /* + * Clear the default MAC address in the RAR0 rgister, + * which is loaded from EEPROM when system boot or chipreset, + * this will cause the conficts with add_mac/rem_mac entry + * points when VMDq is enabled. For this reason, the RAR0 + * must be cleared for both cases mentioned above. + */ + e1000_rar_clear(hw, 0); - /* Store the default mac address */ - e1000_rar_set(hw, hw->mac.addr, 0); + if (!igb->unicst_init) { - bcopy(hw->mac.addr, igb->unicst_addr[0].mac.addr, - ETHERADDRL); - igb->unicst_addr[0].mac.set = 1; + /* Initialize the multiple unicast addresses */ + igb->unicst_total = MAX_NUM_UNICAST_ADDRESSES; + igb->unicst_avail = igb->unicst_total; - for (slot = 1; slot < igb->unicst_total; slot++) + for (slot = 0; slot < igb->unicst_total; slot++) igb->unicst_addr[slot].mac.set = 0; igb->unicst_init = B_TRUE; } else { - /* Recover the default mac address */ - bcopy(igb->unicst_addr[0].mac.addr, hw->mac.addr, - ETHERADDRL); - - /* Store the default mac address */ - e1000_rar_set(hw, hw->mac.addr, 0); - /* Re-configure the RAR registers */ - for (slot = 1; slot < igb->unicst_total; slot++) - e1000_rar_set(hw, - igb->unicst_addr[slot].mac.addr, slot); + for (slot = 0; slot < igb->unicst_total; slot++) { + e1000_rar_set_vmdq(hw, igb->unicst_addr[slot].mac.addr, + slot, igb->vmdq_mode, + igb->unicst_addr[slot].mac.group_index); + } } if (igb_check_acc_handle(igb->osdep.reg_handle) != DDI_FM_OK) @@ -2091,11 +2275,30 @@ igb_init_unicst(igb_t *igb) } /* + * igb_unicst_find - Find the slot for the specified unicast address + */ +int +igb_unicst_find(igb_t *igb, const uint8_t *mac_addr) +{ + int slot; + + ASSERT(mutex_owned(&igb->gen_lock)); + + for (slot = 0; slot < igb->unicst_total; slot++) { + if (bcmp(igb->unicst_addr[slot].mac.addr, + mac_addr, ETHERADDRL) == 0) + return (slot); + } + + return (-1); +} + +/* * igb_unicst_set - Set the unicast address to the specified slot */ int igb_unicst_set(igb_t *igb, const uint8_t *mac_addr, - mac_addr_slot_t slot) + int slot) { struct e1000_hw *hw = &igb->hw; @@ -2232,6 +2435,8 @@ igb_get_conf(igb_t *igb) struct e1000_hw *hw = &igb->hw; uint32_t default_mtu; uint32_t flow_control; + uint32_t ring_per_group; + int i; /* * igb driver supports the following user configurations: @@ -2299,16 +2504,66 @@ igb_get_conf(igb_t *igb) /* * Multiple rings configurations */ - igb->num_tx_rings = igb_get_prop(igb, PROP_TX_QUEUE_NUM, - MIN_TX_QUEUE_NUM, MAX_TX_QUEUE_NUM, DEFAULT_TX_QUEUE_NUM); igb->tx_ring_size = igb_get_prop(igb, PROP_TX_RING_SIZE, MIN_TX_RING_SIZE, MAX_TX_RING_SIZE, DEFAULT_TX_RING_SIZE); - - igb->num_rx_rings = igb_get_prop(igb, PROP_RX_QUEUE_NUM, - MIN_RX_QUEUE_NUM, MAX_RX_QUEUE_NUM, DEFAULT_RX_QUEUE_NUM); igb->rx_ring_size = igb_get_prop(igb, PROP_RX_RING_SIZE, MIN_RX_RING_SIZE, MAX_RX_RING_SIZE, DEFAULT_RX_RING_SIZE); + igb->mr_enable = igb_get_prop(igb, PROP_MR_ENABLE, 0, 1, 1); + igb->num_rx_groups = igb_get_prop(igb, PROP_RX_GROUP_NUM, + MIN_RX_GROUP_NUM, MAX_RX_GROUP_NUM, DEFAULT_RX_GROUP_NUM); + + if (igb->mr_enable) { + igb->num_tx_rings = DEFAULT_TX_QUEUE_NUM; + igb->num_rx_rings = DEFAULT_RX_QUEUE_NUM; + } else { + igb->num_tx_rings = 1; + igb->num_rx_rings = 1; + + if (igb->num_rx_groups > 1) { + igb_error(igb, + "Invalid rx groups number. Please enable multiple " + "rings first"); + igb->num_rx_groups = 1; + } + } + + /* + * Check the divisibility between rx rings and rx groups. + */ + for (i = igb->num_rx_groups; i > 0; i--) { + if ((igb->num_rx_rings % i) == 0) + break; + } + if (i != igb->num_rx_groups) { + igb_error(igb, + "Invalid rx groups number. Downgrade the rx group " + "number to %d.", i); + igb->num_rx_groups = i; + } + + /* + * Get the ring number per group. + */ + ring_per_group = igb->num_rx_rings / igb->num_rx_groups; + + if (igb->num_rx_groups == 1) { + /* + * One rx ring group, the rx ring number is num_rx_rings. + */ + igb->vmdq_mode = E1000_VMDQ_OFF; + } else if (ring_per_group == 1) { + /* + * Multiple rx groups, each group has one rx ring. + */ + igb->vmdq_mode = E1000_VMDQ_MAC; + } else { + /* + * Multiple groups and multiple rings. + */ + igb->vmdq_mode = E1000_VMDQ_MAC_RSS; + } + /* * Tunable used to force an interrupt type. The only use is * for testing of the lesser interrupt types. @@ -2861,6 +3116,7 @@ igb_enable_adapter_interrupts(igb_t *igb) /* Interrupt enabling for MSI-X */ E1000_WRITE_REG(hw, E1000_EIMS, igb->eims_mask); E1000_WRITE_REG(hw, E1000_EIAC, igb->eims_mask); + igb->ims_mask = E1000_IMS_LSC; E1000_WRITE_REG(hw, E1000_IMS, E1000_IMS_LSC); /* Enable MSI-X PBA support */ @@ -2873,6 +3129,7 @@ igb_enable_adapter_interrupts(igb_t *igb) E1000_WRITE_REG(hw, E1000_CTRL_EXT, reg); } else { /* Interrupt enabling for MSI and legacy */ + igb->ims_mask = IMS_ENABLE_MASK; E1000_WRITE_REG(hw, E1000_IMS, IMS_ENABLE_MASK); } @@ -3176,11 +3433,12 @@ igb_intr_rx_work(igb_rx_ring_t *rx_ring) mblk_t *mp; mutex_enter(&rx_ring->rx_lock); - mp = igb_rx(rx_ring); + mp = igb_rx(rx_ring, IGB_NO_POLL); mutex_exit(&rx_ring->rx_lock); if (mp != NULL) - mac_rx(rx_ring->igb->mac_hdl, NULL, mp); + mac_rx_ring(rx_ring->igb->mac_hdl, rx_ring->ring_handle, mp, + rx_ring->ring_gen_num); } #pragma inline(igb_intr_tx_work) @@ -3197,17 +3455,17 @@ igb_intr_tx_work(igb_tx_ring_t *tx_ring) if (tx_ring->reschedule && (tx_ring->tbd_free >= tx_ring->resched_thresh)) { tx_ring->reschedule = B_FALSE; - mac_tx_update(tx_ring->igb->mac_hdl); + mac_tx_ring_update(tx_ring->igb->mac_hdl, tx_ring->ring_handle); IGB_DEBUG_STAT(tx_ring->stat_reschedule); } } -#pragma inline(igb_intr_other_work) +#pragma inline(igb_intr_link_work) /* - * igb_intr_other_work - other processing of ISR + * igb_intr_link_work - link-status-change processing of ISR */ static void -igb_intr_other_work(igb_t *igb) +igb_intr_link_work(igb_t *igb) { boolean_t link_changed; @@ -3273,7 +3531,7 @@ igb_intr_legacy(void *arg1, void *arg2) ASSERT(igb->num_tx_rings == 1); if (icr & E1000_ICR_RXT0) { - mp = igb_rx(&igb->rx_rings[0]); + mp = igb_rx(&igb->rx_rings[0], IGB_NO_POLL); } if (icr & E1000_ICR_TXDW) { @@ -3320,7 +3578,7 @@ igb_intr_legacy(void *arg1, void *arg2) if (tx_reschedule) { tx_ring->reschedule = B_FALSE; - mac_tx_update(igb->mac_hdl); + mac_tx_ring_update(igb->mac_hdl, tx_ring->ring_handle); IGB_DEBUG_STAT(tx_ring->stat_reschedule); } @@ -3359,7 +3617,7 @@ igb_intr_msi(void *arg1, void *arg2) } if (icr & E1000_ICR_LSC) { - igb_intr_other_work(igb); + igb_intr_link_work(igb); } return (DDI_INTR_CLAIMED); @@ -3385,10 +3643,27 @@ igb_intr_rx(void *arg1, void *arg2) } /* + * igb_intr_tx - Interrupt handler for tx + */ +static uint_t +igb_intr_tx(void *arg1, void *arg2) +{ + igb_tx_ring_t *tx_ring = (igb_tx_ring_t *)arg1; + + _NOTE(ARGUNUSED(arg2)); + + /* + * Only used via MSI-X vector so don't check cause bits + * and only clean the given ring. + */ + igb_intr_tx_work(tx_ring); + + return (DDI_INTR_CLAIMED); +} + +/* * igb_intr_tx_other - Interrupt handler for both tx and other * - * Always look for Tx cleanup work. Only look for other work if the right - * bits are set in the Interrupt Cause Register. */ static uint_t igb_intr_tx_other(void *arg1, void *arg2) @@ -3401,17 +3676,18 @@ igb_intr_tx_other(void *arg1, void *arg2) icr = E1000_READ_REG(&igb->hw, E1000_ICR); /* - * Always look for Tx cleanup work. We don't have separate - * transmit vectors, so we have only one tx ring enabled. + * Look for tx reclaiming work first. Remember, in the + * case of only interrupt sharing, only one tx ring is + * used */ - ASSERT(igb->num_tx_rings == 1); igb_intr_tx_work(&igb->tx_rings[0]); /* - * Check for "other" causes. + * Need check cause bits and only link change will + * be processed */ if (icr & E1000_ICR_LSC) { - igb_intr_other_work(igb); + igb_intr_link_work(igb); } return (DDI_INTR_CLAIMED); @@ -3504,23 +3780,12 @@ static int igb_alloc_intr_handles(igb_t *igb, int intr_type) { dev_info_t *devinfo; - int request, count, avail, actual; - int rx_rings, minimum; + int orig, request, count, avail, actual; + int diff, minimum; int rc; devinfo = igb->dip; - /* - * Currently only 1 tx ring is supported. More tx rings - * will be supported with future enhancement. - */ - if (igb->num_tx_rings > 1) { - igb->num_tx_rings = 1; - igb_log(igb, - "Use only 1 MSI-X vector for tx, " - "force tx queue number to 1"); - } - switch (intr_type) { case DDI_INTR_TYPE_FIXED: request = 1; /* Request 1 legacy interrupt handle */ @@ -3536,12 +3801,12 @@ igb_alloc_intr_handles(igb_t *igb, int intr_type) case DDI_INTR_TYPE_MSIX: /* - * Best number of vectors for the adapter is - * # rx rings + # tx rings + 1 for other - * But currently we only support number of vectors of - * # rx rings + 1 for tx & other + * Number of vectors for the adapter is + * # rx rings + # tx rings + * One of tx vectors is for tx & other */ - request = igb->num_rx_rings + 1; + request = igb->num_rx_rings + igb->num_tx_rings; + orig = request; minimum = 2; IGB_DEBUGLOG_0(igb, "interrupt type: MSI-X"); break; @@ -3613,15 +3878,24 @@ igb_alloc_intr_handles(igb_t *igb, int intr_type) } /* - * For MSI-X, actual might force us to reduce number of rx rings + * For MSI-X, actual might force us to reduce number of tx & rx rings */ - if (intr_type == DDI_INTR_TYPE_MSIX) { - rx_rings = actual - 1; - if (rx_rings < igb->num_rx_rings) { + if ((intr_type == DDI_INTR_TYPE_MSIX) && (orig > actual)) { + diff = orig - actual; + if (diff < igb->num_tx_rings) { + igb_log(igb, + "MSI-X vectors force Tx queue number to %d", + igb->num_tx_rings - diff); + igb->num_tx_rings -= diff; + } else { + igb_log(igb, + "MSI-X vectors force Tx queue number to 1"); + igb->num_tx_rings = 1; + igb_log(igb, "MSI-X vectors force Rx queue number to %d", - rx_rings); - igb->num_rx_rings = rx_rings; + actual - 1); + igb->num_rx_rings = actual - 1; } } @@ -3662,6 +3936,7 @@ static int igb_add_intr_handlers(igb_t *igb) { igb_rx_ring_t *rx_ring; + igb_tx_ring_t *tx_ring; int vector; int rc; int i; @@ -3671,14 +3946,17 @@ igb_add_intr_handlers(igb_t *igb) switch (igb->intr_type) { case DDI_INTR_TYPE_MSIX: /* Add interrupt handler for tx + other */ + tx_ring = &igb->tx_rings[0]; rc = ddi_intr_add_handler(igb->htable[vector], (ddi_intr_handler_t *)igb_intr_tx_other, (void *)igb, NULL); + if (rc != DDI_SUCCESS) { igb_log(igb, "Add tx/other interrupt handler failed: %d", rc); return (IGB_FAILURE); } + tx_ring->intr_vector = vector; vector++; /* Add interrupt handler for each rx ring */ @@ -3704,6 +3982,31 @@ igb_add_intr_handlers(igb_t *igb) vector++; } + + /* Add interrupt handler for each tx ring from 2nd ring */ + for (i = 1; i < igb->num_tx_rings; i++) { + tx_ring = &igb->tx_rings[i]; + + rc = ddi_intr_add_handler(igb->htable[vector], + (ddi_intr_handler_t *)igb_intr_tx, + (void *)tx_ring, NULL); + + if (rc != DDI_SUCCESS) { + igb_log(igb, + "Add tx interrupt handler failed. " + "return: %d, tx ring: %d", rc, i); + for (vector--; vector >= 0; vector--) { + (void) ddi_intr_remove_handler( + igb->htable[vector]); + } + return (IGB_FAILURE); + } + + tx_ring->intr_vector = vector; + + vector++; + } + break; case DDI_INTR_TYPE_MSI: @@ -3764,14 +4067,14 @@ igb_setup_adapter_msix(igb_t *igb) struct e1000_hw *hw = &igb->hw; /* - * Set vector for Tx + Other causes - * NOTE assumption that there is only one of these and it is vector 0 + * Set vector for other causes, NOTE assumption that it is vector 0 */ vector = 0; + igb->eims_mask = E1000_EICR_TX_QUEUE0 | E1000_EICR_OTHER; E1000_WRITE_REG(hw, E1000_MSIXBM(vector), igb->eims_mask); - vector++; + for (i = 0; i < igb->num_rx_rings; i++) { /* * Set vector for each rx ring @@ -3787,6 +4090,21 @@ igb_setup_adapter_msix(igb_t *igb) vector++; } + for (i = 1; i < igb->num_tx_rings; i++) { + /* + * Set vector for each tx ring from 2nd tx ring + */ + eims = (E1000_EICR_TX_QUEUE0 << i); + E1000_WRITE_REG(hw, E1000_MSIXBM(vector), eims); + + /* + * Accumulate bits to enable in igb_enable_adapter_interrupts() + */ + igb->eims_mask |= eims; + + vector++; + } + ASSERT(vector == igb->intr_cnt); /* diff --git a/usr/src/uts/common/io/igb/igb_osdep.c b/usr/src/uts/common/io/igb/igb_osdep.c index 9d03c05494..f915edd5ae 100644 --- a/usr/src/uts/common/io/igb/igb_osdep.c +++ b/usr/src/uts/common/io/igb/igb_osdep.c @@ -1,19 +1,17 @@ /* * CDDL HEADER START * - * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * - * You can obtain a copy of the license at: - * http://www.opensolaris.org/os/licensing. + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * - * When using or redistributing this file, you may do so under the - * License only. No other modification of this header is permitted. - * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] @@ -22,11 +20,13 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms of the CDDL. + * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ #include "igb_osdep.h" #include "igb_api.h" @@ -114,3 +114,61 @@ e1000_enable_pciex_master(struct e1000_hw *hw) ctrl &= ~E1000_CTRL_GIO_MASTER_DISABLE; E1000_WRITE_REG(hw, E1000_CTRL, ctrl); } + +/* + * e1000_rar_set_vmdq - Clear the RAR registers + */ +void +e1000_rar_clear(struct e1000_hw *hw, uint32_t index) +{ + + uint32_t rar_high; + + /* Make the hardware the Address invalid by setting the clear bit */ + rar_high = ~E1000_RAH_AV; + + E1000_WRITE_REG_ARRAY(hw, E1000_RA, ((index << 1) + 1), rar_high); + E1000_WRITE_FLUSH(hw); +} + +/* + * e1000_rar_set_vmdq - Set the RAR registers for VMDq + */ +void +e1000_rar_set_vmdq(struct e1000_hw *hw, const uint8_t *addr, uint32_t index, + uint32_t vmdq_mode, uint8_t qsel) +{ + uint32_t rar_low, rar_high; + + /* + * NIC expects these in little endian so reverse the byte order + * from network order (big endian) to little endian. + */ + + rar_low = ((uint32_t)addr[0] | ((uint32_t)addr[1] << 8) | + ((uint32_t)addr[2] << 16) | ((uint32_t)addr[3] << 24)); + + rar_high = ((uint32_t)addr[4] | ((uint32_t)addr[5] << 8)); + + /* Indicate to hardware the Address is Valid. */ + rar_high |= E1000_RAH_AV; + + /* Set que selector based on vmdq mode */ + switch (vmdq_mode) { + default: + case E1000_VMDQ_OFF: + break; + case E1000_VMDQ_MAC: + rar_high |= (qsel << 18); + break; + case E1000_VMDQ_MAC_RSS: + rar_high |= 1 << (18 + qsel); + break; + + } + + /* write to receive address registers */ + E1000_WRITE_REG_ARRAY(hw, E1000_RA, (index << 1), rar_low); + E1000_WRITE_REG_ARRAY(hw, E1000_RA, ((index << 1) + 1), rar_high); + E1000_WRITE_FLUSH(hw); +} diff --git a/usr/src/uts/common/io/igb/igb_osdep.h b/usr/src/uts/common/io/igb/igb_osdep.h index 42ba27a2e3..f56f320a1c 100644 --- a/usr/src/uts/common/io/igb/igb_osdep.h +++ b/usr/src/uts/common/io/igb/igb_osdep.h @@ -1,19 +1,17 @@ /* * CDDL HEADER START * - * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * - * You can obtain a copy of the license at: - * http://www.opensolaris.org/os/licensing. + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * - * When using or redistributing this file, you may do so under the - * License only. No other modification of this header is permitted. - * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] @@ -22,15 +20,17 @@ */ /* + * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. + */ + +/* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms of the CDDL. + * Use is subject to license terms. */ #ifndef _IGB_OSDEP_H #define _IGB_OSDEP_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -96,6 +96,18 @@ extern "C" { #define IEEE_ESR_1000X_HD_CAPS 0x4000 /* 1000X HD capable */ #define IEEE_ESR_1000X_FD_CAPS 0x8000 /* 1000X FD capable */ +/* VMDq MODE supported by hardware */ +#define E1000_VMDQ_OFF 0 +#define E1000_VMDQ_MAC 1 +#define E1000_VMDQ_MAC_RSS 2 + +/* VMDq based on packet destination MAC address */ +#define E1000_MRQC_ENABLE_VMDQ_MAC_GROUP 0x00000003 +/* VMDq based on packet destination MAC address and RSS */ +#define E1000_MRQC_ENABLE_VMDQ_MAC_RSS_GROUP 0x00000005 +/* The default queue in each VMDqs */ +#define E1000_VMDQ_MAC_GROUP_DEFAULT_QUEUE 0x100 + #define E1000_WRITE_FLUSH(a) (void) E1000_READ_REG(a, E1000_STATUS) #define E1000_WRITE_REG(hw, reg, value) \ diff --git a/usr/src/uts/common/io/igb/igb_rx.c b/usr/src/uts/common/io/igb/igb_rx.c index ec04dc6b8e..acf15ed35c 100644 --- a/usr/src/uts/common/io/igb/igb_rx.c +++ b/usr/src/uts/common/io/igb/igb_rx.c @@ -1,19 +1,17 @@ /* * CDDL HEADER START * - * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * - * You can obtain a copy of the license at: - * http://www.opensolaris.org/os/licensing. + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * - * When using or redistributing this file, you may do so under the - * License only. No other modification of this header is permitted. - * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] @@ -22,11 +20,13 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms of the CDDL. + * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ #include "igb_sw.h" @@ -251,6 +251,24 @@ igb_rx_assoc_hcksum(mblk_t *mp, uint32_t status_error) } } +mblk_t * +igb_rx_ring_poll(void *arg, int bytes) +{ + igb_rx_ring_t *rx_ring = (igb_rx_ring_t *)arg; + mblk_t *mp = NULL; + + ASSERT(bytes >= 0); + + if (bytes == 0) + return (mp); + + mutex_enter(&rx_ring->rx_lock); + mp = igb_rx(rx_ring, bytes); + mutex_exit(&rx_ring->rx_lock); + + return (mp); +} + /* * igb_rx - Receive the data of one ring * @@ -260,7 +278,7 @@ igb_rx_assoc_hcksum(mblk_t *mp, uint32_t status_error) * passed up to mac_rx(). */ mblk_t * -igb_rx(igb_rx_ring_t *rx_ring) +igb_rx(igb_rx_ring_t *rx_ring, int poll_bytes) { union e1000_adv_rx_desc *current_rbd; rx_control_block_t *current_rcb; @@ -272,6 +290,7 @@ igb_rx(igb_rx_ring_t *rx_ring) uint32_t pkt_len; uint32_t status_error; uint32_t pkt_num; + uint32_t total_bytes; igb_t *igb = rx_ring->igb; mblk_head = NULL; @@ -296,6 +315,7 @@ igb_rx(igb_rx_ring_t *rx_ring) current_rbd = &rx_ring->rbd_ring[rx_next]; pkt_num = 0; + total_bytes = 0; status_error = current_rbd->wb.upper.status_error; while (status_error & E1000_RXD_STAT_DD) { /* @@ -315,6 +335,14 @@ igb_rx(igb_rx_ring_t *rx_ring) (status_error & E1000_RXDEXT_STATERR_IPE)); pkt_len = current_rbd->wb.upper.length; + + if ((poll_bytes != IGB_NO_POLL) && + ((pkt_len + total_bytes) > poll_bytes)) + break; + + IGB_DEBUG_STAT(rx_ring->stat_pkt_cnt); + total_bytes += pkt_len; + mp = NULL; /* * For packets with length more than the copy threshold, diff --git a/usr/src/uts/common/io/igb/igb_sw.h b/usr/src/uts/common/io/igb/igb_sw.h index 457c929d1a..a69ba3bb77 100644 --- a/usr/src/uts/common/io/igb/igb_sw.h +++ b/usr/src/uts/common/io/igb/igb_sw.h @@ -1,19 +1,17 @@ /* * CDDL HEADER START * - * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * - * You can obtain a copy of the license at: - * http://www.opensolaris.org/os/licensing. + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * - * When using or redistributing this file, you may do so under the - * License only. No other modification of this header is permitted. - * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] @@ -22,15 +20,17 @@ */ /* + * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. + */ + +/* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms of the CDDL. + * Use is subject to license terms. */ #ifndef _IGB_SW_H #define _IGB_SW_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -48,7 +48,7 @@ extern "C" { #include <sys/modctl.h> #include <sys/errno.h> #include <sys/dlpi.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> #include <sys/vlan.h> #include <sys/ddi.h> @@ -88,6 +88,9 @@ extern "C" { #define IGB_INTR_MSI 2 #define IGB_INTR_LEGACY 3 +#define IGB_NO_POLL -1 +#define IGB_NO_FREE_SLOT -1 + #define MAX_NUM_UNICAST_ADDRESSES E1000_RAR_ENTRIES #define MAX_NUM_MULTICAST_ADDRESSES 256 #define MAX_NUM_EITR 10 @@ -97,10 +100,9 @@ extern "C" { /* * Maximum values for user configurable parameters */ -#define MAX_TX_QUEUE_NUM 4 -#define MAX_RX_QUEUE_NUM 4 #define MAX_TX_RING_SIZE 4096 #define MAX_RX_RING_SIZE 4096 +#define MAX_RX_GROUP_NUM 4 #define MAX_MTU 9000 #define MAX_RX_LIMIT_PER_INTR 4096 @@ -119,10 +121,9 @@ extern "C" { /* * Minimum values for user configurable parameters */ -#define MIN_TX_QUEUE_NUM 1 -#define MIN_RX_QUEUE_NUM 1 #define MIN_TX_RING_SIZE 64 #define MIN_RX_RING_SIZE 64 +#define MIN_RX_GROUP_NUM 1 #define MIN_MTU ETHERMIN #define MIN_RX_LIMIT_PER_INTR 16 @@ -140,10 +141,11 @@ extern "C" { /* * Default values for user configurable parameters */ -#define DEFAULT_TX_QUEUE_NUM 1 -#define DEFAULT_RX_QUEUE_NUM 1 +#define DEFAULT_TX_QUEUE_NUM 4 +#define DEFAULT_RX_QUEUE_NUM 4 #define DEFAULT_TX_RING_SIZE 512 #define DEFAULT_RX_RING_SIZE 512 +#define DEFAULT_RX_GROUP_NUM 1 #define DEFAULT_MTU ETHERMTU #define DEFAULT_RX_LIMIT_PER_INTR 256 @@ -187,7 +189,6 @@ extern "C" { #define ATTACH_PROGRESS_ENABLE_INTR 0x1000 /* DDI interrupts enabled */ #define ATTACH_PROGRESS_FMINIT 0x2000 /* FMA initialized */ - #define PROP_ADV_AUTONEG_CAP "adv_autoneg_cap" #define PROP_ADV_1000FDX_CAP "adv_1000fdx_cap" #define PROP_ADV_1000HDX_CAP "adv_1000hdx_cap" @@ -197,10 +198,10 @@ extern "C" { #define PROP_ADV_10HDX_CAP "adv_10hdx_cap" #define PROP_DEFAULT_MTU "default_mtu" #define PROP_FLOW_CONTROL "flow_control" -#define PROP_TX_QUEUE_NUM "tx_queue_number" #define PROP_TX_RING_SIZE "tx_ring_size" -#define PROP_RX_QUEUE_NUM "rx_queue_number" #define PROP_RX_RING_SIZE "rx_ring_size" +#define PROP_MR_ENABLE "mr_enable" +#define PROP_RX_GROUP_NUM "rx_group_number" #define PROP_INTR_FORCE "intr_force" #define PROP_TX_HCKSUM_ENABLE "tx_hcksum_enable" @@ -410,7 +411,7 @@ typedef union igb_ether_addr { } reg; struct { uint8_t set; - uint8_t redundant; + uint8_t group_index; uint8_t addr[ETHERADDRL]; } mac; } igb_ether_addr_t; @@ -479,6 +480,7 @@ typedef struct rx_control_block { */ typedef struct igb_tx_ring { uint32_t index; /* Ring index */ + uint32_t intr_vector; /* Interrupt vector index */ /* * Mutexes @@ -538,13 +540,14 @@ typedef struct igb_tx_ring { uint32_t stat_fail_no_tcb; uint32_t stat_fail_dma_bind; uint32_t stat_reschedule; + uint32_t stat_pkt_cnt; #endif /* * Pointer to the igb struct */ struct igb *igb; - + mac_ring_handle_t ring_handle; /* call back ring handle */ } igb_tx_ring_t; /* @@ -592,12 +595,24 @@ typedef struct igb_rx_ring { uint32_t stat_frame_error; uint32_t stat_cksum_error; uint32_t stat_exceed_pkt; + uint32_t stat_pkt_cnt; #endif struct igb *igb; /* Pointer to igb struct */ - + mac_ring_handle_t ring_handle; /* call back ring handle */ + uint32_t group_index; /* group index */ + uint64_t ring_gen_num; } igb_rx_ring_t; +/* + * Software Receive Ring Group + */ +typedef struct igb_rx_group { + uint32_t index; /* Group index */ + mac_group_handle_t group_handle; /* call back group handle */ + struct igb *igb; /* Pointer to igb struct */ +} igb_rx_group_t; + typedef struct igb { int instance; mac_handle_t mac_hdl; @@ -616,13 +631,18 @@ typedef struct igb { uint32_t loopback_mode; uint32_t max_frame_size; + uint32_t mr_enable; /* Enable multiple rings */ + uint32_t vmdq_mode; /* Mode of VMDq */ + /* - * Receive Rings + * Receive Rings and Groups */ igb_rx_ring_t *rx_rings; /* Array of rx rings */ uint32_t num_rx_rings; /* Number of rx rings in use */ uint32_t rx_ring_size; /* Rx descriptor ring size */ uint32_t rx_buf_size; /* Rx buffer size */ + igb_rx_group_t *rx_groups; /* Array of rx groups */ + uint32_t num_rx_groups; /* Number of rx groups in use */ /* * Transmit Rings @@ -652,6 +672,7 @@ typedef struct igb { uint_t intr_pri; ddi_intr_handle_t *htable; uint32_t eims_mask; + uint32_t ims_mask; kmutex_t gen_lock; /* General lock for device access */ kmutex_t watchdog_lock; @@ -772,7 +793,8 @@ void igb_free_dma(igb_t *); int igb_start(igb_t *); void igb_stop(igb_t *); int igb_setup_link(igb_t *, boolean_t); -int igb_unicst_set(igb_t *, const uint8_t *, mac_addr_slot_t); +int igb_unicst_find(igb_t *, const uint8_t *); +int igb_unicst_set(igb_t *, const uint8_t *, int); int igb_multicst_add(igb_t *, const uint8_t *); int igb_multicst_remove(igb_t *, const uint8_t *); enum ioc_reply igb_loopback_ioctl(igb_t *, struct iocblk *, mblk_t *); @@ -795,22 +817,23 @@ int igb_m_unicst(void *, const uint8_t *); int igb_m_stat(void *, uint_t, uint64_t *); void igb_m_resources(void *); void igb_m_ioctl(void *, queue_t *, mblk_t *); -int igb_m_unicst_add(void *, mac_multi_addr_t *); -int igb_m_unicst_remove(void *, mac_addr_slot_t); -int igb_m_unicst_modify(void *, mac_multi_addr_t *); -int igb_m_unicst_get(void *, mac_multi_addr_t *); boolean_t igb_m_getcapab(void *, mac_capab_t, void *); +void igb_fill_ring(void *, mac_ring_type_t, const int, const int, + mac_ring_info_t *, mac_ring_handle_t); +void igb_fill_group(void *arg, mac_ring_type_t, const int, + mac_group_info_t *, mac_group_handle_t); +int igb_rx_ring_intr_enable(mac_intr_handle_t); +int igb_rx_ring_intr_disable(mac_intr_handle_t); /* * Function prototypes in igb_rx.c */ -mblk_t *igb_rx(igb_rx_ring_t *); +mblk_t *igb_rx(igb_rx_ring_t *, int); void igb_rx_recycle(caddr_t arg); /* * Function prototypes in igb_tx.c */ -mblk_t *igb_m_tx(void *, mblk_t *); void igb_free_tcb(tx_control_block_t *); void igb_put_free_list(igb_tx_ring_t *, link_list_t *); uint32_t igb_tx_recycle_legacy(igb_tx_ring_t *); @@ -835,6 +858,8 @@ enum ioc_reply igb_nd_ioctl(igb_t *, queue_t *, mblk_t *, struct iocblk *); */ int igb_init_stats(igb_t *); +mblk_t *igb_rx_ring_poll(void *, int); +mblk_t *igb_tx_ring_send(void *, mblk_t *); #ifdef __cplusplus } diff --git a/usr/src/uts/common/io/igb/igb_tx.c b/usr/src/uts/common/io/igb/igb_tx.c index b3a0090ebe..7b43bbad97 100644 --- a/usr/src/uts/common/io/igb/igb_tx.c +++ b/usr/src/uts/common/io/igb/igb_tx.c @@ -1,19 +1,17 @@ /* * CDDL HEADER START * - * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * - * You can obtain a copy of the license at: - * http://www.opensolaris.org/os/licensing. + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * - * When using or redistributing this file, you may do so under the - * License only. No other modification of this header is permitted. - * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] @@ -22,11 +20,13 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms of the CDDL. + * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ #include "igb_sw.h" @@ -42,7 +42,7 @@ static tx_control_block_t *igb_get_free_list(igb_tx_ring_t *); static void igb_get_hcksum_context(mblk_t *, hcksum_context_t *); static boolean_t igb_check_hcksum_context(igb_tx_ring_t *, hcksum_context_t *); static void igb_fill_hcksum_context(struct e1000_adv_tx_context_desc *, - hcksum_context_t *); + hcksum_context_t *, uint32_t); #ifndef IGB_DEBUG #pragma inline(igb_save_desc) @@ -51,58 +51,14 @@ static void igb_fill_hcksum_context(struct e1000_adv_tx_context_desc *, #pragma inline(igb_fill_hcksum_context) #endif -/* - * igb_m_tx - * - * The GLDv3 interface to call driver's tx routine to transmit - * the mblks. - */ mblk_t * -igb_m_tx(void *arg, mblk_t *mp) +igb_tx_ring_send(void *arg, mblk_t *mp) { - igb_t *igb = (igb_t *)arg; - mblk_t *next; - igb_tx_ring_t *tx_ring; + igb_tx_ring_t *tx_ring = (igb_tx_ring_t *)arg; - /* - * If the adapter is suspended, or it is not started, or the link - * is not up, the mblks are simply dropped. - */ - if (((igb->igb_state & IGB_SUSPENDED) != 0) || - ((igb->igb_state & IGB_STARTED) == 0) || - (igb->link_state != LINK_STATE_UP)) { - /* Free the mblk chain */ - while (mp != NULL) { - next = mp->b_next; - mp->b_next = NULL; - - freemsg(mp); - mp = next; - } + ASSERT(tx_ring != NULL); - return (NULL); - } - - /* - * Decide which tx ring is used to transmit the packets. - * This needs to be updated later to fit the new interface - * of the multiple rings support. - */ - tx_ring = &igb->tx_rings[0]; - - while (mp != NULL) { - next = mp->b_next; - mp->b_next = NULL; - - if (!igb_tx(tx_ring, mp)) { - mp->b_next = next; - break; - } - - mp = next; - } - - return (mp); + return ((igb_tx(tx_ring, mp)) ? NULL : mp); } /* @@ -671,7 +627,7 @@ igb_check_hcksum_context(igb_tx_ring_t *tx_ring, hcksum_context_t *hcksum) */ static void igb_fill_hcksum_context(struct e1000_adv_tx_context_desc *ctx_tbd, - hcksum_context_t *hcksum) + hcksum_context_t *hcksum, uint32_t ring_index) { /* * Fill the context descriptor with the checksum @@ -708,7 +664,7 @@ igb_fill_hcksum_context(struct e1000_adv_tx_context_desc *ctx_tbd, } ctx_tbd->seqnum_seed = 0; - ctx_tbd->mss_l4len_idx = 0; + ctx_tbd->mss_l4len_idx = ring_index << 4; } /* @@ -764,7 +720,8 @@ igb_tx_fill_ring(igb_tx_ring_t *tx_ring, link_list_t *pending_list, * hardware checksum offload informations. */ igb_fill_hcksum_context( - (struct e1000_adv_tx_context_desc *)tbd, hcksum); + (struct e1000_adv_tx_context_desc *)tbd, hcksum, + tx_ring->index); index = NEXT_INDEX(index, 1, tx_ring->ring_size); desc_num++; @@ -843,6 +800,7 @@ igb_tx_fill_ring(igb_tx_ring_t *tx_ring, link_list_t *pending_list, if (hcksum_flags & HCK_PARTIALCKSUM) first_tbd->read.olinfo_status |= E1000_TXD_POPTS_TXSM << 8; + first_tbd->read.olinfo_status |= tx_ring->index << 4; } /* @@ -853,6 +811,8 @@ igb_tx_fill_ring(igb_tx_ring_t *tx_ring, link_list_t *pending_list, tbd->read.cmd_type_len |= E1000_ADVTXD_DCMD_EOP | E1000_ADVTXD_DCMD_RS; + IGB_DEBUG_STAT(tx_ring->stat_pkt_cnt); + /* * Sync the DMA buffer of the tx descriptor ring */ diff --git a/usr/src/uts/common/io/ipw/ipw2100.c b/usr/src/uts/common/io/ipw/ipw2100.c index 3ad59d1051..d1171b5122 100644 --- a/usr/src/uts/common/io/ipw/ipw2100.c +++ b/usr/src/uts/common/io/ipw/ipw2100.c @@ -48,7 +48,7 @@ #include <sys/modctl.h> #include <sys/devops.h> #include <sys/dlpi.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <net/if.h> #include <sys/mac_wifi.h> #include <sys/varargs.h> @@ -177,7 +177,6 @@ mac_callbacks_t ipw2100_m_callbacks = { ipw2100_m_multicst, ipw2100_m_unicst, ipw2100_m_tx, - NULL, ipw2100_m_ioctl }; diff --git a/usr/src/uts/common/io/iwh/iwh.c b/usr/src/uts/common/io/iwh/iwh.c index cce2a98845..1865a7ee5c 100644 --- a/usr/src/uts/common/io/iwh/iwh.c +++ b/usr/src/uts/common/io/iwh/iwh.c @@ -48,7 +48,7 @@ #include <sys/modctl.h> #include <sys/devops.h> #include <sys/dlpi.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_wifi.h> #include <sys/net80211.h> #include <sys/net80211_proto.h> @@ -414,7 +414,6 @@ mac_callbacks_t iwh_m_callbacks = { iwh_m_multicst, iwh_m_unicst, iwh_m_tx, - NULL, iwh_m_ioctl }; diff --git a/usr/src/uts/common/io/iwi/ipw2200.c b/usr/src/uts/common/io/iwi/ipw2200.c index 465c3ea2a7..80633d498f 100644 --- a/usr/src/uts/common/io/iwi/ipw2200.c +++ b/usr/src/uts/common/io/iwi/ipw2200.c @@ -48,7 +48,7 @@ #include <sys/modctl.h> #include <sys/devops.h> #include <sys/dlpi.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_wifi.h> #include <sys/varargs.h> #include <sys/pci.h> @@ -207,7 +207,6 @@ mac_callbacks_t ipw2200_m_callbacks = { ipw2200_m_multicst, ipw2200_m_unicst, ipw2200_m_tx, - NULL, ipw2200_m_ioctl }; diff --git a/usr/src/uts/common/io/iwk/iwk2.c b/usr/src/uts/common/io/iwk/iwk2.c index a0f17f2927..4ec4b774c8 100644 --- a/usr/src/uts/common/io/iwk/iwk2.c +++ b/usr/src/uts/common/io/iwk/iwk2.c @@ -48,7 +48,7 @@ #include <sys/modctl.h> #include <sys/devops.h> #include <sys/dlpi.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_wifi.h> #include <sys/net80211.h> #include <sys/net80211_proto.h> @@ -423,7 +423,6 @@ mac_callbacks_t iwk_m_callbacks = { iwk_m_multicst, iwk_m_unicst, iwk_m_tx, - NULL, iwk_m_ioctl, NULL, NULL, diff --git a/usr/src/uts/common/io/ixgbe/ixgbe.conf b/usr/src/uts/common/io/ixgbe/ixgbe.conf index 0e46fe5a0d..215d3d9516 100644 --- a/usr/src/uts/common/io/ixgbe/ixgbe.conf +++ b/usr/src/uts/common/io/ixgbe/ixgbe.conf @@ -1,19 +1,17 @@ # # CDDL HEADER START # -# Copyright(c) 2007-2008 Intel Corporation. All rights reserved. # The contents of this file are subject to the terms of the # Common Development and Distribution License (the "License"). # You may not use this file except in compliance with the License. # -# You can obtain a copy of the license at: -# http://www.opensolaris.org/os/licensing. +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. # See the License for the specific language governing permissions # and limitations under the License. # -# When using or redistributing this file, you may do so under the -# License only. No other modification of this header is permitted. -# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. # If applicable, add the following below this CDDL HEADER, with the # fields enclosed by brackets "[]" replaced with your own identifying # information: Portions Copyright [yyyy] [name of copyright owner] @@ -21,11 +19,10 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms of the CDDL. +# Copyright(c) 2007-2008 Intel Corporation. All rights reserved. # -# -# ident "%Z%%M% %I% %E% SMI" +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. # # # Driver.conf file for Intel 10GbE PCIE NIC Driver (ixgbe) @@ -45,35 +42,31 @@ # 1 - Receive only # 2 - Transmit only # 3 - Receive and transmit -# default value: 3 +# default value: 0 # # flow_control = 3; # # -------------------- Transmit/Receive Queues -------------------- -# tx/rx queue. -# tx_queue_number -# The number of the transmit queues -# Allowed values: 1 - 32 -# Default value: 1 # # tx_ring_size # The number of the transmit descriptors per transmit queue # Allowed values: 64 - 4096 -# Default value: 512 -# -# rx_queue_number -# The number of the receive queues -# Allowed values: 1 - 64 -# Default value: 1 +# Default value: 1024 # # rx_ring_size # The number of the receive descriptors per receive queue # Allowed values: 64 - 4096 -# Default value: 512 +# Default value: 1024 # -# Note: The final values of tx_queue_number and rx_queue_number are decided -# by the number of interrupt vectors obtained by the driver. They could be -# less than the specified values because of limited interrupt vector number. +# mr_enable +# Enable multiple tx queues and rx queues +# Allowed values: 0 - 1 +# Default value: 1 +# +# rx_group_number +# The number of the receive groups +# Allowed values: 1 - 16 +# Default value: 1 # # -------- How to set parameters for a particular interface --------- # The example below shows how to locate the device path and set a parameter diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_common.c b/usr/src/uts/common/io/ixgbe/ixgbe_common.c index f472cbd290..76e0232ff7 100644 --- a/usr/src/uts/common/io/ixgbe/ixgbe_common.c +++ b/usr/src/uts/common/io/ixgbe/ixgbe_common.c @@ -1,19 +1,17 @@ /* * CDDL HEADER START * - * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * - * You can obtain a copy of the license at: - * http://www.opensolaris.org/os/licensing. + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * - * When using or redistributing this file, you may do so under the - * License only. No other modification of this header is permitted. - * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] @@ -22,14 +20,16 @@ */ /* + * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. + */ + +/* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms of the CDDL. + * Use is subject to license terms. */ /* IntelVersion: 1.159 v2008-03-04 */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "ixgbe_common.h" #include "ixgbe_api.h" @@ -1546,27 +1546,11 @@ ixgbe_set_mta(struct ixgbe_hw *hw, u8 *mc_addr) void ixgbe_add_mc_addr(struct ixgbe_hw *hw, u8 *mc_addr) { - u32 rar_entries = hw->mac.num_rar_entries; - u32 rar; - DEBUGOUT6(" MC Addr =%.2X %.2X %.2X %.2X %.2X %.2X\n", mc_addr[0], mc_addr[1], mc_addr[2], mc_addr[3], mc_addr[4], mc_addr[5]); - /* - * Place this multicast address in the RAR if there is room, - * else put it in the MTA - */ - if (hw->addr_ctrl.rar_used_count < rar_entries) { - /* use RAR from the end up for multicast */ - rar = rar_entries - hw->addr_ctrl.mc_addr_in_rar_count - 1; - hw->mac.ops.set_rar(hw, rar, mc_addr, 0, IXGBE_RAH_AV); - DEBUGOUT1("Added a multicast address to RAR[%d]\n", rar); - hw->addr_ctrl.rar_used_count++; - hw->addr_ctrl.mc_addr_in_rar_count++; - } else { - ixgbe_set_mta(hw, mc_addr); - } + ixgbe_set_mta(hw, mc_addr); DEBUGOUT("ixgbe_add_mc_addr Complete\n"); } @@ -1588,7 +1572,6 @@ ixgbe_update_mc_addr_list_generic(struct ixgbe_hw *hw, u8 *mc_addr_list, u32 mc_addr_count, ixgbe_mc_addr_itr next) { u32 i; - u32 rar_entries = hw->mac.num_rar_entries; u32 vmdq; /* @@ -1596,18 +1579,8 @@ ixgbe_update_mc_addr_list_generic(struct ixgbe_hw *hw, u8 *mc_addr_list, * use. */ hw->addr_ctrl.num_mc_addrs = mc_addr_count; - hw->addr_ctrl.rar_used_count -= hw->addr_ctrl.mc_addr_in_rar_count; - hw->addr_ctrl.mc_addr_in_rar_count = 0; hw->addr_ctrl.mta_in_use = 0; - /* Zero out the other receive addresses. */ - DEBUGOUT2("Clearing RAR[%d-%d]\n", hw->addr_ctrl.rar_used_count, - rar_entries - 1); - for (i = hw->addr_ctrl.rar_used_count; i < rar_entries; i++) { - IXGBE_WRITE_REG(hw, IXGBE_RAL(i), 0); - IXGBE_WRITE_REG(hw, IXGBE_RAH(i), 0); - } - /* Clear the MTA */ DEBUGOUT(" Clearing MTA\n"); for (i = 0; i < hw->mac.mcft_size; i++) diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_gld.c b/usr/src/uts/common/io/ixgbe/ixgbe_gld.c index 78a96bd4ef..b4b3a966fe 100644 --- a/usr/src/uts/common/io/ixgbe/ixgbe_gld.c +++ b/usr/src/uts/common/io/ixgbe/ixgbe_gld.c @@ -1,19 +1,17 @@ /* * CDDL HEADER START * - * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * - * You can obtain a copy of the license at: - * http://www.opensolaris.org/os/licensing. + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * - * When using or redistributing this file, you may do so under the - * License only. No other modification of this header is permitted. - * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] @@ -22,11 +20,13 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms of the CDDL. + * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ #include "ixgbe_sw.h" @@ -103,16 +103,24 @@ ixgbe_m_stat(void *arg, uint_t stat, uint64_t *val) break; case MAC_STAT_RBYTES: - for (i = 0; i < 16; i++) - ixgbe_ks->tor.value.ui64 += + ixgbe_ks->tor.value.ui64 = 0; + for (i = 0; i < 16; i++) { + ixgbe_ks->qbrc[i].value.ui64 += IXGBE_READ_REG(hw, IXGBE_QBRC(i)); + ixgbe_ks->tor.value.ui64 += + ixgbe_ks->qbrc[i].value.ui64; + } *val = ixgbe_ks->tor.value.ui64; break; case MAC_STAT_OBYTES: - for (i = 0; i < 16; i++) - ixgbe_ks->tot.value.ui64 += + ixgbe_ks->tot.value.ui64 = 0; + for (i = 0; i < 16; i++) { + ixgbe_ks->qbtc[i].value.ui64 += IXGBE_READ_REG(hw, IXGBE_QBTC(i)); + ixgbe_ks->tot.value.ui64 += + ixgbe_ks->qbtc[i].value.ui64; + } *val = ixgbe_ks->tot.value.ui64; break; @@ -412,37 +420,6 @@ ixgbe_m_multicst(void *arg, boolean_t add, const uint8_t *mcst_addr) } /* - * Set a new device unicast address. - */ -int -ixgbe_m_unicst(void *arg, const uint8_t *mac_addr) -{ - ixgbe_t *ixgbe = (ixgbe_t *)arg; - int result; - - mutex_enter(&ixgbe->gen_lock); - - if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) { - mutex_exit(&ixgbe->gen_lock); - return (ECANCELED); - } - - /* - * Store the new MAC address. - */ - bcopy(mac_addr, ixgbe->hw.mac.addr, ETHERADDRL); - - /* - * Set MAC address in address slot 0, which is the default address. - */ - result = ixgbe_unicst_set(ixgbe, mac_addr, 0); - - mutex_exit(&ixgbe->gen_lock); - - return (result); -} - -/* * Pass on M_IOCTL messages passed to the DLD, and support * private IOCTLs for debugging and ndd. */ @@ -511,191 +488,6 @@ ixgbe_m_ioctl(void *arg, queue_t *q, mblk_t *mp) } } - -/* - * Find an unused address slot, set the address to it, reserve - * this slot and enable the device to start filtering on the - * new address. - */ -int -ixgbe_m_unicst_add(void *arg, mac_multi_addr_t *maddr) -{ - ixgbe_t *ixgbe = (ixgbe_t *)arg; - mac_addr_slot_t slot; - int err; - - mutex_enter(&ixgbe->gen_lock); - - if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) { - mutex_exit(&ixgbe->gen_lock); - return (ECANCELED); - } - - if (mac_unicst_verify(ixgbe->mac_hdl, - maddr->mma_addr, maddr->mma_addrlen) == B_FALSE) { - mutex_exit(&ixgbe->gen_lock); - return (EINVAL); - } - - if (ixgbe->unicst_avail == 0) { - /* no slots available */ - mutex_exit(&ixgbe->gen_lock); - return (ENOSPC); - } - - /* - * Primary/default address is in slot 0. The next addresses - * are the multiple MAC addresses. So multiple MAC address 0 - * is in slot 1, 1 in slot 2, and so on. So the first multiple - * MAC address resides in slot 1. - */ - for (slot = 1; slot < ixgbe->unicst_total; slot++) { - if (ixgbe->unicst_addr[slot].mac.set == 0) - break; - } - - ASSERT((slot > 0) && (slot < ixgbe->unicst_total)); - - maddr->mma_slot = slot; - - if ((err = ixgbe_unicst_set(ixgbe, maddr->mma_addr, slot)) == 0) { - ixgbe->unicst_addr[slot].mac.set = 1; - ixgbe->unicst_avail--; - } - - mutex_exit(&ixgbe->gen_lock); - - return (err); -} - -/* - * Removes a MAC address that was added before. - */ -int -ixgbe_m_unicst_remove(void *arg, mac_addr_slot_t slot) -{ - ixgbe_t *ixgbe = (ixgbe_t *)arg; - int err; - - mutex_enter(&ixgbe->gen_lock); - - if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) { - mutex_exit(&ixgbe->gen_lock); - return (ECANCELED); - } - - if ((slot <= 0) || (slot >= ixgbe->unicst_total)) { - mutex_exit(&ixgbe->gen_lock); - return (EINVAL); - } - - if (ixgbe->unicst_addr[slot].mac.set == 1) { - /* - * Copy the default address to the passed slot - */ - if ((err = ixgbe_unicst_set(ixgbe, - ixgbe->unicst_addr[0].mac.addr, slot)) == 0) { - ixgbe->unicst_addr[slot].mac.set = 0; - ixgbe->unicst_avail++; - } - - mutex_exit(&ixgbe->gen_lock); - - return (err); - } - - mutex_exit(&ixgbe->gen_lock); - - return (EINVAL); -} - -/* - * Modifies the value of an address that has been added before. - * The new address length and the slot number that was returned - * in the call to add should be passed in. mma_flags should be - * set to 0. - * Returns 0 on success. - */ -int -ixgbe_m_unicst_modify(void *arg, mac_multi_addr_t *maddr) -{ - ixgbe_t *ixgbe = (ixgbe_t *)arg; - mac_addr_slot_t slot; - int err; - - mutex_enter(&ixgbe->gen_lock); - - if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) { - mutex_exit(&ixgbe->gen_lock); - return (ECANCELED); - } - - if (mac_unicst_verify(ixgbe->mac_hdl, - maddr->mma_addr, maddr->mma_addrlen) == B_FALSE) { - mutex_exit(&ixgbe->gen_lock); - return (EINVAL); - } - - slot = maddr->mma_slot; - - if ((slot <= 0) || (slot >= ixgbe->unicst_total)) { - mutex_exit(&ixgbe->gen_lock); - return (EINVAL); - } - - if (ixgbe->unicst_addr[slot].mac.set == 1) { - err = ixgbe_unicst_set(ixgbe, maddr->mma_addr, slot); - mutex_exit(&ixgbe->gen_lock); - return (err); - } - - mutex_exit(&ixgbe->gen_lock); - - return (EINVAL); -} - -/* - * Get the MAC address and all other information related to - * the address slot passed in mac_multi_addr_t. - * mma_flags should be set to 0 in the call. - * On return, mma_flags can take the following values: - * 1) MMAC_SLOT_UNUSED - * 2) MMAC_SLOT_USED | MMAC_VENDOR_ADDR - * 3) MMAC_SLOT_UNUSED | MMAC_VENDOR_ADDR - * 4) MMAC_SLOT_USED - */ -int -ixgbe_m_unicst_get(void *arg, mac_multi_addr_t *maddr) -{ - ixgbe_t *ixgbe = (ixgbe_t *)arg; - mac_addr_slot_t slot; - - mutex_enter(&ixgbe->gen_lock); - - if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) { - mutex_exit(&ixgbe->gen_lock); - return (ECANCELED); - } - - slot = maddr->mma_slot; - - if ((slot <= 0) || (slot >= ixgbe->unicst_total)) { - mutex_exit(&ixgbe->gen_lock); - return (EINVAL); - } - if (ixgbe->unicst_addr[slot].mac.set == 1) { - bcopy(ixgbe->unicst_addr[slot].mac.addr, - maddr->mma_addr, ETHERADDRL); - maddr->mma_flags = MMAC_SLOT_USED; - } else { - maddr->mma_flags = MMAC_SLOT_UNUSED; - } - - mutex_exit(&ixgbe->gen_lock); - - return (0); -} - /* * Obtain the MAC's capabilities and associated data from * the driver. @@ -732,25 +524,29 @@ ixgbe_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) return (B_FALSE); } } - case MAC_CAPAB_MULTIADDRESS: { - multiaddress_capab_t *mmacp = cap_data; - - /* - * The number of MAC addresses made available by - * this capability is one less than the total as - * the primary address in slot 0 is counted in - * the total. - */ - mmacp->maddr_naddr = ixgbe->unicst_total - 1; - mmacp->maddr_naddrfree = ixgbe->unicst_avail; - /* No multiple factory addresses, set mma_flag to 0 */ - mmacp->maddr_flag = 0; - mmacp->maddr_handle = ixgbe; - mmacp->maddr_add = ixgbe_m_unicst_add; - mmacp->maddr_remove = ixgbe_m_unicst_remove; - mmacp->maddr_modify = ixgbe_m_unicst_modify; - mmacp->maddr_get = ixgbe_m_unicst_get; - mmacp->maddr_reserve = NULL; + case MAC_CAPAB_RINGS: { + mac_capab_rings_t *cap_rings = cap_data; + + switch (cap_rings->mr_type) { + case MAC_RING_TYPE_RX: + cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; + cap_rings->mr_rnum = ixgbe->num_rx_rings; + cap_rings->mr_gnum = ixgbe->num_rx_groups; + cap_rings->mr_rget = ixgbe_fill_ring; + cap_rings->mr_gget = ixgbe_fill_group; + cap_rings->mr_gaddring = NULL; + cap_rings->mr_gremring = NULL; + break; + case MAC_RING_TYPE_TX: + cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; + cap_rings->mr_rnum = ixgbe->num_tx_rings; + cap_rings->mr_gnum = 0; + cap_rings->mr_rget = ixgbe_fill_ring; + cap_rings->mr_gget = NULL; + break; + default: + break; + } break; } default: diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_main.c b/usr/src/uts/common/io/ixgbe/ixgbe_main.c index f7bbcb1ff6..f8acd5fdd5 100644 --- a/usr/src/uts/common/io/ixgbe/ixgbe_main.c +++ b/usr/src/uts/common/io/ixgbe/ixgbe_main.c @@ -61,6 +61,8 @@ static void ixgbe_setup_rx_ring(ixgbe_rx_ring_t *); static void ixgbe_setup_tx_ring(ixgbe_tx_ring_t *); static void ixgbe_setup_rss(ixgbe_t *); static void ixgbe_init_unicst(ixgbe_t *); +static int ixgbe_unicst_set(ixgbe_t *, const uint8_t *, int); +static int ixgbe_unicst_find(ixgbe_t *, const uint8_t *); static void ixgbe_setup_multicst(ixgbe_t *); static void ixgbe_get_hw_state(ixgbe_t *); static void ixgbe_get_conf(ixgbe_t *); @@ -83,7 +85,9 @@ static int ixgbe_alloc_intr_handles(ixgbe_t *, int); static int ixgbe_add_intr_handlers(ixgbe_t *); static void ixgbe_map_rxring_to_vector(ixgbe_t *, int, int); static void ixgbe_map_txring_to_vector(ixgbe_t *, int, int); -static void ixgbe_set_ivar(ixgbe_t *, uint16_t, uint8_t); +static void ixgbe_setup_ivar(ixgbe_t *, uint16_t, uint8_t); +static void ixgbe_enable_ivar(ixgbe_t *, uint16_t); +static void ixgbe_disable_ivar(ixgbe_t *, uint16_t); static int ixgbe_map_rings_to_vectors(ixgbe_t *); static void ixgbe_setup_adapter_vector(ixgbe_t *); static void ixgbe_rem_intr_handlers(ixgbe_t *); @@ -92,12 +96,14 @@ static int ixgbe_enable_intrs(ixgbe_t *); static int ixgbe_disable_intrs(ixgbe_t *); static uint_t ixgbe_intr_legacy(void *, void *); static uint_t ixgbe_intr_msi(void *, void *); -static uint_t ixgbe_intr_rx(void *, void *); -static uint_t ixgbe_intr_tx_other(void *, void *); +static uint_t ixgbe_intr_rx_tx(void *, void *); +static uint_t ixgbe_intr_other(void *, void *); static void ixgbe_intr_rx_work(ixgbe_rx_ring_t *); static void ixgbe_intr_tx_work(ixgbe_tx_ring_t *); static void ixgbe_intr_other_work(ixgbe_t *); static void ixgbe_get_driver_control(struct ixgbe_hw *); +static int ixgbe_addmac(void *, const uint8_t *); +static int ixgbe_remmac(void *, const uint8_t *); static void ixgbe_release_driver_control(struct ixgbe_hw *); static int ixgbe_attach(dev_info_t *, ddi_attach_cmd_t); @@ -188,8 +194,7 @@ static mac_callbacks_t ixgbe_m_callbacks = { ixgbe_m_stop, ixgbe_m_promisc, ixgbe_m_multicst, - ixgbe_m_unicst, - ixgbe_m_tx, + NULL, NULL, ixgbe_m_ioctl, ixgbe_m_getcapab @@ -675,6 +680,7 @@ ixgbe_register_mac(ixgbe_t *ixgbe) mac->m_min_sdu = 0; mac->m_max_sdu = ixgbe->default_mtu; mac->m_margin = VLAN_TAGSZ; + mac->m_v12n = MAC_VIRT_LEVEL1; status = mac_register(mac, &ixgbe->mac_hdl); @@ -765,6 +771,7 @@ static int ixgbe_init_driver_settings(ixgbe_t *ixgbe) { struct ixgbe_hw *hw = &ixgbe->hw; + dev_info_t *devinfo = ixgbe->dip; ixgbe_rx_ring_t *rx_ring; ixgbe_tx_ring_t *tx_ring; uint32_t rx_size; @@ -779,6 +786,11 @@ ixgbe_init_driver_settings(ixgbe_t *ixgbe) } /* + * Get the system page size + */ + ixgbe->sys_page_size = ddi_ptob(devinfo, (ulong_t)1); + + /* * Set rx buffer size * * The IP header alignment room is counted in the calculation. @@ -1569,6 +1581,23 @@ ixgbe_alloc_rings(ixgbe_t *ixgbe) return (IXGBE_FAILURE); } + /* + * Allocate memory space for rx ring groups + */ + ixgbe->rx_groups = kmem_zalloc( + sizeof (ixgbe_rx_group_t) * ixgbe->num_rx_groups, + KM_NOSLEEP); + + if (ixgbe->rx_groups == NULL) { + kmem_free(ixgbe->rx_rings, + sizeof (ixgbe_rx_ring_t) * ixgbe->num_rx_rings); + kmem_free(ixgbe->tx_rings, + sizeof (ixgbe_tx_ring_t) * ixgbe->num_tx_rings); + ixgbe->rx_rings = NULL; + ixgbe->tx_rings = NULL; + return (IXGBE_FAILURE); + } + return (IXGBE_SUCCESS); } @@ -1589,6 +1618,12 @@ ixgbe_free_rings(ixgbe_t *ixgbe) sizeof (ixgbe_tx_ring_t) * ixgbe->num_tx_rings); ixgbe->tx_rings = NULL; } + + if (ixgbe->rx_groups != NULL) { + kmem_free(ixgbe->rx_groups, + sizeof (ixgbe_rx_group_t) * ixgbe->num_rx_groups); + ixgbe->rx_groups = NULL; + } } /* @@ -1693,7 +1728,9 @@ ixgbe_setup_rx(ixgbe_t *ixgbe) { ixgbe_rx_ring_t *rx_ring; struct ixgbe_hw *hw = &ixgbe->hw; + ixgbe_rx_group_t *rx_group; uint32_t reg_val; + uint32_t ring_mapping; int i; /* @@ -1723,6 +1760,29 @@ ixgbe_setup_rx(ixgbe_t *ixgbe) } /* + * Setup rx groups. + */ + for (i = 0; i < ixgbe->num_rx_groups; i++) { + rx_group = &ixgbe->rx_groups[i]; + rx_group->index = i; + rx_group->ixgbe = ixgbe; + } + + /* + * Setup the per-ring statistics mapping. + */ + ring_mapping = 0; + for (i = 0; i < ixgbe->num_rx_rings; i++) { + ring_mapping |= (i & 0xF) << (8 * (i & 0x3)); + if ((i & 0x3) == 0x3) { + IXGBE_WRITE_REG(hw, IXGBE_RQSMR(i >> 2), ring_mapping); + ring_mapping = 0; + } + } + if ((i & 0x3) != 0x3) + IXGBE_WRITE_REG(hw, IXGBE_RQSMR(i >> 2), ring_mapping); + + /* * The Max Frame Size in MHADD will be internally increased by four * bytes if the packet has a VLAN field, so includes MTU, ethernet * header and frame check sequence. @@ -1858,6 +1918,7 @@ ixgbe_setup_tx(ixgbe_t *ixgbe) struct ixgbe_hw *hw = &ixgbe->hw; ixgbe_tx_ring_t *tx_ring; uint32_t reg_val; + uint32_t ring_mapping; int i; for (i = 0; i < ixgbe->num_tx_rings; i++) { @@ -1866,6 +1927,20 @@ ixgbe_setup_tx(ixgbe_t *ixgbe) } /* + * Setup the per-ring statistics mapping. + */ + ring_mapping = 0; + for (i = 0; i < ixgbe->num_tx_rings; i++) { + ring_mapping |= (i & 0xF) << (8 * (i & 0x3)); + if ((i & 0x3) == 0x3) { + IXGBE_WRITE_REG(hw, IXGBE_TQSMR(i >> 2), ring_mapping); + ring_mapping = 0; + } + } + if ((i & 0x3) != 0x3) + IXGBE_WRITE_REG(hw, IXGBE_TQSMR(i >> 2), ring_mapping); + + /* * Enable CRC appending and TX padding (for short tx frames) */ reg_val = IXGBE_READ_REG(hw, IXGBE_HLREG0); @@ -1936,13 +2011,13 @@ static void ixgbe_init_unicst(ixgbe_t *ixgbe) { struct ixgbe_hw *hw = &ixgbe->hw; + uint8_t *mac_addr; int slot; /* * Here we should consider two situations: * - * 1. Chipset is initialized the first time - * Initialize the multiple unicast addresses, and - * save the default mac address. + * 1. Chipset is initialized at the first time, + * Clear all the multiple unicast addresses. * * 2. Chipset is reset * Recover the multiple unicast addresses from the @@ -1953,36 +2028,36 @@ ixgbe_init_unicst(ixgbe_t *ixgbe) * Initialize the multiple unicast addresses */ ixgbe->unicst_total = MAX_NUM_UNICAST_ADDRESSES; - - ixgbe->unicst_avail = ixgbe->unicst_total - 1; - - bcopy(hw->mac.addr, ixgbe->unicst_addr[0].mac.addr, - ETHERADDRL); - ixgbe->unicst_addr[0].mac.set = 1; - - for (slot = 1; slot < ixgbe->unicst_total; slot++) + ixgbe->unicst_avail = ixgbe->unicst_total; + for (slot = 0; slot < ixgbe->unicst_total; slot++) { + mac_addr = ixgbe->unicst_addr[slot].mac.addr; + bzero(mac_addr, ETHERADDRL); + (void) ixgbe_set_rar(hw, slot, mac_addr, NULL, NULL); ixgbe->unicst_addr[slot].mac.set = 0; - + } ixgbe->unicst_init = B_TRUE; } else { - /* - * Recover the default mac address - */ - bcopy(ixgbe->unicst_addr[0].mac.addr, hw->mac.addr, - ETHERADDRL); - /* Re-configure the RAR registers */ - for (slot = 1; slot < ixgbe->unicst_total; slot++) - (void) ixgbe_set_rar(hw, slot, - ixgbe->unicst_addr[slot].mac.addr, NULL, NULL); + for (slot = 0; slot < ixgbe->unicst_total; slot++) { + mac_addr = ixgbe->unicst_addr[slot].mac.addr; + if (ixgbe->unicst_addr[slot].mac.set == 1) { + (void) ixgbe_set_rar(hw, slot, mac_addr, + NULL, IXGBE_RAH_AV); + } else { + bzero(mac_addr, ETHERADDRL); + (void) ixgbe_set_rar(hw, slot, mac_addr, + NULL, NULL); + } + } } } + /* * ixgbe_unicst_set - Set the unicast address to the specified slot. */ int ixgbe_unicst_set(ixgbe_t *ixgbe, const uint8_t *mac_addr, - mac_addr_slot_t slot) + int slot) { struct ixgbe_hw *hw = &ixgbe->hw; @@ -1996,7 +2071,7 @@ ixgbe_unicst_set(ixgbe_t *ixgbe, const uint8_t *mac_addr, /* * Set the unicast address to the RAR register */ - (void) ixgbe_set_rar(hw, slot, (uint8_t *)mac_addr, NULL, NULL); + (void) ixgbe_set_rar(hw, slot, (uint8_t *)mac_addr, NULL, IXGBE_RAH_AV); if (ixgbe_check_acc_handle(ixgbe->osdep.reg_handle) != DDI_FM_OK) { ddi_fm_service_impact(ixgbe->dip, DDI_SERVICE_DEGRADED); @@ -2007,6 +2082,25 @@ ixgbe_unicst_set(ixgbe_t *ixgbe, const uint8_t *mac_addr, } /* + * ixgbe_unicst_find - Find the slot for the specified unicast address + */ +int +ixgbe_unicst_find(ixgbe_t *ixgbe, const uint8_t *mac_addr) +{ + int slot; + + ASSERT(mutex_owned(&ixgbe->gen_lock)); + + for (slot = 0; slot < ixgbe->unicst_total; slot++) { + if (bcmp(ixgbe->unicst_addr[slot].mac.addr, + mac_addr, ETHERADDRL) == 0) + return (slot); + } + + return (-1); +} + +/* * ixgbe_multicst_add - Add a multicst address. */ int @@ -2153,7 +2247,7 @@ ixgbe_get_conf(ixgbe_t *ixgbe) * Ethernet flow control configuration */ flow_control = ixgbe_get_prop(ixgbe, PROP_FLOW_CONTROL, - ixgbe_fc_none, 3, ixgbe_fc_full); + ixgbe_fc_none, 3, ixgbe_fc_none); if (flow_control == 3) flow_control = ixgbe_fc_default; @@ -2173,10 +2267,25 @@ ixgbe_get_conf(ixgbe_t *ixgbe) MIN_RX_RING_SIZE, MAX_RX_RING_SIZE, DEFAULT_RX_RING_SIZE); /* + * Multiple groups configuration + */ + ixgbe->num_rx_groups = ixgbe_get_prop(ixgbe, PROP_RX_GROUP_NUM, + MIN_RX_GROUP_NUM, MAX_RX_GROUP_NUM, DEFAULT_RX_GROUP_NUM); + + ixgbe->mr_enable = ixgbe_get_prop(ixgbe, PROP_MR_ENABLE, + 0, 1, DEFAULT_MR_ENABLE); + + if (ixgbe->mr_enable == B_FALSE) { + ixgbe->num_tx_rings = 1; + ixgbe->num_rx_rings = 1; + ixgbe->num_rx_groups = 1; + } + + /* * Tunable used to force an interrupt type. The only use is * for testing of the lesser interrupt types. * 0 = don't force interrupt type - * 1 = force interrupt type MSIX + * 1 = force interrupt type MSI-X * 2 = force interrupt type MSI * 3 = force interrupt type Legacy */ @@ -2413,6 +2522,7 @@ ixgbe_stall_check(ixgbe_t *ixgbe) result = B_FALSE; for (i = 0; i < ixgbe->num_tx_rings; i++) { tx_ring = &ixgbe->tx_rings[i]; + tx_ring->tx_recycle(tx_ring); if (tx_ring->recycle_fail > 0) tx_ring->stall_watchdog++; @@ -2872,11 +2982,12 @@ ixgbe_intr_rx_work(ixgbe_rx_ring_t *rx_ring) mutex_enter(&rx_ring->rx_lock); - mp = ixgbe_rx(rx_ring); + mp = ixgbe_ring_rx(rx_ring, IXGBE_POLL_NULL); mutex_exit(&rx_ring->rx_lock); if (mp != NULL) - mac_rx(rx_ring->ixgbe->mac_hdl, NULL, mp); + mac_rx_ring(rx_ring->ixgbe->mac_hdl, rx_ring->ring_handle, mp, + rx_ring->ring_gen_num); } #pragma inline(ixgbe_intr_tx_work) @@ -2897,7 +3008,8 @@ ixgbe_intr_tx_work(ixgbe_tx_ring_t *tx_ring) if (tx_ring->reschedule && (tx_ring->tbd_free >= tx_ring->resched_thresh)) { tx_ring->reschedule = B_FALSE; - mac_tx_update(tx_ring->ixgbe->mac_hdl); + mac_tx_ring_update(tx_ring->ixgbe->mac_hdl, + tx_ring->ring_handle); IXGBE_DEBUG_STAT(tx_ring->stat_reschedule); } } @@ -2943,6 +3055,7 @@ ixgbe_intr_legacy(void *arg1, void *arg2) ixgbe_t *ixgbe = (ixgbe_t *)arg1; struct ixgbe_hw *hw = &ixgbe->hw; ixgbe_tx_ring_t *tx_ring; + ixgbe_rx_ring_t *rx_ring; uint32_t eicr; mblk_t *mp; boolean_t tx_reschedule; @@ -2974,16 +3087,20 @@ ixgbe_intr_legacy(void *arg1, void *arg2) ASSERT(ixgbe->num_tx_rings == 1); /* - * For legacy interrupt, we can't differentiate - * between tx and rx, so always clean both + * For legacy interrupt, rx rings[0] will use RTxQ[0]. */ - if (eicr & IXGBE_EICR_RTX_QUEUE) { - + if (eicr & 0x1) { /* * Clean the rx descriptors */ - mp = ixgbe_rx(&ixgbe->rx_rings[0]); + rx_ring = &ixgbe->rx_rings[0]; + mp = ixgbe_ring_rx(rx_ring, IXGBE_POLL_NULL); + } + /* + * For legacy interrupt, tx rings[0] will use RTxQ[1]. + */ + if (eicr & 0x2) { /* * Recycle the tx descriptors */ @@ -3020,11 +3137,12 @@ ixgbe_intr_legacy(void *arg1, void *arg2) * Do the following work outside of the gen_lock */ if (mp != NULL) - mac_rx(ixgbe->mac_hdl, NULL, mp); + mac_rx_ring(rx_ring->ixgbe->mac_hdl, rx_ring->ring_handle, mp, + rx_ring->ring_gen_num); if (tx_reschedule) { tx_ring->reschedule = B_FALSE; - mac_tx_update(ixgbe->mac_hdl); + mac_tx_ring_update(ixgbe->mac_hdl, tx_ring->ring_handle); IXGBE_DEBUG_STAT(tx_ring->stat_reschedule); } @@ -3055,11 +3173,16 @@ ixgbe_intr_msi(void *arg1, void *arg2) ASSERT(ixgbe->num_tx_rings == 1); /* - * For MSI interrupt, we can't differentiate - * between tx and rx, so always clean both. + * For MSI interrupt, rx rings[0] will use RTxQ[0]. */ - if (eicr & IXGBE_EICR_RTX_QUEUE) { + if (eicr & 0x1) { ixgbe_intr_rx_work(&ixgbe->rx_rings[0]); + } + + /* + * For MSI interrupt, tx rings[0] will use RTxQ[1]. + */ + if (eicr & 0x2) { ixgbe_intr_tx_work(&ixgbe->tx_rings[0]); } @@ -3071,38 +3194,47 @@ ixgbe_intr_msi(void *arg1, void *arg2) } /* - * ixgbe_intr_rx - Interrupt handler for rx. + * ixgbe_intr_rx_tx - Interrupt handler for rx and tx. */ static uint_t -ixgbe_intr_rx(void *arg1, void *arg2) +ixgbe_intr_rx_tx(void *arg1, void *arg2) { _NOTE(ARGUNUSED(arg2)); - ixgbe_ring_vector_t *vect = (ixgbe_ring_vector_t *)arg1; - ixgbe_t *ixgbe = vect->ixgbe; - int r_idx; + ixgbe_ring_vector_t *vect = (ixgbe_ring_vector_t *)arg1; + ixgbe_t *ixgbe = vect->ixgbe; + int r_idx = 0; /* - * clean each rx ring that has its bit set in the map + * Clean each rx ring that has its bit set in the map */ r_idx = bt_getlowbit(vect->rx_map, 0, (ixgbe->num_rx_rings - 1)); - while (r_idx >= 0) { ixgbe_intr_rx_work(&ixgbe->rx_rings[r_idx]); r_idx = bt_getlowbit(vect->rx_map, (r_idx + 1), (ixgbe->num_rx_rings - 1)); } + /* + * Clean each tx ring that has its bit set in the map + */ + r_idx = bt_getlowbit(vect->tx_map, 0, (ixgbe->num_tx_rings - 1)); + while (r_idx >= 0) { + ixgbe_intr_tx_work(&ixgbe->tx_rings[r_idx]); + r_idx = bt_getlowbit(vect->tx_map, (r_idx + 1), + (ixgbe->num_tx_rings - 1)); + } + return (DDI_INTR_CLAIMED); } /* - * ixgbe_intr_tx_other - Interrupt handler for both tx and other. + * ixgbe_intr_other - Interrupt handler for other. * - * Always look for Tx cleanup work. Only look for other work if the right - * bits are set in the Interrupt Cause Register. + * Only look for other work if the right bits are set in the + * Interrupt Cause Register. */ static uint_t -ixgbe_intr_tx_other(void *arg1, void *arg2) +ixgbe_intr_other(void *arg1, void *arg2) { _NOTE(ARGUNUSED(arg2)); ixgbe_t *ixgbe = (ixgbe_t *)arg1; @@ -3112,14 +3244,8 @@ ixgbe_intr_tx_other(void *arg1, void *arg2) eicr = IXGBE_READ_REG(hw, IXGBE_EICR); /* - * Always look for Tx cleanup work. We don't have separate - * transmit vectors, so we have only one tx ring enabled. - */ - ASSERT(ixgbe->num_tx_rings == 1); - ixgbe_intr_tx_work(&ixgbe->tx_rings[0]); - - /* - * Check for "other" causes. + * Need check cause bits and only link change will + * be processed */ if (eicr & IXGBE_EICR_LSC) { ixgbe_intr_other_work(ixgbe); @@ -3174,12 +3300,13 @@ ixgbe_alloc_intrs(ixgbe_t *ixgbe) } /* - * MSI-X not used, force rings to 1 + * MSI-X not used, force rings and groups to 1 */ ixgbe->num_rx_rings = 1; + ixgbe->num_rx_groups = 1; ixgbe->num_tx_rings = 1; ixgbe_log(ixgbe, - "MSI-X not used, force rx and tx queue number to 1"); + "MSI-X not used, force rings and groups number to 1"); /* * Install MSI interrupts @@ -3217,30 +3344,19 @@ ixgbe_alloc_intrs(ixgbe_t *ixgbe) * * For legacy and MSI, only 1 handle is needed. For MSI-X, * if fewer than 2 handles are available, return failure. - * Upon success, this sets the number of Rx rings to a number that - * matches the handles available for Rx interrupts. + * Upon success, this maps the vectors to rx and tx rings for + * interrupts. */ static int ixgbe_alloc_intr_handles(ixgbe_t *ixgbe, int intr_type) { dev_info_t *devinfo; int request, count, avail, actual; - int rx_rings, minimum; + int minimum; int rc; devinfo = ixgbe->dip; - /* - * Currently only 1 tx ring is supported. More tx rings - * will be supported with future enhancement. - */ - if (ixgbe->num_tx_rings > 1) { - ixgbe->num_tx_rings = 1; - ixgbe_log(ixgbe, - "Use only 1 MSI-X vector for tx, " - "force tx queue number to 1"); - } - switch (intr_type) { case DDI_INTR_TYPE_FIXED: request = 1; /* Request 1 legacy interrupt handle */ @@ -3257,11 +3373,11 @@ ixgbe_alloc_intr_handles(ixgbe_t *ixgbe, int intr_type) case DDI_INTR_TYPE_MSIX: /* * Best number of vectors for the adapter is - * # rx rings + # tx rings + 1 for other - * But currently we only support number of vectors of - * # rx rings + 1 for tx & other + * # rx rings + # tx rings + 1 for other. */ - request = ixgbe->num_rx_rings + 1; + request = ixgbe->num_rx_rings + ixgbe->num_tx_rings + 1; + if (request > (IXGBE_MAX_RING_VECTOR + 1)) + request = IXGBE_MAX_RING_VECTOR + 1; minimum = 2; IXGBE_DEBUGLOG_0(ixgbe, "interrupt type: MSI-X"); break; @@ -3327,9 +3443,8 @@ ixgbe_alloc_intr_handles(ixgbe_t *ixgbe, int intr_type) ixgbe->intr_cnt = actual; /* - * Now we know the actual number of vectors. Here we assume that - * tx and other will share 1 vector and all remaining (must be at - * least 1 remaining) will be used for rx. + * Now we know the actual number of vectors. Here we map the vector + * to other, rx rings and tx ring. */ if (actual < minimum) { ixgbe_log(ixgbe, "Insufficient interrupt handles available: %d", @@ -3338,19 +3453,6 @@ ixgbe_alloc_intr_handles(ixgbe_t *ixgbe, int intr_type) } /* - * For MSI-X, actual might force us to reduce number of rx rings - */ - if (intr_type == DDI_INTR_TYPE_MSIX) { - rx_rings = actual - 1; - if (rx_rings < ixgbe->num_rx_rings) { - ixgbe_log(ixgbe, - "MSI-X vectors force Rx queue number to %d", - rx_rings); - ixgbe->num_rx_rings = rx_rings; - } - } - - /* * Get priority for first vector, assume remaining are all the same */ rc = ddi_intr_get_pri(ixgbe->htable[0], &ixgbe->intr_pri); @@ -3386,56 +3488,47 @@ alloc_handle_fail: static int ixgbe_add_intr_handlers(ixgbe_t *ixgbe) { - ixgbe_rx_ring_t *rx_ring; - int vector; + int vector = 0; int rc; - int i; - - vector = 0; switch (ixgbe->intr_type) { case DDI_INTR_TYPE_MSIX: /* - * Add interrupt handler for tx + other - */ - rc = ddi_intr_add_handler(ixgbe->htable[vector], - (ddi_intr_handler_t *)ixgbe_intr_tx_other, - (void *)ixgbe, NULL); - if (rc != DDI_SUCCESS) { - ixgbe_log(ixgbe, - "Add tx/other interrupt handler failed: %d", rc); - return (IXGBE_FAILURE); - } - vector++; - - /* - * Add interrupt handler for each rx ring + * Add interrupt handler for rx and tx rings: vector[0 - + * (ixgbe->intr_cnt -1)]. */ - for (i = 0; i < ixgbe->num_rx_rings; i++) { - rx_ring = &ixgbe->rx_rings[i]; - + for (vector = 0; vector < (ixgbe->intr_cnt -1); vector++) { /* * install pointer to vect_map[vector] */ rc = ddi_intr_add_handler(ixgbe->htable[vector], - (ddi_intr_handler_t *)ixgbe_intr_rx, + (ddi_intr_handler_t *)ixgbe_intr_rx_tx, (void *)&ixgbe->vect_map[vector], NULL); if (rc != DDI_SUCCESS) { ixgbe_log(ixgbe, "Add rx interrupt handler failed. " - "return: %d, rx ring: %d", rc, i); + "return: %d, vector: %d", rc, vector); for (vector--; vector >= 0; vector--) { (void) ddi_intr_remove_handler( ixgbe->htable[vector]); } return (IXGBE_FAILURE); } + } - rx_ring->intr_vector = vector; - - vector++; + /* + * Add interrupt handler for other: vector[ixgbe->intr_cnt -1] + */ + rc = ddi_intr_add_handler(ixgbe->htable[vector], + (ddi_intr_handler_t *)ixgbe_intr_other, + (void *)ixgbe, NULL); + if (rc != DDI_SUCCESS) { + ixgbe_log(ixgbe, + "Add other interrupt handler failed: %d", rc); + return (IXGBE_FAILURE); } + break; case DDI_INTR_TYPE_MSI: @@ -3452,10 +3545,6 @@ ixgbe_add_intr_handlers(ixgbe_t *ixgbe) return (IXGBE_FAILURE); } - rx_ring = &ixgbe->rx_rings[0]; - rx_ring->intr_vector = vector; - - vector++; break; case DDI_INTR_TYPE_FIXED: @@ -3472,17 +3561,13 @@ ixgbe_add_intr_handlers(ixgbe_t *ixgbe) return (IXGBE_FAILURE); } - rx_ring = &ixgbe->rx_rings[0]; - rx_ring->intr_vector = vector; - - vector++; break; default: return (IXGBE_FAILURE); } - ASSERT(vector == ixgbe->intr_cnt); + ASSERT(vector == (ixgbe->intr_cnt -1)); return (IXGBE_SUCCESS); } @@ -3509,6 +3594,7 @@ ixgbe_map_rxring_to_vector(ixgbe_t *ixgbe, int r_idx, int v_idx) /* * Remember bit position */ + ixgbe->rx_rings[r_idx].intr_vector = v_idx; ixgbe->rx_rings[r_idx].vect_bit = 1 << v_idx; } @@ -3534,48 +3620,81 @@ ixgbe_map_txring_to_vector(ixgbe_t *ixgbe, int t_idx, int v_idx) /* * Remember bit position */ + ixgbe->tx_rings[t_idx].intr_vector = v_idx; ixgbe->tx_rings[t_idx].vect_bit = 1 << v_idx; } /* - * ixgbe_set_ivar - Set the given entry in the given interrupt vector + * ixgbe_setup_ivar - Set the given entry in the given interrupt vector * allocation register (IVAR). */ static void -ixgbe_set_ivar(ixgbe_t *ixgbe, uint16_t int_alloc_entry, uint8_t msix_vector) +ixgbe_setup_ivar(ixgbe_t *ixgbe, uint16_t intr_alloc_entry, uint8_t msix_vector) { struct ixgbe_hw *hw = &ixgbe->hw; u32 ivar, index; msix_vector |= IXGBE_IVAR_ALLOC_VAL; - index = (int_alloc_entry >> 2) & 0x1F; + index = (intr_alloc_entry >> 2) & 0x1F; + ivar = IXGBE_READ_REG(hw, IXGBE_IVAR(index)); + ivar &= ~(0xFF << (8 * (intr_alloc_entry & 0x3))); + ivar |= (msix_vector << (8 * (intr_alloc_entry & 0x3))); + IXGBE_WRITE_REG(hw, IXGBE_IVAR(index), ivar); +} + +/* + * ixgbe_enable_ivar - Enable the given entry by setting the VAL bit of + * given interrupt vector allocation register (IVAR). + */ +static void +ixgbe_enable_ivar(ixgbe_t *ixgbe, uint16_t intr_alloc_entry) +{ + struct ixgbe_hw *hw = &ixgbe->hw; + u32 ivar, index; + + index = (intr_alloc_entry >> 2) & 0x1F; + ivar = IXGBE_READ_REG(hw, IXGBE_IVAR(index)); + ivar |= (IXGBE_IVAR_ALLOC_VAL << (8 * (intr_alloc_entry & 0x3))); + IXGBE_WRITE_REG(hw, IXGBE_IVAR(index), ivar); +} + +/* + * ixgbe_enable_ivar - Disble the given entry by clearing the VAL bit of + * given interrupt vector allocation register (IVAR). + */ +static void +ixgbe_disable_ivar(ixgbe_t *ixgbe, uint16_t intr_alloc_entry) +{ + struct ixgbe_hw *hw = &ixgbe->hw; + u32 ivar, index; + + index = (intr_alloc_entry >> 2) & 0x1F; ivar = IXGBE_READ_REG(hw, IXGBE_IVAR(index)); - ivar &= ~(0xFF << (8 * (int_alloc_entry & 0x3))); - ivar |= (msix_vector << (8 * (int_alloc_entry & 0x3))); + ivar &= ~(IXGBE_IVAR_ALLOC_VAL << (8 * (intr_alloc_entry & 0x3))); IXGBE_WRITE_REG(hw, IXGBE_IVAR(index), ivar); } /* * ixgbe_map_rings_to_vectors - Map descriptor rings to interrupt vectors. * - * For msi-x, this currently implements only the scheme which is - * 1 vector for tx + other, 1 vector for each rx ring. + * For MSI-X, here will map rx and tx ring to vector[0 - (vectors -1)]. + * The last vector will be used for other interrupt. */ static int ixgbe_map_rings_to_vectors(ixgbe_t *ixgbe) { int i, vector = 0; - int vect_remain = ixgbe->intr_cnt; /* initialize vector map */ bzero(&ixgbe->vect_map, sizeof (ixgbe->vect_map)); /* - * non-MSI-X case is very simple: all interrupts on vector 0 + * non-MSI-X case is very simple: rx rings[0] on RTxQ[0], + * tx rings[0] on RTxQ[1]. */ if (ixgbe->intr_type != DDI_INTR_TYPE_MSIX) { ixgbe_map_rxring_to_vector(ixgbe, 0, 0); - ixgbe_map_txring_to_vector(ixgbe, 0, 0); + ixgbe_map_txring_to_vector(ixgbe, 0, 1); return (IXGBE_SUCCESS); } @@ -3584,16 +3703,19 @@ ixgbe_map_rings_to_vectors(ixgbe_t *ixgbe) */ /* - * Map vector 0 to tx + * Map vectors to rx rings */ - ixgbe_map_txring_to_vector(ixgbe, 0, vector++); - vect_remain--; + for (i = 0; i < ixgbe->num_rx_rings; i++) { + ixgbe_map_rxring_to_vector(ixgbe, i, vector); + vector = (vector +1) % (ixgbe->intr_cnt -1); + } /* - * Map remaining vectors to rx rings + * Map vectors to tx rings */ - for (i = 0; i < vect_remain; i++) { - ixgbe_map_rxring_to_vector(ixgbe, i, vector++); + for (i = 0; i < ixgbe->num_tx_rings; i++) { + ixgbe_map_txring_to_vector(ixgbe, i, vector); + vector = (vector +1) % (ixgbe->intr_cnt -1); } return (IXGBE_SUCCESS); @@ -3602,16 +3724,16 @@ ixgbe_map_rings_to_vectors(ixgbe_t *ixgbe) /* * ixgbe_setup_adapter_vector - Setup the adapter interrupt vector(s). * - * This relies on queue/vector mapping already set up in the + * This relies on ring/vector mapping already set up in the * vect_map[] structures */ static void ixgbe_setup_adapter_vector(ixgbe_t *ixgbe) { struct ixgbe_hw *hw = &ixgbe->hw; - ixgbe_ring_vector_t *vect; /* vector bitmap */ - int r_idx; /* ring index */ - int v_idx; /* vector index */ + ixgbe_ring_vector_t *vect; /* vector bitmap */ + int r_idx; /* ring index */ + int v_idx; /* vector index */ /* * Clear any previous entries @@ -3620,9 +3742,20 @@ ixgbe_setup_adapter_vector(ixgbe_t *ixgbe) IXGBE_WRITE_REG(hw, IXGBE_IVAR(v_idx), 0); /* - * "Other" is always on vector 0 + * For non MSI-X interrupt, rx rings[0] will use RTxQ[0], and + * tx rings[0] will use RTxQ[1]. + */ + if (ixgbe->intr_type != DDI_INTR_TYPE_MSIX) { + ixgbe_setup_ivar(ixgbe, IXGBE_IVAR_RX_QUEUE(0), 0); + ixgbe_setup_ivar(ixgbe, IXGBE_IVAR_TX_QUEUE(0), 1); + return; + } + + /* + * For MSI-X interrupt, "Other" is always on last vector. */ - ixgbe_set_ivar(ixgbe, IXGBE_IVAR_OTHER_CAUSES_INDEX, 0); + ixgbe_setup_ivar(ixgbe, IXGBE_IVAR_OTHER_CAUSES_INDEX, + (ixgbe->intr_cnt - 1)); /* * For each interrupt vector, populate the IVAR table @@ -3637,7 +3770,7 @@ ixgbe_setup_adapter_vector(ixgbe_t *ixgbe) (ixgbe->num_rx_rings - 1)); while (r_idx >= 0) { - ixgbe_set_ivar(ixgbe, IXGBE_IVAR_RX_QUEUE(r_idx), + ixgbe_setup_ivar(ixgbe, IXGBE_IVAR_RX_QUEUE(r_idx), v_idx); r_idx = bt_getlowbit(vect->rx_map, (r_idx + 1), (ixgbe->num_rx_rings - 1)); @@ -3650,7 +3783,7 @@ ixgbe_setup_adapter_vector(ixgbe_t *ixgbe) (ixgbe->num_tx_rings - 1)); while (r_idx >= 0) { - ixgbe_set_ivar(ixgbe, IXGBE_IVAR_TX_QUEUE(r_idx), + ixgbe_setup_ivar(ixgbe, IXGBE_IVAR_TX_QUEUE(r_idx), v_idx); r_idx = bt_getlowbit(vect->tx_map, (r_idx + 1), (ixgbe->num_tx_rings - 1)); @@ -3996,3 +4129,231 @@ ixgbe_fm_ereport(ixgbe_t *ixgbe, char *detail) FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, NULL); } } + +static int +ixgbe_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num) +{ + ixgbe_rx_ring_t *rx_ring = (ixgbe_rx_ring_t *)rh; + + mutex_enter(&rx_ring->rx_lock); + rx_ring->ring_gen_num = mr_gen_num; + mutex_exit(&rx_ring->rx_lock); + return (0); +} + +/* + * Callback funtion for MAC layer to register all rings. + */ +/* ARGSUSED */ +void +ixgbe_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, + const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh) +{ + ixgbe_t *ixgbe = (ixgbe_t *)arg; + mac_intr_t *mintr = &infop->mri_intr; + + switch (rtype) { + case MAC_RING_TYPE_RX: { + ASSERT(rg_index == 0); + ASSERT(ring_index < ixgbe->num_rx_rings); + + ixgbe_rx_ring_t *rx_ring = &ixgbe->rx_rings[ring_index]; + rx_ring->ring_handle = rh; + + infop->mri_driver = (mac_ring_driver_t)rx_ring; + infop->mri_start = ixgbe_ring_start; + infop->mri_stop = NULL; + infop->mri_poll = ixgbe_ring_rx_poll; + + mintr->mi_handle = (mac_intr_handle_t)rx_ring; + mintr->mi_enable = ixgbe_rx_ring_intr_enable; + mintr->mi_disable = ixgbe_rx_ring_intr_disable; + + break; + } + case MAC_RING_TYPE_TX: { + ASSERT(rg_index == -1); + ASSERT(ring_index < ixgbe->num_tx_rings); + + ixgbe_tx_ring_t *tx_ring = &ixgbe->tx_rings[ring_index]; + tx_ring->ring_handle = rh; + + infop->mri_driver = (mac_ring_driver_t)tx_ring; + infop->mri_start = NULL; + infop->mri_stop = NULL; + infop->mri_tx = ixgbe_ring_tx; + + break; + } + default: + break; + } +} + +/* + * Callback funtion for MAC layer to register all groups. + */ +void +ixgbe_fill_group(void *arg, mac_ring_type_t rtype, const int index, + mac_group_info_t *infop, mac_group_handle_t gh) +{ + ixgbe_t *ixgbe = (ixgbe_t *)arg; + + switch (rtype) { + case MAC_RING_TYPE_RX: { + ixgbe_rx_group_t *rx_group; + + rx_group = &ixgbe->rx_groups[index]; + rx_group->group_handle = gh; + + infop->mgi_driver = (mac_group_driver_t)rx_group; + infop->mgi_start = NULL; + infop->mgi_stop = NULL; + infop->mgi_addmac = ixgbe_addmac; + infop->mgi_remmac = ixgbe_remmac; + infop->mgi_count = (ixgbe->num_rx_rings / ixgbe->num_rx_groups); + + break; + } + case MAC_RING_TYPE_TX: + break; + default: + break; + } +} + +/* + * Enable interrupt on the specificed rx ring. + */ +int +ixgbe_rx_ring_intr_enable(mac_intr_handle_t intrh) +{ + ixgbe_rx_ring_t *rx_ring = (ixgbe_rx_ring_t *)intrh; + ixgbe_t *ixgbe = rx_ring->ixgbe; + int r_idx = rx_ring->index; + int v_idx = rx_ring->intr_vector; + + mutex_enter(&ixgbe->gen_lock); + ASSERT(BT_TEST(ixgbe->vect_map[v_idx].rx_map, r_idx) == 0); + + /* + * To enable interrupt by setting the VAL bit of given interrupt + * vector allocation register (IVAR). + */ + ixgbe_enable_ivar(ixgbe, IXGBE_IVAR_RX_QUEUE(r_idx)); + + BT_SET(ixgbe->vect_map[v_idx].rx_map, r_idx); + mutex_exit(&ixgbe->gen_lock); + + return (0); +} + +/* + * Disable interrupt on the specificed rx ring. + */ +int +ixgbe_rx_ring_intr_disable(mac_intr_handle_t intrh) +{ + ixgbe_rx_ring_t *rx_ring = (ixgbe_rx_ring_t *)intrh; + ixgbe_t *ixgbe = rx_ring->ixgbe; + int r_idx = rx_ring->index; + int v_idx = rx_ring->intr_vector; + + mutex_enter(&ixgbe->gen_lock); + + ASSERT(BT_TEST(ixgbe->vect_map[v_idx].rx_map, r_idx) == 1); + + /* + * To disable interrupt by clearing the VAL bit of given interrupt + * vector allocation register (IVAR). + */ + ixgbe_disable_ivar(ixgbe, IXGBE_IVAR_RX_QUEUE(r_idx)); + + BT_CLEAR(ixgbe->vect_map[v_idx].rx_map, r_idx); + + mutex_exit(&ixgbe->gen_lock); + + return (0); +} + +/* + * Add a mac address. + */ +static int +ixgbe_addmac(void *arg, const uint8_t *mac_addr) +{ + ixgbe_rx_group_t *rx_group = (ixgbe_rx_group_t *)arg; + ixgbe_t *ixgbe = rx_group->ixgbe; + int slot; + int err; + + mutex_enter(&ixgbe->gen_lock); + + if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) { + mutex_exit(&ixgbe->gen_lock); + return (ECANCELED); + } + + if (ixgbe->unicst_avail == 0) { + /* no slots available */ + mutex_exit(&ixgbe->gen_lock); + return (ENOSPC); + } + + for (slot = 0; slot < ixgbe->unicst_total; slot++) { + if (ixgbe->unicst_addr[slot].mac.set == 0) + break; + } + + ASSERT((slot >= 0) && (slot < ixgbe->unicst_total)); + + if ((err = ixgbe_unicst_set(ixgbe, mac_addr, slot)) == 0) { + ixgbe->unicst_addr[slot].mac.set = 1; + ixgbe->unicst_avail--; + } + + mutex_exit(&ixgbe->gen_lock); + + return (err); +} + +/* + * Remove a mac address. + */ +static int +ixgbe_remmac(void *arg, const uint8_t *mac_addr) +{ + ixgbe_rx_group_t *rx_group = (ixgbe_rx_group_t *)arg; + ixgbe_t *ixgbe = rx_group->ixgbe; + int slot; + int err; + + mutex_enter(&ixgbe->gen_lock); + + if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) { + mutex_exit(&ixgbe->gen_lock); + return (ECANCELED); + } + + slot = ixgbe_unicst_find(ixgbe, mac_addr); + if (slot == -1) { + mutex_exit(&ixgbe->gen_lock); + return (EINVAL); + } + + if (ixgbe->unicst_addr[slot].mac.set == 0) { + mutex_exit(&ixgbe->gen_lock); + return (EINVAL); + } + + bzero(ixgbe->unicst_addr[slot].mac.addr, ETHERADDRL); + if ((err = ixgbe_unicst_set(ixgbe, + ixgbe->unicst_addr[slot].mac.addr, slot)) == 0) { + ixgbe->unicst_addr[slot].mac.set = 0; + ixgbe->unicst_avail++; + } + + mutex_exit(&ixgbe->gen_lock); + + return (err); +} diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_rx.c b/usr/src/uts/common/io/ixgbe/ixgbe_rx.c index 3f09a4215d..63e42cede2 100644 --- a/usr/src/uts/common/io/ixgbe/ixgbe_rx.c +++ b/usr/src/uts/common/io/ixgbe/ixgbe_rx.c @@ -1,19 +1,17 @@ /* * CDDL HEADER START * - * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * - * You can obtain a copy of the license at: - * http://www.opensolaris.org/os/licensing. + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * - * When using or redistributing this file, you may do so under the - * License only. No other modification of this header is permitted. - * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] @@ -22,11 +20,13 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms of the CDDL. + * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ #include "ixgbe_sw.h" @@ -176,7 +176,10 @@ ixgbe_rx_bind(ixgbe_rx_ring_t *rx_ring, uint32_t index, uint32_t pkt_len) * DMA buffer, we have to return and use bcopy to * process the packet. */ - if (current_rcb->mp == NULL) { + if (current_rcb->mp != NULL) { + current_rcb->mp->b_rptr += IPHDR_ALIGN_ROOM; + current_rcb->mp->b_wptr += IPHDR_ALIGN_ROOM; + } else { atomic_inc_32(&rx_ring->rcb_free); return (NULL); } @@ -246,7 +249,7 @@ ixgbe_rx_assoc_hcksum(mblk_t *mp, uint32_t status_error) } /* - * ixgbe_rx - Receive the data of one ring. + * ixgbe_ring_rx - Receive the data of one ring. * * This function goes throught h/w descriptor in one specified rx ring, * receives the data if the descriptor status shows the data is ready. @@ -254,7 +257,7 @@ ixgbe_rx_assoc_hcksum(mblk_t *mp, uint32_t status_error) * passed up to mac_rx(). */ mblk_t * -ixgbe_rx(ixgbe_rx_ring_t *rx_ring) +ixgbe_ring_rx(ixgbe_rx_ring_t *rx_ring, int poll_bytes) { union ixgbe_adv_rx_desc *current_rbd; rx_control_block_t *current_rcb; @@ -266,6 +269,7 @@ ixgbe_rx(ixgbe_rx_ring_t *rx_ring) uint32_t pkt_len; uint32_t status_error; uint32_t pkt_num; + uint32_t received_bytes; ixgbe_t *ixgbe = rx_ring->ixgbe; struct ixgbe_hw *hw = &ixgbe->hw; @@ -289,6 +293,7 @@ ixgbe_rx(ixgbe_rx_ring_t *rx_ring) rx_next = rx_ring->rbd_next; current_rbd = &rx_ring->rbd_ring[rx_next]; + received_bytes = 0; pkt_num = 0; status_error = current_rbd->wb.upper.status_error; while (status_error & IXGBE_RXD_STAT_DD) { @@ -309,6 +314,13 @@ ixgbe_rx(ixgbe_rx_ring_t *rx_ring) (status_error & IXGBE_RXDADV_ERR_IPE)); pkt_len = current_rbd->wb.upper.length; + + if ((poll_bytes != IXGBE_POLL_NULL) && + ((received_bytes + pkt_len) > poll_bytes)) + break; + + received_bytes += pkt_len; + mp = NULL; /* * For packets with length more than the copy threshold, @@ -378,3 +390,21 @@ rx_discard: return (mblk_head); } + +mblk_t * +ixgbe_ring_rx_poll(void *arg, int n_bytes) +{ + ixgbe_rx_ring_t *rx_ring = (ixgbe_rx_ring_t *)arg; + mblk_t *mp = NULL; + + ASSERT(n_bytes >= 0); + + if (n_bytes == 0) + return (mp); + + mutex_enter(&rx_ring->rx_lock); + mp = ixgbe_ring_rx(rx_ring, n_bytes); + mutex_exit(&rx_ring->rx_lock); + + return (mp); +} diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_stat.c b/usr/src/uts/common/io/ixgbe/ixgbe_stat.c index 776af1fba4..00eccf23a2 100644 --- a/usr/src/uts/common/io/ixgbe/ixgbe_stat.c +++ b/usr/src/uts/common/io/ixgbe/ixgbe_stat.c @@ -1,19 +1,17 @@ /* * CDDL HEADER START * - * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * - * You can obtain a copy of the license at: - * http://www.opensolaris.org/os/licensing. + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * - * When using or redistributing this file, you may do so under the - * License only. No other modification of this header is permitted. - * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] @@ -22,11 +20,13 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms of the CDDL. + * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ #include "ixgbe_sw.h" @@ -87,17 +87,29 @@ ixgbe_update_stats(kstat_t *ks, int rw) ixgbe_ks->tx_reschedule.value.ui64 += ixgbe->tx_rings[i].stat_reschedule; } +#endif /* * Hardware calculated statistics. */ + ixgbe_ks->gprc.value.ui64 = 0; + ixgbe_ks->gptc.value.ui64 = 0; + ixgbe_ks->tor.value.ui64 = 0; + ixgbe_ks->tot.value.ui64 = 0; for (i = 0; i < 16; i++) { - ixgbe_ks->gprc.value.ul += IXGBE_READ_REG(hw, IXGBE_QPRC(i)); - ixgbe_ks->gptc.value.ul += IXGBE_READ_REG(hw, IXGBE_QPTC(i)); - ixgbe_ks->tor.value.ui64 += IXGBE_READ_REG(hw, IXGBE_QBRC(i)); - ixgbe_ks->tot.value.ui64 += IXGBE_READ_REG(hw, IXGBE_QBTC(i)); + ixgbe_ks->qprc[i].value.ui64 += + IXGBE_READ_REG(hw, IXGBE_QPRC(i)); + ixgbe_ks->gprc.value.ui64 += ixgbe_ks->qprc[i].value.ui64; + ixgbe_ks->qptc[i].value.ui64 += + IXGBE_READ_REG(hw, IXGBE_QPTC(i)); + ixgbe_ks->gptc.value.ui64 += ixgbe_ks->qptc[i].value.ui64; + ixgbe_ks->qbrc[i].value.ui64 += + IXGBE_READ_REG(hw, IXGBE_QBRC(i)); + ixgbe_ks->tor.value.ui64 += ixgbe_ks->qbrc[i].value.ui64; + ixgbe_ks->qbtc[i].value.ui64 += + IXGBE_READ_REG(hw, IXGBE_QBTC(i)); + ixgbe_ks->tot.value.ui64 += ixgbe_ks->qbtc[i].value.ui64; } - /* * This is a Workaround: * Currently h/w GORCH, GOTCH, TORH registers are not @@ -124,7 +136,6 @@ ixgbe_update_stats(kstat_t *ks, int rw) ixgbe_ks->ptc511.value.ul += IXGBE_READ_REG(hw, IXGBE_PTC511); ixgbe_ks->ptc1023.value.ul += IXGBE_READ_REG(hw, IXGBE_PTC1023); ixgbe_ks->ptc1522.value.ul += IXGBE_READ_REG(hw, IXGBE_PTC1522); -#endif ixgbe_ks->mspdc.value.ui64 += IXGBE_READ_REG(hw, IXGBE_MSPDC); for (i = 0; i < 8; i++) @@ -200,6 +211,7 @@ ixgbe_init_stats(ixgbe_t *ixgbe) KSTAT_DATA_UINT64); kstat_named_init(&ixgbe_ks->tx_reschedule, "tx_reschedule", KSTAT_DATA_UINT64); +#endif kstat_named_init(&ixgbe_ks->gprc, "good_pkts_recvd", KSTAT_DATA_UINT64); @@ -233,7 +245,138 @@ ixgbe_init_stats(ixgbe_t *ixgbe) KSTAT_DATA_UINT64); kstat_named_init(&ixgbe_ks->ptc1522, "pkts_xmitd_(1024-1522b)", KSTAT_DATA_UINT64); -#endif + + kstat_named_init(&ixgbe_ks->qprc[0], "queue_pkts_recvd [ 0]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qprc[1], "queue_pkts_recvd [ 1]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qprc[2], "queue_pkts_recvd [ 2]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qprc[3], "queue_pkts_recvd [ 3]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qprc[4], "queue_pkts_recvd [ 4]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qprc[5], "queue_pkts_recvd [ 5]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qprc[6], "queue_pkts_recvd [ 6]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qprc[7], "queue_pkts_recvd [ 7]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qprc[8], "queue_pkts_recvd [ 8]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qprc[9], "queue_pkts_recvd [ 9]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qprc[10], "queue_pkts_recvd [10]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qprc[11], "queue_pkts_recvd [11]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qprc[12], "queue_pkts_recvd [12]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qprc[13], "queue_pkts_recvd [13]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qprc[14], "queue_pkts_recvd [14]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qprc[15], "queue_pkts_recvd [15]", + KSTAT_DATA_UINT64); + + kstat_named_init(&ixgbe_ks->qptc[0], "queue_pkts_xmitd [ 0]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qptc[1], "queue_pkts_xmitd [ 1]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qptc[2], "queue_pkts_xmitd [ 2]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qptc[3], "queue_pkts_xmitd [ 3]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qptc[4], "queue_pkts_xmitd [ 4]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qptc[5], "queue_pkts_xmitd [ 5]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qptc[6], "queue_pkts_xmitd [ 6]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qptc[7], "queue_pkts_xmitd [ 7]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qptc[8], "queue_pkts_xmitd [ 8]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qptc[9], "queue_pkts_xmitd [ 9]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qptc[10], "queue_pkts_xmitd [10]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qptc[11], "queue_pkts_xmitd [11]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qptc[12], "queue_pkts_xmitd [12]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qptc[13], "queue_pkts_xmitd [13]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qptc[14], "queue_pkts_xmitd [14]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qptc[15], "queue_pkts_xmitd [15]", + KSTAT_DATA_UINT64); + + kstat_named_init(&ixgbe_ks->qbrc[0], "queue_bytes_recvd [ 0]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbrc[1], "queue_bytes_recvd [ 1]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbrc[2], "queue_bytes_recvd [ 2]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbrc[3], "queue_bytes_recvd [ 3]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbrc[4], "queue_bytes_recvd [ 4]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbrc[5], "queue_bytes_recvd [ 5]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbrc[6], "queue_bytes_recvd [ 6]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbrc[7], "queue_bytes_recvd [ 7]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbrc[8], "queue_bytes_recvd [ 8]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbrc[9], "queue_bytes_recvd [ 9]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbrc[10], "queue_bytes_recvd [10]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbrc[11], "queue_bytes_recvd [11]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbrc[12], "queue_bytes_recvd [12]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbrc[13], "queue_bytes_recvd [13]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbrc[14], "queue_bytes_recvd [14]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbrc[15], "queue_bytes_recvd [15]", + KSTAT_DATA_UINT64); + + kstat_named_init(&ixgbe_ks->qbtc[0], "queue_bytes_xmitd [ 0]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbtc[1], "queue_bytes_xmitd [ 1]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbtc[2], "queue_bytes_xmitd [ 2]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbtc[3], "queue_bytes_xmitd [ 3]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbtc[4], "queue_bytes_xmitd [ 4]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbtc[5], "queue_bytes_xmitd [ 5]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbtc[6], "queue_bytes_xmitd [ 6]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbtc[7], "queue_bytes_xmitd [ 7]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbtc[8], "queue_bytes_xmitd [ 8]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbtc[9], "queue_bytes_xmitd [ 9]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbtc[10], "queue_bytes_xmitd [10]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbtc[11], "queue_bytes_xmitd [11]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbtc[12], "queue_bytes_xmitd [12]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbtc[13], "queue_bytes_xmitd [13]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbtc[14], "queue_bytes_xmitd [14]", + KSTAT_DATA_UINT64); + kstat_named_init(&ixgbe_ks->qbtc[15], "queue_bytes_xmitd [15]", + KSTAT_DATA_UINT64); kstat_named_init(&ixgbe_ks->mspdc, "mac_short_packet_discard", KSTAT_DATA_UINT64); diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_sw.h b/usr/src/uts/common/io/ixgbe/ixgbe_sw.h index 390233fff5..f648c57a18 100644 --- a/usr/src/uts/common/io/ixgbe/ixgbe_sw.h +++ b/usr/src/uts/common/io/ixgbe/ixgbe_sw.h @@ -1,19 +1,17 @@ /* * CDDL HEADER START * - * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * - * You can obtain a copy of the license at: - * http://www.opensolaris.org/os/licensing. + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * - * When using or redistributing this file, you may do so under the - * License only. No other modification of this header is permitted. - * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] @@ -22,15 +20,17 @@ */ /* + * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. + */ + +/* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms of the CDDL. + * Use is subject to license terms. */ #ifndef _IXGBE_SW_H #define _IXGBE_SW_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -48,7 +48,7 @@ extern "C" { #include <sys/modctl.h> #include <sys/errno.h> #include <sys/dlpi.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> #include <sys/vlan.h> #include <sys/ddi.h> @@ -89,6 +89,8 @@ extern "C" { #define IXGBE_INTR_MSI 2 #define IXGBE_INTR_LEGACY 3 +#define IXGBE_POLL_NULL -1 + #define MAX_COOKIE 18 #define MIN_NUM_TX_DESC 2 @@ -102,6 +104,7 @@ extern "C" { */ #define MAX_TX_QUEUE_NUM 32 #define MAX_RX_QUEUE_NUM 64 +#define MAX_RX_GROUP_NUM 1 #define MAX_TX_RING_SIZE 4096 #define MAX_RX_RING_SIZE 4096 @@ -121,6 +124,7 @@ extern "C" { */ #define MIN_TX_QUEUE_NUM 1 #define MIN_RX_QUEUE_NUM 1 +#define MIN_RX_GROUP_NUM 1 #define MIN_TX_RING_SIZE 64 #define MIN_RX_RING_SIZE 64 @@ -136,17 +140,18 @@ extern "C" { /* * Default values for user configurable parameters */ -#define DEFAULT_TX_QUEUE_NUM 1 -#define DEFAULT_RX_QUEUE_NUM 1 -#define DEFAULT_TX_RING_SIZE 512 -#define DEFAULT_RX_RING_SIZE 512 +#define DEFAULT_TX_QUEUE_NUM 8 +#define DEFAULT_RX_QUEUE_NUM 8 +#define DEFAULT_RX_GROUP_NUM 1 +#define DEFAULT_TX_RING_SIZE 1024 +#define DEFAULT_RX_RING_SIZE 1024 #define DEFAULT_MTU ETHERMTU #define DEFAULT_RX_LIMIT_PER_INTR 256 #define DEFAULT_INTR_THROTTLING 200 /* In unit of 256 nsec */ #define DEFAULT_RX_COPY_THRESHOLD 128 #define DEFAULT_TX_COPY_THRESHOLD 512 -#define DEFAULT_TX_RECYCLE_THRESHOLD MAX_COOKIE +#define DEFAULT_TX_RECYCLE_THRESHOLD (MAX_COOKIE + 1) #define DEFAULT_TX_OVERLOAD_THRESHOLD MIN_NUM_TX_DESC #define DEFAULT_TX_RESCHED_THRESHOLD 128 #define DEFAULT_FCRTH 0x20000 @@ -156,6 +161,14 @@ extern "C" { #define DEFAULT_TX_HCKSUM_ENABLE B_TRUE #define DEFAULT_RX_HCKSUM_ENABLE B_TRUE #define DEFAULT_LSO_ENABLE B_TRUE +#define DEFAULT_MR_ENABLE B_TRUE +#define DEFAULT_TX_HEAD_WB_ENABLE B_TRUE + +#define IXGBE_LSO_MAXLEN 65535 + +#define DEFAULT_TX_HCKSUM_ENABLE B_TRUE +#define DEFAULT_RX_HCKSUM_ENABLE B_TRUE +#define DEFAULT_LSO_ENABLE B_TRUE #define DEFAULT_TX_HEAD_WB_ENABLE B_TRUE #define IXGBE_LSO_MAXLEN 65535 @@ -167,11 +180,12 @@ extern "C" { #define MAX_LINK_DOWN_TIMEOUT 8 /* 8 seconds */ /* - * limits on msi-x vectors for 82598 + * Limits on msi-x vectors for 82598 */ -#define IXGBE_MAX_INTR_VECTOR 18 -#define IXGBE_MAX_OTHER_VECTOR 2 -#define IXGBE_MAX_RING_VECTOR (IXGBE_MAX_INTR_VECTOR - IXGBE_MAX_OTHER_VECTOR) +#define IXGBE_MAX_INTR_VECTOR 18 +#define IXGBE_MAX_OTHER_VECTOR 1 +#define IXGBE_MAX_TCP_TIMER_VECTOR 1 +#define IXGBE_MAX_RING_VECTOR 16 /* * Extra register bit masks for 82598 @@ -209,11 +223,13 @@ extern "C" { #define PROP_TX_RING_SIZE "tx_ring_size" #define PROP_RX_QUEUE_NUM "rx_queue_number" #define PROP_RX_RING_SIZE "rx_ring_size" +#define PROP_RX_GROUP_NUM "rx_group_number" #define PROP_INTR_FORCE "intr_force" #define PROP_TX_HCKSUM_ENABLE "tx_hcksum_enable" #define PROP_RX_HCKSUM_ENABLE "rx_hcksum_enable" #define PROP_LSO_ENABLE "lso_enable" +#define PROP_MR_ENABLE "mr_enable" #define PROP_TX_HEAD_WB_ENABLE "tx_head_wb_enable" #define PROP_TX_COPY_THRESHOLD "tx_copy_threshold" #define PROP_TX_RECYCLE_THRESHOLD "tx_recycle_threshold" @@ -264,9 +280,6 @@ enum ioc_reply { IOC_REPLY /* OK, just send reply */ }; -#define MBLK_LEN(mp) ((uintptr_t)(mp)->b_wptr - \ - (uintptr_t)(mp)->b_rptr) - #define DMA_SYNC(area, flag) ((void) ddi_dma_sync((area)->dma_handle, \ 0, 0, (flag))) @@ -533,13 +546,15 @@ typedef struct ixgbe_tx_ring { uint32_t stat_fail_no_tcb; uint32_t stat_fail_dma_bind; uint32_t stat_reschedule; + uint32_t stat_lso_header_fail; #endif + mac_ring_handle_t ring_handle; + /* * Pointer to the ixgbe struct */ struct ixgbe *ixgbe; - } ixgbe_tx_ring_t; /* @@ -590,11 +605,22 @@ typedef struct ixgbe_rx_ring { uint32_t stat_exceed_pkt; #endif - struct ixgbe *ixgbe; /* Pointer to ixgbe struct */ + mac_ring_handle_t ring_handle; + uint64_t ring_gen_num; + struct ixgbe *ixgbe; /* Pointer to ixgbe struct */ } ixgbe_rx_ring_t; /* + * Software Receive Ring Group + */ +typedef struct ixgbe_rx_group { + uint32_t index; /* Group index */ + mac_group_handle_t group_handle; /* call back group handle */ + struct ixgbe *ixgbe; /* Pointer to ixgbe struct */ +} ixgbe_rx_group_t; + +/* * structure to map ring cleanup to msi-x vector */ typedef struct ixgbe_ring_vector { @@ -641,6 +667,12 @@ typedef struct ixgbe { uint32_t rx_buf_size; /* Rx buffer size */ /* + * Receive Groups + */ + ixgbe_rx_group_t *rx_groups; /* Array of rx groups */ + uint32_t num_rx_groups; /* Number of rx groups in use */ + + /* * Transmit Rings */ ixgbe_tx_ring_t *tx_rings; /* Array of tx rings */ @@ -651,6 +683,7 @@ typedef struct ixgbe { boolean_t tx_head_wb_enable; /* Tx head wrtie-back */ boolean_t tx_hcksum_enable; /* Tx h/w cksum offload */ boolean_t lso_enable; /* Large Segment Offload */ + boolean_t mr_enable; /* Multiple Tx and Rx Ring */ uint32_t tx_copy_thresh; /* Tx copy threshold */ uint32_t tx_recycle_thresh; /* Tx recycle threshold */ uint32_t tx_overload_thresh; /* Tx overload threshold */ @@ -684,6 +717,8 @@ typedef struct ixgbe { uint32_t mcast_count; struct ether_addr mcast_table[MAX_NUM_MULTICAST_ADDRESSES]; + ulong_t sys_page_size; + /* * Kstat definitions */ @@ -694,13 +729,11 @@ typedef struct ixgbe { */ caddr_t nd_data; nd_param_t nd_params[PARAM_COUNT]; - } ixgbe_t; typedef struct ixgbe_stat { - kstat_named_t link_speed; /* Link Speed */ -#ifdef IXGBE_DEBUG + kstat_named_t reset_count; /* Reset Count */ kstat_named_t rx_frame_error; /* Rx Error in Packet */ @@ -729,7 +762,11 @@ typedef struct ixgbe_stat { kstat_named_t ptc511; /* Packets Xmitted (255-511b) */ kstat_named_t ptc1023; /* Packets Xmitted (512-1023b) */ kstat_named_t ptc1522; /* Packets Xmitted (1024-1522b */ -#endif + kstat_named_t qprc[16]; /* Queue Packets Received Count */ + kstat_named_t qptc[16]; /* Queue Packets Transmitted Count */ + kstat_named_t qbrc[16]; /* Queue Bytes Received Count */ + kstat_named_t qbtc[16]; /* Queue Bytes Transmitted Count */ + kstat_named_t crcerrs; /* CRC Error Count */ kstat_named_t illerrc; /* Illegal Byte Error Count */ kstat_named_t errbc; /* Error Byte Count */ @@ -770,7 +807,6 @@ void ixgbe_set_fma_flags(int, int); int ixgbe_start(ixgbe_t *); void ixgbe_stop(ixgbe_t *); int ixgbe_driver_setup_link(ixgbe_t *, boolean_t); -int ixgbe_unicst_set(ixgbe_t *, const uint8_t *, mac_addr_slot_t); int ixgbe_multicst_add(ixgbe_t *, const uint8_t *); int ixgbe_multicst_remove(ixgbe_t *, const uint8_t *); enum ioc_reply ixgbe_loopback_ioctl(ixgbe_t *, struct iocblk *, mblk_t *); @@ -783,6 +819,13 @@ int ixgbe_check_acc_handle(ddi_acc_handle_t handle); int ixgbe_check_dma_handle(ddi_dma_handle_t handle); void ixgbe_fm_ereport(ixgbe_t *, char *); +void ixgbe_fill_ring(void *, mac_ring_type_t, const int, const int, + mac_ring_info_t *, mac_ring_handle_t); +void ixgbe_fill_group(void *arg, mac_ring_type_t, const int, + mac_group_info_t *, mac_group_handle_t); +int ixgbe_rx_ring_intr_enable(mac_intr_handle_t); +int ixgbe_rx_ring_intr_disable(mac_intr_handle_t); + /* * Function prototypes in ixgbe_gld.c */ @@ -790,26 +833,22 @@ int ixgbe_m_start(void *); void ixgbe_m_stop(void *); int ixgbe_m_promisc(void *, boolean_t); int ixgbe_m_multicst(void *, boolean_t, const uint8_t *); -int ixgbe_m_unicst(void *, const uint8_t *); int ixgbe_m_stat(void *, uint_t, uint64_t *); void ixgbe_m_resources(void *); void ixgbe_m_ioctl(void *, queue_t *, mblk_t *); -int ixgbe_m_unicst_add(void *, mac_multi_addr_t *); -int ixgbe_m_unicst_remove(void *, mac_addr_slot_t); -int ixgbe_m_unicst_modify(void *, mac_multi_addr_t *); -int ixgbe_m_unicst_get(void *, mac_multi_addr_t *); boolean_t ixgbe_m_getcapab(void *, mac_capab_t, void *); /* * Function prototypes in ixgbe_rx.c */ -mblk_t *ixgbe_rx(ixgbe_rx_ring_t *); +mblk_t *ixgbe_ring_rx(ixgbe_rx_ring_t *, int); void ixgbe_rx_recycle(caddr_t arg); +mblk_t *ixgbe_ring_rx_poll(void *, int); /* * Function prototypes in ixgbe_tx.c */ -mblk_t *ixgbe_m_tx(void *, mblk_t *); +mblk_t *ixgbe_ring_tx(void *, mblk_t *); void ixgbe_free_tcb(tx_control_block_t *); void ixgbe_put_free_list(ixgbe_tx_ring_t *, link_list_t *); uint32_t ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *); @@ -834,7 +873,6 @@ enum ioc_reply ixgbe_nd_ioctl(ixgbe_t *, queue_t *, mblk_t *, struct iocblk *); */ int ixgbe_init_stats(ixgbe_t *); - #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_tx.c b/usr/src/uts/common/io/ixgbe/ixgbe_tx.c index f2a5d8fa0c..721353c756 100644 --- a/usr/src/uts/common/io/ixgbe/ixgbe_tx.c +++ b/usr/src/uts/common/io/ixgbe/ixgbe_tx.c @@ -1,19 +1,17 @@ /* * CDDL HEADER START * - * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * - * You can obtain a copy of the license at: - * http://www.opensolaris.org/os/licensing. + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * - * When using or redistributing this file, you may do so under the - * License only. No other modification of this header is permitted. - * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] @@ -22,15 +20,16 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms of the CDDL. + * Copyright(c) 2007-2008 Intel Corporation. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ #include "ixgbe_sw.h" -static boolean_t ixgbe_tx(ixgbe_tx_ring_t *, mblk_t *); static int ixgbe_tx_copy(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *, uint32_t, boolean_t); static int ixgbe_tx_bind(ixgbe_tx_ring_t *, tx_control_block_t *, mblk_t *, @@ -44,7 +43,7 @@ static int ixgbe_get_context(mblk_t *, ixgbe_tx_context_t *); static boolean_t ixgbe_check_context(ixgbe_tx_ring_t *, ixgbe_tx_context_t *); static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *, - ixgbe_tx_context_t *); + ixgbe_tx_context_t *, int); #ifndef IXGBE_DEBUG #pragma inline(ixgbe_save_desc) @@ -54,65 +53,9 @@ static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *, #endif /* - * ixgbe_m_tx - * - * The GLDv3 interface to call driver's tx routine to transmit - * the mblks. - */ -mblk_t * -ixgbe_m_tx(void *arg, mblk_t *mp) -{ - ixgbe_t *ixgbe = (ixgbe_t *)arg; - mblk_t *next; - ixgbe_tx_ring_t *tx_ring; - - /* - * If the adapter is suspended, or it is not started, or the link - * is not up, the mblks are simply dropped. - */ - if (((ixgbe->ixgbe_state & IXGBE_SUSPENDED) != 0) || - ((ixgbe->ixgbe_state & IXGBE_STARTED) == 0) || - (ixgbe->link_state != LINK_STATE_UP)) { - /* Free the mblk chain */ - while (mp != NULL) { - next = mp->b_next; - mp->b_next = NULL; - - freemsg(mp); - mp = next; - } - - return (NULL); - } - - /* - * Decide which tx ring is used to transmit the packets. - * This needs to be updated later to fit the new interface - * of the multiple rings support. - */ - tx_ring = &ixgbe->tx_rings[0]; - - while (mp != NULL) { - next = mp->b_next; - mp->b_next = NULL; - - if (!ixgbe_tx(tx_ring, mp)) { - mp->b_next = next; - break; - } - - mp = next; - } - - return (mp); -} - -/* - * ixgbe_tx - Main transmit processing + * ixgbe_ring_tx * - * Called from ixgbe_m_tx with an mblk ready to transmit. this - * routine sets up the transmit descriptors and sends data to - * the wire. + * To transmit one mblk through one specified ring. * * One mblk can consist of several fragments, each fragment * will be processed with different methods based on the size. @@ -136,9 +79,10 @@ ixgbe_m_tx(void *arg, mblk_t *mp) * be used. After the processing, those tx control blocks will * be put to the work list. */ -static boolean_t -ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp) +mblk_t * +ixgbe_ring_tx(void *arg, mblk_t *mp) { + ixgbe_tx_ring_t *tx_ring = (ixgbe_tx_ring_t *)arg; ixgbe_t *ixgbe = tx_ring->ixgbe; tx_type_t current_flag, next_flag; uint32_t current_len, next_len; @@ -150,11 +94,19 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp) tx_control_block_t *tcb; ixgbe_tx_context_t tx_context, *ctx; link_list_t pending_list; + uint32_t len, hdr_frag_len, hdr_len; + uint32_t copy_thresh; + mblk_t *new_mp; + mblk_t *pre_mp; + + ASSERT(mp->b_next == NULL); + + copy_thresh = tx_ring->copy_thresh; /* Get the mblk size */ mbsize = 0; for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { - mbsize += MBLK_LEN(nmp); + mbsize += MBLKL(nmp); } if (ixgbe->tx_hcksum_enable) { @@ -166,25 +118,24 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp) ctx = &tx_context; if (ixgbe_get_context(mp, ctx) < 0) { freemsg(mp); - return (B_TRUE); + return (NULL); } /* * If the mblk size exceeds the max size ixgbe could - * process, then discard this mblk, and return B_TRUE + * process, then discard this mblk, and return NULL. */ if ((ctx->lso_flag && ((mbsize - ctx->mac_hdr_len) > IXGBE_LSO_MAXLEN)) || (!ctx->lso_flag && (mbsize > (ixgbe->max_frame_size - ETHERFCSL)))) { freemsg(mp); IXGBE_DEBUGLOG_0(ixgbe, "ixgbe_tx: packet oversize"); - return (B_TRUE); + return (NULL); } } else { ctx = NULL; } - /* * Check and recycle tx descriptors. * The recycle threshold here should be selected carefully @@ -194,13 +145,13 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp) /* * After the recycling, if the tbd_free is less than the - * overload_threshold, assert overload, return B_FALSE; + * overload_threshold, assert overload, return mp; * and we need to re-schedule the tx again. */ if (tx_ring->tbd_free < tx_ring->overload_thresh) { tx_ring->reschedule = B_TRUE; IXGBE_DEBUG_STAT(tx_ring->stat_overload); - return (B_FALSE); + return (mp); } /* @@ -213,12 +164,77 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp) desc_num = 0; desc_total = 0; + /* + * The software should guarantee LSO packet header(MAC+IP+TCP) + * to be within one descriptor. Here we reallocate and refill the + * the header if it's physical memory non-contiguous. + */ + if ((ctx != NULL) && ctx->lso_flag) { + /* find the last fragment of the header */ + len = MBLKL(mp); + ASSERT(len > 0); + nmp = mp; + pre_mp = NULL; + hdr_len = ctx->ip_hdr_len + ctx->mac_hdr_len + ctx->l4_hdr_len; + while (len < hdr_len) { + pre_mp = nmp; + nmp = nmp->b_cont; + len += MBLKL(nmp); + } + /* + * If the header and the payload are in different mblks, + * we simply force the header to be copied into pre-allocated + * page-aligned buffer. + */ + if (len == hdr_len) + goto adjust_threshold; + + hdr_frag_len = hdr_len - (len - MBLKL(nmp)); + /* + * There are two cases we need to reallocate a mblk for the + * last header fragment: + * 1. the header is in multiple mblks and the last fragment + * share the same mblk with the payload + * 2. the header is in a single mblk shared with the payload + * and the header is physical memory non-contiguous + */ + if ((nmp != mp) || + (P2NPHASE((uintptr_t)nmp->b_rptr, ixgbe->sys_page_size) + < len)) { + IXGBE_DEBUG_STAT(tx_ring->stat_lso_header_fail); + /* + * reallocate the mblk for the last header fragment, + * expect to bcopy into pre-allocated page-aligned + * buffer + */ + new_mp = allocb(hdr_frag_len, NULL); + if (!new_mp) + return (B_FALSE); + bcopy(nmp->b_rptr, new_mp->b_rptr, hdr_frag_len); + /* link the new header fragment with the other parts */ + new_mp->b_wptr = new_mp->b_rptr + hdr_frag_len; + new_mp->b_cont = nmp; + if (pre_mp) + pre_mp->b_cont = new_mp; + nmp->b_rptr += hdr_frag_len; + if (hdr_frag_len == hdr_len) + mp = new_mp; + } +adjust_threshold: + /* + * adjust the bcopy threshhold to guarantee + * the header to use bcopy way + */ + if (copy_thresh < hdr_len) + copy_thresh = hdr_len; + } + current_mp = mp; - current_len = MBLK_LEN(current_mp); + current_len = MBLKL(current_mp); /* * Decide which method to use for the first fragment */ - current_flag = (current_len <= tx_ring->copy_thresh) ? + current_flag = (current_len <= copy_thresh) ? USE_COPY : USE_DMA; /* * If the mblk includes several contiguous small fragments, @@ -238,7 +254,7 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp) while (current_mp) { next_mp = current_mp->b_cont; eop = (next_mp == NULL); /* Last fragment of the packet? */ - next_len = eop ? 0: MBLK_LEN(next_mp); + next_len = eop ? 0: MBLKL(next_mp); /* * When the current fragment is an empty fragment, if @@ -254,7 +270,7 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp) if ((current_len == 0) && (copy_done)) { current_mp = next_mp; current_len = next_len; - current_flag = (current_len <= tx_ring->copy_thresh) ? + current_flag = (current_len <= copy_thresh) ? USE_COPY : USE_DMA; continue; } @@ -302,10 +318,10 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp) * copied to the current tx buffer, we need * to complete the current copy processing. */ - next_flag = (next_len > tx_ring->copy_thresh) ? + next_flag = (next_len > copy_thresh) ? USE_DMA: USE_COPY; copy_done = B_TRUE; - } else if (next_len > tx_ring->copy_thresh) { + } else if (next_len > copy_thresh) { /* * The next fragment needs to be processed with * DMA binding. So the copy prcessing will be @@ -329,7 +345,7 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp) * Check whether to use bcopy or DMA binding to process * the next fragment. */ - next_flag = (next_len > tx_ring->copy_thresh) ? + next_flag = (next_len > copy_thresh) ? USE_DMA: USE_COPY; ASSERT(copy_done == B_TRUE); @@ -367,7 +383,7 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp) /* * If the number of free tx descriptors is not enough for transmit - * then return failure. + * then return mp. * * Note: we must put this check under the mutex protection to * ensure the correctness when multiple threads access it in @@ -386,7 +402,7 @@ ixgbe_tx(ixgbe_tx_ring_t *tx_ring, mblk_t *mp) mutex_exit(&tx_ring->tx_lock); - return (B_TRUE); + return (NULL); tx_failure: /* @@ -410,7 +426,7 @@ tx_failure: /* Transmit failed, do not drop the mblk, rechedule the transmit */ tx_ring->reschedule = B_TRUE; - return (B_FALSE); + return (mp); } /* @@ -536,7 +552,9 @@ static int ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx) { uint32_t start; - uint32_t flags; + uint32_t hckflags; + uint32_t lsoflags; + uint32_t mss; uint32_t len; uint32_t size; uint32_t offset; @@ -548,16 +566,16 @@ ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx) ASSERT(mp != NULL); - hcksum_retrieve(mp, NULL, NULL, &start, NULL, NULL, NULL, &flags); + hcksum_retrieve(mp, NULL, NULL, &start, NULL, NULL, NULL, &hckflags); bzero(ctx, sizeof (ixgbe_tx_context_t)); - ctx->hcksum_flags = flags; - if (flags == 0) + if (hckflags == 0) return (0); + ctx->hcksum_flags = hckflags; - ctx->mss = DB_LSOMSS(mp); - ctx->lso_flag = (ctx->hcksum_flags & HW_LSO) && - (ctx->mss != 0); + lso_info_get(mp, &mss, &lsoflags); + ctx->mss = mss; + ctx->lso_flag = (lsoflags == HW_LSO); /* * LSO relies on tx h/w checksum, so here will drop the package @@ -582,12 +600,12 @@ ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx) * in one mblk fragment, so we go thourgh the fragments to parse * the ether type. */ - size = len = MBLK_LEN(mp); + size = len = MBLKL(mp); offset = offsetof(struct ether_header, ether_type); while (size <= offset) { mp = mp->b_cont; ASSERT(mp != NULL); - len = MBLK_LEN(mp); + len = MBLKL(mp); size += len; } pos = mp->b_rptr + offset + len - size; @@ -601,7 +619,7 @@ ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx) while (size <= offset) { mp = mp->b_cont; ASSERT(mp != NULL); - len = MBLK_LEN(mp); + len = MBLKL(mp); size += len; } pos = mp->b_rptr + offset + len - size; @@ -613,25 +631,32 @@ ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx) } /* - * Here we assume the IP(V6) header is fully included in + * Here we don't assume the IP(V6) header is fully included in * one mblk fragment. */ switch (etype) { case ETHERTYPE_IP: - offset = mac_hdr_len; - while (size <= offset) { - mp = mp->b_cont; - ASSERT(mp != NULL); - len = MBLK_LEN(mp); - size += len; - } - pos = mp->b_rptr + offset + len - size; - if (ctx->lso_flag) { - *((uint16_t *)(uintptr_t)(pos + offsetof(ipha_t, - ipha_length))) = 0; - *((uint16_t *)(uintptr_t)(pos + offsetof(ipha_t, - ipha_hdr_checksum))) = 0; + offset = offsetof(ipha_t, ipha_length) + mac_hdr_len; + while (size <= offset) { + mp = mp->b_cont; + ASSERT(mp != NULL); + len = MBLKL(mp); + size += len; + } + pos = mp->b_rptr + offset + len - size; + *((uint16_t *)(uintptr_t)(pos)) = 0; + + offset = offsetof(ipha_t, ipha_hdr_checksum) + + mac_hdr_len; + while (size <= offset) { + mp = mp->b_cont; + ASSERT(mp != NULL); + len = MBLKL(mp); + size += len; + } + pos = mp->b_rptr + offset + len - size; + *((uint16_t *)(uintptr_t)(pos)) = 0; /* * To perform ixgbe LSO, here also need to fill @@ -642,14 +667,23 @@ ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx) */ } - l4_proto = *(uint8_t *)(pos + offsetof(ipha_t, ipha_protocol)); + offset = offsetof(ipha_t, ipha_protocol) + mac_hdr_len; + while (size <= offset) { + mp = mp->b_cont; + ASSERT(mp != NULL); + len = MBLKL(mp); + size += len; + } + pos = mp->b_rptr + offset + len - size; + + l4_proto = *(uint8_t *)pos; break; case ETHERTYPE_IPV6: offset = offsetof(ip6_t, ip6_nxt) + mac_hdr_len; while (size <= offset) { mp = mp->b_cont; ASSERT(mp != NULL); - len = MBLK_LEN(mp); + len = MBLKL(mp); size += len; } pos = mp->b_rptr + offset + len - size; @@ -667,7 +701,7 @@ ixgbe_get_context(mblk_t *mp, ixgbe_tx_context_t *ctx) while (size <= offset) { mp = mp->b_cont; ASSERT(mp != NULL); - len = MBLK_LEN(mp); + len = MBLKL(mp); size += len; } pos = mp->b_rptr + offset + len - size; @@ -702,13 +736,14 @@ ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx) return (B_FALSE); /* - * Compare the checksum data retrieved from the mblk and the - * stored checksum data of the last context descriptor. The data - * need to be checked are: + * Compare the context data retrieved from the mblk and the + * stored data of the last context descriptor. The data need + * to be checked are: * hcksum_flags * l4_proto * mac_hdr_len * ip_hdr_len + * lso_flag * mss (only checked for LSO) * l4_hr_len (only checked for LSO) * Either one of the above data is changed, a new context descriptor @@ -716,16 +751,14 @@ ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx) */ last = &tx_ring->tx_context; - if (ctx->hcksum_flags != 0) { - if ((ctx->hcksum_flags != last->hcksum_flags) || - (ctx->l4_proto != last->l4_proto) || - (ctx->mac_hdr_len != last->mac_hdr_len) || - (ctx->ip_hdr_len != last->ip_hdr_len) || - (ctx->lso_flag && ((ctx->mss != last->mss) || - (ctx->l4_hdr_len != last->l4_hdr_len)))) { - - return (B_TRUE); - } + if ((ctx->hcksum_flags != last->hcksum_flags) || + (ctx->l4_proto != last->l4_proto) || + (ctx->mac_hdr_len != last->mac_hdr_len) || + (ctx->ip_hdr_len != last->ip_hdr_len) || + (ctx->lso_flag != last->lso_flag) || + (ctx->lso_flag && ((ctx->mss != last->mss) || + (ctx->l4_hdr_len != last->l4_hdr_len)))) { + return (B_TRUE); } return (B_FALSE); @@ -738,11 +771,11 @@ ixgbe_check_context(ixgbe_tx_ring_t *tx_ring, ixgbe_tx_context_t *ctx) */ static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd, - ixgbe_tx_context_t *ctx) + ixgbe_tx_context_t *ctx, int ring_index) { /* * Fill the context descriptor with the checksum - * context information we've got + * context information we've got. */ ctx_tbd->vlan_macip_lens = ctx->ip_hdr_len; ctx_tbd->vlan_macip_lens |= ctx->mac_hdr_len << @@ -775,12 +808,12 @@ ixgbe_fill_context(struct ixgbe_adv_tx_context_desc *ctx_tbd, } ctx_tbd->seqnum_seed = 0; + ctx_tbd->mss_l4len_idx = ring_index << 4; + if (ctx->lso_flag) { - ctx_tbd->mss_l4len_idx = + ctx_tbd->mss_l4len_idx |= (ctx->l4_hdr_len << IXGBE_ADVTXD_L4LEN_SHIFT) | (ctx->mss << IXGBE_ADVTXD_MSS_SHIFT); - } else { - ctx_tbd->mss_l4len_idx = 0; } } @@ -838,7 +871,7 @@ ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list, */ ixgbe_fill_context( (struct ixgbe_adv_tx_context_desc *)tbd, - ctx); + ctx, tx_ring->index); index = NEXT_INDEX(index, 1, tx_ring->ring_size); desc_num++; @@ -908,6 +941,14 @@ ixgbe_tx_fill_ring(ixgbe_tx_ring_t *tx_ring, link_list_t *pending_list, */ ASSERT(first_tbd != NULL); first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_IFCS; + first_tbd->read.olinfo_status |= (tx_ring->index << 4); + + if (ctx != NULL && ctx->lso_flag) { + first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE; + first_tbd->read.olinfo_status |= + (mbsize - ctx->mac_hdr_len - ctx->ip_hdr_len + - ctx->l4_hdr_len) << IXGBE_ADVTXD_PAYLEN_SHIFT; + } if (ctx != NULL && ctx->lso_flag) { first_tbd->read.cmd_type_len |= IXGBE_ADVTXD_DCMD_TSE; @@ -1017,14 +1058,18 @@ ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring) * The mutex_tryenter() is used to avoid unnecessary * lock contention. */ - if (mutex_tryenter(&tx_ring->recycle_lock) == 0) - return (0); + mutex_enter(&tx_ring->recycle_lock); ASSERT(tx_ring->tbd_free <= tx_ring->ring_size); if (tx_ring->tbd_free == tx_ring->ring_size) { tx_ring->recycle_fail = 0; tx_ring->stall_watchdog = 0; + if (tx_ring->reschedule) { + tx_ring->reschedule = B_FALSE; + mac_tx_ring_update(tx_ring->ixgbe->mac_hdl, + tx_ring->ring_handle); + } mutex_exit(&tx_ring->recycle_lock); return (0); } @@ -1108,6 +1153,12 @@ ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t *tx_ring) */ atomic_add_32(&tx_ring->tbd_free, desc_num); + if ((tx_ring->tbd_free >= tx_ring->resched_thresh) && + (tx_ring->reschedule)) { + tx_ring->reschedule = B_FALSE; + mac_tx_ring_update(tx_ring->ixgbe->mac_hdl, + tx_ring->ring_handle); + } mutex_exit(&tx_ring->recycle_lock); /* @@ -1152,14 +1203,18 @@ ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring) * The mutex_tryenter() is used to avoid unnecessary * lock contention. */ - if (mutex_tryenter(&tx_ring->recycle_lock) == 0) - return (0); + mutex_enter(&tx_ring->recycle_lock); ASSERT(tx_ring->tbd_free <= tx_ring->ring_size); if (tx_ring->tbd_free == tx_ring->ring_size) { tx_ring->recycle_fail = 0; tx_ring->stall_watchdog = 0; + if (tx_ring->reschedule) { + tx_ring->reschedule = B_FALSE; + mac_tx_ring_update(tx_ring->ixgbe->mac_hdl, + tx_ring->ring_handle); + } mutex_exit(&tx_ring->recycle_lock); return (0); } @@ -1245,6 +1300,12 @@ ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t *tx_ring) */ atomic_add_32(&tx_ring->tbd_free, desc_num); + if ((tx_ring->tbd_free >= tx_ring->resched_thresh) && + (tx_ring->reschedule)) { + tx_ring->reschedule = B_FALSE; + mac_tx_ring_update(tx_ring->ixgbe->mac_hdl, + tx_ring->ring_handle); + } mutex_exit(&tx_ring->recycle_lock); /* diff --git a/usr/src/uts/common/io/mac/README b/usr/src/uts/common/io/mac/README new file mode 100644 index 0000000000..744c9842c3 --- /dev/null +++ b/usr/src/uts/common/io/mac/README @@ -0,0 +1,80 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# + +This README describes the organization of the files and subdirectories +that make up the misc/mac module. + +Changes to the sources should follow the layout and naming conventions +adopted herein. + +Each functional component of the mac module is implemented in a separate +source file. The external interfaces are declared in header files delivered +under <sys>. The internal data structures and definitions are declared +in header files internal to this directory. + +. Client Interface + This is the kernel programming interface for accessing L2 services as + a consumer. + . mac_client.c + . sys/mac_client.h: APIs intended for external MAC consumers + . sys/mac_client_priv.h: APIs for GLDv3 components only (dld, + dls, aggr, vnic, etc). + . mac_client_impl.h Internals. + +. Provider Interface + This is the GLDv3 kernel driver interface. Functions and data structures + are used by L2 drivers to provide services to MAC consumers. + . mac_provider.c + . sys/mac_provider.h + +. MAC Type Plugins + The GLDv3 L2 supports multiple types of media control. Each type is + implemented as a plugin delivered in a separate file under the + plugin/ directory. + Add a new file to the plugin/ directory for introducing a new MAC type. + +. Core Component. + - Scheduling Engine: + . mac_datapath_setup.c: Control path for the scheduler. + . mac_soft_ring.c, + mac_soft_ring.h: Fanout Soft Rings. + . mac_sched.c: Data path + . mac_bcast.c Data path and switching for broadcast and + multicast packets. + . mac_stat.c: Statistics + + - Classification Engine + mac_flow.c: Flows and software classification: + + - NICs Resources Management + . mac.c (this file also has other miscelanea) + +. Misc + . mac.c + . mac_util.c + . mac_ndd.c + diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c index a7c472bfb2..1ee6d36cd6 100644 --- a/usr/src/uts/common/io/mac/mac.c +++ b/usr/src/uts/common/io/mac/mac.c @@ -24,9 +24,246 @@ * Use is subject to license terms. */ - /* * MAC Services Module + * + * The GLDv3 framework locking - The MAC layer + * -------------------------------------------- + * + * The MAC layer is central to the GLD framework and can provide the locking + * framework needed for itself and for the use of MAC clients. MAC end points + * are fairly disjoint and don't share a lot of state. So a coarse grained + * multi-threading scheme is to single thread all create/modify/delete or set + * type of control operations on a per mac end point while allowing data threads + * concurrently. + * + * Control operations (set) that modify a mac end point are always serialized on + * a per mac end point basis, We have at most 1 such thread per mac end point + * at a time. + * + * All other operations that are not serialized are essentially multi-threaded. + * For example a control operation (get) like getting statistics which may not + * care about reading values atomically or data threads sending or receiving + * data. Mostly these type of operations don't modify the control state. Any + * state these operations care about are protected using traditional locks. + * + * The perimeter only serializes serial operations. It does not imply there + * aren't any other concurrent operations. However a serialized operation may + * sometimes need to make sure it is the only thread. In this case it needs + * to use reference counting mechanisms to cv_wait until any current data + * threads are done. + * + * The mac layer itself does not hold any locks across a call to another layer. + * The perimeter is however held across a down call to the driver to make the + * whole control operation atomic with respect to other control operations. + * Also the data path and get type control operations may proceed concurrently. + * These operations synchronize with the single serial operation on a given mac + * end point using regular locks. The perimeter ensures that conflicting + * operations like say a mac_multicast_add and a mac_multicast_remove on the + * same mac end point don't interfere with each other and also ensures that the + * changes in the mac layer and the call to the underlying driver to say add a + * multicast address are done atomically without interference from a thread + * trying to delete the same address. + * + * For example, consider + * mac_multicst_add() + * { + * mac_perimeter_enter(); serialize all control operations + * + * grab list lock protect against access by data threads + * add to list + * drop list lock + * + * call driver's mi_multicst + * + * mac_perimeter_exit(); + * } + * + * To lessen the number of serialization locks and simplify the lock hierarchy, + * we serialize all the control operations on a per mac end point by using a + * single serialization lock called the perimeter. We allow recursive entry into + * the perimeter to facilitate use of this mechanism by both the mac client and + * the MAC layer itself. + * + * MAC client means an entity that does an operation on a mac handle + * obtained from a mac_open/mac_client_open. Similarly MAC driver means + * an entity that does an operation on a mac handle obtained from a + * mac_register. An entity could be both client and driver but on different + * handles eg. aggr. and should only make the corresponding mac interface calls + * i.e. mac driver interface or mac client interface as appropriate for that + * mac handle. + * + * General rules. + * ------------- + * + * R1. The lock order of upcall threads is natually opposite to downcall + * threads. Hence upcalls must not hold any locks across layers for fear of + * recursive lock enter and lock order violation. This applies to all layers. + * + * R2. The perimeter is just another lock. Since it is held in the down + * direction, acquiring the perimeter in an upcall is prohibited as it would + * cause a deadlock. This applies to all layers. + * + * Note that upcalls that need to grab the mac perimeter (for example + * mac_notify upcalls) can still achieve that by posting the request to a + * thread, which can then grab all the required perimeters and locks in the + * right global order. Note that in the above example the mac layer iself + * won't grab the mac perimeter in the mac_notify upcall, instead the upcall + * to the client must do that. Please see the aggr code for an example. + * + * MAC client rules + * ---------------- + * + * R3. A MAC client may use the MAC provided perimeter facility to serialize + * control operations on a per mac end point. It does this by by acquring + * and holding the perimeter across a sequence of calls to the mac layer. + * This ensures atomicity across the entire block of mac calls. In this + * model the MAC client must not hold any client locks across the calls to + * the mac layer. This model is the preferred solution. + * + * R4. However if a MAC client has a lot of global state across all mac end + * points the per mac end point serialization may not be sufficient. In this + * case the client may choose to use global locks or use its own serialization. + * To avoid deadlocks, these client layer locks held across the mac calls + * in the control path must never be acquired by the data path for the reason + * mentioned below. + * + * (Assume that a control operation that holds a client lock blocks in the + * mac layer waiting for upcall reference counts to drop to zero. If an upcall + * data thread that holds this reference count, tries to acquire the same + * client lock subsequently it will deadlock). + * + * A MAC client may follow either the R3 model or the R4 model, but can't + * mix both. In the former, the hierarchy is Perim -> client locks, but in + * the latter it is client locks -> Perim. + * + * R5. MAC clients must make MAC calls (excluding data calls) in a cv_wait'able + * context since they may block while trying to acquire the perimeter. + * In addition some calls may block waiting for upcall refcnts to come down to + * zero. + * + * R6. MAC clients must make sure that they are single threaded and all threads + * from the top (in particular data threads) have finished before calling + * mac_client_close. The MAC framework does not track the number of client + * threads using the mac client handle. Also mac clients must make sure + * they have undone all the control operations before calling mac_client_close. + * For example mac_unicast_remove/mac_multicast_remove to undo the corresponding + * mac_unicast_add/mac_multicast_add. + * + * MAC framework rules + * ------------------- + * + * R7. The mac layer itself must not hold any mac layer locks (except the mac + * perimeter) across a call to any other layer from the mac layer. The call to + * any other layer could be via mi_* entry points, classifier entry points into + * the driver or via upcall pointers into layers above. The mac perimeter may + * be acquired or held only in the down direction, for e.g. when calling into + * a mi_* driver enty point to provide atomicity of the operation. + * + * R8. Since it is not guaranteed (see R14) that drivers won't hold locks across + * mac driver interfaces, the MAC layer must provide a cut out for control + * interfaces like upcall notifications and start them in a separate thread. + * + * R9. Note that locking order also implies a plumbing order. For example + * VNICs are allowed to be created over aggrs, but not vice-versa. An attempt + * to plumb in any other order must be failed at mac_open time, otherwise it + * could lead to deadlocks due to inverse locking order. + * + * R10. MAC driver interfaces must not block since the driver could call them + * in interrupt context. + * + * R11. Walkers must preferably not hold any locks while calling walker + * callbacks. Instead these can operate on reference counts. In simple + * callbacks it may be ok to hold a lock and call the callbacks, but this is + * harder to maintain in the general case of arbitrary callbacks. + * + * R12. The MAC layer must protect upcall notification callbacks using reference + * counts rather than holding locks across the callbacks. + * + * R13. Given the variety of drivers, it is preferable if the MAC layer can make + * sure that any pointers (such as mac ring pointers) it passes to the driver + * remain valid until mac unregister time. Currently the mac layer achieves + * this by using generation numbers for rings and freeing the mac rings only + * at unregister time. The MAC layer must provide a layer of indirection and + * must not expose underlying driver rings or driver data structures/pointers + * directly to MAC clients. + * + * MAC driver rules + * ---------------- + * + * R14. It would be preferable if MAC drivers don't hold any locks across any + * mac call. However at a minimum they must not hold any locks across data + * upcalls. They must also make sure that all references to mac data structures + * are cleaned up and that it is single threaded at mac_unregister time. + * + * R15. MAC driver interfaces don't block and so the action may be done + * asynchronously in a separate thread as for example handling notifications. + * The driver must not assume that the action is complete when the call + * returns. + * + * R16. Drivers must maintain a generation number per Rx ring, and pass it + * back to mac_rx_ring(); They are expected to increment the generation + * number whenever the ring's stop routine is invoked. + * See comments in mac_rx_ring(); + * + * R17 Similarly mi_stop is another synchronization point and the driver must + * ensure that all upcalls are done and there won't be any future upcall + * before returning from mi_stop. + * + * R18. The driver may assume that all set/modify control operations via + * the mi_* entry points are single threaded on a per mac end point. + * + * Lock and Perimeter hierarchy scenarios + * --------------------------------------- + * + * i_mac_impl_lock -> mi_rw_lock -> srs_lock -> s_ring_lock[i_mac_tx_srs_notify] + * + * ft_lock -> fe_lock [mac_flow_lookup] + * + * mi_rw_lock -> fe_lock [mac_bcast_send] + * + * srs_lock -> mac_bw_lock [mac_rx_srs_drain_bw] + * + * cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind] + * + * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename] + * + * Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac + * client to driver. In the case of clients that explictly use the mac provided + * perimeter mechanism for its serialization, the hierarchy is + * Perimeter -> mac layer locks, since the client never holds any locks across + * the mac calls. In the case of clients that use its own locks the hierarchy + * is Client locks -> Mac Perim -> Mac layer locks. The client never explicitly + * calls mac_perim_enter/exit in this case. + * + * Subflow creation rules + * --------------------------- + * o In case of a user specified cpulist present on underlying link and flows, + * the flows cpulist must be a subset of the underlying link. + * o In case of a user specified fanout mode present on link and flow, the + * subflow fanout count has to be less than or equal to that of the + * underlying link. The cpu-bindings for the subflows will be a subset of + * the underlying link. + * o In case if no cpulist specified on both underlying link and flow, the + * underlying link relies on a MAC tunable to provide out of box fanout. + * The subflow will have no cpulist (the subflow will be unbound) + * o In case if no cpulist is specified on the underlying link, a subflow can + * carry either a user-specified cpulist or fanout count. The cpu-bindings + * for the subflow will not adhere to restriction that they need to be subset + * of the underlying link. + * o In case where the underlying link is carrying either a user specified + * cpulist or fanout mode and for a unspecified subflow, the subflow will be + * created unbound. + * o While creating unbound subflows, bandwidth mode changes attempt to + * figure a right fanout count. In such cases the fanout count will override + * the unbound cpu-binding behavior. + * o In addition to this, while cycling between flow and link properties, we + * impose a restriction that if a link property has a subflow with + * user-specified attributes, we will not allow changing the link property. + * The administrator needs to reset all the user specified properties for the + * subflows before attempting a link property change. + * Some of the above rules can be overridden by specifying additional command + * line options while creating or modifying link or subflow properties. */ #include <sys/types.h> @@ -39,11 +276,13 @@ #include <sys/strsun.h> #include <sys/strsubr.h> #include <sys/dlpi.h> -#include <sys/dls.h> #include <sys/modhash.h> -#include <sys/vlan.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> +#include <sys/mac_client_impl.h> +#include <sys/mac_soft_ring.h> #include <sys/mac_impl.h> +#include <sys/mac.h> +#include <sys/dls.h> #include <sys/dld.h> #include <sys/modctl.h> #include <sys/fs/dv_node.h> @@ -52,20 +291,45 @@ #include <sys/callb.h> #include <sys/cpuvar.h> #include <sys/atomic.h> +#include <sys/bitmap.h> +#include <sys/sdt.h> +#include <sys/mac_flow.h> +#include <sys/ddi_intr_impl.h> +#include <sys/disp.h> #include <sys/sdt.h> +#include <sys/vnic.h> +#include <sys/vnic_impl.h> +#include <sys/vlan.h> +#include <inet/ip.h> +#include <inet/ip6.h> +#include <sys/exacct.h> +#include <sys/exacct_impl.h> #include <inet/nd.h> #include <sys/ethernet.h> #define IMPL_HASHSZ 67 /* prime */ -static kmem_cache_t *i_mac_impl_cachep; -static mod_hash_t *i_mac_impl_hash; +kmem_cache_t *i_mac_impl_cachep; +mod_hash_t *i_mac_impl_hash; krwlock_t i_mac_impl_lock; uint_t i_mac_impl_count; -static kmem_cache_t *mac_vnic_tx_cache; +static kmem_cache_t *mac_ring_cache; static id_space_t *minor_ids; static uint32_t minor_count; +/* + * Logging stuff. Perhaps mac_logging_interval could be broken into + * mac_flow_log_interval and mac_link_log_interval if we want to be + * able to schedule them differently. + */ +uint_t mac_logging_interval; +boolean_t mac_flow_log_enable; +boolean_t mac_link_log_enable; +timeout_id_t mac_logging_timer; + +/* for debugging, see MAC_DBG_PRT() in mac_impl.h */ +int mac_dbg = 0; + #define MACTYPE_KMODDIR "mac" #define MACTYPE_HASHSZ 67 static mod_hash_t *i_mactype_hash; @@ -75,295 +339,75 @@ static mod_hash_t *i_mactype_hash; */ static kmutex_t i_mactype_lock; -static void i_mac_notify_thread(void *); -static mblk_t *mac_vnic_tx(void *, mblk_t *); -static mblk_t *mac_vnic_txloop(void *, mblk_t *); -static void mac_register_priv_prop(mac_impl_t *, mac_priv_prop_t *, uint_t); -static void mac_unregister_priv_prop(mac_impl_t *); - /* - * Private functions. + * mac_tx_percpu_cnt + * + * Number of per cpu locks per mac_client_impl_t. Used by the transmit side + * in mac_tx to reduce lock contention. This is sized at boot time in mac_init. + * mac_tx_percpu_cnt_max is settable in /etc/system and must be a power of 2. + * Per cpu locks may be disabled by setting mac_tx_percpu_cnt_max to 1. */ - -/*ARGSUSED*/ -static int -i_mac_constructor(void *buf, void *arg, int kmflag) -{ - mac_impl_t *mip = buf; - - bzero(buf, sizeof (mac_impl_t)); - - mip->mi_linkstate = LINK_STATE_UNKNOWN; - - rw_init(&mip->mi_state_lock, NULL, RW_DRIVER, NULL); - rw_init(&mip->mi_gen_lock, NULL, RW_DRIVER, NULL); - rw_init(&mip->mi_data_lock, NULL, RW_DRIVER, NULL); - rw_init(&mip->mi_notify_lock, NULL, RW_DRIVER, NULL); - rw_init(&mip->mi_rx_lock, NULL, RW_DRIVER, NULL); - rw_init(&mip->mi_tx_lock, NULL, RW_DRIVER, NULL); - rw_init(&mip->mi_resource_lock, NULL, RW_DRIVER, NULL); - mutex_init(&mip->mi_activelink_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&mip->mi_notify_bits_lock, NULL, MUTEX_DRIVER, NULL); - cv_init(&mip->mi_notify_cv, NULL, CV_DRIVER, NULL); - mutex_init(&mip->mi_lock, NULL, MUTEX_DRIVER, NULL); - cv_init(&mip->mi_rx_cv, NULL, CV_DRIVER, NULL); - return (0); -} - -/*ARGSUSED*/ -static void -i_mac_destructor(void *buf, void *arg) -{ - mac_impl_t *mip = buf; - - ASSERT(mip->mi_ref == 0); - ASSERT(!mip->mi_exclusive); - ASSERT(mip->mi_active == 0); - ASSERT(mip->mi_linkstate == LINK_STATE_UNKNOWN); - ASSERT(mip->mi_devpromisc == 0); - ASSERT(mip->mi_promisc == 0); - ASSERT(mip->mi_mmap == NULL); - ASSERT(mip->mi_mmrp == NULL); - ASSERT(mip->mi_mnfp == NULL); - ASSERT(mip->mi_resource_add == NULL); - ASSERT(mip->mi_ksp == NULL); - ASSERT(mip->mi_kstat_count == 0); - ASSERT(mip->mi_notify_bits == 0); - ASSERT(mip->mi_notify_thread == NULL); - - rw_destroy(&mip->mi_gen_lock); - rw_destroy(&mip->mi_state_lock); - rw_destroy(&mip->mi_data_lock); - rw_destroy(&mip->mi_notify_lock); - rw_destroy(&mip->mi_rx_lock); - rw_destroy(&mip->mi_tx_lock); - rw_destroy(&mip->mi_resource_lock); - mutex_destroy(&mip->mi_activelink_lock); - mutex_destroy(&mip->mi_notify_bits_lock); - cv_destroy(&mip->mi_notify_cv); - mutex_destroy(&mip->mi_lock); - cv_destroy(&mip->mi_rx_cv); -} +int mac_tx_percpu_cnt; +int mac_tx_percpu_cnt_max = 128; + +static int i_mac_constructor(void *, void *, int); +static void i_mac_destructor(void *, void *); +static int i_mac_ring_ctor(void *, void *, int); +static void i_mac_ring_dtor(void *, void *); +static mblk_t *mac_rx_classify(mac_impl_t *, mac_resource_handle_t, mblk_t *); +void mac_tx_client_flush(mac_client_impl_t *); +void mac_tx_client_block(mac_client_impl_t *); +static void mac_rx_ring_quiesce(mac_ring_t *, uint_t); +static int mac_start_group_and_rings(mac_group_t *); +static void mac_stop_group_and_rings(mac_group_t *); /* - * mac_vnic_tx_t kmem cache support functions. + * Module initialization functions. */ -/* ARGSUSED */ -static int -i_mac_vnic_tx_ctor(void *buf, void *arg, int mkflag) -{ - mac_vnic_tx_t *vnic_tx = buf; - - bzero(buf, sizeof (mac_vnic_tx_t)); - mutex_init(&vnic_tx->mv_lock, NULL, MUTEX_DRIVER, NULL); - cv_init(&vnic_tx->mv_cv, NULL, CV_DRIVER, NULL); - return (0); -} - -/* ARGSUSED */ -static void -i_mac_vnic_tx_dtor(void *buf, void *arg) -{ - mac_vnic_tx_t *vnic_tx = buf; - - ASSERT(vnic_tx->mv_refs == 0); - mutex_destroy(&vnic_tx->mv_lock); - cv_destroy(&vnic_tx->mv_cv); -} - -static void -i_mac_notify(mac_impl_t *mip, mac_notify_type_t type) +void +mac_init(void) { - rw_enter(&i_mac_impl_lock, RW_READER); - if (mip->mi_disabled) - goto exit; - - /* - * Guard against incorrect notifications. (Running a newer - * mac client against an older implementation?) - */ - if (type >= MAC_NNOTE) - goto exit; + mac_tx_percpu_cnt = ((boot_max_ncpus == -1) ? max_ncpus : + boot_max_ncpus); - mutex_enter(&mip->mi_notify_bits_lock); - mip->mi_notify_bits |= (1 << type); - cv_broadcast(&mip->mi_notify_cv); - mutex_exit(&mip->mi_notify_bits_lock); + /* Upper bound is mac_tx_percpu_cnt_max */ + if (mac_tx_percpu_cnt > mac_tx_percpu_cnt_max) + mac_tx_percpu_cnt = mac_tx_percpu_cnt_max; -exit: - rw_exit(&i_mac_impl_lock); -} + if (mac_tx_percpu_cnt < 1) { + /* Someone set max_tx_percpu_cnt_max to 0 or less */ + mac_tx_percpu_cnt = 1; + } -static void -i_mac_log_link_state(mac_impl_t *mip) -{ + ASSERT(mac_tx_percpu_cnt >= 1); + mac_tx_percpu_cnt = (1 << highbit(mac_tx_percpu_cnt - 1)); /* - * If no change, then it is not interesting. + * Make it of the form 2**N - 1 in the range + * [0 .. mac_tx_percpu_cnt_max - 1] */ - if (mip->mi_lastlinkstate == mip->mi_linkstate) - return; - - switch (mip->mi_linkstate) { - case LINK_STATE_UP: - if (mip->mi_type->mt_ops.mtops_ops & MTOPS_LINK_DETAILS) { - char det[200]; - - mip->mi_type->mt_ops.mtops_link_details(det, - sizeof (det), (mac_handle_t)mip, mip->mi_pdata); - - cmn_err(CE_NOTE, "!%s link up, %s", mip->mi_name, det); - } else { - cmn_err(CE_NOTE, "!%s link up", mip->mi_name); - } - break; - - case LINK_STATE_DOWN: - /* - * Only transitions from UP to DOWN are interesting - */ - if (mip->mi_lastlinkstate != LINK_STATE_UNKNOWN) - cmn_err(CE_NOTE, "!%s link down", mip->mi_name); - break; - - case LINK_STATE_UNKNOWN: - /* - * This case is normally not interesting. - */ - break; - } - mip->mi_lastlinkstate = mip->mi_linkstate; -} - -static void -i_mac_notify_thread(void *arg) -{ - mac_impl_t *mip = arg; - callb_cpr_t cprinfo; - - CALLB_CPR_INIT(&cprinfo, &mip->mi_notify_bits_lock, callb_generic_cpr, - "i_mac_notify_thread"); - - mutex_enter(&mip->mi_notify_bits_lock); - for (;;) { - uint32_t bits; - uint32_t type; - - bits = mip->mi_notify_bits; - if (bits == 0) { - CALLB_CPR_SAFE_BEGIN(&cprinfo); - cv_wait(&mip->mi_notify_cv, &mip->mi_notify_bits_lock); - CALLB_CPR_SAFE_END(&cprinfo, &mip->mi_notify_bits_lock); - continue; - } - mip->mi_notify_bits = 0; - - if ((bits & (1 << MAC_NNOTE)) != 0) { - /* request to quit */ - ASSERT(mip->mi_disabled); - break; - } - - mutex_exit(&mip->mi_notify_bits_lock); - - /* - * Log link changes. - */ - if ((bits & (1 << MAC_NOTE_LINK)) != 0) - i_mac_log_link_state(mip); - - /* - * Do notification callbacks for each notification type. - */ - for (type = 0; type < MAC_NNOTE; type++) { - mac_notify_fn_t *mnfp; - - if ((bits & (1 << type)) == 0) { - continue; - } - - /* - * Walk the list of notifications. - */ - rw_enter(&mip->mi_notify_lock, RW_READER); - for (mnfp = mip->mi_mnfp; mnfp != NULL; - mnfp = mnfp->mnf_nextp) { - - mnfp->mnf_fn(mnfp->mnf_arg, type); - } - rw_exit(&mip->mi_notify_lock); - } - - mutex_enter(&mip->mi_notify_bits_lock); - } - - mip->mi_notify_thread = NULL; - cv_broadcast(&mip->mi_notify_cv); - - CALLB_CPR_EXIT(&cprinfo); - - thread_exit(); -} - -static mactype_t * -i_mactype_getplugin(const char *pname) -{ - mactype_t *mtype = NULL; - boolean_t tried_modload = B_FALSE; - - mutex_enter(&i_mactype_lock); + mac_tx_percpu_cnt--; -find_registered_mactype: - if (mod_hash_find(i_mactype_hash, (mod_hash_key_t)pname, - (mod_hash_val_t *)&mtype) != 0) { - if (!tried_modload) { - /* - * If the plugin has not yet been loaded, then - * attempt to load it now. If modload() succeeds, - * the plugin should have registered using - * mactype_register(), in which case we can go back - * and attempt to find it again. - */ - if (modload(MACTYPE_KMODDIR, (char *)pname) != -1) { - tried_modload = B_TRUE; - goto find_registered_mactype; - } - } - } else { - /* - * Note that there's no danger that the plugin we've loaded - * could be unloaded between the modload() step and the - * reference count bump here, as we're holding - * i_mactype_lock, which mactype_unregister() also holds. - */ - atomic_inc_32(&mtype->mt_ref); - } - - mutex_exit(&i_mactype_lock); - return (mtype); -} - -/* - * Module initialization functions. - */ - -void -mac_init(void) -{ i_mac_impl_cachep = kmem_cache_create("mac_impl_cache", sizeof (mac_impl_t), 0, i_mac_constructor, i_mac_destructor, NULL, NULL, NULL, 0); ASSERT(i_mac_impl_cachep != NULL); - mac_vnic_tx_cache = kmem_cache_create("mac_vnic_tx_cache", - sizeof (mac_vnic_tx_t), 0, i_mac_vnic_tx_ctor, i_mac_vnic_tx_dtor, - NULL, NULL, NULL, 0); - ASSERT(mac_vnic_tx_cache != NULL); + mac_ring_cache = kmem_cache_create("mac_ring_cache", + sizeof (mac_ring_t), 0, i_mac_ring_ctor, i_mac_ring_dtor, NULL, + NULL, NULL, 0); + ASSERT(mac_ring_cache != NULL); i_mac_impl_hash = mod_hash_create_extended("mac_impl_hash", IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor, mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); rw_init(&i_mac_impl_lock, NULL, RW_DEFAULT, NULL); + + mac_flow_init(); + mac_soft_ring_init(); + mac_bcast_init(); + mac_client_init(); + i_mac_impl_count = 0; i_mactype_hash = mod_hash_create_extended("mactype_hash", @@ -380,6 +424,12 @@ mac_init(void) minor_ids = id_space_create("mac_minor_ids", MAC_MAX_MINOR+1, MAXMIN32); ASSERT(minor_ids != NULL); minor_count = 0; + + /* Let's default to 20 seconds */ + mac_logging_interval = 20; + mac_flow_log_enable = B_FALSE; + mac_link_log_enable = B_FALSE; + mac_logging_timer = 0; } int @@ -389,567 +439,701 @@ mac_fini(void) return (EBUSY); id_space_destroy(minor_ids); + mac_flow_fini(); mod_hash_destroy_hash(i_mac_impl_hash); rw_destroy(&i_mac_impl_lock); - kmem_cache_destroy(i_mac_impl_cachep); - kmem_cache_destroy(mac_vnic_tx_cache); + mac_client_fini(); + kmem_cache_destroy(mac_ring_cache); mod_hash_destroy_hash(i_mactype_hash); + mac_soft_ring_finish(); return (0); } -/* - * Client functions. - */ - -static int -mac_hold(const char *macname, mac_impl_t **pmip) +void +mac_init_ops(struct dev_ops *ops, const char *name) { - mac_impl_t *mip; - int err; - - /* - * Check the device name length to make sure it won't overflow our - * buffer. - */ - if (strlen(macname) >= MAXNAMELEN) - return (EINVAL); - - /* - * Look up its entry in the global hash table. - */ - rw_enter(&i_mac_impl_lock, RW_WRITER); - err = mod_hash_find(i_mac_impl_hash, (mod_hash_key_t)macname, - (mod_hash_val_t *)&mip); + dld_init_ops(ops, name); +} - if (err != 0) { - rw_exit(&i_mac_impl_lock); - return (ENOENT); - } +void +mac_fini_ops(struct dev_ops *ops) +{ + dld_fini_ops(ops); +} - if (mip->mi_disabled) { - rw_exit(&i_mac_impl_lock); - return (ENOENT); - } +/*ARGSUSED*/ +static int +i_mac_constructor(void *buf, void *arg, int kmflag) +{ + mac_impl_t *mip = buf; - if (mip->mi_exclusive) { - rw_exit(&i_mac_impl_lock); - return (EBUSY); - } + bzero(buf, sizeof (mac_impl_t)); - mip->mi_ref++; - rw_exit(&i_mac_impl_lock); + mip->mi_linkstate = LINK_STATE_UNKNOWN; + mip->mi_nclients = 0; - *pmip = mip; + mutex_init(&mip->mi_lock, NULL, MUTEX_DRIVER, NULL); + rw_init(&mip->mi_rw_lock, NULL, RW_DRIVER, NULL); + mutex_init(&mip->mi_notify_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&mip->mi_promisc_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&mip->mi_ring_lock, NULL, MUTEX_DEFAULT, NULL); + + mip->mi_notify_cb_info.mcbi_lockp = &mip->mi_notify_lock; + cv_init(&mip->mi_notify_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL); + mip->mi_promisc_cb_info.mcbi_lockp = &mip->mi_promisc_lock; + cv_init(&mip->mi_promisc_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL); return (0); } +/*ARGSUSED*/ static void -mac_rele(mac_impl_t *mip) +i_mac_destructor(void *buf, void *arg) { - rw_enter(&i_mac_impl_lock, RW_WRITER); - ASSERT(mip->mi_ref != 0); - if (--mip->mi_ref == 0) - ASSERT(!mip->mi_activelink); - rw_exit(&i_mac_impl_lock); -} + mac_impl_t *mip = buf; + mac_cb_info_t *mcbi; -int -mac_hold_exclusive(mac_handle_t mh) -{ - mac_impl_t *mip = (mac_impl_t *)mh; + ASSERT(mip->mi_ref == 0); + ASSERT(mip->mi_active == 0); + ASSERT(mip->mi_linkstate == LINK_STATE_UNKNOWN); + ASSERT(mip->mi_devpromisc == 0); + ASSERT(mip->mi_promisc == 0); + ASSERT(mip->mi_ksp == NULL); + ASSERT(mip->mi_kstat_count == 0); + ASSERT(mip->mi_nclients == 0); + ASSERT(mip->mi_nactiveclients == 0); + ASSERT(mip->mi_state_flags == 0); + ASSERT(mip->mi_factory_addr == NULL); + ASSERT(mip->mi_factory_addr_num == 0); + ASSERT(mip->mi_default_tx_ring == NULL); + + mcbi = &mip->mi_notify_cb_info; + ASSERT(mcbi->mcbi_del_cnt == 0 && mcbi->mcbi_walker_cnt == 0); + ASSERT(mip->mi_notify_bits == 0); + ASSERT(mip->mi_notify_thread == NULL); + ASSERT(mcbi->mcbi_lockp == &mip->mi_notify_lock); + mcbi->mcbi_lockp = NULL; - /* - * Look up its entry in the global hash table. - */ - rw_enter(&i_mac_impl_lock, RW_WRITER); - if (mip->mi_disabled) { - rw_exit(&i_mac_impl_lock); - return (ENOENT); - } + mcbi = &mip->mi_promisc_cb_info; + ASSERT(mcbi->mcbi_del_cnt == 0 && mip->mi_promisc_list == NULL); + ASSERT(mip->mi_promisc_list == NULL); + ASSERT(mcbi->mcbi_lockp == &mip->mi_promisc_lock); + mcbi->mcbi_lockp = NULL; - if (mip->mi_ref != 0) { - rw_exit(&i_mac_impl_lock); - return (EBUSY); - } + ASSERT(mip->mi_bcast_ngrps == 0 && mip->mi_bcast_grp == NULL); + ASSERT(mip->mi_perim_owner == NULL && mip->mi_perim_ocnt == 0); - ASSERT(!mip->mi_exclusive); + mutex_destroy(&mip->mi_lock); + rw_destroy(&mip->mi_rw_lock); - mip->mi_ref++; - mip->mi_exclusive = B_TRUE; - rw_exit(&i_mac_impl_lock); + mutex_destroy(&mip->mi_promisc_lock); + cv_destroy(&mip->mi_promisc_cb_info.mcbi_cv); + mutex_destroy(&mip->mi_notify_lock); + cv_destroy(&mip->mi_notify_cb_info.mcbi_cv); + mutex_destroy(&mip->mi_ring_lock); +} + +/* ARGSUSED */ +static int +i_mac_ring_ctor(void *buf, void *arg, int kmflag) +{ + mac_ring_t *ring = (mac_ring_t *)buf; + + bzero(ring, sizeof (mac_ring_t)); + cv_init(&ring->mr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&ring->mr_lock, NULL, MUTEX_DEFAULT, NULL); + ring->mr_state = MR_FREE; return (0); } +/* ARGSUSED */ +static void +i_mac_ring_dtor(void *buf, void *arg) +{ + mac_ring_t *ring = (mac_ring_t *)buf; + + cv_destroy(&ring->mr_cv); + mutex_destroy(&ring->mr_lock); +} + +/* + * Common functions to do mac callback addition and deletion. Currently this is + * used by promisc callbacks and notify callbacks. List addition and deletion + * need to take care of list walkers. List walkers in general, can't hold list + * locks and make upcall callbacks due to potential lock order and recursive + * reentry issues. Instead list walkers increment the list walker count to mark + * the presence of a walker thread. Addition can be carefully done to ensure + * that the list walker always sees either the old list or the new list. + * However the deletion can't be done while the walker is active, instead the + * deleting thread simply marks the entry as logically deleted. The last walker + * physically deletes and frees up the logically deleted entries when the walk + * is complete. + */ void -mac_rele_exclusive(mac_handle_t mh) +mac_callback_add(mac_cb_info_t *mcbi, mac_cb_t **mcb_head, + mac_cb_t *mcb_elem) { - mac_impl_t *mip = (mac_impl_t *)mh; + mac_cb_t *p; + mac_cb_t **pp; + + /* Verify it is not already in the list */ + for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) { + if (p == mcb_elem) + break; + } + VERIFY(p == NULL); /* - * Look up its entry in the global hash table. + * Add it to the head of the callback list. The membar ensures that + * the following list pointer manipulations reach global visibility + * in exactly the program order below. */ - rw_enter(&i_mac_impl_lock, RW_WRITER); - ASSERT(mip->mi_ref == 1 && mip->mi_exclusive); - mip->mi_ref--; - mip->mi_exclusive = B_FALSE; - rw_exit(&i_mac_impl_lock); + ASSERT(MUTEX_HELD(mcbi->mcbi_lockp)); + + mcb_elem->mcb_nextp = *mcb_head; + membar_producer(); + *mcb_head = mcb_elem; } -int -mac_open(const char *macname, mac_handle_t *mhp) +/* + * Mark the entry as logically deleted. If there aren't any walkers unlink + * from the list. In either case return the corresponding status. + */ +boolean_t +mac_callback_remove(mac_cb_info_t *mcbi, mac_cb_t **mcb_head, + mac_cb_t *mcb_elem) { - mac_impl_t *mip; - int err; + mac_cb_t *p; + mac_cb_t **pp; + ASSERT(MUTEX_HELD(mcbi->mcbi_lockp)); /* - * Look up its entry in the global hash table. + * Search the callback list for the entry to be removed */ - if ((err = mac_hold(macname, &mip)) != 0) - return (err); + for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) { + if (p == mcb_elem) + break; + } + VERIFY(p != NULL); /* - * Hold the dip associated to the MAC to prevent it from being - * detached. For a softmac, its underlying dip is held by the - * mi_open() callback. - * - * This is done to be more tolerant with some defective drivers, - * which incorrectly handle mac_unregister() failure in their - * xxx_detach() routine. For example, some drivers ignore the - * failure of mac_unregister() and free all resources that - * that are needed for data transmition. + * If there are walkers just mark it as deleted and the last walker + * will remove from the list and free it. */ - e_ddi_hold_devi(mip->mi_dip); + if (mcbi->mcbi_walker_cnt != 0) { + p->mcb_flags |= MCB_CONDEMNED; + mcbi->mcbi_del_cnt++; + return (B_FALSE); + } - rw_enter(&mip->mi_gen_lock, RW_WRITER); + ASSERT(mcbi->mcbi_del_cnt == 0); + *pp = p->mcb_nextp; + p->mcb_nextp = NULL; + return (B_TRUE); +} - if ((mip->mi_oref != 0) || - !(mip->mi_callbacks->mc_callbacks & MC_OPEN)) { - goto done; +/* + * Wait for all pending callback removals to be completed + */ +void +mac_callback_remove_wait(mac_cb_info_t *mcbi) +{ + ASSERT(MUTEX_HELD(mcbi->mcbi_lockp)); + while (mcbi->mcbi_del_cnt != 0) { + DTRACE_PROBE1(need_wait, mac_cb_info_t *, mcbi); + cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp); } +} - /* - * Note that we do not hold i_mac_impl_lock when calling the - * mc_open() callback function to avoid deadlock with the - * i_mac_notify() function. - */ - if ((err = mip->mi_open(mip->mi_driver)) != 0) { - rw_exit(&mip->mi_gen_lock); - ddi_release_devi(mip->mi_dip); - mac_rele(mip); - return (err); +/* + * The last mac callback walker does the cleanup. Walk the list and unlik + * all the logically deleted entries and construct a temporary list of + * removed entries. Return the list of removed entries to the caller. + */ +mac_cb_t * +mac_callback_walker_cleanup(mac_cb_info_t *mcbi, mac_cb_t **mcb_head) +{ + mac_cb_t *p; + mac_cb_t **pp; + mac_cb_t *rmlist = NULL; /* List of removed elements */ + int cnt = 0; + + ASSERT(MUTEX_HELD(mcbi->mcbi_lockp)); + ASSERT(mcbi->mcbi_del_cnt != 0 && mcbi->mcbi_walker_cnt == 0); + + pp = mcb_head; + while (*pp != NULL) { + if ((*pp)->mcb_flags & MCB_CONDEMNED) { + p = *pp; + *pp = p->mcb_nextp; + p->mcb_nextp = rmlist; + rmlist = p; + cnt++; + continue; + } + pp = &(*pp)->mcb_nextp; } -done: - mip->mi_oref++; - rw_exit(&mip->mi_gen_lock); - *mhp = (mac_handle_t)mip; - return (0); + ASSERT(mcbi->mcbi_del_cnt == cnt); + mcbi->mcbi_del_cnt = 0; + return (rmlist); } -int -mac_open_by_linkid(datalink_id_t linkid, mac_handle_t *mhp) +boolean_t +mac_callback_lookup(mac_cb_t **mcb_headp, mac_cb_t *mcb_elem) { - dls_dl_handle_t dlh; - int err; - - if ((err = dls_devnet_hold_tmp(linkid, &dlh)) != 0) - return (err); + mac_cb_t *mcb; - if (dls_devnet_vid(dlh) != VLAN_ID_NONE) { - err = EINVAL; - goto done; + /* Verify it is not already in the list */ + for (mcb = *mcb_headp; mcb != NULL; mcb = mcb->mcb_nextp) { + if (mcb == mcb_elem) + return (B_TRUE); } - dls_devnet_prop_task_wait(dlh); - - err = mac_open(dls_devnet_mac(dlh), mhp); - -done: - dls_devnet_rele_tmp(dlh); - return (err); + return (B_FALSE); } -int -mac_open_by_linkname(const char *link, mac_handle_t *mhp) +boolean_t +mac_callback_find(mac_cb_info_t *mcbi, mac_cb_t **mcb_headp, mac_cb_t *mcb_elem) { - datalink_id_t linkid; - int err; + boolean_t found; - if ((err = dls_mgmt_get_linkid(link, &linkid)) != 0) - return (err); - return (mac_open_by_linkid(linkid, mhp)); + mutex_enter(mcbi->mcbi_lockp); + found = mac_callback_lookup(mcb_headp, mcb_elem); + mutex_exit(mcbi->mcbi_lockp); + + return (found); } +/* Free the list of removed callbacks */ void -mac_close(mac_handle_t mh) +mac_callback_free(mac_cb_t *rmlist) { - mac_impl_t *mip = (mac_impl_t *)mh; + mac_cb_t *mcb; + mac_cb_t *mcb_next; - rw_enter(&mip->mi_gen_lock, RW_WRITER); - - ASSERT(mip->mi_oref != 0); - if (--mip->mi_oref == 0) { - if ((mip->mi_callbacks->mc_callbacks & MC_CLOSE)) - mip->mi_close(mip->mi_driver); + for (mcb = rmlist; mcb != NULL; mcb = mcb_next) { + mcb_next = mcb->mcb_nextp; + kmem_free(mcb->mcb_objp, mcb->mcb_objsize); } - rw_exit(&mip->mi_gen_lock); - - ddi_release_devi(mip->mi_dip); - mac_rele(mip); } -const mac_info_t * -mac_info(mac_handle_t mh) +/* + * The promisc callbacks are in 2 lists, one off the 'mip' and another off the + * 'mcip' threaded by mpi_mi_link and mpi_mci_link respectively. However there + * is only a single shared total walker count, and an entry can't be physically + * unlinked if a walker is active on either list. The last walker does this + * cleanup of logically deleted entries. + */ +void +i_mac_promisc_walker_cleanup(mac_impl_t *mip) { - return (&((mac_impl_t *)mh)->mi_info); -} + mac_cb_t *rmlist; + mac_cb_t *mcb; + mac_cb_t *mcb_next; + mac_promisc_impl_t *mpip; -dev_info_t * -mac_devinfo_get(mac_handle_t mh) -{ - return (((mac_impl_t *)mh)->mi_dip); + /* + * Construct a temporary list of deleted callbacks by walking the + * the mi_promisc_list. Then for each entry in the temporary list, + * remove it from the mci_promisc_list and free the entry. + */ + rmlist = mac_callback_walker_cleanup(&mip->mi_promisc_cb_info, + &mip->mi_promisc_list); + + for (mcb = rmlist; mcb != NULL; mcb = mcb_next) { + mcb_next = mcb->mcb_nextp; + mpip = (mac_promisc_impl_t *)mcb->mcb_objp; + VERIFY(mac_callback_remove(&mip->mi_promisc_cb_info, + &mpip->mpi_mcip->mci_promisc_list, &mpip->mpi_mci_link)); + mcb->mcb_flags = 0; + mcb->mcb_nextp = NULL; + kmem_cache_free(mac_promisc_impl_cache, mpip); + } } -const char * -mac_name(mac_handle_t mh) +void +i_mac_notify(mac_impl_t *mip, mac_notify_type_t type) { - return (((mac_impl_t *)mh)->mi_name); -} + mac_cb_info_t *mcbi; -minor_t -mac_minor(mac_handle_t mh) -{ - return (((mac_impl_t *)mh)->mi_minor); + /* + * Signal the notify thread even after mi_ref has become zero and + * mi_disabled is set. The synchronization with the notify thread + * happens in mac_unregister and that implies the driver must make + * sure it is single-threaded (with respect to mac calls) and that + * all pending mac calls have returned before it calls mac_unregister + */ + rw_enter(&i_mac_impl_lock, RW_READER); + if (mip->mi_state_flags & MIS_DISABLED) + goto exit; + + /* + * Guard against incorrect notifications. (Running a newer + * mac client against an older implementation?) + */ + if (type >= MAC_NNOTE) + goto exit; + + mcbi = &mip->mi_notify_cb_info; + mutex_enter(mcbi->mcbi_lockp); + mip->mi_notify_bits |= (1 << type); + cv_broadcast(&mcbi->mcbi_cv); + mutex_exit(mcbi->mcbi_lockp); + +exit: + rw_exit(&i_mac_impl_lock); } -uint64_t -mac_stat_get(mac_handle_t mh, uint_t stat) +/* + * Mac serialization primitives. Please see the block comment at the + * top of the file. + */ +void +i_mac_perim_enter(mac_impl_t *mip) { - mac_impl_t *mip = (mac_impl_t *)mh; - uint64_t val; - int ret; - - /* - * The range of stat determines where it is maintained. Stat - * values from 0 up to (but not including) MAC_STAT_MIN are - * mainteined by the mac module itself. Everything else is - * maintained by the driver. - */ - if (stat < MAC_STAT_MIN) { - /* These stats are maintained by the mac module itself. */ - switch (stat) { - case MAC_STAT_LINK_STATE: - return (mip->mi_linkstate); - case MAC_STAT_LINK_UP: - return (mip->mi_linkstate == LINK_STATE_UP); - case MAC_STAT_PROMISC: - return (mip->mi_devpromisc != 0); - default: - ASSERT(B_FALSE); - } - } + mac_client_impl_t *mcip; - /* - * Call the driver to get the given statistic. - */ - ret = mip->mi_getstat(mip->mi_driver, stat, &val); - if (ret != 0) { + if (mip->mi_state_flags & MIS_IS_VNIC) { /* - * The driver doesn't support this statistic. Get the - * statistic's default value. + * This is a VNIC. Return the lower mac since that is what + * we want to serialize on. */ - val = mac_stat_default(mip, stat); + mcip = mac_vnic_lower(mip); + mip = mcip->mci_mip; + } + + mutex_enter(&mip->mi_perim_lock); + if (mip->mi_perim_owner == curthread) { + mip->mi_perim_ocnt++; + mutex_exit(&mip->mi_perim_lock); + return; } - return (val); + + while (mip->mi_perim_owner != NULL) + cv_wait(&mip->mi_perim_cv, &mip->mi_perim_lock); + + mip->mi_perim_owner = curthread; + ASSERT(mip->mi_perim_ocnt == 0); + mip->mi_perim_ocnt++; +#ifdef DEBUG + mip->mi_perim_stack_depth = getpcstack(mip->mi_perim_stack, + MAC_PERIM_STACK_DEPTH); +#endif + mutex_exit(&mip->mi_perim_lock); } int -mac_start(mac_handle_t mh) +i_mac_perim_enter_nowait(mac_impl_t *mip) { - mac_impl_t *mip = (mac_impl_t *)mh; - int err; + /* + * The vnic is a special case, since the serialization is done based + * on the lower mac. If the lower mac is busy, it does not imply the + * vnic can't be unregistered. But in the case of other drivers, + * a busy perimeter or open mac handles implies that the mac is busy + * and can't be unregistered. + */ + if (mip->mi_state_flags & MIS_IS_VNIC) { + i_mac_perim_enter(mip); + return (0); + } - ASSERT(mip->mi_start != NULL); + mutex_enter(&mip->mi_perim_lock); + if (mip->mi_perim_owner != NULL) { + mutex_exit(&mip->mi_perim_lock); + return (EBUSY); + } + ASSERT(mip->mi_perim_ocnt == 0); + mip->mi_perim_owner = curthread; + mip->mi_perim_ocnt++; + mutex_exit(&mip->mi_perim_lock); - rw_enter(&(mip->mi_state_lock), RW_WRITER); + return (0); +} - /* - * Check whether the device is already started. - */ - if (mip->mi_active++ != 0) { +void +i_mac_perim_exit(mac_impl_t *mip) +{ + mac_client_impl_t *mcip; + + if (mip->mi_state_flags & MIS_IS_VNIC) { /* - * It's already started so there's nothing more to do. + * This is a VNIC. Return the lower mac since that is what + * we want to serialize on. */ - err = 0; - goto done; + mcip = mac_vnic_lower(mip); + mip = mcip->mci_mip; } - /* - * Start the device. - */ - if ((err = mip->mi_start(mip->mi_driver)) != 0) - --mip->mi_active; + ASSERT(mip->mi_perim_owner == curthread && mip->mi_perim_ocnt != 0); -done: - rw_exit(&(mip->mi_state_lock)); - return (err); + mutex_enter(&mip->mi_perim_lock); + if (--mip->mi_perim_ocnt == 0) { + mip->mi_perim_owner = NULL; + cv_signal(&mip->mi_perim_cv); + } + mutex_exit(&mip->mi_perim_lock); } -void -mac_stop(mac_handle_t mh) +/* + * Returns whether the current thread holds the mac perimeter. Used in making + * assertions. + */ +boolean_t +mac_perim_held(mac_handle_t mh) { mac_impl_t *mip = (mac_impl_t *)mh; + mac_client_impl_t *mcip; - ASSERT(mip->mi_stop != NULL); - - rw_enter(&(mip->mi_state_lock), RW_WRITER); - - /* - * Check whether the device is still needed. - */ - ASSERT(mip->mi_active != 0); - if (--mip->mi_active != 0) { + if (mip->mi_state_flags & MIS_IS_VNIC) { /* - * It's still needed so there's nothing more to do. + * This is a VNIC. Return the lower mac since that is what + * we want to serialize on. */ - goto done; + mcip = mac_vnic_lower(mip); + mip = mcip->mci_mip; } + return (mip->mi_perim_owner == curthread); +} +/* + * mac client interfaces to enter the mac perimeter of a mac end point, given + * its mac handle, or macname or linkid. + */ +void +mac_perim_enter_by_mh(mac_handle_t mh, mac_perim_handle_t *mphp) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + i_mac_perim_enter(mip); /* - * Stop the device. + * The mac_perim_handle_t returned encodes the 'mip' and whether a + * mac_open has been done internally while entering the perimeter. + * This information is used in mac_perim_exit */ - mip->mi_stop(mip->mi_driver); - -done: - rw_exit(&(mip->mi_state_lock)); + MAC_ENCODE_MPH(*mphp, mip, 0); } int -mac_multicst_add(mac_handle_t mh, const uint8_t *addr) +mac_perim_enter_by_macname(const char *name, mac_perim_handle_t *mphp) { - mac_impl_t *mip = (mac_impl_t *)mh; - mac_multicst_addr_t **pp; - mac_multicst_addr_t *p; - int err; - - ASSERT(mip->mi_multicst != NULL); + int err; + mac_handle_t mh; - /* - * Verify the address. - */ - if ((err = mip->mi_type->mt_ops.mtops_multicst_verify(addr, - mip->mi_pdata)) != 0) { + if ((err = mac_open(name, &mh)) != 0) return (err); - } - /* - * Check whether the given address is already enabled. - */ - rw_enter(&(mip->mi_data_lock), RW_WRITER); - for (pp = &(mip->mi_mmap); (p = *pp) != NULL; pp = &(p->mma_nextp)) { - if (bcmp(p->mma_addr, addr, mip->mi_type->mt_addr_length) == - 0) { - /* - * The address is already enabled so just bump the - * reference count. - */ - p->mma_ref++; - err = 0; - goto done; - } - } + mac_perim_enter_by_mh(mh, mphp); + MAC_ENCODE_MPH(*mphp, mh, 1); + return (0); +} - /* - * Allocate a new list entry. - */ - if ((p = kmem_zalloc(sizeof (mac_multicst_addr_t), - KM_NOSLEEP)) == NULL) { - err = ENOMEM; - goto done; - } +int +mac_perim_enter_by_linkid(datalink_id_t linkid, mac_perim_handle_t *mphp) +{ + int err; + mac_handle_t mh; - /* - * Enable a new multicast address. - */ - if ((err = mip->mi_multicst(mip->mi_driver, B_TRUE, addr)) != 0) { - kmem_free(p, sizeof (mac_multicst_addr_t)); - goto done; - } + if ((err = mac_open_by_linkid(linkid, &mh)) != 0) + return (err); - /* - * Add the address to the list of enabled addresses. - */ - bcopy(addr, p->mma_addr, mip->mi_type->mt_addr_length); - p->mma_ref++; - *pp = p; + mac_perim_enter_by_mh(mh, mphp); + MAC_ENCODE_MPH(*mphp, mh, 1); + return (0); +} -done: - rw_exit(&(mip->mi_data_lock)); - return (err); +void +mac_perim_exit(mac_perim_handle_t mph) +{ + mac_impl_t *mip; + boolean_t need_close; + + MAC_DECODE_MPH(mph, mip, need_close); + i_mac_perim_exit(mip); + if (need_close) + mac_close((mac_handle_t)mip); } int -mac_multicst_remove(mac_handle_t mh, const uint8_t *addr) +mac_hold(const char *macname, mac_impl_t **pmip) { - mac_impl_t *mip = (mac_impl_t *)mh; - mac_multicst_addr_t **pp; - mac_multicst_addr_t *p; - int err; + mac_impl_t *mip; + int err; - ASSERT(mip->mi_multicst != NULL); + /* + * Check the device name length to make sure it won't overflow our + * buffer. + */ + if (strlen(macname) >= MAXNAMELEN) + return (EINVAL); /* - * Find the entry in the list for the given address. + * Look up its entry in the global hash table. */ - rw_enter(&(mip->mi_data_lock), RW_WRITER); - for (pp = &(mip->mi_mmap); (p = *pp) != NULL; pp = &(p->mma_nextp)) { - if (bcmp(p->mma_addr, addr, mip->mi_type->mt_addr_length) == - 0) { - if (--p->mma_ref == 0) - break; + rw_enter(&i_mac_impl_lock, RW_WRITER); + err = mod_hash_find(i_mac_impl_hash, (mod_hash_key_t)macname, + (mod_hash_val_t *)&mip); - /* - * There is still a reference to this address so - * there's nothing more to do. - */ - err = 0; - goto done; - } + if (err != 0) { + rw_exit(&i_mac_impl_lock); + return (ENOENT); } - /* - * We did not find an entry for the given address so it is not - * currently enabled. - */ - if (p == NULL) { - err = ENOENT; - goto done; + if (mip->mi_state_flags & MIS_DISABLED) { + rw_exit(&i_mac_impl_lock); + return (ENOENT); } - ASSERT(p->mma_ref == 0); - /* - * Disable the multicast address. - */ - if ((err = mip->mi_multicst(mip->mi_driver, B_FALSE, addr)) != 0) { - p->mma_ref++; - goto done; + if (mip->mi_state_flags & MIS_EXCLUSIVE_HELD) { + rw_exit(&i_mac_impl_lock); + return (EBUSY); } - /* - * Remove it from the list. - */ - *pp = p->mma_nextp; - kmem_free(p, sizeof (mac_multicst_addr_t)); + mip->mi_ref++; + rw_exit(&i_mac_impl_lock); -done: - rw_exit(&(mip->mi_data_lock)); - return (err); + *pmip = mip; + return (0); } -/* - * mac_unicst_verify: Verifies the passed address. It fails - * if the passed address is a group address or has incorrect length. - */ -boolean_t -mac_unicst_verify(mac_handle_t mh, const uint8_t *addr, uint_t len) +void +mac_rele(mac_impl_t *mip) { - mac_impl_t *mip = (mac_impl_t *)mh; - - /* - * Verify the address. - */ - if ((len != mip->mi_type->mt_addr_length) || - (mip->mi_type->mt_ops.mtops_unicst_verify(addr, - mip->mi_pdata)) != 0) { - return (B_FALSE); - } else { - return (B_TRUE); + rw_enter(&i_mac_impl_lock, RW_WRITER); + ASSERT(mip->mi_ref != 0); + if (--mip->mi_ref == 0) { + ASSERT(mip->mi_nactiveclients == 0 && + !(mip->mi_state_flags & MIS_EXCLUSIVE)); } + rw_exit(&i_mac_impl_lock); } +/* + * This function is called only by mac_client_open. + */ int -mac_unicst_set(mac_handle_t mh, const uint8_t *addr) +mac_start(mac_impl_t *mip) { - mac_impl_t *mip = (mac_impl_t *)mh; - int err; - boolean_t notify = B_FALSE; - - ASSERT(mip->mi_unicst != NULL); + int err = 0; - /* - * Verify the address. - */ - if ((err = mip->mi_type->mt_ops.mtops_unicst_verify(addr, - mip->mi_pdata)) != 0) { - return (err); - } + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + ASSERT(mip->mi_start != NULL); /* - * Program the new unicast address. + * Check whether the device is already started. */ - rw_enter(&(mip->mi_data_lock), RW_WRITER); + if (mip->mi_active++ == 0) { + mac_ring_t *ring = NULL; - /* - * If address doesn't change, do nothing. - * This check is necessary otherwise it may call into mac_unicst_set - * recursively. - */ - if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) == 0) - goto done; + /* + * Start the device. + */ + err = mip->mi_start(mip->mi_driver); + if (err != 0) { + mip->mi_active--; + return (err); + } - if ((err = mip->mi_unicst(mip->mi_driver, addr)) != 0) - goto done; + /* + * Start the default tx ring. + */ + if (mip->mi_default_tx_ring != NULL) { - /* - * Save the address and flag that we need to send a notification. - */ - bcopy(addr, mip->mi_addr, mip->mi_type->mt_addr_length); - notify = B_TRUE; + ring = (mac_ring_t *)mip->mi_default_tx_ring; + err = mac_start_ring(ring); + if (err != 0) { + mip->mi_active--; + return (err); + } + ring->mr_state = MR_INUSE; + } -done: - rw_exit(&(mip->mi_data_lock)); + if (mip->mi_rx_groups != NULL) { + /* + * Start the default ring, since it will be needed + * to receive broadcast and multicast traffic for + * both primary and non-primary MAC clients. + */ + mac_group_t *grp = &mip->mi_rx_groups[0]; - if (notify) - i_mac_notify(mip, MAC_NOTE_UNICST); + ASSERT(grp->mrg_state == MAC_GROUP_STATE_REGISTERED); + err = mac_start_group_and_rings(grp); + if (err != 0) { + mip->mi_active--; + if (ring != NULL) { + mac_stop_ring(ring); + ring->mr_state = MR_FREE; + } + return (err); + } + mac_set_rx_group_state(grp, MAC_GROUP_STATE_SHARED); + } + } return (err); } +/* + * This function is called only by mac_client_close. + */ void -mac_unicst_get(mac_handle_t mh, uint8_t *addr) +mac_stop(mac_impl_t *mip) { - mac_impl_t *mip = (mac_impl_t *)mh; + ASSERT(mip->mi_stop != NULL); + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); /* - * Copy out the current unicast source address. + * Check whether the device is still needed. */ - rw_enter(&(mip->mi_data_lock), RW_READER); - bcopy(mip->mi_addr, addr, mip->mi_type->mt_addr_length); - rw_exit(&(mip->mi_data_lock)); -} + ASSERT(mip->mi_active != 0); + if (--mip->mi_active == 0) { + if (mip->mi_rx_groups != NULL) { + /* + * There should be no more active clients since the + * MAC is being stopped. Stop the default RX group + * and transition it back to registered state. + */ + mac_group_t *grp = &mip->mi_rx_groups[0]; -void -mac_dest_get(mac_handle_t mh, uint8_t *addr) -{ - mac_impl_t *mip = (mac_impl_t *)mh; + /* + * When clients are torn down, the groups + * are release via mac_release_rx_group which + * knows the the default group is always in + * started mode since broadcast uses it. So + * we can assert that their are no clients + * (since mac_bcast_add doesn't register itself + * as a client) and group is in SHARED state. + */ + ASSERT(grp->mrg_state == MAC_GROUP_STATE_SHARED); + ASSERT(MAC_RX_GROUP_NO_CLIENT(grp) && + mip->mi_nactiveclients == 0); + mac_stop_group_and_rings(grp); + mac_set_rx_group_state(grp, MAC_GROUP_STATE_REGISTERED); + } - /* - * Copy out the current destination address. - */ - rw_enter(&(mip->mi_data_lock), RW_READER); - bcopy(mip->mi_dstaddr, addr, mip->mi_type->mt_addr_length); - rw_exit(&(mip->mi_data_lock)); + if (mip->mi_default_tx_ring != NULL) { + mac_ring_t *ring; + + ring = (mac_ring_t *)mip->mi_default_tx_ring; + mac_stop_ring(ring); + ring->mr_state = MR_FREE; + } + + /* + * Stop the device. + */ + mip->mi_stop(mip->mi_driver); + } } int -mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype) +i_mac_promisc_set(mac_impl_t *mip, boolean_t on, mac_promisc_type_t ptype) { - mac_impl_t *mip = (mac_impl_t *)mh; int err = 0; + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); ASSERT(mip->mi_setpromisc != NULL); ASSERT(ptype == MAC_DEVPROMISC || ptype == MAC_PROMISC); @@ -958,7 +1142,6 @@ mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype) * For details on the distinction between "device promiscuous mode" * and "MAC promiscuous mode", see PSARC/2005/289. */ - rw_enter(&(mip->mi_data_lock), RW_WRITER); if (on) { /* * Enable promiscuous mode on the device if not yet enabled. @@ -967,7 +1150,7 @@ mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype) err = mip->mi_setpromisc(mip->mi_driver, B_TRUE); if (err != 0) { mip->mi_devpromisc--; - goto done; + return (err); } i_mac_notify(mip, MAC_NOTE_DEVPROMISC); } @@ -978,10 +1161,9 @@ mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype) if (ptype == MAC_PROMISC && mip->mi_promisc++ == 0) i_mac_notify(mip, MAC_NOTE_PROMISC); } else { - if (mip->mi_devpromisc == 0) { - err = EPROTO; - goto done; - } + if (mip->mi_devpromisc == 0) + return (EPROTO); + /* * Disable promiscuous mode on the device if this is the last * enabling. @@ -990,7 +1172,7 @@ mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype) err = mip->mi_setpromisc(mip->mi_driver, B_FALSE); if (err != 0) { mip->mi_devpromisc++; - goto done; + return (err); } i_mac_notify(mip, MAC_NOTE_DEVPROMISC); } @@ -1003,11 +1185,27 @@ mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype) i_mac_notify(mip, MAC_NOTE_PROMISC); } -done: - rw_exit(&(mip->mi_data_lock)); - return (err); + return (0); } +int +mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + int rv; + + i_mac_perim_enter(mip); + rv = i_mac_promisc_set(mip, on, ptype); + i_mac_perim_exit(mip); + + return (rv); +} + +/* + * The promiscuity state can change any time. If the caller needs to take + * actions that are atomic with the promiscuity state, then the caller needs + * to bracket the entire sequence with mac_perim_enter/exit + */ boolean_t mac_promisc_get(mac_handle_t mh, mac_promisc_type_t ptype) { @@ -1024,1296 +1222,1162 @@ mac_promisc_get(mac_handle_t mh, mac_promisc_type_t ptype) return (mip->mi_promisc != 0); } +/* + * Invoked at MAC instance attach time to initialize the list + * of factory MAC addresses supported by a MAC instance. This function + * builds a local cache in the mac_impl_t for the MAC addresses + * supported by the underlying hardware. The MAC clients themselves + * use the mac_addr_factory*() functions to query and reserve + * factory MAC addresses. + */ void -mac_sdu_get(mac_handle_t mh, uint_t *min_sdu, uint_t *max_sdu) +mac_addr_factory_init(mac_impl_t *mip) { - mac_impl_t *mip = (mac_impl_t *)mh; + mac_capab_multifactaddr_t capab; + uint8_t *addr; + int i; - if (min_sdu != NULL) - *min_sdu = mip->mi_sdu_min; - if (max_sdu != NULL) - *max_sdu = mip->mi_sdu_max; -} - -void -mac_resources(mac_handle_t mh) -{ - mac_impl_t *mip = (mac_impl_t *)mh; + /* + * First round to see how many factory MAC addresses are available. + */ + bzero(&capab, sizeof (capab)); + if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_MULTIFACTADDR, + &capab) || (capab.mcm_naddr == 0)) { + /* + * The MAC instance doesn't support multiple factory + * MAC addresses, we're done here. + */ + return; + } /* - * If the driver supports resource registration, call the driver to - * ask it to register its resources. + * Allocate the space and get all the factory addresses. */ - if (mip->mi_callbacks->mc_callbacks & MC_RESOURCES) - mip->mi_resources(mip->mi_driver); + addr = kmem_alloc(capab.mcm_naddr * MAXMACADDRLEN, KM_SLEEP); + capab.mcm_getaddr(mip->mi_driver, capab.mcm_naddr, addr); + + mip->mi_factory_addr_num = capab.mcm_naddr; + mip->mi_factory_addr = kmem_zalloc(mip->mi_factory_addr_num * + sizeof (mac_factory_addr_t), KM_SLEEP); + + for (i = 0; i < capab.mcm_naddr; i++) { + bcopy(addr + i * MAXMACADDRLEN, + mip->mi_factory_addr[i].mfa_addr, + mip->mi_type->mt_addr_length); + mip->mi_factory_addr[i].mfa_in_use = B_FALSE; + } + + kmem_free(addr, capab.mcm_naddr * MAXMACADDRLEN); } void -mac_ioctl(mac_handle_t mh, queue_t *wq, mblk_t *bp) +mac_addr_factory_fini(mac_impl_t *mip) { - mac_impl_t *mip = (mac_impl_t *)mh; - int cmd = ((struct iocblk *)bp->b_rptr)->ioc_cmd; - - if ((cmd == ND_GET && (mip->mi_callbacks->mc_callbacks & MC_GETPROP)) || - (cmd == ND_SET && (mip->mi_callbacks->mc_callbacks & MC_SETPROP))) { - /* - * If ndd props were registered, call them. - * Note that ndd ioctls are Obsolete - */ - mac_ndd_ioctl(mip, wq, bp); + if (mip->mi_factory_addr == NULL) { + ASSERT(mip->mi_factory_addr_num == 0); return; } - /* - * Call the driver to handle the ioctl. The driver may not support - * any ioctls, in which case we reply with a NAK on its behalf. - */ - if (mip->mi_callbacks->mc_callbacks & MC_IOCTL) - mip->mi_ioctl(mip->mi_driver, wq, bp); - else - miocnak(wq, bp, 0, EINVAL); + kmem_free(mip->mi_factory_addr, mip->mi_factory_addr_num * + sizeof (mac_factory_addr_t)); + + mip->mi_factory_addr = NULL; + mip->mi_factory_addr_num = 0; } -const mac_txinfo_t * -mac_do_tx_get(mac_handle_t mh, boolean_t is_vnic) +/* + * Reserve a factory MAC address. If *slot is set to -1, the function + * attempts to reserve any of the available factory MAC addresses and + * returns the reserved slot id. If no slots are available, the function + * returns ENOSPC. If *slot is not set to -1, the function reserves + * the specified slot if it is available, or returns EBUSY is the slot + * is already used. Returns ENOTSUP if the underlying MAC does not + * support multiple factory addresses. If the slot number is not -1 but + * is invalid, returns EINVAL. + */ +int +mac_addr_factory_reserve(mac_client_handle_t mch, int *slot) { - mac_impl_t *mip = (mac_impl_t *)mh; - mac_txinfo_t *mtp; + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; + int i, ret = 0; + i_mac_perim_enter(mip); /* - * Grab the lock to prevent us from racing with MAC_PROMISC being - * changed. This is sufficient since MAC clients are careful to always - * call mac_txloop_add() prior to enabling MAC_PROMISC, and to disable - * MAC_PROMISC prior to calling mac_txloop_remove(). + * Protect against concurrent readers that may need a self-consistent + * view of the factory addresses */ - rw_enter(&mip->mi_tx_lock, RW_READER); + rw_enter(&mip->mi_rw_lock, RW_WRITER); - if (mac_promisc_get(mh, MAC_PROMISC)) { - ASSERT(mip->mi_mtfp != NULL); - if (mip->mi_vnic_present && !is_vnic) { - mtp = &mip->mi_vnic_txloopinfo; - } else { - mtp = &mip->mi_txloopinfo; + if (mip->mi_factory_addr_num == 0) { + ret = ENOTSUP; + goto bail; + } + + if (*slot != -1) { + /* check the specified slot */ + if (*slot < 1 || *slot > mip->mi_factory_addr_num) { + ret = EINVAL; + goto bail; + } + if (mip->mi_factory_addr[*slot-1].mfa_in_use) { + ret = EBUSY; + goto bail; } } else { - if (mip->mi_vnic_present && !is_vnic) { - mtp = &mip->mi_vnic_txinfo; - } else { - /* - * Note that we cannot ASSERT() that mip->mi_mtfp is - * NULL, because to satisfy the above ASSERT(), we - * have to disable MAC_PROMISC prior to calling - * mac_txloop_remove(). - */ - mtp = &mip->mi_txinfo; + /* pick the next available slot */ + for (i = 0; i < mip->mi_factory_addr_num; i++) { + if (!mip->mi_factory_addr[i].mfa_in_use) + break; + } + + if (i == mip->mi_factory_addr_num) { + ret = ENOSPC; + goto bail; } + *slot = i+1; } - rw_exit(&mip->mi_tx_lock); - return (mtp); -} + mip->mi_factory_addr[*slot-1].mfa_in_use = B_TRUE; + mip->mi_factory_addr[*slot-1].mfa_client = mcip; -/* - * Invoked by VNIC to obtain the transmit entry point. - */ -const mac_txinfo_t * -mac_vnic_tx_get(mac_handle_t mh) -{ - return (mac_do_tx_get(mh, B_TRUE)); +bail: + rw_exit(&mip->mi_rw_lock); + i_mac_perim_exit(mip); + return (ret); } /* - * Invoked by any non-VNIC client to obtain the transmit entry point. - * If a VNIC is present, the VNIC transmit function provided by the VNIC - * will be returned to the MAC client. + * Release the specified factory MAC address slot. */ -const mac_txinfo_t * -mac_tx_get(mac_handle_t mh) -{ - return (mac_do_tx_get(mh, B_FALSE)); -} - -link_state_t -mac_link_get(mac_handle_t mh) -{ - return (((mac_impl_t *)mh)->mi_linkstate); -} - -mac_notify_handle_t -mac_notify_add(mac_handle_t mh, mac_notify_t notify, void *arg) +void +mac_addr_factory_release(mac_client_handle_t mch, uint_t slot) { - mac_impl_t *mip = (mac_impl_t *)mh; - mac_notify_fn_t *mnfp; - - mnfp = kmem_zalloc(sizeof (mac_notify_fn_t), KM_SLEEP); - mnfp->mnf_fn = notify; - mnfp->mnf_arg = arg; + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; + i_mac_perim_enter(mip); /* - * Add it to the head of the 'notify' callback list. + * Protect against concurrent readers that may need a self-consistent + * view of the factory addresses */ - rw_enter(&mip->mi_notify_lock, RW_WRITER); - mnfp->mnf_nextp = mip->mi_mnfp; - mip->mi_mnfp = mnfp; - rw_exit(&mip->mi_notify_lock); + rw_enter(&mip->mi_rw_lock, RW_WRITER); - return ((mac_notify_handle_t)mnfp); + ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num); + ASSERT(mip->mi_factory_addr[slot-1].mfa_in_use); + + mip->mi_factory_addr[slot-1].mfa_in_use = B_FALSE; + + rw_exit(&mip->mi_rw_lock); + i_mac_perim_exit(mip); } +/* + * Stores in mac_addr the value of the specified MAC address. Returns + * 0 on success, or EINVAL if the slot number is not valid for the MAC. + * The caller must provide a string of at least MAXNAMELEN bytes. + */ void -mac_notify_remove(mac_handle_t mh, mac_notify_handle_t mnh) +mac_addr_factory_value(mac_handle_t mh, int slot, uchar_t *mac_addr, + uint_t *addr_len, char *client_name, boolean_t *in_use_arg) { - mac_impl_t *mip = (mac_impl_t *)mh; - mac_notify_fn_t *mnfp = (mac_notify_fn_t *)mnh; - mac_notify_fn_t **pp; - mac_notify_fn_t *p; + mac_impl_t *mip = (mac_impl_t *)mh; + boolean_t in_use; + + ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num); /* - * Search the 'notify' callback list for the function closure. + * Readers need to hold mi_rw_lock. Writers need to hold mac perimeter + * and mi_rw_lock */ - rw_enter(&mip->mi_notify_lock, RW_WRITER); - for (pp = &(mip->mi_mnfp); (p = *pp) != NULL; - pp = &(p->mnf_nextp)) { - if (p == mnfp) - break; + rw_enter(&mip->mi_rw_lock, RW_READER); + bcopy(mip->mi_factory_addr[slot-1].mfa_addr, mac_addr, MAXMACADDRLEN); + *addr_len = mip->mi_type->mt_addr_length; + in_use = mip->mi_factory_addr[slot-1].mfa_in_use; + if (in_use && client_name != NULL) { + bcopy(mip->mi_factory_addr[slot-1].mfa_client->mci_name, + client_name, MAXNAMELEN); } - ASSERT(p != NULL); + if (in_use_arg != NULL) + *in_use_arg = in_use; + rw_exit(&mip->mi_rw_lock); +} - /* - * Remove it from the list. - */ - *pp = p->mnf_nextp; - rw_exit(&mip->mi_notify_lock); +/* + * Returns the number of factory MAC addresses (in addition to the + * primary MAC address), 0 if the underlying MAC doesn't support + * that feature. + */ +uint_t +mac_addr_factory_num(mac_handle_t mh) +{ + mac_impl_t *mip = (mac_impl_t *)mh; - /* - * Free it. - */ - kmem_free(mnfp, sizeof (mac_notify_fn_t)); + return (mip->mi_factory_addr_num); } + void -mac_notify(mac_handle_t mh) +mac_rx_group_unmark(mac_group_t *grp, uint_t flag) { - mac_impl_t *mip = (mac_impl_t *)mh; - mac_notify_type_t type; + mac_ring_t *ring; - for (type = 0; type < MAC_NNOTE; type++) - i_mac_notify(mip, type); + for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next) + ring->mr_flag &= ~flag; } /* - * Register a receive function for this mac. - * More information on this function's interaction with mac_rx() - * can be found atop mac_rx(). + * The following mac_hwrings_xxx() functions are private mac client functions + * used by the aggr driver to access and control the underlying HW Rx group + * and rings. In this case, the aggr driver has exclusive control of the + * underlying HW Rx group/rings, it calls the following functions to + * start/stop the HW Rx rings, disable/enable polling, add/remove mac' + * addresses, or set up the Rx callback. */ -mac_rx_handle_t -mac_do_rx_add(mac_handle_t mh, mac_rx_t rx, void *arg, boolean_t is_active) +/* ARGSUSED */ +static void +mac_hwrings_rx_process(void *arg, mac_resource_handle_t srs, + mblk_t *mp_chain, boolean_t loopback) { - mac_impl_t *mip = (mac_impl_t *)mh; - mac_rx_fn_t *mrfp; + mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; + mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; + mac_direct_rx_t proc; + void *arg1; + mac_resource_handle_t arg2; - mrfp = kmem_zalloc(sizeof (mac_rx_fn_t), KM_SLEEP); - mrfp->mrf_fn = rx; - mrfp->mrf_arg = arg; - mrfp->mrf_active = is_active; + proc = srs_rx->sr_func; + arg1 = srs_rx->sr_arg1; + arg2 = mac_srs->srs_mrh; - /* - * Add it to the head of the 'rx' callback list. - */ - rw_enter(&(mip->mi_rx_lock), RW_WRITER); + proc(arg1, arg2, mp_chain, NULL); +} + +/* + * This function is called to get the list of HW rings that are reserved by + * an exclusive mac client. + * + * Return value: the number of HW rings. + */ +int +mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh, + mac_ring_handle_t *hwrh) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + flow_entry_t *flent = mcip->mci_flent; + mac_group_t *grp = flent->fe_rx_ring_group; + mac_ring_t *ring; + int cnt = 0; /* - * mac_rx() will only call callbacks that are marked inuse. + * The mac client did not reserve any RX group, return directly. + * This is probably because the underlying MAC does not support + * any RX groups. */ - mrfp->mrf_inuse = B_TRUE; - mrfp->mrf_nextp = mip->mi_mrfp; + *hwgh = NULL; + if (grp == NULL) + return (0); /* - * mac_rx() could be traversing the remainder of the list - * and miss the new callback we're adding here. This is not a problem - * because we do not guarantee the callback to take effect immediately - * after mac_rx_add() returns. + * This RX group must be reserved by this mac client. */ - mip->mi_mrfp = mrfp; - rw_exit(&(mip->mi_rx_lock)); + ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) && + (mch == (mac_client_handle_t)(MAC_RX_GROUP_ONLY_CLIENT(grp)))); - return ((mac_rx_handle_t)mrfp); + for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next) { + ASSERT(cnt < MAX_RINGS_PER_GROUP); + hwrh[cnt++] = (mac_ring_handle_t)ring; + } + *hwgh = (mac_group_handle_t)grp; + return (cnt); } -mac_rx_handle_t -mac_rx_add(mac_handle_t mh, mac_rx_t rx, void *arg) +/* + * Setup the RX callback of the mac client which exclusively controls HW ring. + */ +void +mac_hwring_setup(mac_ring_handle_t hwrh, mac_resource_handle_t prh) { - return (mac_do_rx_add(mh, rx, arg, B_FALSE)); + mac_ring_t *hw_ring = (mac_ring_t *)hwrh; + mac_soft_ring_set_t *mac_srs = hw_ring->mr_srs; + + mac_srs->srs_mrh = prh; + mac_srs->srs_rx.sr_lower_proc = mac_hwrings_rx_process; } -mac_rx_handle_t -mac_active_rx_add(mac_handle_t mh, mac_rx_t rx, void *arg) +void +mac_hwring_teardown(mac_ring_handle_t hwrh) { - return (mac_do_rx_add(mh, rx, arg, B_TRUE)); + mac_ring_t *hw_ring = (mac_ring_t *)hwrh; + mac_soft_ring_set_t *mac_srs = hw_ring->mr_srs; + + mac_srs->srs_rx.sr_lower_proc = mac_rx_srs_process; + mac_srs->srs_mrh = NULL; } -/* - * Unregister a receive function for this mac. - * This function does not block if wait is B_FALSE. This is useful - * for clients who call mac_rx_remove() from a non-blockable context. - * More information on this function's interaction with mac_rx() - * can be found atop mac_rx(). - */ -void -mac_rx_remove(mac_handle_t mh, mac_rx_handle_t mrh, boolean_t wait) +int +mac_hwring_disable_intr(mac_ring_handle_t rh) { - mac_impl_t *mip = (mac_impl_t *)mh; - mac_rx_fn_t *mrfp = (mac_rx_fn_t *)mrh; - mac_rx_fn_t **pp; - mac_rx_fn_t *p; + mac_ring_t *rr_ring = (mac_ring_t *)rh; + mac_intr_t *intr = &rr_ring->mr_info.mri_intr; - /* - * Search the 'rx' callback list for the function closure. - */ - rw_enter(&mip->mi_rx_lock, RW_WRITER); - for (pp = &(mip->mi_mrfp); (p = *pp) != NULL; pp = &(p->mrf_nextp)) { - if (p == mrfp) - break; - } - ASSERT(p != NULL); + return (intr->mi_disable(intr->mi_handle)); +} - /* - * If mac_rx() is running, mark callback for deletion - * and return (if wait is false), or wait until mac_rx() - * exits (if wait is true). - */ - if (mip->mi_rx_ref > 0) { - DTRACE_PROBE1(defer_delete, mac_impl_t *, mip); - p->mrf_inuse = B_FALSE; - mutex_enter(&mip->mi_lock); - mip->mi_rx_removed++; - mutex_exit(&mip->mi_lock); +int +mac_hwring_enable_intr(mac_ring_handle_t rh) +{ + mac_ring_t *rr_ring = (mac_ring_t *)rh; + mac_intr_t *intr = &rr_ring->mr_info.mri_intr; - rw_exit(&mip->mi_rx_lock); - if (wait) - mac_rx_remove_wait(mh); - return; - } + return (intr->mi_enable(intr->mi_handle)); +} + +int +mac_hwring_start(mac_ring_handle_t rh) +{ + mac_ring_t *rr_ring = (mac_ring_t *)rh; - /* Remove it from the list. */ - *pp = p->mrf_nextp; - kmem_free(mrfp, sizeof (mac_rx_fn_t)); - rw_exit(&mip->mi_rx_lock); + MAC_RING_UNMARK(rr_ring, MR_QUIESCE); + return (0); } -/* - * Wait for all pending callback removals to be completed by mac_rx(). - * Note that if we call mac_rx_remove() immediately before this, there is no - * guarantee we would wait *only* on the callback that we specified. - * mac_rx_remove() could have been called by other threads and we would have - * to wait for other marked callbacks to be removed as well. - */ void -mac_rx_remove_wait(mac_handle_t mh) +mac_hwring_stop(mac_ring_handle_t rh) { - mac_impl_t *mip = (mac_impl_t *)mh; + mac_ring_t *rr_ring = (mac_ring_t *)rh; - mutex_enter(&mip->mi_lock); - while (mip->mi_rx_removed > 0) { - DTRACE_PROBE1(need_wait, mac_impl_t *, mip); - cv_wait(&mip->mi_rx_cv, &mip->mi_lock); - } - mutex_exit(&mip->mi_lock); + mac_rx_ring_quiesce(rr_ring, MR_QUIESCE); } -mac_txloop_handle_t -mac_txloop_add(mac_handle_t mh, mac_txloop_t tx, void *arg) +mblk_t * +mac_hwring_poll(mac_ring_handle_t rh, int bytes_to_pickup) { - mac_impl_t *mip = (mac_impl_t *)mh; - mac_txloop_fn_t *mtfp; + mac_ring_t *rr_ring = (mac_ring_t *)rh; + mac_ring_info_t *info = &rr_ring->mr_info; - mtfp = kmem_zalloc(sizeof (mac_txloop_fn_t), KM_SLEEP); - mtfp->mtf_fn = tx; - mtfp->mtf_arg = arg; + return (info->mri_poll(info->mri_driver, bytes_to_pickup)); +} - /* - * Add it to the head of the 'tx' callback list. - */ - rw_enter(&(mip->mi_tx_lock), RW_WRITER); - mtfp->mtf_nextp = mip->mi_mtfp; - mip->mi_mtfp = mtfp; - rw_exit(&(mip->mi_tx_lock)); +int +mac_hwgroup_addmac(mac_group_handle_t gh, const uint8_t *addr) +{ + mac_group_t *group = (mac_group_t *)gh; - return ((mac_txloop_handle_t)mtfp); + return (mac_group_addmac(group, addr)); +} + +int +mac_hwgroup_remmac(mac_group_handle_t gh, const uint8_t *addr) +{ + mac_group_t *group = (mac_group_t *)gh; + + return (mac_group_remmac(group, addr)); } /* - * Unregister a transmit function for this mac. This removes the function - * from the list of transmit functions for this mac. + * Set the RX group to be shared/reserved. Note that the group must be + * started/stopped outside of this function. */ void -mac_txloop_remove(mac_handle_t mh, mac_txloop_handle_t mth) +mac_set_rx_group_state(mac_group_t *grp, mac_group_state_t state) { - mac_impl_t *mip = (mac_impl_t *)mh; - mac_txloop_fn_t *mtfp = (mac_txloop_fn_t *)mth; - mac_txloop_fn_t **pp; - mac_txloop_fn_t *p; - /* - * Search the 'tx' callback list for the function. + * If there is no change in the group state, just return. */ - rw_enter(&(mip->mi_tx_lock), RW_WRITER); - for (pp = &(mip->mi_mtfp); (p = *pp) != NULL; pp = &(p->mtf_nextp)) { - if (p == mtfp) - break; + if (grp->mrg_state == state) + return; + + switch (state) { + case MAC_GROUP_STATE_RESERVED: + /* + * Successfully reserved the group. + * + * Given that there is an exclusive client controlling this + * group, we enable the group level polling when available, + * so that SRSs get to turn on/off individual rings they's + * assigned to. + */ + ASSERT(MAC_PERIM_HELD(grp->mrg_mh)); + + if (GROUP_INTR_DISABLE_FUNC(grp) != NULL) + GROUP_INTR_DISABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp)); + + break; + + case MAC_GROUP_STATE_SHARED: + /* + * Set all rings of this group to software classified. + * If the group has an overriding interrupt, then re-enable it. + */ + ASSERT(MAC_PERIM_HELD(grp->mrg_mh)); + + if (GROUP_INTR_ENABLE_FUNC(grp) != NULL) + GROUP_INTR_ENABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp)); + + /* The ring is not available for reservations any more */ + break; + + case MAC_GROUP_STATE_REGISTERED: + /* Also callable from mac_register, perim is not held */ + break; + + default: + ASSERT(B_FALSE); + break; } - ASSERT(p != NULL); - /* Remove it from the list. */ - *pp = p->mtf_nextp; - kmem_free(mtfp, sizeof (mac_txloop_fn_t)); - rw_exit(&(mip->mi_tx_lock)); + grp->mrg_state = state; } -void -mac_resource_set(mac_handle_t mh, mac_resource_add_t add, void *arg) +/* + * Quiesce future hardware classified packets for the specified Rx ring + */ +static void +mac_rx_ring_quiesce(mac_ring_t *rx_ring, uint_t ring_flag) { - mac_impl_t *mip = (mac_impl_t *)mh; - - /* - * Update the 'resource_add' callbacks. - */ - rw_enter(&(mip->mi_resource_lock), RW_WRITER); - mip->mi_resource_add = add; - mip->mi_resource_add_arg = arg; - rw_exit(&(mip->mi_resource_lock)); + ASSERT(rx_ring->mr_classify_type == MAC_HW_CLASSIFIER); + ASSERT(ring_flag == MR_CONDEMNED || ring_flag == MR_QUIESCE); + + mutex_enter(&rx_ring->mr_lock); + rx_ring->mr_flag |= ring_flag; + while (rx_ring->mr_refcnt != 0) + cv_wait(&rx_ring->mr_cv, &rx_ring->mr_lock); + mutex_exit(&rx_ring->mr_lock); } /* - * Driver support functions. + * Please see mac_tx for details about the per cpu locking scheme */ - -mac_register_t * -mac_alloc(uint_t mac_version) +static void +mac_tx_lock_all(mac_client_impl_t *mcip) { - mac_register_t *mregp; + int i; - /* - * Make sure there isn't a version mismatch between the driver and - * the framework. In the future, if multiple versions are - * supported, this check could become more sophisticated. - */ - if (mac_version != MAC_VERSION) - return (NULL); - - mregp = kmem_zalloc(sizeof (mac_register_t), KM_SLEEP); - mregp->m_version = mac_version; - return (mregp); + for (i = 0; i <= mac_tx_percpu_cnt; i++) + mutex_enter(&mcip->mci_tx_pcpu[i].pcpu_tx_lock); } -void -mac_free(mac_register_t *mregp) +static void +mac_tx_unlock_all(mac_client_impl_t *mcip) { - kmem_free(mregp, sizeof (mac_register_t)); + int i; + + for (i = mac_tx_percpu_cnt; i >= 0; i--) + mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock); } -/* - * Allocate a minor number. - */ -minor_t -mac_minor_hold(boolean_t sleep) +static void +mac_tx_unlock_allbutzero(mac_client_impl_t *mcip) { - minor_t minor; + int i; - /* - * Grab a value from the arena. - */ - atomic_add_32(&minor_count, 1); + for (i = mac_tx_percpu_cnt; i > 0; i--) + mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock); +} - if (sleep) - minor = (uint_t)id_alloc(minor_ids); - else - minor = (uint_t)id_alloc_nosleep(minor_ids); +static int +mac_tx_sum_refcnt(mac_client_impl_t *mcip) +{ + int i; + int refcnt = 0; - if (minor == 0) { - atomic_add_32(&minor_count, -1); - return (0); - } + for (i = 0; i <= mac_tx_percpu_cnt; i++) + refcnt += mcip->mci_tx_pcpu[i].pcpu_tx_refcnt; - return (minor); + return (refcnt); } /* - * Release a previously allocated minor number. + * Stop future Tx packets coming down from the client in preparation for + * quiescing the Tx side. This is needed for dynamic reclaim and reassignment + * of rings between clients */ void -mac_minor_rele(minor_t minor) +mac_tx_client_block(mac_client_impl_t *mcip) { - /* - * Return the value to the arena. - */ - id_free(minor_ids, minor); - atomic_add_32(&minor_count, -1); + mac_tx_lock_all(mcip); + mcip->mci_tx_flag |= MCI_TX_QUIESCE; + while (mac_tx_sum_refcnt(mcip) != 0) { + mac_tx_unlock_allbutzero(mcip); + cv_wait(&mcip->mci_tx_cv, &mcip->mci_tx_pcpu[0].pcpu_tx_lock); + mutex_exit(&mcip->mci_tx_pcpu[0].pcpu_tx_lock); + mac_tx_lock_all(mcip); + } + mac_tx_unlock_all(mcip); } -uint32_t -mac_no_notification(mac_handle_t mh) +void +mac_tx_client_unblock(mac_client_impl_t *mcip) { - mac_impl_t *mip = (mac_impl_t *)mh; - return (mip->mi_unsup_note); + mac_tx_lock_all(mcip); + mcip->mci_tx_flag &= ~MCI_TX_QUIESCE; + mac_tx_unlock_all(mcip); } -boolean_t -mac_is_legacy(mac_handle_t mh) +/* + * Wait for an SRS to quiesce. The SRS worker will signal us when the + * quiesce is done. + */ +static void +mac_srs_quiesce_wait(mac_soft_ring_set_t *srs, uint_t srs_flag) { - mac_impl_t *mip = (mac_impl_t *)mh; - return (mip->mi_legacy); + mutex_enter(&srs->srs_lock); + while (!(srs->srs_state & srs_flag)) + cv_wait(&srs->srs_quiesce_done_cv, &srs->srs_lock); + mutex_exit(&srs->srs_lock); } /* - * mac_register() is how drivers register new MACs with the GLDv3 - * framework. The mregp argument is allocated by drivers using the - * mac_alloc() function, and can be freed using mac_free() immediately upon - * return from mac_register(). Upon success (0 return value), the mhp - * opaque pointer becomes the driver's handle to its MAC interface, and is - * the argument to all other mac module entry points. + * Quiescing an Rx SRS is achieved by the following sequence. The protocol + * works bottom up by cutting off packet flow from the bottommost point in the + * mac, then the SRS, and then the soft rings. There are 2 use cases of this + * mechanism. One is a temporary quiesce of the SRS, such as say while changing + * the Rx callbacks. Another use case is Rx SRS teardown. In the former case + * the QUIESCE prefix/suffix is used and in the latter the CONDEMNED is used + * for the SRS and MR flags. In the former case the threads pause waiting for + * a restart, while in the latter case the threads exit. The Tx SRS teardown + * is also mostly similar to the above. + * + * 1. Stop future hardware classified packets at the lowest level in the mac. + * Remove any hardware classification rule (CONDEMNED case) and mark the + * rings as CONDEMNED or QUIESCE as appropriate. This prevents the mr_refcnt + * from increasing. Upcalls from the driver that come through hardware + * classification will be dropped in mac_rx from now on. Then we wait for + * the mr_refcnt to drop to zero. When the mr_refcnt reaches zero we are + * sure there aren't any upcall threads from the driver through hardware + * classification. In the case of SRS teardown we also remove the + * classification rule in the driver. + * + * 2. Stop future software classified packets by marking the flow entry with + * FE_QUIESCE or FE_CONDEMNED as appropriate which prevents the refcnt from + * increasing. We also remove the flow entry from the table in the latter + * case. Then wait for the fe_refcnt to reach an appropriate quiescent value + * that indicates there aren't any active threads using that flow entry. + * + * 3. Quiesce the SRS and softrings by signaling the SRS. The SRS poll thread, + * SRS worker thread, and the soft ring threads are quiesced in sequence + * with the SRS worker thread serving as a master controller. This + * mechansim is explained in mac_srs_worker_quiesce(). + * + * The restart mechanism to reactivate the SRS and softrings is explained + * in mac_srs_worker_restart(). Here we just signal the SRS worker to start the + * restart sequence. */ -int -mac_register(mac_register_t *mregp, mac_handle_t *mhp) -{ - mac_impl_t *mip; - mactype_t *mtype; - int err = EINVAL; - struct devnames *dnp = NULL; - uint_t instance; - boolean_t style1_created = B_FALSE; - boolean_t style2_created = B_FALSE; - mac_capab_legacy_t legacy; - char *driver; - minor_t minor = 0; - - /* Find the required MAC-Type plugin. */ - if ((mtype = i_mactype_getplugin(mregp->m_type_ident)) == NULL) - return (EINVAL); +void +mac_rx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag) +{ + flow_entry_t *flent = srs->srs_flent; + uint_t mr_flag, srs_done_flag; - /* Create a mac_impl_t to represent this MAC. */ - mip = kmem_cache_alloc(i_mac_impl_cachep, KM_SLEEP); + ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent))); + ASSERT(!(srs->srs_type & SRST_TX)); - /* - * The mac is not ready for open yet. - */ - mip->mi_disabled = B_TRUE; - - /* - * When a mac is registered, the m_instance field can be set to: - * - * 0: Get the mac's instance number from m_dip. - * This is usually used for physical device dips. - * - * [1 .. MAC_MAX_MINOR-1]: Use the value as the mac's instance number. - * For example, when an aggregation is created with the key option, - * "key" will be used as the instance number. - * - * -1: Assign an instance number from [MAC_MAX_MINOR .. MAXMIN-1]. - * This is often used when a MAC of a virtual link is registered - * (e.g., aggregation when "key" is not specified, or vnic). - * - * Note that the instance number is used to derive the mi_minor field - * of mac_impl_t, which will then be used to derive the name of kstats - * and the devfs nodes. The first 2 cases are needed to preserve - * backward compatibility. - */ - switch (mregp->m_instance) { - case 0: - instance = ddi_get_instance(mregp->m_dip); - break; - case ((uint_t)-1): - minor = mac_minor_hold(B_TRUE); - if (minor == 0) { - err = ENOSPC; - goto fail; - } - instance = minor - 1; - break; - default: - instance = mregp->m_instance; - if (instance >= MAC_MAX_MINOR) { - err = EINVAL; - goto fail; - } - break; + if (srs_quiesce_flag == SRS_CONDEMNED) { + mr_flag = MR_CONDEMNED; + srs_done_flag = SRS_CONDEMNED_DONE; + if (srs->srs_type & SRST_CLIENT_POLL_ENABLED) + mac_srs_client_poll_disable(srs->srs_mcip, srs); + } else { + ASSERT(srs_quiesce_flag == SRS_QUIESCE); + mr_flag = MR_QUIESCE; + srs_done_flag = SRS_QUIESCE_DONE; + if (srs->srs_type & SRST_CLIENT_POLL_ENABLED) + mac_srs_client_poll_quiesce(srs->srs_mcip, srs); } - mip->mi_minor = (minor_t)(instance + 1); - mip->mi_dip = mregp->m_dip; - - driver = (char *)ddi_driver_name(mip->mi_dip); - - /* Construct the MAC name as <drvname><instance> */ - (void) snprintf(mip->mi_name, sizeof (mip->mi_name), "%s%d", - driver, instance); - - mip->mi_driver = mregp->m_driver; - - mip->mi_type = mtype; - mip->mi_margin = mregp->m_margin; - mip->mi_info.mi_media = mtype->mt_type; - mip->mi_info.mi_nativemedia = mtype->mt_nativetype; - if (mregp->m_max_sdu <= mregp->m_min_sdu) - goto fail; - mip->mi_sdu_min = mregp->m_min_sdu; - mip->mi_sdu_max = mregp->m_max_sdu; - mip->mi_info.mi_addr_length = mip->mi_type->mt_addr_length; - /* - * If the media supports a broadcast address, cache a pointer to it - * in the mac_info_t so that upper layers can use it. - */ - mip->mi_info.mi_brdcst_addr = mip->mi_type->mt_brdcst_addr; - - /* - * Copy the unicast source address into the mac_info_t, but only if - * the MAC-Type defines a non-zero address length. We need to - * handle MAC-Types that have an address length of 0 - * (point-to-point protocol MACs for example). - */ - if (mip->mi_type->mt_addr_length > 0) { - if (mregp->m_src_addr == NULL) - goto fail; - mip->mi_info.mi_unicst_addr = - kmem_alloc(mip->mi_type->mt_addr_length, KM_SLEEP); - bcopy(mregp->m_src_addr, mip->mi_info.mi_unicst_addr, - mip->mi_type->mt_addr_length); - + if (srs->srs_ring != NULL) { + mac_rx_ring_quiesce(srs->srs_ring, mr_flag); + } else { /* - * Copy the fixed 'factory' MAC address from the immutable - * info. This is taken to be the MAC address currently in - * use. + * SRS is driven by software classification. In case + * of CONDEMNED, the top level teardown functions will + * deal with flow removal. */ - bcopy(mip->mi_info.mi_unicst_addr, mip->mi_addr, - mip->mi_type->mt_addr_length); - /* Copy the destination address if one is provided. */ - if (mregp->m_dst_addr != NULL) { - bcopy(mregp->m_dst_addr, mip->mi_dstaddr, - mip->mi_type->mt_addr_length); + if (srs_quiesce_flag != SRS_CONDEMNED) { + FLOW_MARK(flent, FE_QUIESCE); + mac_flow_wait(flent, FLOW_DRIVER_UPCALL); } - } else if (mregp->m_src_addr != NULL) { - goto fail; } /* - * The format of the m_pdata is specific to the plugin. It is - * passed in as an argument to all of the plugin callbacks. The - * driver can update this information by calling - * mac_pdata_update(). + * Signal the SRS to quiesce itself, and then cv_wait for the + * SRS quiesce to complete. The SRS worker thread will wake us + * up when the quiesce is complete */ - if (mregp->m_pdata != NULL) { - /* - * Verify that the plugin supports MAC plugin data and that - * the supplied data is valid. - */ - if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY)) - goto fail; - if (!mip->mi_type->mt_ops.mtops_pdata_verify(mregp->m_pdata, - mregp->m_pdata_size)) { - goto fail; - } - mip->mi_pdata = kmem_alloc(mregp->m_pdata_size, KM_SLEEP); - bcopy(mregp->m_pdata, mip->mi_pdata, mregp->m_pdata_size); - mip->mi_pdata_size = mregp->m_pdata_size; - } + mac_srs_signal(srs, srs_quiesce_flag); + mac_srs_quiesce_wait(srs, srs_done_flag); +} - /* - * Register the private properties. - */ - mac_register_priv_prop(mip, mregp->m_priv_props, - mregp->m_priv_prop_count); +/* + * Remove an SRS. + */ +void +mac_rx_srs_remove(mac_soft_ring_set_t *srs) +{ + flow_entry_t *flent = srs->srs_flent; + int i; + mac_rx_srs_quiesce(srs, SRS_CONDEMNED); /* - * Stash the driver callbacks into the mac_impl_t, but first sanity - * check to make sure all mandatory callbacks are set. + * Locate and remove our entry in the fe_rx_srs[] array, and + * adjust the fe_rx_srs array entries and array count by + * moving the last entry into the vacated spot. */ - if (mregp->m_callbacks->mc_getstat == NULL || - mregp->m_callbacks->mc_start == NULL || - mregp->m_callbacks->mc_stop == NULL || - mregp->m_callbacks->mc_setpromisc == NULL || - mregp->m_callbacks->mc_multicst == NULL || - mregp->m_callbacks->mc_unicst == NULL || - mregp->m_callbacks->mc_tx == NULL) { - goto fail; + mutex_enter(&flent->fe_lock); + for (i = 0; i < flent->fe_rx_srs_cnt; i++) { + if (flent->fe_rx_srs[i] == srs) + break; } - mip->mi_callbacks = mregp->m_callbacks; - /* - * Set up the possible transmit routines. - */ - mip->mi_txinfo.mt_fn = mip->mi_tx; - mip->mi_txinfo.mt_arg = mip->mi_driver; + ASSERT(i != 0 && i < flent->fe_rx_srs_cnt); + if (i != flent->fe_rx_srs_cnt - 1) { + flent->fe_rx_srs[i] = + flent->fe_rx_srs[flent->fe_rx_srs_cnt - 1]; + i = flent->fe_rx_srs_cnt - 1; + } - mip->mi_legacy = mac_capab_get((mac_handle_t)mip, - MAC_CAPAB_LEGACY, &legacy); + flent->fe_rx_srs[i] = NULL; + flent->fe_rx_srs_cnt--; + mutex_exit(&flent->fe_lock); - if (mip->mi_legacy) { - /* - * Legacy device. Messages being sent will be looped back - * by the underlying driver. Therefore the txloop function - * pointer is the same as the tx function pointer. - */ - mip->mi_txloopinfo.mt_fn = mip->mi_txinfo.mt_fn; - mip->mi_txloopinfo.mt_arg = mip->mi_txinfo.mt_arg; - mip->mi_unsup_note = legacy.ml_unsup_note; - mip->mi_phy_dev = legacy.ml_dev; - } else { - /* - * Normal device. The framework needs to do the loopback. - */ - mip->mi_txloopinfo.mt_fn = mac_txloop; - mip->mi_txloopinfo.mt_arg = mip; - mip->mi_unsup_note = 0; - mip->mi_phy_dev = makedevice(ddi_driver_major(mip->mi_dip), - ddi_get_instance(mip->mi_dip) + 1); - } + mac_srs_free(srs); +} - mip->mi_vnic_txinfo.mt_fn = mac_vnic_tx; - mip->mi_vnic_txinfo.mt_arg = mip; +static void +mac_srs_clear_flag(mac_soft_ring_set_t *srs, uint_t flag) +{ + mutex_enter(&srs->srs_lock); + srs->srs_state &= ~flag; + mutex_exit(&srs->srs_lock); +} + +void +mac_rx_srs_restart(mac_soft_ring_set_t *srs) +{ + flow_entry_t *flent = srs->srs_flent; + mac_ring_t *mr; - mip->mi_vnic_txloopinfo.mt_fn = mac_vnic_txloop; - mip->mi_vnic_txloopinfo.mt_arg = mip; + ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent))); + ASSERT((srs->srs_type & SRST_TX) == 0); /* - * Allocate a notification thread. + * This handles a change in the number of SRSs between the quiesce and + * and restart operation of a flow. */ - mip->mi_notify_thread = thread_create(NULL, 0, i_mac_notify_thread, - mip, 0, &p0, TS_RUN, minclsyspri); - if (mip->mi_notify_thread == NULL) - goto fail; + if (!SRS_QUIESCED(srs)) + return; /* - * Initialize the kstats for this device. + * Signal the SRS to restart itself. Wait for the restart to complete + * Note that we only restart the SRS if it is not marked as + * permanently quiesced. */ - mac_stat_create(mip); - - - /* set the gldv3 flag in dn_flags */ - dnp = &devnamesp[ddi_driver_major(mip->mi_dip)]; - LOCK_DEV_OPS(&dnp->dn_lock); - dnp->dn_flags |= (DN_GLDV3_DRIVER | DN_NETWORK_DRIVER); - UNLOCK_DEV_OPS(&dnp->dn_lock); - - if (mip->mi_minor < MAC_MAX_MINOR + 1) { - /* Create a style-2 DLPI device */ - if (ddi_create_minor_node(mip->mi_dip, driver, S_IFCHR, 0, - DDI_NT_NET, CLONE_DEV) != DDI_SUCCESS) - goto fail; - style2_created = B_TRUE; + if (!SRS_QUIESCED_PERMANENT(srs)) { + mac_srs_signal(srs, SRS_RESTART); + mac_srs_quiesce_wait(srs, SRS_RESTART_DONE); + mac_srs_clear_flag(srs, SRS_RESTART_DONE); - /* Create a style-1 DLPI device */ - if (ddi_create_minor_node(mip->mi_dip, mip->mi_name, S_IFCHR, - mip->mi_minor, DDI_NT_NET, 0) != DDI_SUCCESS) - goto fail; - style1_created = B_TRUE; + mac_srs_client_poll_restart(srs->srs_mcip, srs); } - rw_enter(&i_mac_impl_lock, RW_WRITER); - if (mod_hash_insert(i_mac_impl_hash, - (mod_hash_key_t)mip->mi_name, (mod_hash_val_t)mip) != 0) { - - rw_exit(&i_mac_impl_lock); - err = EEXIST; - goto fail; + /* Finally clear the flags to let the packets in */ + mr = srs->srs_ring; + if (mr != NULL) { + MAC_RING_UNMARK(mr, MR_QUIESCE); + /* In case the ring was stopped, safely restart it */ + (void) mac_start_ring(mr); + } else { + FLOW_UNMARK(flent, FE_QUIESCE); } +} - DTRACE_PROBE2(mac__register, struct devnames *, dnp, - (mac_impl_t *), mip); - - /* - * Mark the MAC to be ready for open. - */ - mip->mi_disabled = B_FALSE; - - rw_exit(&i_mac_impl_lock); - - atomic_inc_32(&i_mac_impl_count); +/* + * Temporary quiesce of a flow and associated Rx SRS. + * Please see block comment above mac_rx_classify_flow_rem. + */ +/* ARGSUSED */ +int +mac_rx_classify_flow_quiesce(flow_entry_t *flent, void *arg) +{ + int i; - cmn_err(CE_NOTE, "!%s registered", mip->mi_name); - *mhp = (mac_handle_t)mip; + for (i = 0; i < flent->fe_rx_srs_cnt; i++) { + mac_rx_srs_quiesce((mac_soft_ring_set_t *)flent->fe_rx_srs[i], + SRS_QUIESCE); + } return (0); +} -fail: - if (style1_created) - ddi_remove_minor_node(mip->mi_dip, mip->mi_name); - - if (style2_created) - ddi_remove_minor_node(mip->mi_dip, driver); +/* + * Restart a flow and associated Rx SRS that has been quiesced temporarily + * Please see block comment above mac_rx_classify_flow_rem + */ +/* ARGSUSED */ +int +mac_rx_classify_flow_restart(flow_entry_t *flent, void *arg) +{ + int i; - /* clean up notification thread */ - if (mip->mi_notify_thread != NULL) { - mutex_enter(&mip->mi_notify_bits_lock); - mip->mi_notify_bits = (1 << MAC_NNOTE); - cv_broadcast(&mip->mi_notify_cv); - while (mip->mi_notify_bits != 0) - cv_wait(&mip->mi_notify_cv, &mip->mi_notify_bits_lock); - mutex_exit(&mip->mi_notify_bits_lock); - } + for (i = 0; i < flent->fe_rx_srs_cnt; i++) + mac_rx_srs_restart((mac_soft_ring_set_t *)flent->fe_rx_srs[i]); - if (mip->mi_info.mi_unicst_addr != NULL) { - kmem_free(mip->mi_info.mi_unicst_addr, - mip->mi_type->mt_addr_length); - mip->mi_info.mi_unicst_addr = NULL; - } + return (0); +} - mac_stat_destroy(mip); +void +mac_srs_perm_quiesce(mac_client_handle_t mch, boolean_t on) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + flow_entry_t *flent = mcip->mci_flent; + mac_impl_t *mip = mcip->mci_mip; + mac_soft_ring_set_t *mac_srs; + int i; - if (mip->mi_type != NULL) { - atomic_dec_32(&mip->mi_type->mt_ref); - mip->mi_type = NULL; - } + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); - if (mip->mi_pdata != NULL) { - kmem_free(mip->mi_pdata, mip->mi_pdata_size); - mip->mi_pdata = NULL; - mip->mi_pdata_size = 0; - } + if (flent == NULL) + return; - if (minor != 0) { - ASSERT(minor > MAC_MAX_MINOR); - mac_minor_rele(minor); + for (i = 0; i < flent->fe_rx_srs_cnt; i++) { + mac_srs = flent->fe_rx_srs[i]; + mutex_enter(&mac_srs->srs_lock); + if (on) + mac_srs->srs_state |= SRS_QUIESCE_PERM; + else + mac_srs->srs_state &= ~SRS_QUIESCE_PERM; + mutex_exit(&mac_srs->srs_lock); } - - mac_unregister_priv_prop(mip); - - kmem_cache_free(i_mac_impl_cachep, mip); - return (err); } -int -mac_disable(mac_handle_t mh) +void +mac_rx_client_quiesce(mac_client_handle_t mch) { - mac_impl_t *mip = (mac_impl_t *)mh; + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; - /* - * See if there are any other references to this mac_t (e.g., VLAN's). - * If not, set mi_disabled to prevent any new VLAN's from being - * created while we're destroying this mac. - */ - rw_enter(&i_mac_impl_lock, RW_WRITER); - if (mip->mi_ref > 0) { - rw_exit(&i_mac_impl_lock); - return (EBUSY); + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + if (MCIP_DATAPATH_SETUP(mcip)) { + (void) mac_rx_classify_flow_quiesce(mcip->mci_flent, + NULL); + (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, + mac_rx_classify_flow_quiesce, NULL); } - mip->mi_disabled = B_TRUE; - rw_exit(&i_mac_impl_lock); - return (0); } -int -mac_unregister(mac_handle_t mh) +void +mac_rx_client_restart(mac_client_handle_t mch) { - int err; - mac_impl_t *mip = (mac_impl_t *)mh; - mod_hash_val_t val; - mac_multicst_addr_t *p, *nextp; - mac_margin_req_t *mmr, *nextmmr; + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; - /* - * See if there are any other references to this mac_t (e.g., VLAN's). - * If not, set mi_disabled to prevent any new VLAN's from being - * created while we're destroying this mac. Once mac_disable() returns - * 0, the rest of mac_unregister() stuff should continue without - * returning an error. - */ - if (!mip->mi_disabled) { - if ((err = mac_disable(mh)) != 0) - return (err); - } + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); - /* - * Clean up notification thread (wait for it to exit). - */ - mutex_enter(&mip->mi_notify_bits_lock); - mip->mi_notify_bits = (1 << MAC_NNOTE); - cv_broadcast(&mip->mi_notify_cv); - while (mip->mi_notify_bits != 0) - cv_wait(&mip->mi_notify_cv, &mip->mi_notify_bits_lock); - mutex_exit(&mip->mi_notify_bits_lock); - - if (mip->mi_minor < MAC_MAX_MINOR + 1) { - ddi_remove_minor_node(mip->mi_dip, mip->mi_name); - ddi_remove_minor_node(mip->mi_dip, - (char *)ddi_driver_name(mip->mi_dip)); + if (MCIP_DATAPATH_SETUP(mcip)) { + (void) mac_rx_classify_flow_restart(mcip->mci_flent, NULL); + (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, + mac_rx_classify_flow_restart, NULL); } +} - ASSERT(!mip->mi_activelink); - - mac_stat_destroy(mip); - - rw_enter(&i_mac_impl_lock, RW_WRITER); - (void) mod_hash_remove(i_mac_impl_hash, - (mod_hash_key_t)mip->mi_name, &val); - ASSERT(mip == (mac_impl_t *)val); +/* + * This function only quiesces the Tx SRS and softring worker threads. Callers + * need to make sure that there aren't any mac client threads doing current or + * future transmits in the mac before calling this function. + */ +void +mac_tx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag) +{ + mac_client_impl_t *mcip = srs->srs_mcip; - ASSERT(i_mac_impl_count > 0); - atomic_dec_32(&i_mac_impl_count); - rw_exit(&i_mac_impl_lock); + ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); - if (mip->mi_pdata != NULL) - kmem_free(mip->mi_pdata, mip->mi_pdata_size); - mip->mi_pdata = NULL; - mip->mi_pdata_size = 0; + ASSERT(srs->srs_type & SRST_TX); + ASSERT(srs_quiesce_flag == SRS_CONDEMNED || + srs_quiesce_flag == SRS_QUIESCE); /* - * Free the list of multicast addresses. + * Signal the SRS to quiesce itself, and then cv_wait for the + * SRS quiesce to complete. The SRS worker thread will wake us + * up when the quiesce is complete */ - for (p = mip->mi_mmap; p != NULL; p = nextp) { - nextp = p->mma_nextp; - kmem_free(p, sizeof (mac_multicst_addr_t)); - } - mip->mi_mmap = NULL; + mac_srs_signal(srs, srs_quiesce_flag); + mac_srs_quiesce_wait(srs, srs_quiesce_flag == SRS_QUIESCE ? + SRS_QUIESCE_DONE : SRS_CONDEMNED_DONE); +} +void +mac_tx_srs_restart(mac_soft_ring_set_t *srs) +{ /* - * Free the list of margin request. + * Resizing the fanout could result in creation of new SRSs. + * They may not necessarily be in the quiesced state in which + * case it need be restarted */ - for (mmr = mip->mi_mmrp; mmr != NULL; mmr = nextmmr) { - nextmmr = mmr->mmr_nextp; - kmem_free(mmr, sizeof (mac_margin_req_t)); - } - mip->mi_mmrp = NULL; - - mip->mi_linkstate = LINK_STATE_UNKNOWN; - kmem_free(mip->mi_info.mi_unicst_addr, mip->mi_type->mt_addr_length); - mip->mi_info.mi_unicst_addr = NULL; - - atomic_dec_32(&mip->mi_type->mt_ref); - mip->mi_type = NULL; - - if (mip->mi_minor > MAC_MAX_MINOR) - mac_minor_rele(mip->mi_minor); - - mac_unregister_priv_prop(mip); - - cmn_err(CE_NOTE, "!%s unregistered", mip->mi_name); - - kmem_cache_free(i_mac_impl_cachep, mip); + if (!SRS_QUIESCED(srs)) + return; - return (0); + mac_srs_signal(srs, SRS_RESTART); + mac_srs_quiesce_wait(srs, SRS_RESTART_DONE); + mac_srs_clear_flag(srs, SRS_RESTART_DONE); } /* - * To avoid potential deadlocks, mac_rx() releases mi_rx_lock - * before invoking its list of upcalls. This introduces races with - * mac_rx_remove() and mac_rx_add(), who can potentially modify the - * upcall list while mi_rx_lock is not being held. The race with - * mac_rx_remove() is handled by incrementing mi_rx_ref upon entering - * mac_rx(); a non-zero mi_rx_ref would tell mac_rx_remove() - * to not modify the list but instead mark an upcall for deletion. - * before mac_rx() exits, mi_rx_ref is decremented and if it - * is 0, the marked upcalls will be removed from the list and freed. - * The race with mac_rx_add() is harmless because mac_rx_add() only - * prepends to the list and since mac_rx() saves the list head - * before releasing mi_rx_lock, any prepended upcall won't be seen - * until the next packet chain arrives. - * - * To minimize lock contention between multiple parallel invocations - * of mac_rx(), mi_rx_lock is acquired as a READER lock. The - * use of atomic operations ensures the sanity of mi_rx_ref. mi_rx_lock - * will be upgraded to WRITER mode when there are marked upcalls to be - * cleaned. + * Temporary quiesce of a flow and associated Rx SRS. + * Please see block comment above mac_rx_srs_quiesce */ -static void -mac_do_rx(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain, - boolean_t active_only) +/* ARGSUSED */ +int +mac_tx_flow_quiesce(flow_entry_t *flent, void *arg) { - mac_impl_t *mip = (mac_impl_t *)mh; - mblk_t *bp = mp_chain; - mac_rx_fn_t *mrfp; - /* - * Call all registered receive functions. + * The fe_tx_srs is null for a subflow on an interface that is + * not plumbed */ - rw_enter(&mip->mi_rx_lock, RW_READER); - if ((mrfp = mip->mi_mrfp) == NULL) { - /* There are no registered receive functions. */ - freemsgchain(bp); - rw_exit(&mip->mi_rx_lock); - return; - } - atomic_inc_32(&mip->mi_rx_ref); - rw_exit(&mip->mi_rx_lock); + if (flent->fe_tx_srs != NULL) + mac_tx_srs_quiesce(flent->fe_tx_srs, SRS_QUIESCE); + return (0); +} +/* ARGSUSED */ +int +mac_tx_flow_restart(flow_entry_t *flent, void *arg) +{ /* - * Call registered receive functions. + * The fe_tx_srs is null for a subflow on an interface that is + * not plumbed */ - do { - mblk_t *recv_bp; - - if (active_only && !mrfp->mrf_active) { - mrfp = mrfp->mrf_nextp; - if (mrfp == NULL) { - /* - * We hit the last receiver, but it's not - * active. - */ - freemsgchain(bp); - } - continue; - } - - recv_bp = (mrfp->mrf_nextp != NULL) ? copymsgchain(bp) : bp; - if (recv_bp != NULL) { - if (mrfp->mrf_inuse) { - /* - * Send bp itself and keep the copy. - * If there's only one active receiver, - * it should get the original message, - * tagged with the hardware checksum flags. - */ - mrfp->mrf_fn(mrfp->mrf_arg, mrh, bp); - bp = recv_bp; - } else { - freemsgchain(recv_bp); - } - } - - mrfp = mrfp->mrf_nextp; - } while (mrfp != NULL); + if (flent->fe_tx_srs != NULL) + mac_tx_srs_restart(flent->fe_tx_srs); + return (0); +} - rw_enter(&mip->mi_rx_lock, RW_READER); - if (atomic_dec_32_nv(&mip->mi_rx_ref) == 0 && mip->mi_rx_removed > 0) { - mac_rx_fn_t **pp, *p; - uint32_t cnt = 0; +void +mac_tx_client_quiesce(mac_client_impl_t *mcip, uint_t srs_quiesce_flag) +{ + ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); - DTRACE_PROBE1(delete_callbacks, mac_impl_t *, mip); + mac_tx_client_block(mcip); + if (MCIP_TX_SRS(mcip) != NULL) { + mac_tx_srs_quiesce(MCIP_TX_SRS(mcip), srs_quiesce_flag); + (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, + mac_tx_flow_quiesce, NULL); + } +} - /* - * Need to become exclusive before doing cleanup - */ - if (rw_tryupgrade(&mip->mi_rx_lock) == 0) { - rw_exit(&mip->mi_rx_lock); - rw_enter(&mip->mi_rx_lock, RW_WRITER); - } +void +mac_tx_client_restart(mac_client_impl_t *mcip) +{ + ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); - /* - * We return if another thread has already entered and cleaned - * up the list. - */ - if (mip->mi_rx_ref > 0 || mip->mi_rx_removed == 0) { - rw_exit(&mip->mi_rx_lock); - return; - } + mac_tx_client_unblock(mcip); + if (MCIP_TX_SRS(mcip) != NULL) { + mac_tx_srs_restart(MCIP_TX_SRS(mcip)); + (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, + mac_tx_flow_restart, NULL); + } +} - /* - * Free removed callbacks. - */ - pp = &mip->mi_mrfp; - while (*pp != NULL) { - if (!(*pp)->mrf_inuse) { - p = *pp; - *pp = (*pp)->mrf_nextp; - kmem_free(p, sizeof (*p)); - cnt++; - continue; - } - pp = &(*pp)->mrf_nextp; - } +void +mac_tx_client_flush(mac_client_impl_t *mcip) +{ + ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); - /* - * Wake up mac_rx_remove_wait() - */ - mutex_enter(&mip->mi_lock); - ASSERT(mip->mi_rx_removed == cnt); - mip->mi_rx_removed = 0; - cv_broadcast(&mip->mi_rx_cv); - mutex_exit(&mip->mi_lock); - } - rw_exit(&mip->mi_rx_lock); + mac_tx_client_quiesce(mcip, SRS_QUIESCE); + mac_tx_client_restart(mcip); } void -mac_rx(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) +mac_client_quiesce(mac_client_impl_t *mcip) { - mac_do_rx(mh, mrh, mp_chain, B_FALSE); + mac_rx_client_quiesce((mac_client_handle_t)mcip); + mac_tx_client_quiesce(mcip, SRS_QUIESCE); } -/* - * Send a packet chain up to the receive callbacks which declared - * themselves as being active. - */ void -mac_active_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp_chain) +mac_client_restart(mac_client_impl_t *mcip) { - mac_do_rx(arg, mrh, mp_chain, B_TRUE); + mac_rx_client_restart((mac_client_handle_t)mcip); + mac_tx_client_restart(mcip); } /* - * Function passed to the active client sharing a VNIC. This function - * is returned by mac_tx_get() when a VNIC is present. It invokes - * the VNIC transmit entry point which was specified by the VNIC when - * it called mac_vnic_set(). The VNIC transmit entry point will - * pass the packets to the local VNICs and/or to the underlying VNICs - * if needed. + * Allocate a minor number. */ -static mblk_t * -mac_vnic_tx(void *arg, mblk_t *mp) +minor_t +mac_minor_hold(boolean_t sleep) { - mac_impl_t *mip = arg; - mac_txinfo_t *mtfp; - mac_vnic_tx_t *mvt; + minor_t minor; /* - * There is a race between the notification of the VNIC - * addition and removal, and the processing of the VNIC notification - * by the MAC client. During this window, it is possible for - * an active MAC client to contine invoking mac_vnic_tx() while - * the VNIC has already been removed. So we cannot assume - * that mi_vnic_present will always be true when mac_vnic_tx() - * is invoked. + * Grab a value from the arena. */ - rw_enter(&mip->mi_tx_lock, RW_READER); - if (!mip->mi_vnic_present) { - rw_exit(&mip->mi_tx_lock); - freemsgchain(mp); - return (NULL); - } + atomic_add_32(&minor_count, 1); - ASSERT(mip->mi_vnic_tx != NULL); - mvt = mip->mi_vnic_tx; - MAC_VNIC_TXINFO_REFHOLD(mvt); - rw_exit(&mip->mi_tx_lock); + if (sleep) + minor = (uint_t)id_alloc(minor_ids); + else + minor = (uint_t)id_alloc_nosleep(minor_ids); - mtfp = &mvt->mv_txinfo; - mtfp->mt_fn(mtfp->mt_arg, mp); + if (minor == 0) { + atomic_add_32(&minor_count, -1); + return (0); + } - MAC_VNIC_TXINFO_REFRELE(mvt); - return (NULL); + return (minor); } /* - * Transmit function -- ONLY used when there are registered loopback listeners. + * Release a previously allocated minor number. */ -mblk_t * -mac_do_txloop(void *arg, mblk_t *bp, boolean_t call_vnic) +void +mac_minor_rele(minor_t minor) { - mac_impl_t *mip = arg; - mac_txloop_fn_t *mtfp; - mblk_t *loop_bp, *resid_bp, *next_bp; - - if (call_vnic) { - /* - * In promiscous mode, a copy of the sent packet will - * be sent to the client's promiscous receive entry - * points via mac_vnic_tx()-> - * mac_active_rx_promisc()->mac_rx_default(). - */ - return (mac_vnic_tx(arg, bp)); - } - - while (bp != NULL) { - next_bp = bp->b_next; - bp->b_next = NULL; - - if ((loop_bp = copymsg(bp)) == NULL) - goto noresources; - - if ((resid_bp = mip->mi_tx(mip->mi_driver, bp)) != NULL) { - ASSERT(resid_bp == bp); - freemsg(loop_bp); - goto noresources; - } - - rw_enter(&mip->mi_tx_lock, RW_READER); - mtfp = mip->mi_mtfp; - while (mtfp != NULL && loop_bp != NULL) { - bp = loop_bp; - - /* XXX counter bump if copymsg() fails? */ - if (mtfp->mtf_nextp != NULL) - loop_bp = copymsg(bp); - else - loop_bp = NULL; - - mtfp->mtf_fn(mtfp->mtf_arg, bp); - mtfp = mtfp->mtf_nextp; - } - rw_exit(&mip->mi_tx_lock); - - /* - * It's possible we've raced with the disabling of promiscuous - * mode, in which case we can discard our copy. - */ - if (loop_bp != NULL) - freemsg(loop_bp); - - bp = next_bp; - } - - return (NULL); - -noresources: - bp->b_next = next_bp; - return (bp); + /* + * Return the value to the arena. + */ + id_free(minor_ids, minor); + atomic_add_32(&minor_count, -1); } -mblk_t * -mac_txloop(void *arg, mblk_t *bp) +uint32_t +mac_no_notification(mac_handle_t mh) { - return (mac_do_txloop(arg, bp, B_FALSE)); + mac_impl_t *mip = (mac_impl_t *)mh; + return (mip->mi_unsup_note); } -static mblk_t * -mac_vnic_txloop(void *arg, mblk_t *bp) +/* + * Prevent any new opens of this mac in preparation for unregister + */ +int +i_mac_disable(mac_impl_t *mip) { - return (mac_do_txloop(arg, bp, B_TRUE)); -} + mac_client_impl_t *mcip; -void -mac_link_update(mac_handle_t mh, link_state_t link) -{ - mac_impl_t *mip = (mac_impl_t *)mh; + rw_enter(&i_mac_impl_lock, RW_WRITER); + if (mip->mi_state_flags & MIS_DISABLED) { + /* Already disabled, return success */ + rw_exit(&i_mac_impl_lock); + return (0); + } + /* + * See if there are any other references to this mac_t (e.g., VLAN's). + * If so return failure. If all the other checks below pass, then + * set mi_disabled atomically under the i_mac_impl_lock to prevent + * any new VLAN's from being created or new mac client opens of this + * mac end point. + */ + if (mip->mi_ref > 0) { + rw_exit(&i_mac_impl_lock); + return (EBUSY); + } /* - * Save the link state. + * mac clients must delete all multicast groups they join before + * closing. bcast groups are reference counted, the last client + * to delete the group will wait till the group is physically + * deleted. Since all clients have closed this mac end point + * mi_bcast_ngrps must be zero at this point */ - mip->mi_linkstate = link; + ASSERT(mip->mi_bcast_ngrps == 0); /* - * Send a MAC_NOTE_LINK notification. + * Don't let go of this if it has some flows. + * All other code guarantees no flows are added to a disabled + * mac, therefore it is sufficient to check for the flow table + * only here. */ - i_mac_notify(mip, MAC_NOTE_LINK); + mcip = mac_primary_client_handle(mip); + if ((mcip != NULL) && mac_link_has_flows((mac_client_handle_t)mcip)) { + rw_exit(&i_mac_impl_lock); + return (ENOTEMPTY); + } + + mip->mi_state_flags |= MIS_DISABLED; + rw_exit(&i_mac_impl_lock); + return (0); } -void -mac_unicst_update(mac_handle_t mh, const uint8_t *addr) +int +mac_disable_nowait(mac_handle_t mh) { mac_impl_t *mip = (mac_impl_t *)mh; + int err; - if (mip->mi_type->mt_addr_length == 0) - return; + if ((err = i_mac_perim_enter_nowait(mip)) != 0) + return (err); + err = i_mac_disable(mip); + i_mac_perim_exit(mip); + return (err); +} - /* - * If the address has not changed, do nothing. - */ - if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) == 0) - return; +int +mac_disable(mac_handle_t mh) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + int err; - /* - * Save the address. - */ - bcopy(addr, mip->mi_addr, mip->mi_type->mt_addr_length); + i_mac_perim_enter(mip); + err = i_mac_disable(mip); + i_mac_perim_exit(mip); /* - * Send a MAC_NOTE_UNICST notification. + * Clean up notification thread and wait for it to exit. */ - i_mac_notify(mip, MAC_NOTE_UNICST); -} + if (err == 0) + i_mac_notify_exit(mip); -void -mac_tx_update(mac_handle_t mh) -{ - /* - * Send a MAC_NOTE_TX notification. - */ - i_mac_notify((mac_impl_t *)mh, MAC_NOTE_TX); + return (err); } -void -mac_resource_update(mac_handle_t mh) +/* + * Called when the MAC instance has a non empty flow table, to de-multiplex + * incoming packets to the right flow. + * The MAC's rw lock is assumed held as a READER. + */ +/* ARGSUSED */ +static mblk_t * +mac_rx_classify(mac_impl_t *mip, mac_resource_handle_t mrh, mblk_t *mp) { + flow_entry_t *flent = NULL; + uint_t flags = FLOW_INBOUND; + int err; + /* - * Send a MAC_NOTE_RESOURCE notification. + * If the mac is a port of an aggregation, pass FLOW_IGNORE_VLAN + * to mac_flow_lookup() so that the VLAN packets can be successfully + * passed to the non-VLAN aggregation flows. + * + * Note that there is possibly a race between this and + * mac_unicast_remove/add() and VLAN packets could be incorrectly + * classified to non-VLAN flows of non-aggregation mac clients. These + * VLAN packets will be then filtered out by the mac module. */ - i_mac_notify((mac_impl_t *)mh, MAC_NOTE_RESOURCE); -} - -mac_resource_handle_t -mac_resource_add(mac_handle_t mh, mac_resource_t *mrp) -{ - mac_impl_t *mip = (mac_impl_t *)mh; - mac_resource_handle_t mrh; - mac_resource_add_t add; - void *arg; - - rw_enter(&mip->mi_resource_lock, RW_READER); - add = mip->mi_resource_add; - arg = mip->mi_resource_add_arg; + if ((mip->mi_state_flags & MIS_EXCLUSIVE) != 0) + flags |= FLOW_IGNORE_VLAN; - if (add != NULL) - mrh = add(arg, mrp); - else - mrh = NULL; - rw_exit(&mip->mi_resource_lock); + err = mac_flow_lookup(mip->mi_flow_tab, mp, flags, &flent); + if (err != 0) { + /* no registered receive function */ + return (mp); + } else { + mac_client_impl_t *mcip; - return (mrh); + /* + * This flent might just be an additional one on the MAC client, + * i.e. for classification purposes (different fdesc), however + * the resources, SRS et. al., are in the mci_flent, so if + * this isn't the mci_flent, we need to get it. + */ + if ((mcip = flent->fe_mcip) != NULL && + mcip->mci_flent != flent) { + FLOW_REFRELE(flent); + flent = mcip->mci_flent; + FLOW_TRY_REFHOLD(flent, err); + if (err != 0) + return (mp); + } + (flent->fe_cb_fn)(flent->fe_cb_arg1, flent->fe_cb_arg2, mp, + B_FALSE); + FLOW_REFRELE(flent); + } + return (NULL); } -int -mac_pdata_update(mac_handle_t mh, void *mac_pdata, size_t dsize) +mblk_t * +mac_rx_flow(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) { mac_impl_t *mip = (mac_impl_t *)mh; + mblk_t *bp, *bp1, **bpp, *list = NULL; /* - * Verify that the plugin supports MAC plugin data and that the - * supplied data is valid. + * We walk the chain and attempt to classify each packet. + * The packets that couldn't be classified will be returned + * back to the caller. */ - if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY)) - return (EINVAL); - if (!mip->mi_type->mt_ops.mtops_pdata_verify(mac_pdata, dsize)) - return (EINVAL); + bp = mp_chain; + bpp = &list; + while (bp != NULL) { + bp1 = bp; + bp = bp->b_next; + bp1->b_next = NULL; - if (mip->mi_pdata != NULL) - kmem_free(mip->mi_pdata, mip->mi_pdata_size); + if (mac_rx_classify(mip, mrh, bp1) != NULL) { + *bpp = bp1; + bpp = &bp1->b_next; + } + } + return (list); +} - mip->mi_pdata = kmem_alloc(dsize, KM_SLEEP); - bcopy(mac_pdata, mip->mi_pdata, dsize); - mip->mi_pdata_size = dsize; +static int +mac_tx_flow_srs_wakeup(flow_entry_t *flent, void *arg) +{ + mac_ring_handle_t ring = arg; - /* - * Since the MAC plugin data is used to construct MAC headers that - * were cached in fast-path headers, we need to flush fast-path - * information for links associated with this mac. - */ - i_mac_notify(mip, MAC_NOTE_FASTPATH_FLUSH); + if (flent->fe_tx_srs) + mac_tx_srs_wakeup(flent->fe_tx_srs, ring); return (0); } void -mac_multicst_refresh(mac_handle_t mh, mac_multicst_t refresh, void *arg, - boolean_t add) +i_mac_tx_srs_notify(mac_impl_t *mip, mac_ring_handle_t ring) { - mac_impl_t *mip = (mac_impl_t *)mh; - mac_multicst_addr_t *p; + mac_client_impl_t *cclient; + mac_soft_ring_set_t *mac_srs; /* - * If no specific refresh function was given then default to the - * driver's m_multicst entry point. + * After grabbing the mi_rw_lock, the list of clients can't change. + * If there are any clients mi_disabled must be B_FALSE and can't + * get set since there are clients. If there aren't any clients we + * don't do anything. In any case the mip has to be valid. The driver + * must make sure that it goes single threaded (with respect to mac + * calls) and wait for all pending mac calls to finish before calling + * mac_unregister. */ - if (refresh == NULL) { - refresh = mip->mi_multicst; - arg = mip->mi_driver; + rw_enter(&i_mac_impl_lock, RW_READER); + if (mip->mi_state_flags & MIS_DISABLED) { + rw_exit(&i_mac_impl_lock); + return; } - ASSERT(refresh != NULL); /* - * Walk the multicast address list and call the refresh function for - * each address. + * Get MAC tx srs from walking mac_client_handle list. */ - rw_enter(&(mip->mi_data_lock), RW_READER); - for (p = mip->mi_mmap; p != NULL; p = p->mma_nextp) - refresh(arg, add, p->mma_addr); - rw_exit(&(mip->mi_data_lock)); + rw_enter(&mip->mi_rw_lock, RW_READER); + for (cclient = mip->mi_clients_list; cclient != NULL; + cclient = cclient->mci_client_next) { + if ((mac_srs = MCIP_TX_SRS(cclient)) != NULL) + mac_tx_srs_wakeup(mac_srs, ring); + if (!FLOW_TAB_EMPTY(cclient->mci_subflow_tab)) { + (void) mac_flow_walk_nolock(cclient->mci_subflow_tab, + mac_tx_flow_srs_wakeup, ring); + } + } + rw_exit(&mip->mi_rw_lock); + rw_exit(&i_mac_impl_lock); } +/* ARGSUSED */ void -mac_unicst_refresh(mac_handle_t mh, mac_unicst_t refresh, void *arg) +mac_multicast_refresh(mac_handle_t mh, mac_multicst_t refresh, void *arg, + boolean_t add) { - mac_impl_t *mip = (mac_impl_t *)mh; + mac_impl_t *mip = (mac_impl_t *)mh; + + i_mac_perim_enter((mac_impl_t *)mh); /* * If no specific refresh function was given then default to the - * driver's mi_unicst entry point. + * driver's m_multicst entry point. */ if (refresh == NULL) { - refresh = mip->mi_unicst; + refresh = mip->mi_multicst; arg = mip->mi_driver; } - ASSERT(refresh != NULL); - /* - * Call the refresh function with the current unicast address. - */ - refresh(arg, mip->mi_addr); + mac_bcast_refresh(mip, refresh, arg, add); + i_mac_perim_exit((mac_impl_t *)mh); } void @@ -2352,7 +2416,7 @@ mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current) mac_margin_req_t **pp, *p; int err = 0; - rw_enter(&(mip->mi_data_lock), RW_WRITER); + rw_enter(&(mip->mi_rw_lock), RW_WRITER); if (current) *marginp = mip->mi_margin; @@ -2369,7 +2433,7 @@ mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current) * Check whether the given margin is already in the list. If so, * bump the reference count. */ - for (pp = &(mip->mi_mmrp); (p = *pp) != NULL; pp = &(p->mmr_nextp)) { + for (pp = &mip->mi_mmrp; (p = *pp) != NULL; pp = &p->mmr_nextp) { if (p->mmr_margin == *marginp) { /* * The margin requested is already in the list, @@ -2383,18 +2447,14 @@ mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current) } - if ((p = kmem_zalloc(sizeof (mac_margin_req_t), KM_NOSLEEP)) == NULL) { - err = ENOMEM; - goto done; - } - + p = kmem_zalloc(sizeof (mac_margin_req_t), KM_SLEEP); p->mmr_margin = *marginp; p->mmr_ref++; p->mmr_nextp = *pp; *pp = p; done: - rw_exit(&(mip->mi_data_lock)); + rw_exit(&(mip->mi_rw_lock)); return (err); } @@ -2409,7 +2469,7 @@ mac_margin_remove(mac_handle_t mh, uint32_t margin) mac_margin_req_t **pp, *p; int err = 0; - rw_enter(&(mip->mi_data_lock), RW_WRITER); + rw_enter(&(mip->mi_rw_lock), RW_WRITER); /* * Find the entry in the list for the given margin. */ @@ -2442,30 +2502,17 @@ mac_margin_remove(mac_handle_t mh, uint32_t margin) *pp = p->mmr_nextp; kmem_free(p, sizeof (mac_margin_req_t)); done: - rw_exit(&(mip->mi_data_lock)); + rw_exit(&(mip->mi_rw_lock)); return (err); } -/* - * The mac client requests to get the mac's current margin value. - */ -void -mac_margin_get(mac_handle_t mh, uint32_t *marginp) -{ - mac_impl_t *mip = (mac_impl_t *)mh; - - rw_enter(&(mip->mi_data_lock), RW_READER); - *marginp = mip->mi_margin; - rw_exit(&(mip->mi_data_lock)); -} - boolean_t mac_margin_update(mac_handle_t mh, uint32_t margin) { mac_impl_t *mip = (mac_impl_t *)mh; uint32_t margin_needed = 0; - rw_enter(&(mip->mi_data_lock), RW_WRITER); + rw_enter(&(mip->mi_rw_lock), RW_WRITER); if (mip->mi_mmrp != NULL) margin_needed = mip->mi_mmrp->mmr_margin; @@ -2473,7 +2520,7 @@ mac_margin_update(mac_handle_t mh, uint32_t margin) if (margin_needed <= margin) mip->mi_margin = margin; - rw_exit(&(mip->mi_data_lock)); + rw_exit(&(mip->mi_rw_lock)); if (margin_needed <= margin) i_mac_notify(mip, MAC_NOTE_MARGIN); @@ -2481,287 +2528,48 @@ mac_margin_update(mac_handle_t mh, uint32_t margin) return (margin_needed <= margin); } -boolean_t -mac_do_active_set(mac_handle_t mh, boolean_t shareable) -{ - mac_impl_t *mip = (mac_impl_t *)mh; - - mutex_enter(&mip->mi_activelink_lock); - if (mip->mi_activelink) { - mutex_exit(&mip->mi_activelink_lock); - return (B_FALSE); - } - mip->mi_activelink = B_TRUE; - mip->mi_shareable = shareable; - mutex_exit(&mip->mi_activelink_lock); - return (B_TRUE); -} - /* - * Called by MAC clients. By default, active MAC clients cannot - * share the NIC with VNICs. + * MAC Type Plugin functions. */ -boolean_t -mac_active_set(mac_handle_t mh) -{ - return (mac_do_active_set(mh, B_FALSE)); -} -/* - * Called by MAC clients which can share the NIC with VNICS, e.g. DLS. - */ -boolean_t -mac_active_shareable_set(mac_handle_t mh) +mactype_t * +mactype_getplugin(const char *pname) { - return (mac_do_active_set(mh, B_TRUE)); -} - -void -mac_active_clear(mac_handle_t mh) -{ - mac_impl_t *mip = (mac_impl_t *)mh; - - mutex_enter(&mip->mi_activelink_lock); - ASSERT(mip->mi_activelink); - mip->mi_activelink = B_FALSE; - mutex_exit(&mip->mi_activelink_lock); -} - -boolean_t -mac_vnic_set(mac_handle_t mh, mac_txinfo_t *tx_info, mac_getcapab_t getcapab_fn, - void *getcapab_arg) -{ - mac_impl_t *mip = (mac_impl_t *)mh; - mac_vnic_tx_t *vnic_tx; + mactype_t *mtype = NULL; + boolean_t tried_modload = B_FALSE; - mutex_enter(&mip->mi_activelink_lock); - rw_enter(&mip->mi_tx_lock, RW_WRITER); - ASSERT(!mip->mi_vnic_present); + mutex_enter(&i_mactype_lock); - if (mip->mi_activelink && !mip->mi_shareable) { +find_registered_mactype: + if (mod_hash_find(i_mactype_hash, (mod_hash_key_t)pname, + (mod_hash_val_t *)&mtype) != 0) { + if (!tried_modload) { + /* + * If the plugin has not yet been loaded, then + * attempt to load it now. If modload() succeeds, + * the plugin should have registered using + * mactype_register(), in which case we can go back + * and attempt to find it again. + */ + if (modload(MACTYPE_KMODDIR, (char *)pname) != -1) { + tried_modload = B_TRUE; + goto find_registered_mactype; + } + } + } else { /* - * The NIC is already used by an active client which cannot - * share it with VNICs. + * Note that there's no danger that the plugin we've loaded + * could be unloaded between the modload() step and the + * reference count bump here, as we're holding + * i_mactype_lock, which mactype_unregister() also holds. */ - rw_exit(&mip->mi_tx_lock); - mutex_exit(&mip->mi_activelink_lock); - return (B_FALSE); - } - - vnic_tx = kmem_cache_alloc(mac_vnic_tx_cache, KM_SLEEP); - vnic_tx->mv_refs = 0; - vnic_tx->mv_txinfo = *tx_info; - vnic_tx->mv_clearing = B_FALSE; - - mip->mi_vnic_present = B_TRUE; - mip->mi_vnic_tx = vnic_tx; - mip->mi_vnic_getcapab_fn = getcapab_fn; - mip->mi_vnic_getcapab_arg = getcapab_arg; - rw_exit(&mip->mi_tx_lock); - mutex_exit(&mip->mi_activelink_lock); - - i_mac_notify(mip, MAC_NOTE_VNIC); - return (B_TRUE); -} - -void -mac_vnic_clear(mac_handle_t mh) -{ - mac_impl_t *mip = (mac_impl_t *)mh; - mac_vnic_tx_t *vnic_tx; - - rw_enter(&mip->mi_tx_lock, RW_WRITER); - ASSERT(mip->mi_vnic_present); - mip->mi_vnic_present = B_FALSE; - /* - * Setting mi_vnic_tx to NULL here under the lock guarantees - * that no new references to the current VNIC transmit structure - * will be taken by mac_vnic_tx(). This is a necessary condition - * for safely waiting for the reference count to drop to - * zero below. - */ - vnic_tx = mip->mi_vnic_tx; - mip->mi_vnic_tx = NULL; - mip->mi_vnic_getcapab_fn = NULL; - mip->mi_vnic_getcapab_arg = NULL; - rw_exit(&mip->mi_tx_lock); - - i_mac_notify(mip, MAC_NOTE_VNIC); - - /* - * Wait for all TX calls referencing the VNIC transmit - * entry point that was removed to complete. - */ - mutex_enter(&vnic_tx->mv_lock); - vnic_tx->mv_clearing = B_TRUE; - while (vnic_tx->mv_refs > 0) - cv_wait(&vnic_tx->mv_cv, &vnic_tx->mv_lock); - mutex_exit(&vnic_tx->mv_lock); - kmem_cache_free(mac_vnic_tx_cache, vnic_tx); -} - -/* - * mac_info_get() is used for retrieving the mac_info when a DL_INFO_REQ is - * issued before a DL_ATTACH_REQ. we walk the i_mac_impl_hash table and find - * the first mac_impl_t with a matching driver name; then we copy its mac_info_t - * to the caller. we do all this with i_mac_impl_lock held so the mac_impl_t - * cannot disappear while we are accessing it. - */ -typedef struct i_mac_info_state_s { - const char *mi_name; - mac_info_t *mi_infop; -} i_mac_info_state_t; - -/*ARGSUSED*/ -static uint_t -i_mac_info_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg) -{ - i_mac_info_state_t *statep = arg; - mac_impl_t *mip = (mac_impl_t *)val; - - if (mip->mi_disabled) - return (MH_WALK_CONTINUE); - - if (strcmp(statep->mi_name, - ddi_driver_name(mip->mi_dip)) != 0) - return (MH_WALK_CONTINUE); - - statep->mi_infop = &mip->mi_info; - return (MH_WALK_TERMINATE); -} - -boolean_t -mac_info_get(const char *name, mac_info_t *minfop) -{ - i_mac_info_state_t state; - - rw_enter(&i_mac_impl_lock, RW_READER); - state.mi_name = name; - state.mi_infop = NULL; - mod_hash_walk(i_mac_impl_hash, i_mac_info_walker, &state); - if (state.mi_infop == NULL) { - rw_exit(&i_mac_impl_lock); - return (B_FALSE); - } - *minfop = *state.mi_infop; - rw_exit(&i_mac_impl_lock); - return (B_TRUE); -} - -boolean_t -mac_do_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data, - boolean_t is_vnic) -{ - mac_impl_t *mip = (mac_impl_t *)mh; - - if (!is_vnic) { - rw_enter(&mip->mi_tx_lock, RW_READER); - if (mip->mi_vnic_present) { - boolean_t rv; - - rv = mip->mi_vnic_getcapab_fn(mip->mi_vnic_getcapab_arg, - cap, cap_data); - rw_exit(&mip->mi_tx_lock); - return (rv); - } - rw_exit(&mip->mi_tx_lock); - } - - if (mip->mi_callbacks->mc_callbacks & MC_GETCAPAB) - return (mip->mi_getcapab(mip->mi_driver, cap, cap_data)); - else - return (B_FALSE); -} - -boolean_t -mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data) -{ - return (mac_do_capab_get(mh, cap, cap_data, B_FALSE)); -} - -boolean_t -mac_vnic_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data) -{ - return (mac_do_capab_get(mh, cap, cap_data, B_TRUE)); -} - -boolean_t -mac_sap_verify(mac_handle_t mh, uint32_t sap, uint32_t *bind_sap) -{ - mac_impl_t *mip = (mac_impl_t *)mh; - return (mip->mi_type->mt_ops.mtops_sap_verify(sap, bind_sap, - mip->mi_pdata)); -} - -mblk_t * -mac_header(mac_handle_t mh, const uint8_t *daddr, uint32_t sap, mblk_t *payload, - size_t extra_len) -{ - mac_impl_t *mip = (mac_impl_t *)mh; - return (mip->mi_type->mt_ops.mtops_header(mip->mi_addr, daddr, sap, - mip->mi_pdata, payload, extra_len)); -} - -int -mac_header_info(mac_handle_t mh, mblk_t *mp, mac_header_info_t *mhip) -{ - mac_impl_t *mip = (mac_impl_t *)mh; - return (mip->mi_type->mt_ops.mtops_header_info(mp, mip->mi_pdata, - mhip)); -} - -mblk_t * -mac_header_cook(mac_handle_t mh, mblk_t *mp) -{ - mac_impl_t *mip = (mac_impl_t *)mh; - if (mip->mi_type->mt_ops.mtops_ops & MTOPS_HEADER_COOK) { - if (DB_REF(mp) > 1) { - mblk_t *newmp = copymsg(mp); - if (newmp == NULL) - return (NULL); - freemsg(mp); - mp = newmp; - } - return (mip->mi_type->mt_ops.mtops_header_cook(mp, - mip->mi_pdata)); - } - return (mp); -} - -mblk_t * -mac_header_uncook(mac_handle_t mh, mblk_t *mp) -{ - mac_impl_t *mip = (mac_impl_t *)mh; - if (mip->mi_type->mt_ops.mtops_ops & MTOPS_HEADER_UNCOOK) { - if (DB_REF(mp) > 1) { - mblk_t *newmp = copymsg(mp); - if (newmp == NULL) - return (NULL); - freemsg(mp); - mp = newmp; - } - return (mip->mi_type->mt_ops.mtops_header_uncook(mp, - mip->mi_pdata)); + atomic_inc_32(&mtype->mt_ref); } - return (mp); -} - -void -mac_init_ops(struct dev_ops *ops, const char *name) -{ - dld_init_ops(ops, name); -} -void -mac_fini_ops(struct dev_ops *ops) -{ - dld_fini_ops(ops); + mutex_exit(&i_mactype_lock); + return (mtype); } -/* - * MAC Type Plugin functions. - */ - mactype_register_t * mactype_alloc(uint_t mactype_version) { @@ -2878,19 +2686,70 @@ done: return (err); } +/* + * Returns TRUE when the specified property is intended for the MAC framework, + * as opposed to driver defined properties. + */ +static boolean_t +mac_is_macprop(mac_prop_t *macprop) +{ + switch (macprop->mp_id) { + case MAC_PROP_MAXBW: + case MAC_PROP_PRIO: + case MAC_PROP_BIND_CPU: + return (B_TRUE); + default: + return (B_FALSE); + } +} + +/* + * mac_set_prop() sets mac or hardware driver properties: + * mac properties include maxbw, priority, and cpu binding list. Driver + * properties are private properties to the hardware, such as mtu, speed + * etc. + * If the property is a driver property, mac_set_prop() calls driver's callback + * function to set it. + * If the property is a mac property, mac_set_prop() invokes mac_set_resources() + * which will cache the property value in mac_impl_t and may call + * mac_client_set_resource() to update property value of the primary mac client, + * if it exists. + */ int mac_set_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize) { int err = ENOTSUP; mac_impl_t *mip = (mac_impl_t *)mh; + ASSERT(MAC_PERIM_HELD(mh)); + + /* If it is mac property, call mac_set_resources() */ + if (mac_is_macprop(macprop)) { + mac_resource_props_t mrp; + + if (valsize < sizeof (mac_resource_props_t)) + return (EINVAL); + bzero(&mrp, sizeof (mac_resource_props_t)); + bcopy(val, &mrp, sizeof (mrp)); + return (mac_set_resources(mh, &mrp)); + } + /* For driver properties, call driver's callback */ if (mip->mi_callbacks->mc_callbacks & MC_SETPROP) { err = mip->mi_callbacks->mc_setprop(mip->mi_driver, macprop->mp_name, macprop->mp_id, valsize, val); } + return (err); } +/* + * mac_get_prop() gets mac or hardware driver properties. + * + * If the property is a driver property, mac_get_prop() calls driver's callback + * function to get it. + * If the property is a mac property, mac_get_prop() invokes mac_get_resources() + * which returns the cached value in mac_impl_t. + */ int mac_get_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize, uint_t *perm) @@ -2900,6 +2759,18 @@ mac_get_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize, uint32_t sdu; link_state_t link_state; + /* If mac property, read from cache */ + if (mac_is_macprop(macprop)) { + mac_resource_props_t mrp; + + if (valsize < sizeof (mac_resource_props_t)) + return (EINVAL); + bzero(&mrp, sizeof (mac_resource_props_t)); + mac_get_resources(mh, &mrp); + bcopy(&mrp, val, sizeof (mac_resource_props_t)); + return (0); + } + switch (macprop->mp_id) { case MAC_PROP_MTU: if (valsize < sizeof (sdu)) @@ -2932,7 +2803,9 @@ mac_get_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize, return (0); default: break; + } + /* If driver property, request from driver */ if (mip->mi_callbacks->mc_callbacks & MC_GETPROP) { err = mip->mi_callbacks->mc_getprop(mip->mi_driver, macprop->mp_name, macprop->mp_id, macprop->mp_flags, @@ -2941,21 +2814,7 @@ mac_get_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize, return (err); } -int -mac_maxsdu_update(mac_handle_t mh, uint_t sdu_max) -{ - mac_impl_t *mip = (mac_impl_t *)mh; - - if (sdu_max <= mip->mi_sdu_min) - return (EINVAL); - mip->mi_sdu_max = sdu_max; - - /* Send a MAC_NOTE_SDU_SIZE notification. */ - i_mac_notify(mip, MAC_NOTE_SDU_SIZE); - return (0); -} - -static void +void mac_register_priv_prop(mac_impl_t *mip, mac_priv_prop_t *mpp, uint_t nprop) { mac_priv_prop_t *mpriv; @@ -2969,7 +2828,7 @@ mac_register_priv_prop(mac_impl_t *mip, mac_priv_prop_t *mpp, uint_t nprop) mip->mi_priv_prop_count = nprop; } -static void +void mac_unregister_priv_prop(mac_impl_t *mip) { mac_priv_prop_t *mpriv; @@ -2981,3 +2840,2283 @@ mac_unregister_priv_prop(mac_impl_t *mip) } mip->mi_priv_prop_count = 0; } + +/* + * mac_ring_t 'mr' macros. Some rogue drivers may access ring structure + * (by invoking mac_rx()) even after processing mac_stop_ring(). In such + * cases if MAC free's the ring structure after mac_stop_ring(), any + * illegal access to the ring structure coming from the driver will panic + * the system. In order to protect the system from such inadverent access, + * we maintain a cache of rings in the mac_impl_t after they get free'd up. + * When packets are received on free'd up rings, MAC (through the generation + * count mechanism) will drop such packets. + */ +static mac_ring_t * +mac_ring_alloc(mac_impl_t *mip, mac_capab_rings_t *cap_rings) +{ + mac_ring_t *ring; + + if (cap_rings->mr_type == MAC_RING_TYPE_RX) { + mutex_enter(&mip->mi_ring_lock); + if (mip->mi_ring_freelist != NULL) { + ring = mip->mi_ring_freelist; + mip->mi_ring_freelist = ring->mr_next; + bzero(ring, sizeof (mac_ring_t)); + } else { + ring = kmem_cache_alloc(mac_ring_cache, KM_SLEEP); + } + mutex_exit(&mip->mi_ring_lock); + } else { + ring = kmem_zalloc(sizeof (mac_ring_t), KM_SLEEP); + } + ASSERT((ring != NULL) && (ring->mr_state == MR_FREE)); + return (ring); +} + +static void +mac_ring_free(mac_impl_t *mip, mac_ring_t *ring) +{ + if (ring->mr_type == MAC_RING_TYPE_RX) { + mutex_enter(&mip->mi_ring_lock); + ring->mr_state = MR_FREE; + ring->mr_flag = 0; + ring->mr_next = mip->mi_ring_freelist; + mip->mi_ring_freelist = ring; + mutex_exit(&mip->mi_ring_lock); + } else { + kmem_free(ring, sizeof (mac_ring_t)); + } +} + +static void +mac_ring_freeall(mac_impl_t *mip) +{ + mac_ring_t *ring_next; + mutex_enter(&mip->mi_ring_lock); + mac_ring_t *ring = mip->mi_ring_freelist; + while (ring != NULL) { + ring_next = ring->mr_next; + kmem_cache_free(mac_ring_cache, ring); + ring = ring_next; + } + mip->mi_ring_freelist = NULL; + mutex_exit(&mip->mi_ring_lock); +} + +int +mac_start_ring(mac_ring_t *ring) +{ + int rv = 0; + + if (ring->mr_start != NULL) + rv = ring->mr_start(ring->mr_driver, ring->mr_gen_num); + + return (rv); +} + +void +mac_stop_ring(mac_ring_t *ring) +{ + if (ring->mr_stop != NULL) + ring->mr_stop(ring->mr_driver); + + /* + * Increment the ring generation number for this ring. + */ + ring->mr_gen_num++; +} + +int +mac_start_group(mac_group_t *group) +{ + int rv = 0; + + if (group->mrg_start != NULL) + rv = group->mrg_start(group->mrg_driver); + + return (rv); +} + +void +mac_stop_group(mac_group_t *group) +{ + if (group->mrg_stop != NULL) + group->mrg_stop(group->mrg_driver); +} + +/* + * Called from mac_start() on the default Rx group. Broadcast and multicast + * packets are received only on the default group. Hence the default group + * needs to be up even if the primary client is not up, for the other groups + * to be functional. We do this by calling this function at mac_start time + * itself. However the broadcast packets that are received can't make their + * way beyond mac_rx until a mac client creates a broadcast flow. + */ +static int +mac_start_group_and_rings(mac_group_t *group) +{ + mac_ring_t *ring; + int rv = 0; + + ASSERT(group->mrg_state == MAC_GROUP_STATE_REGISTERED); + if ((rv = mac_start_group(group)) != 0) + return (rv); + + for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) { + ASSERT(ring->mr_state == MR_FREE); + if ((rv = mac_start_ring(ring)) != 0) + goto error; + ring->mr_state = MR_INUSE; + ring->mr_classify_type = MAC_SW_CLASSIFIER; + } + return (0); + +error: + mac_stop_group_and_rings(group); + return (rv); +} + +/* Called from mac_stop on the default Rx group */ +static void +mac_stop_group_and_rings(mac_group_t *group) +{ + mac_ring_t *ring; + + for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) { + if (ring->mr_state != MR_FREE) { + mac_stop_ring(ring); + ring->mr_state = MR_FREE; + ring->mr_flag = 0; + ring->mr_classify_type = MAC_NO_CLASSIFIER; + } + } + mac_stop_group(group); +} + + +static mac_ring_t * +mac_init_ring(mac_impl_t *mip, mac_group_t *group, int index, + mac_capab_rings_t *cap_rings) +{ + mac_ring_t *ring; + mac_ring_info_t ring_info; + + ring = mac_ring_alloc(mip, cap_rings); + + /* Prepare basic information of ring */ + ring->mr_index = index; + ring->mr_type = group->mrg_type; + ring->mr_gh = (mac_group_handle_t)group; + + /* Insert the new ring to the list. */ + ring->mr_next = group->mrg_rings; + group->mrg_rings = ring; + + /* Zero to reuse the info data structure */ + bzero(&ring_info, sizeof (ring_info)); + + /* Query ring information from driver */ + cap_rings->mr_rget(mip->mi_driver, group->mrg_type, group->mrg_index, + index, &ring_info, (mac_ring_handle_t)ring); + + ring->mr_info = ring_info; + + /* Update ring's status */ + ring->mr_state = MR_FREE; + ring->mr_flag = 0; + + /* Update the ring count of the group */ + group->mrg_cur_count++; + return (ring); +} + +/* + * Rings are chained together for easy regrouping. + */ +static void +mac_init_group(mac_impl_t *mip, mac_group_t *group, int size, + mac_capab_rings_t *cap_rings) +{ + int index; + + /* + * Initialize all ring members of this group. Size of zero will not + * enter the loop, so it's safe for initializing an empty group. + */ + for (index = size - 1; index >= 0; index--) + (void) mac_init_ring(mip, group, index, cap_rings); +} + +int +mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype) +{ + mac_capab_rings_t *cap_rings; + mac_group_t *group, *groups; + mac_group_info_t group_info; + uint_t group_free = 0; + uint_t ring_left; + mac_ring_t *ring; + int g, err = 0; + + switch (rtype) { + case MAC_RING_TYPE_RX: + ASSERT(mip->mi_rx_groups == NULL); + + cap_rings = &mip->mi_rx_rings_cap; + cap_rings->mr_type = MAC_RING_TYPE_RX; + break; + case MAC_RING_TYPE_TX: + ASSERT(mip->mi_tx_groups == NULL); + + cap_rings = &mip->mi_tx_rings_cap; + cap_rings->mr_type = MAC_RING_TYPE_TX; + break; + default: + ASSERT(B_FALSE); + } + + if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_RINGS, + cap_rings)) + return (0); + + /* + * Allocate a contiguous buffer for all groups. + */ + groups = kmem_zalloc(sizeof (mac_group_t) * (cap_rings->mr_gnum + 1), + KM_SLEEP); + + ring_left = cap_rings->mr_rnum; + + /* + * Get all ring groups if any, and get their ring members + * if any. + */ + for (g = 0; g < cap_rings->mr_gnum; g++) { + group = groups + g; + + /* Prepare basic information of the group */ + group->mrg_index = g; + group->mrg_type = rtype; + group->mrg_state = MAC_GROUP_STATE_UNINIT; + group->mrg_mh = (mac_handle_t)mip; + group->mrg_next = group + 1; + + /* Zero to reuse the info data structure */ + bzero(&group_info, sizeof (group_info)); + + /* Query group information from driver */ + cap_rings->mr_gget(mip->mi_driver, rtype, g, &group_info, + (mac_group_handle_t)group); + + switch (cap_rings->mr_group_type) { + case MAC_GROUP_TYPE_DYNAMIC: + if (cap_rings->mr_gaddring == NULL || + cap_rings->mr_gremring == NULL) { + DTRACE_PROBE3( + mac__init__rings_no_addremring, + char *, mip->mi_name, + mac_group_add_ring_t, + cap_rings->mr_gaddring, + mac_group_add_ring_t, + cap_rings->mr_gremring); + err = EINVAL; + goto bail; + } + + switch (rtype) { + case MAC_RING_TYPE_RX: + /* + * The first RX group must have non-zero + * rings, and the following groups must + * have zero rings. + */ + if (g == 0 && group_info.mgi_count == 0) { + DTRACE_PROBE1( + mac__init__rings__rx__def__zero, + char *, mip->mi_name); + err = EINVAL; + goto bail; + } + if (g > 0 && group_info.mgi_count != 0) { + DTRACE_PROBE3( + mac__init__rings__rx__nonzero, + char *, mip->mi_name, + int, g, int, group_info.mgi_count); + err = EINVAL; + goto bail; + } + break; + case MAC_RING_TYPE_TX: + /* + * All TX ring groups must have zero rings. + */ + if (group_info.mgi_count != 0) { + DTRACE_PROBE3( + mac__init__rings__tx__nonzero, + char *, mip->mi_name, + int, g, int, group_info.mgi_count); + err = EINVAL; + goto bail; + } + break; + } + break; + case MAC_GROUP_TYPE_STATIC: + /* + * Note that an empty group is allowed, e.g., an aggr + * would start with an empty group. + */ + break; + default: + /* unknown group type */ + DTRACE_PROBE2(mac__init__rings__unknown__type, + char *, mip->mi_name, + int, cap_rings->mr_group_type); + err = EINVAL; + goto bail; + } + + + /* + * Driver must register group->mgi_addmac/remmac() for rx groups + * to support multiple MAC addresses. + */ + if (rtype == MAC_RING_TYPE_RX) { + if ((group_info.mgi_addmac == NULL) || + (group_info.mgi_addmac == NULL)) + goto bail; + } + + /* Cache driver-supplied information */ + group->mrg_info = group_info; + + /* Update the group's status and group count. */ + mac_set_rx_group_state(group, MAC_GROUP_STATE_REGISTERED); + group_free++; + + group->mrg_rings = NULL; + group->mrg_cur_count = 0; + mac_init_group(mip, group, group_info.mgi_count, cap_rings); + ring_left -= group_info.mgi_count; + + /* The current group size should be equal to default value */ + ASSERT(group->mrg_cur_count == group_info.mgi_count); + } + + /* Build up a dummy group for free resources as a pool */ + group = groups + cap_rings->mr_gnum; + + /* Prepare basic information of the group */ + group->mrg_index = -1; + group->mrg_type = rtype; + group->mrg_state = MAC_GROUP_STATE_UNINIT; + group->mrg_mh = (mac_handle_t)mip; + group->mrg_next = NULL; + + /* + * If there are ungrouped rings, allocate a continuous buffer for + * remaining resources. + */ + if (ring_left != 0) { + group->mrg_rings = NULL; + group->mrg_cur_count = 0; + mac_init_group(mip, group, ring_left, cap_rings); + + /* The current group size should be equal to ring_left */ + ASSERT(group->mrg_cur_count == ring_left); + + ring_left = 0; + + /* Update this group's status */ + mac_set_rx_group_state(group, MAC_GROUP_STATE_REGISTERED); + } else + group->mrg_rings = NULL; + + ASSERT(ring_left == 0); + +bail: + /* Cache other important information to finalize the initialization */ + switch (rtype) { + case MAC_RING_TYPE_RX: + mip->mi_rx_group_type = cap_rings->mr_group_type; + mip->mi_rx_group_count = cap_rings->mr_gnum; + mip->mi_rx_groups = groups; + break; + case MAC_RING_TYPE_TX: + mip->mi_tx_group_type = cap_rings->mr_group_type; + mip->mi_tx_group_count = cap_rings->mr_gnum; + mip->mi_tx_group_free = group_free; + mip->mi_tx_groups = groups; + + /* + * Ring 0 is used as the default one and it could be assigned + * to a client as well. + */ + group = groups + cap_rings->mr_gnum; + ring = group->mrg_rings; + while ((ring->mr_index != 0) && (ring->mr_next != NULL)) + ring = ring->mr_next; + ASSERT(ring->mr_index == 0); + mip->mi_default_tx_ring = (mac_ring_handle_t)ring; + break; + default: + ASSERT(B_FALSE); + } + + if (err != 0) + mac_free_rings(mip, rtype); + + return (err); +} + +/* + * Called to free all ring groups with particular type. It's supposed all groups + * have been released by clinet. + */ +void +mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype) +{ + mac_group_t *group, *groups; + uint_t group_count; + + switch (rtype) { + case MAC_RING_TYPE_RX: + if (mip->mi_rx_groups == NULL) + return; + + groups = mip->mi_rx_groups; + group_count = mip->mi_rx_group_count; + + mip->mi_rx_groups = NULL; + mip->mi_rx_group_count = 0; + break; + case MAC_RING_TYPE_TX: + ASSERT(mip->mi_tx_group_count == mip->mi_tx_group_free); + + if (mip->mi_tx_groups == NULL) + return; + + groups = mip->mi_tx_groups; + group_count = mip->mi_tx_group_count; + + mip->mi_tx_groups = NULL; + mip->mi_tx_group_count = 0; + mip->mi_tx_group_free = 0; + mip->mi_default_tx_ring = NULL; + break; + default: + ASSERT(B_FALSE); + } + + for (group = groups; group != NULL; group = group->mrg_next) { + mac_ring_t *ring; + + if (group->mrg_cur_count == 0) + continue; + + ASSERT(group->mrg_rings != NULL); + + while ((ring = group->mrg_rings) != NULL) { + group->mrg_rings = ring->mr_next; + mac_ring_free(mip, ring); + } + } + + /* Free all the cached rings */ + mac_ring_freeall(mip); + /* Free the block of group data strutures */ + kmem_free(groups, sizeof (mac_group_t) * (group_count + 1)); +} + +/* + * Associate a MAC address with a receive group. + * + * The return value of this function should always be checked properly, because + * any type of failure could cause unexpected results. A group can be added + * or removed with a MAC address only after it has been reserved. Ideally, + * a successful reservation always leads to calling mac_group_addmac() to + * steer desired traffic. Failure of adding an unicast MAC address doesn't + * always imply that the group is functioning abnormally. + * + * Currently this function is called everywhere, and it reflects assumptions + * about MAC addresses in the implementation. CR 6735196. + */ +int +mac_group_addmac(mac_group_t *group, const uint8_t *addr) +{ + ASSERT(group->mrg_type == MAC_RING_TYPE_RX); + ASSERT(group->mrg_info.mgi_addmac != NULL); + + return (group->mrg_info.mgi_addmac(group->mrg_info.mgi_driver, addr)); +} + +/* + * Remove the association between MAC address and receive group. + */ +int +mac_group_remmac(mac_group_t *group, const uint8_t *addr) +{ + ASSERT(group->mrg_type == MAC_RING_TYPE_RX); + ASSERT(group->mrg_info.mgi_remmac != NULL); + + return (group->mrg_info.mgi_remmac(group->mrg_info.mgi_driver, addr)); +} + +/* + * Release a ring in use by marking it MR_FREE. + * Any other client may reserve it for its use. + */ +void +mac_release_tx_ring(mac_ring_handle_t rh) +{ + mac_ring_t *ring = (mac_ring_t *)rh; + mac_group_t *group = (mac_group_t *)ring->mr_gh; + mac_impl_t *mip = (mac_impl_t *)group->mrg_mh; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + ASSERT(ring->mr_state != MR_FREE); + + /* + * Default tx ring will be released by mac_stop(). + */ + if (rh == mip->mi_default_tx_ring) + return; + + mac_stop_ring(ring); + + ring->mr_state = MR_FREE; + ring->mr_flag = 0; +} + +/* + * Send packets through a selected tx ring. + */ +mblk_t * +mac_ring_tx(mac_ring_handle_t rh, mblk_t *mp) +{ + mac_ring_t *ring = (mac_ring_t *)rh; + mac_ring_info_t *info = &ring->mr_info; + + ASSERT(ring->mr_type == MAC_RING_TYPE_TX); + ASSERT(ring->mr_state >= MR_INUSE); + ASSERT(info->mri_tx != NULL); + + return (info->mri_tx(info->mri_driver, mp)); +} + +/* + * Find a ring from its index. + */ +mac_ring_t * +mac_find_ring(mac_group_t *group, int index) +{ + mac_ring_t *ring = group->mrg_rings; + + for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) + if (ring->mr_index == index) + break; + + return (ring); +} +/* + * Add a ring to an existing group. + * + * The ring must be either passed directly (for example if the ring + * movement is initiated by the framework), or specified through a driver + * index (for example when the ring is added by the driver. + * + * The caller needs to call mac_perim_enter() before calling this function. + */ +int +i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index) +{ + mac_impl_t *mip = (mac_impl_t *)group->mrg_mh; + mac_capab_rings_t *cap_rings; + boolean_t driver_call = (ring == NULL); + mac_group_type_t group_type; + int ret = 0; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + switch (group->mrg_type) { + case MAC_RING_TYPE_RX: + cap_rings = &mip->mi_rx_rings_cap; + group_type = mip->mi_rx_group_type; + break; + case MAC_RING_TYPE_TX: + cap_rings = &mip->mi_tx_rings_cap; + group_type = mip->mi_tx_group_type; + break; + default: + ASSERT(B_FALSE); + } + + /* + * There should be no ring with the same ring index in the target + * group. + */ + ASSERT(mac_find_ring(group, driver_call ? index : ring->mr_index) == + NULL); + + if (driver_call) { + /* + * The function is called as a result of a request from + * a driver to add a ring to an existing group, for example + * from the aggregation driver. Allocate a new mac_ring_t + * for that ring. + */ + ring = mac_init_ring(mip, group, index, cap_rings); + ASSERT(group->mrg_state > MAC_GROUP_STATE_UNINIT); + } else { + /* + * The function is called as a result of a MAC layer request + * to add a ring to an existing group. In this case the + * ring is being moved between groups, which requires + * the underlying driver to support dynamic grouping, + * and the mac_ring_t already exists. + */ + ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC); + ASSERT(cap_rings->mr_gaddring != NULL); + ASSERT(ring->mr_gh == NULL); + } + + /* + * At this point the ring should not be in use, and it should be + * of the right for the target group. + */ + ASSERT(ring->mr_state < MR_INUSE); + ASSERT(ring->mr_srs == NULL); + ASSERT(ring->mr_type == group->mrg_type); + + if (!driver_call) { + /* + * Add the driver level hardware ring if the process was not + * initiated by the driver, and the target group is not the + * group. + */ + if (group->mrg_driver != NULL) { + cap_rings->mr_gaddring(group->mrg_driver, + ring->mr_driver, ring->mr_type); + } + + /* + * Insert the ring ahead existing rings. + */ + ring->mr_next = group->mrg_rings; + group->mrg_rings = ring; + ring->mr_gh = (mac_group_handle_t)group; + group->mrg_cur_count++; + } + + /* + * If the group has not been actively used, we're done. + */ + if (group->mrg_index != -1 && + group->mrg_state < MAC_GROUP_STATE_RESERVED) + return (0); + + /* + * Set up SRS/SR according to the ring type. + */ + switch (ring->mr_type) { + case MAC_RING_TYPE_RX: + /* + * Setup SRS on top of the new ring if the group is + * reserved for someones exclusive use. + */ + if (group->mrg_state == MAC_GROUP_STATE_RESERVED) { + flow_entry_t *flent; + mac_client_impl_t *mcip; + + mcip = MAC_RX_GROUP_ONLY_CLIENT(group); + ASSERT(mcip != NULL); + flent = mcip->mci_flent; + ASSERT(flent->fe_rx_srs_cnt > 0); + mac_srs_group_setup(mcip, flent, group, SRST_LINK); + } + break; + case MAC_RING_TYPE_TX: + /* + * For TX this function is only invoked during the + * initial creation of a group when a share is + * associated with a MAC client. So the datapath is not + * yet setup, and will be setup later after the + * group has been reserved and populated. + */ + break; + default: + ASSERT(B_FALSE); + } + + /* + * Start the ring if needed. Failure causes to undo the grouping action. + */ + if ((ret = mac_start_ring(ring)) != 0) { + if (ring->mr_type == MAC_RING_TYPE_RX) { + if (ring->mr_srs != NULL) { + mac_rx_srs_remove(ring->mr_srs); + ring->mr_srs = NULL; + } + } + if (!driver_call) { + cap_rings->mr_gremring(group->mrg_driver, + ring->mr_driver, ring->mr_type); + } + group->mrg_cur_count--; + group->mrg_rings = ring->mr_next; + + ring->mr_gh = NULL; + + if (driver_call) + mac_ring_free(mip, ring); + + return (ret); + } + + /* + * Update the ring's state. + */ + ring->mr_state = MR_INUSE; + MAC_RING_UNMARK(ring, MR_INCIPIENT); + return (0); +} + +/* + * Remove a ring from it's current group. MAC internal function for dynamic + * grouping. + * + * The caller needs to call mac_perim_enter() before calling this function. + */ +void +i_mac_group_rem_ring(mac_group_t *group, mac_ring_t *ring, + boolean_t driver_call) +{ + mac_impl_t *mip = (mac_impl_t *)group->mrg_mh; + mac_capab_rings_t *cap_rings = NULL; + mac_group_type_t group_type; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + ASSERT(mac_find_ring(group, ring->mr_index) == ring); + ASSERT((mac_group_t *)ring->mr_gh == group); + ASSERT(ring->mr_type == group->mrg_type); + + switch (ring->mr_type) { + case MAC_RING_TYPE_RX: + group_type = mip->mi_rx_group_type; + cap_rings = &mip->mi_rx_rings_cap; + + if (group->mrg_state >= MAC_GROUP_STATE_RESERVED) + mac_stop_ring(ring); + + /* + * Only hardware classified packets hold a reference to the + * ring all the way up the Rx path. mac_rx_srs_remove() + * will take care of quiescing the Rx path and removing the + * SRS. The software classified path neither holds a reference + * nor any association with the ring in mac_rx. + */ + if (ring->mr_srs != NULL) { + mac_rx_srs_remove(ring->mr_srs); + ring->mr_srs = NULL; + } + ring->mr_state = MR_FREE; + ring->mr_flag = 0; + + break; + case MAC_RING_TYPE_TX: + /* + * For TX this function is only invoked in two + * cases: + * + * 1) In the case of a failure during the + * initial creation of a group when a share is + * associated with a MAC client. So the SRS is not + * yet setup, and will be setup later after the + * group has been reserved and populated. + * + * 2) From mac_release_tx_group() when freeing + * a TX SRS. + * + * In both cases the SRS and its soft rings are + * already quiesced. + */ + ASSERT(!driver_call); + group_type = mip->mi_tx_group_type; + cap_rings = &mip->mi_tx_rings_cap; + break; + default: + ASSERT(B_FALSE); + } + + /* + * Remove the ring from the group. + */ + if (ring == group->mrg_rings) + group->mrg_rings = ring->mr_next; + else { + mac_ring_t *pre; + + pre = group->mrg_rings; + while (pre->mr_next != ring) + pre = pre->mr_next; + pre->mr_next = ring->mr_next; + } + group->mrg_cur_count--; + + if (!driver_call) { + ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC); + ASSERT(cap_rings->mr_gremring != NULL); + + /* + * Remove the driver level hardware ring. + */ + if (group->mrg_driver != NULL) { + cap_rings->mr_gremring(group->mrg_driver, + ring->mr_driver, ring->mr_type); + } + } + + ring->mr_gh = NULL; + if (driver_call) { + mac_ring_free(mip, ring); + } else { + ring->mr_state = MR_FREE; + ring->mr_flag = 0; + } +} + +/* + * Move a ring to the target group. If needed, remove the ring from the group + * that it currently belongs to. + * + * The caller need to enter MAC's perimeter by calling mac_perim_enter(). + */ +static int +mac_group_mov_ring(mac_impl_t *mip, mac_group_t *d_group, mac_ring_t *ring) +{ + mac_group_t *s_group = (mac_group_t *)ring->mr_gh; + int rv; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + ASSERT(d_group != NULL); + ASSERT(s_group->mrg_mh == d_group->mrg_mh); + + if (s_group == d_group) + return (0); + + /* + * Remove it from current group first. + */ + if (s_group != NULL) + i_mac_group_rem_ring(s_group, ring, B_FALSE); + + /* + * Add it to the new group. + */ + rv = i_mac_group_add_ring(d_group, ring, 0); + if (rv != 0) { + /* + * Failed to add ring back to source group. If + * that fails, the ring is stuck in limbo, log message. + */ + if (i_mac_group_add_ring(s_group, ring, 0)) { + cmn_err(CE_WARN, "%s: failed to move ring %p\n", + mip->mi_name, (void *)ring); + } + } + + return (rv); +} + +/* + * Find a MAC address according to its value. + */ +mac_address_t * +mac_find_macaddr(mac_impl_t *mip, uint8_t *mac_addr) +{ + mac_address_t *map; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + for (map = mip->mi_addresses; map != NULL; map = map->ma_next) { + if (bcmp(mac_addr, map->ma_addr, map->ma_len) == 0) + break; + } + + return (map); +} + +/* + * Check whether the MAC address is shared by multiple clients. + */ +boolean_t +mac_check_macaddr_shared(mac_address_t *map) +{ + ASSERT(MAC_PERIM_HELD((mac_handle_t)map->ma_mip)); + + return (map->ma_nusers > 1); +} + +/* + * Enable a MAC address by enabling promiscuous mode. + */ +static int +mac_add_macaddr_promisc(mac_impl_t *mip, mac_group_t *group) +{ + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + /* + * Current interface only allow to set promiscuous mode with the + * default group. Note, mip->mi_rx_groups might be NULL. + */ + ASSERT(group == mip->mi_rx_groups); + + if (group == mip->mi_rx_groups) + return (i_mac_promisc_set(mip, B_TRUE, MAC_DEVPROMISC)); + else + return (ENOTSUP); +} + +/* + * Remove a MAC address that was added by enabling promiscuous mode. + */ +static int +mac_remove_macaddr_promisc(mac_impl_t *mip, mac_group_t *group) +{ + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + ASSERT(group == mip->mi_rx_groups); + + return (i_mac_promisc_set(mip, B_FALSE, MAC_DEVPROMISC)); +} + +/* + * Remove the specified MAC address from the MAC address list and free it. + */ +static void +mac_free_macaddr(mac_address_t *map) +{ + mac_impl_t *mip = map->ma_mip; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + ASSERT(mip->mi_addresses != NULL); + + map = mac_find_macaddr(mip, map->ma_addr); + + ASSERT(map != NULL); + ASSERT(map->ma_nusers == 0); + + if (map == mip->mi_addresses) { + mip->mi_addresses = map->ma_next; + } else { + mac_address_t *pre; + + pre = mip->mi_addresses; + while (pre->ma_next != map) + pre = pre->ma_next; + pre->ma_next = map->ma_next; + } + + kmem_free(map, sizeof (mac_address_t)); +} + +/* + * Add a MAC address reference for a client. If the desired MAC address + * exists, add a reference to it. Otherwise, add the new address by adding + * it to a reserved group or setting promiscuous mode. Won't try different + * group is the group is non-NULL, so the caller must explictly share + * default group when needed. + * + * Note, the primary MAC address is initialized at registration time, so + * to add it to default group only need to activate it if its reference + * count is still zero. Also, some drivers may not have advertised RINGS + * capability. + */ +int +mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr) +{ + mac_address_t *map; + int err = 0; + boolean_t allocated_map = B_FALSE; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + map = mac_find_macaddr(mip, mac_addr); + + /* + * If the new MAC address has not been added. Allocate a new one + * and set it up. + */ + if (map == NULL) { + map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP); + map->ma_len = mip->mi_type->mt_addr_length; + bcopy(mac_addr, map->ma_addr, map->ma_len); + map->ma_nusers = 0; + map->ma_group = group; + map->ma_mip = mip; + + /* add the new MAC address to the head of the address list */ + map->ma_next = mip->mi_addresses; + mip->mi_addresses = map; + + allocated_map = B_TRUE; + } + + ASSERT(map->ma_group == group); + + /* + * If the MAC address is already in use, simply account for the + * new client. + */ + if (map->ma_nusers++ > 0) + return (0); + + /* + * Activate this MAC address by adding it to the reserved group. + */ + if (group != NULL) { + err = mac_group_addmac(group, (const uint8_t *)mac_addr); + if (err == 0) { + map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED; + return (0); + } + } + + /* + * Try promiscuous mode. Note that rx_groups could be NULL, so we + * need to handle drivers that don't advertise the RINGS capability. + */ + if (group == mip->mi_rx_groups) { + /* + * For drivers that don't advertise RINGS capability, do + * nothing for the primary address. + */ + if ((group == NULL) && + (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0)) { + map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED; + return (0); + } + + /* + * Enable promiscuous mode in order to receive traffic + * to the new MAC address. + */ + err = mac_add_macaddr_promisc(mip, group); + if (err == 0) { + map->ma_type = MAC_ADDRESS_TYPE_UNICAST_PROMISC; + return (0); + } + } + + /* + * Free the MAC address that could not be added. Don't free + * a pre-existing address, it could have been the entry + * for the primary MAC address which was pre-allocated by + * mac_init_macaddr(), and which must remain on the list. + */ + map->ma_nusers--; + if (allocated_map) + mac_free_macaddr(map); + return (err); +} + +/* + * Remove a reference to a MAC address. This may cause to remove the MAC + * address from an associated group or to turn off promiscuous mode. + * The caller needs to handle the failure properly. + */ +int +mac_remove_macaddr(mac_address_t *map) +{ + mac_impl_t *mip = map->ma_mip; + int err = 0; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + ASSERT(map == mac_find_macaddr(mip, map->ma_addr)); + + /* + * If it's not the last client using this MAC address, only update + * the MAC clients count. + */ + if (--map->ma_nusers > 0) + return (0); + + /* + * The MAC address is no longer used by any MAC client, so remove + * it from its associated group, or turn off promiscuous mode + * if it was enabled for the MAC address. + */ + switch (map->ma_type) { + case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED: + /* + * Don't free the preset primary address for drivers that + * don't advertise RINGS capability. + */ + if (map->ma_group == NULL) + return (0); + + err = mac_group_remmac(map->ma_group, map->ma_addr); + break; + case MAC_ADDRESS_TYPE_UNICAST_PROMISC: + err = mac_remove_macaddr_promisc(mip, map->ma_group); + break; + default: + ASSERT(B_FALSE); + } + + if (err != 0) + return (err); + + /* + * We created MAC address for the primary one at registration, so we + * won't free it here. mac_fini_macaddr() will take care of it. + */ + if (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) != 0) + mac_free_macaddr(map); + + return (0); +} + +/* + * Update an existing MAC address. The caller need to make sure that the new + * value has not been used. + */ +int +mac_update_macaddr(mac_address_t *map, uint8_t *mac_addr) +{ + mac_impl_t *mip = map->ma_mip; + int err = 0; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + ASSERT(mac_find_macaddr(mip, mac_addr) == NULL); + + switch (map->ma_type) { + case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED: + /* + * Update the primary address for drivers that are not + * RINGS capable. + */ + if (map->ma_group == NULL) { + err = mip->mi_unicst(mip->mi_driver, (const uint8_t *) + mac_addr); + if (err != 0) + return (err); + break; + } + + /* + * If this MAC address is not currently in use, + * simply break out and update the value. + */ + if (map->ma_nusers == 0) + break; + + /* + * Need to replace the MAC address associated with a group. + */ + err = mac_group_remmac(map->ma_group, map->ma_addr); + if (err != 0) + return (err); + + err = mac_group_addmac(map->ma_group, mac_addr); + + /* + * Failure hints hardware error. The MAC layer needs to + * have error notification facility to handle this. + * Now, simply try to restore the value. + */ + if (err != 0) + (void) mac_group_addmac(map->ma_group, map->ma_addr); + + break; + case MAC_ADDRESS_TYPE_UNICAST_PROMISC: + /* + * Need to do nothing more if in promiscuous mode. + */ + break; + default: + ASSERT(B_FALSE); + } + + /* + * Successfully replaced the MAC address. + */ + if (err == 0) + bcopy(mac_addr, map->ma_addr, map->ma_len); + + return (err); +} + +/* + * Freshen the MAC address with new value. Its caller must have updated the + * hardware MAC address before calling this function. + * This funcitons is supposed to be used to handle the MAC address change + * notification from underlying drivers. + */ +void +mac_freshen_macaddr(mac_address_t *map, uint8_t *mac_addr) +{ + mac_impl_t *mip = map->ma_mip; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + ASSERT(mac_find_macaddr(mip, mac_addr) == NULL); + + /* + * Freshen the MAC address with new value. + */ + bcopy(mac_addr, map->ma_addr, map->ma_len); + bcopy(mac_addr, mip->mi_addr, map->ma_len); + + /* + * Update all MAC clients that share this MAC address. + */ + mac_unicast_update_clients(mip, map); +} + +/* + * Set up the primary MAC address. + */ +void +mac_init_macaddr(mac_impl_t *mip) +{ + mac_address_t *map; + + /* + * The reference count is initialized to zero, until it's really + * activated. + */ + map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP); + map->ma_len = mip->mi_type->mt_addr_length; + bcopy(mip->mi_addr, map->ma_addr, map->ma_len); + + /* + * If driver advertises RINGS capability, it shouldn't have initialized + * its primary MAC address. For other drivers, including VNIC, the + * primary address must work after registration. + */ + if (mip->mi_rx_groups == NULL) + map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED; + + /* + * The primary MAC address is reserved for default group according + * to current design. + */ + map->ma_group = mip->mi_rx_groups; + map->ma_mip = mip; + + mip->mi_addresses = map; +} + +/* + * Clean up the primary MAC address. Note, only one primary MAC address + * is allowed. All other MAC addresses must have been freed appropriately. + */ +void +mac_fini_macaddr(mac_impl_t *mip) +{ + mac_address_t *map = mip->mi_addresses; + + /* there should be exactly one entry left on the list */ + ASSERT(map != NULL); + ASSERT(map->ma_nusers == 0); + ASSERT(map->ma_next == NULL); + + kmem_free(map, sizeof (mac_address_t)); + mip->mi_addresses = NULL; +} + +/* + * Logging related functions. + */ + +/* Write the Flow description to the log file */ +int +mac_write_flow_desc(flow_entry_t *flent, mac_client_impl_t *mcip) +{ + flow_desc_t *fdesc; + mac_resource_props_t *mrp; + net_desc_t ndesc; + + bzero(&ndesc, sizeof (net_desc_t)); + + /* + * Grab the fe_lock to see a self-consistent fe_flow_desc. + * Updates to the fe_flow_desc are done under the fe_lock + */ + mutex_enter(&flent->fe_lock); + fdesc = &flent->fe_flow_desc; + mrp = &flent->fe_resource_props; + + ndesc.nd_name = flent->fe_flow_name; + ndesc.nd_devname = mcip->mci_name; + bcopy(fdesc->fd_src_mac, ndesc.nd_ehost, ETHERADDRL); + bcopy(fdesc->fd_dst_mac, ndesc.nd_edest, ETHERADDRL); + ndesc.nd_sap = htonl(fdesc->fd_sap); + ndesc.nd_isv4 = (uint8_t)fdesc->fd_ipversion == IPV4_VERSION; + ndesc.nd_bw_limit = mrp->mrp_maxbw; + if (ndesc.nd_isv4) { + ndesc.nd_saddr[3] = htonl(fdesc->fd_local_addr.s6_addr32[3]); + ndesc.nd_daddr[3] = htonl(fdesc->fd_remote_addr.s6_addr32[3]); + } else { + bcopy(&fdesc->fd_local_addr, ndesc.nd_saddr, IPV6_ADDR_LEN); + bcopy(&fdesc->fd_remote_addr, ndesc.nd_daddr, IPV6_ADDR_LEN); + } + ndesc.nd_sport = htons(fdesc->fd_local_port); + ndesc.nd_dport = htons(fdesc->fd_remote_port); + ndesc.nd_protocol = (uint8_t)fdesc->fd_protocol; + mutex_exit(&flent->fe_lock); + + return (exacct_commit_netinfo((void *)&ndesc, EX_NET_FLDESC_REC)); +} + +/* Write the Flow statistics to the log file */ +int +mac_write_flow_stats(flow_entry_t *flent) +{ + flow_stats_t *fl_stats; + net_stat_t nstat; + + fl_stats = &flent->fe_flowstats; + nstat.ns_name = flent->fe_flow_name; + nstat.ns_ibytes = fl_stats->fs_rbytes; + nstat.ns_obytes = fl_stats->fs_obytes; + nstat.ns_ipackets = fl_stats->fs_ipackets; + nstat.ns_opackets = fl_stats->fs_opackets; + nstat.ns_ierrors = fl_stats->fs_ierrors; + nstat.ns_oerrors = fl_stats->fs_oerrors; + + return (exacct_commit_netinfo((void *)&nstat, EX_NET_FLSTAT_REC)); +} + +/* Write the Link Description to the log file */ +int +mac_write_link_desc(mac_client_impl_t *mcip) +{ + net_desc_t ndesc; + flow_entry_t *flent = mcip->mci_flent; + + bzero(&ndesc, sizeof (net_desc_t)); + + ndesc.nd_name = mcip->mci_name; + ndesc.nd_devname = mcip->mci_name; + ndesc.nd_isv4 = B_TRUE; + /* + * Grab the fe_lock to see a self-consistent fe_flow_desc. + * Updates to the fe_flow_desc are done under the fe_lock + * after removing the flent from the flow table. + */ + mutex_enter(&flent->fe_lock); + bcopy(flent->fe_flow_desc.fd_src_mac, ndesc.nd_ehost, ETHERADDRL); + mutex_exit(&flent->fe_lock); + + return (exacct_commit_netinfo((void *)&ndesc, EX_NET_LNDESC_REC)); +} + +/* Write the Link statistics to the log file */ +int +mac_write_link_stats(mac_client_impl_t *mcip) +{ + net_stat_t nstat; + + nstat.ns_name = mcip->mci_name; + nstat.ns_ibytes = mcip->mci_stat_ibytes; + nstat.ns_obytes = mcip->mci_stat_obytes; + nstat.ns_ipackets = mcip->mci_stat_ipackets; + nstat.ns_opackets = mcip->mci_stat_opackets; + nstat.ns_ierrors = mcip->mci_stat_ierrors; + nstat.ns_oerrors = mcip->mci_stat_oerrors; + + return (exacct_commit_netinfo((void *)&nstat, EX_NET_LNSTAT_REC)); +} + +/* + * For a given flow, if the descrition has not been logged before, do it now. + * If it is a VNIC, then we have collected information about it from the MAC + * table, so skip it. + */ +/*ARGSUSED*/ +static int +mac_log_flowinfo(flow_entry_t *flent, void *args) +{ + mac_client_impl_t *mcip = flent->fe_mcip; + + if (mcip == NULL) + return (0); + + /* + * If the name starts with "vnic", and fe_user_generated is true (to + * exclude the mcast and active flow entries created implicitly for + * a vnic, it is a VNIC flow. i.e. vnic1 is a vnic flow, + * vnic/bge1/mcast1 is not and neither is vnic/bge1/active. + */ + if (strncasecmp(flent->fe_flow_name, "vnic", 4) == 0 && + (flent->fe_type & FLOW_USER) != 0) { + return (0); + } + + if (!flent->fe_desc_logged) { + /* + * We don't return error because we want to continu the + * walk in case this is the last walk which means we + * need to reset fe_desc_logged in all the flows. + */ + if (mac_write_flow_desc(flent, mcip) != 0) + return (0); + flent->fe_desc_logged = B_TRUE; + } + + /* + * Regardless of the error, we want to proceed in case we have to + * reset fe_desc_logged. + */ + (void) mac_write_flow_stats(flent); + + if (mcip != NULL && !(mcip->mci_state_flags & MCIS_DESC_LOGGED)) + flent->fe_desc_logged = B_FALSE; + + return (0); +} + +typedef struct i_mac_log_state_s { + boolean_t mi_last; + int mi_fenable; + int mi_lenable; +} i_mac_log_state_t; + +/* + * Walk the mac_impl_ts and log the description for each mac client of this mac, + * if it hasn't already been done. Additionally, log statistics for the link as + * well. Walk the flow table and log information for each flow as well. + * If it is the last walk (mci_last), then we turn off mci_desc_logged (and + * also fe_desc_logged, if flow logging is on) since we want to log the + * description if and when logging is restarted. + */ +/*ARGSUSED*/ +static uint_t +i_mac_log_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg) +{ + mac_impl_t *mip = (mac_impl_t *)val; + i_mac_log_state_t *lstate = (i_mac_log_state_t *)arg; + int ret; + mac_client_impl_t *mcip; + + /* + * Only walk the client list for NIC and etherstub + */ + if ((mip->mi_state_flags & MIS_DISABLED) || + ((mip->mi_state_flags & MIS_IS_VNIC) && + (mac_get_lower_mac_handle((mac_handle_t)mip) != NULL))) + return (MH_WALK_CONTINUE); + + for (mcip = mip->mi_clients_list; mcip != NULL; + mcip = mcip->mci_client_next) { + if (!MCIP_DATAPATH_SETUP(mcip)) + continue; + if (lstate->mi_lenable) { + if (!(mcip->mci_state_flags & MCIS_DESC_LOGGED)) { + ret = mac_write_link_desc(mcip); + if (ret != 0) { + /* + * We can't terminate it if this is the last + * walk, else there might be some links with + * mi_desc_logged set to true, which means + * their description won't be logged the next + * time logging is started (similarly for the + * flows within such links). We can continue + * without walking the flow table (i.e. to + * set fe_desc_logged to false) because we + * won't have written any flow stuff for this + * link as we haven't logged the link itself. + */ + if (lstate->mi_last) + return (MH_WALK_CONTINUE); + else + return (MH_WALK_TERMINATE); + } + mcip->mci_state_flags |= MCIS_DESC_LOGGED; + } + } + + if (mac_write_link_stats(mcip) != 0 && !lstate->mi_last) + return (MH_WALK_TERMINATE); + + if (lstate->mi_last) + mcip->mci_state_flags &= ~MCIS_DESC_LOGGED; + + if (lstate->mi_fenable) { + if (mcip->mci_subflow_tab != NULL) { + (void) mac_flow_walk(mcip->mci_subflow_tab, + mac_log_flowinfo, mip); + } + } + } + return (MH_WALK_CONTINUE); +} + +/* + * The timer thread that runs every mac_logging_interval seconds and logs + * link and/or flow information. + */ +/* ARGSUSED */ +void +mac_log_linkinfo(void *arg) +{ + i_mac_log_state_t lstate; + + rw_enter(&i_mac_impl_lock, RW_READER); + if (!mac_flow_log_enable && !mac_link_log_enable) { + rw_exit(&i_mac_impl_lock); + return; + } + lstate.mi_fenable = mac_flow_log_enable; + lstate.mi_lenable = mac_link_log_enable; + lstate.mi_last = B_FALSE; + rw_exit(&i_mac_impl_lock); + + mod_hash_walk(i_mac_impl_hash, i_mac_log_walker, &lstate); + + rw_enter(&i_mac_impl_lock, RW_WRITER); + if (mac_flow_log_enable || mac_link_log_enable) { + mac_logging_timer = timeout(mac_log_linkinfo, NULL, + SEC_TO_TICK(mac_logging_interval)); + } + rw_exit(&i_mac_impl_lock); +} + +/* + * Start the logging timer. + */ +void +mac_start_logusage(mac_logtype_t type, uint_t interval) +{ + rw_enter(&i_mac_impl_lock, RW_WRITER); + switch (type) { + case MAC_LOGTYPE_FLOW: + if (mac_flow_log_enable) { + rw_exit(&i_mac_impl_lock); + return; + } + mac_flow_log_enable = B_TRUE; + /* FALLTHRU */ + case MAC_LOGTYPE_LINK: + if (mac_link_log_enable) { + rw_exit(&i_mac_impl_lock); + return; + } + mac_link_log_enable = B_TRUE; + break; + default: + ASSERT(0); + } + mac_logging_interval = interval; + rw_exit(&i_mac_impl_lock); + mac_log_linkinfo(NULL); +} + +/* + * Stop the logging timer if both Link and Flow logging are turned off. + */ +void +mac_stop_logusage(mac_logtype_t type) +{ + i_mac_log_state_t lstate; + + rw_enter(&i_mac_impl_lock, RW_WRITER); + lstate.mi_fenable = mac_flow_log_enable; + lstate.mi_lenable = mac_link_log_enable; + + /* Last walk */ + lstate.mi_last = B_TRUE; + + switch (type) { + case MAC_LOGTYPE_FLOW: + if (lstate.mi_fenable) { + ASSERT(mac_link_log_enable); + mac_flow_log_enable = B_FALSE; + mac_link_log_enable = B_FALSE; + break; + } + /* FALLTHRU */ + case MAC_LOGTYPE_LINK: + if (!lstate.mi_lenable || mac_flow_log_enable) { + rw_exit(&i_mac_impl_lock); + return; + } + mac_link_log_enable = B_FALSE; + break; + default: + ASSERT(0); + } + rw_exit(&i_mac_impl_lock); + (void) untimeout(mac_logging_timer); + mac_logging_timer = 0; + + /* Last walk */ + mod_hash_walk(i_mac_impl_hash, i_mac_log_walker, &lstate); +} + +/* + * Walk the rx and tx SRS/SRs for a flow and update the priority value. + */ +void +mac_flow_update_priority(mac_client_impl_t *mcip, flow_entry_t *flent) +{ + pri_t pri; + int count; + mac_soft_ring_set_t *mac_srs; + + if (flent->fe_rx_srs_cnt <= 0) + return; + + if (((mac_soft_ring_set_t *)flent->fe_rx_srs[0])->srs_type == + SRST_FLOW) { + pri = FLOW_PRIORITY(mcip->mci_min_pri, + mcip->mci_max_pri, + flent->fe_resource_props.mrp_priority); + } else { + pri = mcip->mci_max_pri; + } + + for (count = 0; count < flent->fe_rx_srs_cnt; count++) { + mac_srs = flent->fe_rx_srs[count]; + mac_update_srs_priority(mac_srs, pri); + } + /* + * If we have a Tx SRS, we need to modify all the threads associated + * with it. + */ + if (flent->fe_tx_srs != NULL) + mac_update_srs_priority(flent->fe_tx_srs, pri); +} + +/* + * RX and TX rings are reserved according to different semantics depending + * on the requests from the MAC clients and type of rings: + * + * On the Tx side, by default we reserve individual rings, independently from + * the groups. + * + * On the Rx side, the reservation is at the granularity of the group + * of rings, and used for v12n level 1 only. It has a special case for the + * primary client. + * + * If a share is allocated to a MAC client, we allocate a TX group and an + * RX group to the client, and assign TX rings and RX rings to these + * groups according to information gathered from the driver through + * the share capability. + * + * The foreseable evolution of Rx rings will handle v12n level 2 and higher + * to allocate individual rings out of a group and program the hw classifier + * based on IP address or higher level criteria. + */ + +/* + * mac_reserve_tx_ring() + * Reserve a unused ring by marking it with MR_INUSE state. + * As reserved, the ring is ready to function. + * + * Notes for Hybrid I/O: + * + * If a specific ring is needed, it is specified through the desired_ring + * argument. Otherwise that argument is set to NULL. + * If the desired ring was previous allocated to another client, this + * function swaps it with a new ring from the group of unassigned rings. + */ +mac_ring_t * +mac_reserve_tx_ring(mac_impl_t *mip, mac_ring_t *desired_ring) +{ + mac_group_t *group; + mac_ring_t *ring; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + if (mip->mi_tx_groups == NULL) + return (NULL); + + /* + * Find an available ring and start it before changing its status. + * The unassigned rings are at the end of the mi_tx_groups + * array. + */ + group = mip->mi_tx_groups + mip->mi_tx_group_count; + + for (ring = group->mrg_rings; ring != NULL; + ring = ring->mr_next) { + if (desired_ring == NULL) { + if (ring->mr_state == MR_FREE) + /* wanted any free ring and found one */ + break; + } else { + mac_ring_t *sring; + mac_client_impl_t *client; + mac_soft_ring_set_t *srs; + + if (ring != desired_ring) + /* wants a desired ring but this one ain't it */ + continue; + + if (ring->mr_state == MR_FREE) + break; + + /* + * Found the desired ring but it's already in use. + * Swap it with a new ring. + */ + + /* find the client which owns that ring */ + for (client = mip->mi_clients_list; client != NULL; + client = client->mci_client_next) { + srs = MCIP_TX_SRS(client); + if (srs != NULL && mac_tx_srs_ring_present(srs, + desired_ring)) { + /* found our ring */ + break; + } + } + ASSERT(client != NULL); + + /* + * Note that we cannot simply invoke the group + * add/rem routines since the client doesn't have a + * TX group. So we need to instead add/remove + * the rings from the SRS. + */ + ASSERT(client->mci_share == NULL); + + /* first quiece the client */ + mac_tx_client_quiesce(client, SRS_QUIESCE); + + /* give a new ring to the client... */ + sring = mac_reserve_tx_ring(mip, NULL); + if (sring != NULL) { + /* + * There are no other available ring + * on that MAC instance. The client + * will fallback to the shared TX + * ring. + * + * XXX if the user required the client + * to have a hardware transmit ring, + * we need to ensure we don't remove + * the last ring from the client. + * In that case look for a repacement + * ring from a client which does not + * require a hardware ring, we could + * add an argument to + * mac_reserve_tx_ring() which causes + * it to take a ring from such a client + * even if the desired ring is NULL. + * This will have to be done as part + * of the fix for CR 6758935. If that still + * fails, i.e. if all rings are allocated + * to clients which require rings, then + * cleanly fail the operation. + */ + mac_tx_srs_add_ring(srs, sring); + } + + /* ... in exchange for our desired ring */ + mac_tx_srs_del_ring(srs, desired_ring); + + /* restart the client */ + mac_tx_client_restart(client); + + break; + } + } + + if (ring != NULL) { + if (mac_start_ring(ring) != 0) + return (NULL); + ring->mr_state = MR_INUSE; + } + + return (ring); +} + +/* + * Minimum number of rings to leave in the default TX group when allocating + * rings to new clients. + */ +static uint_t mac_min_rx_default_rings = 1; + +/* + * Populate a zero-ring group with rings. If the share is non-NULL, + * the rings are chosen according to that share. + * Invoked after allocating a new RX or TX group through + * mac_reserve_rx_group() or mac_reserve_tx_group(), respectively. + * Returns zero on success, an errno otherwise. + */ +int +i_mac_group_allocate_rings(mac_impl_t *mip, mac_ring_type_t ring_type, + mac_group_t *src_group, mac_group_t *new_group, mac_share_handle_t share) +{ + mac_ring_t **rings, *tmp_ring[1], *ring; + uint_t nrings; + int rv, i, j; + + ASSERT(mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC && + mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC); + ASSERT(new_group->mrg_cur_count == 0); + + /* + * First find the rings to allocate to the group. + */ + if (share != NULL) { + /* get rings through ms_squery() */ + mip->mi_share_capab.ms_squery(share, ring_type, NULL, &nrings); + ASSERT(nrings != 0); + rings = kmem_alloc(nrings * sizeof (mac_ring_handle_t), + KM_SLEEP); + mip->mi_share_capab.ms_squery(share, ring_type, + (mac_ring_handle_t *)rings, &nrings); + } else { + /* this function is called for TX only with a share */ + ASSERT(ring_type == MAC_RING_TYPE_RX); + /* + * Pick one ring from default group. + * + * for now pick the second ring which requires the first ring + * at index 0 to stay in the default group, since it is the + * ring which carries the multicast traffic. + * We need a better way for a driver to indicate this, + * for example a per-ring flag. + */ + for (ring = src_group->mrg_rings; ring != NULL; + ring = ring->mr_next) { + if (ring->mr_index != 0) + break; + } + ASSERT(ring != NULL); + nrings = 1; + tmp_ring[0] = ring; + rings = tmp_ring; + } + + switch (ring_type) { + case MAC_RING_TYPE_RX: + if (src_group->mrg_cur_count - nrings < + mac_min_rx_default_rings) { + /* we ran out of rings */ + return (ENOSPC); + } + + /* move receive rings to new group */ + for (i = 0; i < nrings; i++) { + rv = mac_group_mov_ring(mip, new_group, rings[i]); + if (rv != 0) { + /* move rings back on failure */ + for (j = 0; j < i; j++) { + (void) mac_group_mov_ring(mip, + src_group, rings[j]); + } + return (rv); + } + } + break; + + case MAC_RING_TYPE_TX: { + mac_ring_t *tmp_ring; + + /* move the TX rings to the new group */ + ASSERT(src_group == NULL); + for (i = 0; i < nrings; i++) { + /* get the desired ring */ + tmp_ring = mac_reserve_tx_ring(mip, rings[i]); + ASSERT(tmp_ring == rings[i]); + rv = mac_group_mov_ring(mip, new_group, rings[i]); + if (rv != 0) { + /* cleanup on failure */ + for (j = 0; j < i; j++) { + (void) mac_group_mov_ring(mip, + mip->mi_tx_groups + + mip->mi_tx_group_count, rings[j]); + } + } + } + break; + } + } + + if (share != NULL) { + /* add group to share */ + mip->mi_share_capab.ms_sadd(share, new_group->mrg_driver); + /* free temporary array of rings */ + kmem_free(rings, nrings * sizeof (mac_ring_handle_t)); + } + + return (0); +} + +void +mac_rx_group_add_client(mac_group_t *grp, mac_client_impl_t *mcip) +{ + mac_grp_client_t *mgcp; + + for (mgcp = grp->mrg_clients; mgcp != NULL; mgcp = mgcp->mgc_next) { + if (mgcp->mgc_client == mcip) + break; + } + + VERIFY(mgcp == NULL); + + mgcp = kmem_zalloc(sizeof (mac_grp_client_t), KM_SLEEP); + mgcp->mgc_client = mcip; + mgcp->mgc_next = grp->mrg_clients; + grp->mrg_clients = mgcp; + +} + +void +mac_rx_group_remove_client(mac_group_t *grp, mac_client_impl_t *mcip) +{ + mac_grp_client_t *mgcp, **pprev; + + for (pprev = &grp->mrg_clients, mgcp = *pprev; mgcp != NULL; + pprev = &mgcp->mgc_next, mgcp = *pprev) { + if (mgcp->mgc_client == mcip) + break; + } + + ASSERT(mgcp != NULL); + + *pprev = mgcp->mgc_next; + kmem_free(mgcp, sizeof (mac_grp_client_t)); +} + +/* + * mac_reserve_rx_group() + * + * Finds an available group and exclusively reserves it for a client. + * The group is chosen to suit the flow's resource controls (bandwidth and + * fanout requirements) and the address type. + * If the requestor is the pimary MAC then return the group with the + * largest number of rings, otherwise the default ring when available. + */ +mac_group_t * +mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, + mac_rx_group_reserve_type_t rtype) +{ + mac_share_handle_t share = mcip->mci_share; + mac_impl_t *mip = mcip->mci_mip; + mac_group_t *grp = NULL; + int i, start, loopcount; + int err; + mac_address_t *map; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + /* Check if a group already has this mac address (case of VLANs) */ + if ((map = mac_find_macaddr(mip, mac_addr)) != NULL) + return (map->ma_group); + + if (mip->mi_rx_groups == NULL || mip->mi_rx_group_count == 0 || + rtype == MAC_RX_NO_RESERVE) + return (NULL); + + /* + * Try to exclusively reserve a RX group. + * + * For flows requires SW_RING it always goes to the default group + * (Until we can explicitely call out default groups (CR 6695600), + * we assume that the default group is always at position zero); + * + * For flows requires HW_DEFAULT_RING (unicast flow of the primary + * client), try to reserve the default RX group only. + * + * For flows requires HW_RING (unicast flow of other clients), try + * to reserve non-default RX group then the default group. + */ + switch (rtype) { + case MAC_RX_RESERVE_DEFAULT: + start = 0; + loopcount = 1; + break; + case MAC_RX_RESERVE_NONDEFAULT: + start = 1; + loopcount = mip->mi_rx_group_count; + } + + for (i = start; i < start + loopcount; i++) { + grp = &mip->mi_rx_groups[i % mip->mi_rx_group_count]; + + DTRACE_PROBE3(rx__group__trying, char *, mip->mi_name, + int, grp->mrg_index, mac_group_state_t, grp->mrg_state); + + /* + * Check to see whether this mac client is the only client + * on this RX group. If not, we cannot exclusively reserve + * this RX group. + */ + if (!MAC_RX_GROUP_NO_CLIENT(grp) && + (MAC_RX_GROUP_ONLY_CLIENT(grp) != mcip)) { + continue; + } + + /* + * This group could already be SHARED by other multicast + * flows on this client. In that case, the group would + * be shared and has already been started. + */ + ASSERT(grp->mrg_state != MAC_GROUP_STATE_UNINIT); + + if ((grp->mrg_state == MAC_GROUP_STATE_REGISTERED) && + (mac_start_group(grp) != 0)) { + continue; + } + + if ((i % mip->mi_rx_group_count) == 0 || + mip->mi_rx_group_type != MAC_GROUP_TYPE_DYNAMIC) { + break; + } + + ASSERT(grp->mrg_cur_count == 0); + + /* + * Populate the group. Rings should be taken + * from the default group at position 0 for now. + */ + + err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_RX, + &mip->mi_rx_groups[0], grp, share); + if (err == 0) + break; + + DTRACE_PROBE3(rx__group__reserve__alloc__rings, char *, + mip->mi_name, int, grp->mrg_index, int, err); + + /* + * It's a dynamic group but the grouping operation failed. + */ + mac_stop_group(grp); + } + + if (i == start + loopcount) + return (NULL); + + ASSERT(grp != NULL); + + DTRACE_PROBE2(rx__group__reserved, + char *, mip->mi_name, int, grp->mrg_index); + return (grp); +} + +/* + * mac_rx_release_group() + * + * This is called when there are no clients left for the group. + * The group is stopped and marked MAC_GROUP_STATE_REGISTERED, + * and if it is a non default group, the shares are removed and + * all rings are assigned back to default group. + */ +void +mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group) +{ + mac_impl_t *mip = mcip->mci_mip; + mac_ring_t *ring; + + ASSERT(group != &mip->mi_rx_groups[0]); + + /* + * This is the case where there are no clients left. Any + * SRS etc on this group have also be quiesced. + */ + for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) { + if (ring->mr_classify_type == MAC_HW_CLASSIFIER) { + ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED); + /* + * Remove the SRS associated with the HW ring. + * As a result, polling will be disabled. + */ + ring->mr_srs = NULL; + } + ASSERT(ring->mr_state == MR_INUSE); + mac_stop_ring(ring); + ring->mr_state = MR_FREE; + ring->mr_flag = 0; + } + + /* remove group from share */ + if (mcip->mci_share != NULL) { + mip->mi_share_capab.ms_sremove(mcip->mci_share, + group->mrg_driver); + } + + if (mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) { + mac_ring_t *ring; + + /* + * Rings were dynamically allocated to group. + * Move rings back to default group. + */ + while ((ring = group->mrg_rings) != NULL) { + (void) mac_group_mov_ring(mip, + &mip->mi_rx_groups[0], ring); + } + } + mac_stop_group(group); + /* + * Possible improvement: See if we can assign the group just released + * to a another client of the mip + */ +} + +/* + * Reserves a TX group for the specified share. Invoked by mac_tx_srs_setup() + * when a share was allocated to the client. + */ +mac_group_t * +mac_reserve_tx_group(mac_impl_t *mip, mac_share_handle_t share) +{ + mac_group_t *grp; + int rv, i; + + /* + * TX groups are currently allocated only to MAC clients + * which are associated with a share. Since we have a fixed + * number of share and groups, and we already successfully + * allocated a share, find an available TX group. + */ + ASSERT(share != NULL); + ASSERT(mip->mi_tx_group_free > 0); + + for (i = 0; i < mip->mi_tx_group_count; i++) { + grp = &mip->mi_tx_groups[i]; + + if ((grp->mrg_state == MAC_GROUP_STATE_RESERVED) || + (grp->mrg_state == MAC_GROUP_STATE_UNINIT)) + continue; + + rv = mac_start_group(grp); + ASSERT(rv == 0); + + grp->mrg_state = MAC_GROUP_STATE_RESERVED; + break; + } + + ASSERT(grp != NULL); + + /* + * Populate the group. Rings should be taken from the group + * of unassigned rings, which is past the array of TX + * groups adversized by the driver. + */ + rv = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_TX, NULL, + grp, share); + if (rv != 0) { + DTRACE_PROBE3(tx__group__reserve__alloc__rings, + char *, mip->mi_name, int, grp->mrg_index, int, rv); + + mac_stop_group(grp); + grp->mrg_state = MAC_GROUP_STATE_UNINIT; + + return (NULL); + } + + mip->mi_tx_group_free--; + + return (grp); +} + +void +mac_release_tx_group(mac_impl_t *mip, mac_group_t *grp) +{ + mac_client_impl_t *mcip = grp->mrg_tx_client; + mac_share_handle_t share = mcip->mci_share; + mac_ring_t *ring; + + ASSERT(mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC); + ASSERT(share != NULL); + ASSERT(grp->mrg_state == MAC_GROUP_STATE_RESERVED); + + mip->mi_share_capab.ms_sremove(share, grp->mrg_driver); + while ((ring = grp->mrg_rings) != NULL) { + /* move the ring back to the pool */ + (void) mac_group_mov_ring(mip, mip->mi_tx_groups + + mip->mi_tx_group_count, ring); + } + mac_stop_group(grp); + mac_set_rx_group_state(grp, MAC_GROUP_STATE_REGISTERED); + grp->mrg_tx_client = NULL; + mip->mi_tx_group_free++; +} + +/* + * This is a 1-time control path activity initiated by the client (IP). + * The mac perimeter protects against other simultaneous control activities, + * for example an ioctl that attempts to change the degree of fanout and + * increase or decrease the number of softrings associated with this Tx SRS. + */ +static mac_tx_notify_cb_t * +mac_client_tx_notify_add(mac_client_impl_t *mcip, + mac_tx_notify_t notify, void *arg) +{ + mac_cb_info_t *mcbi; + mac_tx_notify_cb_t *mtnfp; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); + + mtnfp = kmem_zalloc(sizeof (mac_tx_notify_cb_t), KM_SLEEP); + mtnfp->mtnf_fn = notify; + mtnfp->mtnf_arg = arg; + mtnfp->mtnf_link.mcb_objp = mtnfp; + mtnfp->mtnf_link.mcb_objsize = sizeof (mac_tx_notify_cb_t); + mtnfp->mtnf_link.mcb_flags = MCB_TX_NOTIFY_CB_T; + + mcbi = &mcip->mci_tx_notify_cb_info; + mutex_enter(mcbi->mcbi_lockp); + mac_callback_add(mcbi, &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link); + mutex_exit(mcbi->mcbi_lockp); + return (mtnfp); +} + +static void +mac_client_tx_notify_remove(mac_client_impl_t *mcip, mac_tx_notify_cb_t *mtnfp) +{ + mac_cb_info_t *mcbi; + mac_cb_t **cblist; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); + + if (!mac_callback_find(&mcip->mci_tx_notify_cb_info, + &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link)) { + cmn_err(CE_WARN, + "mac_client_tx_notify_remove: callback not " + "found, mcip 0x%p mtnfp 0x%p", (void *)mcip, (void *)mtnfp); + return; + } + + mcbi = &mcip->mci_tx_notify_cb_info; + cblist = &mcip->mci_tx_notify_cb_list; + mutex_enter(mcbi->mcbi_lockp); + if (mac_callback_remove(mcbi, cblist, &mtnfp->mtnf_link)) + kmem_free(mtnfp, sizeof (mac_tx_notify_cb_t)); + else + mac_callback_remove_wait(&mcip->mci_tx_notify_cb_info); + mutex_exit(mcbi->mcbi_lockp); +} + +/* + * mac_client_tx_notify(): + * call to add and remove flow control callback routine. + */ +mac_tx_notify_handle_t +mac_client_tx_notify(mac_client_handle_t mch, mac_tx_notify_t callb_func, + void *ptr) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_tx_notify_cb_t *mtnfp = NULL; + + i_mac_perim_enter(mcip->mci_mip); + + if (callb_func != NULL) { + /* Add a notify callback */ + mtnfp = mac_client_tx_notify_add(mcip, callb_func, ptr); + } else { + mac_client_tx_notify_remove(mcip, (mac_tx_notify_cb_t *)ptr); + } + i_mac_perim_exit(mcip->mci_mip); + + return ((mac_tx_notify_handle_t)mtnfp); +} diff --git a/usr/src/uts/common/io/mac/mac_bcast.c b/usr/src/uts/common/io/mac/mac_bcast.c new file mode 100644 index 0000000000..5fd2a6ef55 --- /dev/null +++ b/usr/src/uts/common/io/mac/mac_bcast.c @@ -0,0 +1,668 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/conf.h> +#include <sys/cmn_err.h> +#include <sys/list.h> +#include <sys/kmem.h> +#include <sys/stream.h> +#include <sys/modctl.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/atomic.h> +#include <sys/stat.h> +#include <sys/modhash.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <sys/sdt.h> +#include <sys/mac.h> +#include <sys/mac_impl.h> +#include <sys/mac_client_impl.h> +#include <sys/mac_client_priv.h> +#include <sys/mac_flow_impl.h> + +/* + * Broadcast and multicast traffic must be distributed to the MAC clients + * that are defined on top of the same MAC. The set of + * destinations to which a multicast packet must be sent is a subset + * of all MAC clients defined on top of the MAC. A MAC client can be member + * of more than one such subset. + * + * To accomodate these requirements, we introduce broadcast groups. + * A broadcast group is associated with a broadcast or multicast + * address. The members of a broadcast group consist of the MAC clients + * that should received copies of packets sent to the address + * associated with the group, and are defined on top of the + * same MAC. + * + * The broadcast groups defined on top of a MAC are chained, + * hanging off the mac_impl_t. The broadcast group id's are + * unique globally (tracked by mac_bcast_id). + */ + +/* + * The same MAC client may be added for different <addr,vid> tuple, + * we maintain a ref count for the number of times it has been added + * to account for deleting the MAC client from the group. + */ +typedef struct mac_bcast_grp_mcip_s { + mac_client_impl_t *mgb_client; + int mgb_client_ref; +} mac_bcast_grp_mcip_t; + +typedef struct mac_bcast_grp_s { /* Protected by */ + struct mac_bcast_grp_s *mbg_next; /* SL */ + void *mbg_addr; /* SL */ + uint16_t mbg_vid; /* SL */ + mac_impl_t *mbg_mac_impl; /* WO */ + mac_addrtype_t mbg_addrtype; /* WO */ + flow_entry_t *mbg_flow_ent; /* WO */ + mac_bcast_grp_mcip_t *mbg_clients; /* mi_rw_lock */ + uint_t mbg_nclients; /* mi_rw_lock */ + uint_t mbg_nclients_alloc; /* SL */ + uint64_t mbg_clients_gen; /* mi_rw_lock */ + uint32_t mbg_id; /* atomic */ +} mac_bcast_grp_t; + +static kmem_cache_t *mac_bcast_grp_cache; +static uint32_t mac_bcast_id = 0; + +void +mac_bcast_init(void) +{ + mac_bcast_grp_cache = kmem_cache_create("mac_bcast_grp_cache", + sizeof (mac_bcast_grp_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +mac_bcast_fini(void) +{ + kmem_cache_destroy(mac_bcast_grp_cache); +} + +mac_impl_t * +mac_bcast_grp_mip(void *grp) +{ + mac_bcast_grp_t *bcast_grp = grp; + + return (bcast_grp->mbg_mac_impl); +} + +/* + * Free the specific broadcast group. Invoked when the last reference + * to the group is released. + */ +void +mac_bcast_grp_free(void *bcast_grp) +{ + mac_bcast_grp_t *grp = bcast_grp; + mac_impl_t *mip = grp->mbg_mac_impl; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + if (grp->mbg_addrtype == MAC_ADDRTYPE_MULTICAST) { + /* + * The address is a multicast address, have the + * underlying NIC leave the multicast group. + */ + (void) mip->mi_multicst(mip->mi_driver, B_FALSE, grp->mbg_addr); + } + + ASSERT(grp->mbg_addr != NULL); + kmem_free(grp->mbg_addr, mip->mi_type->mt_addr_length); + kmem_free(grp->mbg_clients, + grp->mbg_nclients_alloc * sizeof (mac_bcast_grp_mcip_t)); + mip->mi_bcast_ngrps--; + kmem_cache_free(mac_bcast_grp_cache, grp); +} + +/* + * arg1: broadcast group + * arg2: sender MAC client if it is being sent by a MAC client, + * NULL if it was received from the wire. + */ +void +mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback) +{ + mac_bcast_grp_t *grp = arg1; + mac_client_impl_t *src_mcip = arg2, *dst_mcip; + mac_impl_t *mip = grp->mbg_mac_impl; + uint64_t gen; + uint_t i; + mblk_t *mp_chain1; + flow_entry_t *flent; + int err; + + rw_enter(&mip->mi_rw_lock, RW_READER); + + /* + * Pass a copy of the mp chain to every MAC client except the sender + * MAC client, if the packet was not received from the underlying NIC. + * + * The broadcast group lock should not be held across calls to + * the flow's callback function, since the same group could + * potentially be accessed from the same context. When the lock + * is reacquired, changes to the broadcast group while the lock + * was released are caught using a generation counter incremented + * each time the list of MAC clients associated with the broadcast + * group is changed. + */ + for (i = 0; i < grp->mbg_nclients_alloc; i++) { + dst_mcip = grp->mbg_clients[i].mgb_client; + if (dst_mcip == NULL) + continue; + flent = dst_mcip->mci_flent; + if (flent == NULL || dst_mcip == src_mcip) { + /* + * Don't send a copy of the packet back to + * its sender. + */ + continue; + } + + /* + * It is important to hold a reference on the + * flow_ent here. + */ + if ((mp_chain1 = mac_copymsgchain_cksum(mp_chain)) == NULL) + break; + /* + * Fix the checksum for packets originating + * from the local machine. + */ + if ((src_mcip != NULL) && + (mp_chain1 = mac_fix_cksum(mp_chain1)) == NULL) + break; + + FLOW_TRY_REFHOLD(flent, err); + if (err != 0) { + freemsgchain(mp_chain1); + continue; + } + + gen = grp->mbg_clients_gen; + + rw_exit(&mip->mi_rw_lock); + + DTRACE_PROBE4(mac__bcast__send__to, mac_client_impl_t *, + src_mcip, flow_fn_t, dst_mcip->mci_flent->fe_cb_fn, + void *, dst_mcip->mci_flent->fe_cb_arg1, + void *, dst_mcip->mci_flent->fe_cb_arg2); + + (dst_mcip->mci_flent->fe_cb_fn)(dst_mcip->mci_flent->fe_cb_arg1, + dst_mcip->mci_flent->fe_cb_arg2, mp_chain1, is_loopback); + FLOW_REFRELE(flent); + + rw_enter(&mip->mi_rw_lock, RW_READER); + + /* update stats */ + if (grp->mbg_addrtype == MAC_ADDRTYPE_MULTICAST) + dst_mcip->mci_stat_multircv++; + else + dst_mcip->mci_stat_brdcstrcv++; + + if (grp->mbg_clients_gen != gen) { + /* + * The list of MAC clients associated with the group + * was changed while the lock was released. + * Give up on the current packet. + */ + rw_exit(&mip->mi_rw_lock); + freemsgchain(mp_chain); + return; + } + } + rw_exit(&mip->mi_rw_lock); + + if (src_mcip != NULL) { + /* + * The packet was sent from one of the MAC clients, + * so we need to send a copy of the packet to the + * underlying NIC so that it can be sent on the wire. + */ + mblk_t *rest; + + src_mcip->mci_stat_multixmt++; + src_mcip->mci_stat_brdcstxmt++; + + rest = MAC_RING_TX_DEFAULT(mip, mp_chain); + if (rest != NULL) + freemsgchain(rest); + } else { + freemsgchain(mp_chain); + } +} + +/* + * Add the specified MAC client to the group corresponding to the specified + * broadcast or multicast address. + * Return 0 on success, or an errno value on failure. + */ +int +mac_bcast_add(mac_client_impl_t *mcip, const uint8_t *addr, uint16_t vid, + mac_addrtype_t addrtype) +{ + mac_impl_t *mip = mcip->mci_mip; + mac_bcast_grp_t *grp = NULL, **last_grp; + size_t addr_len = mip->mi_type->mt_addr_length; + int rc = 0; + int i, index = -1; + mac_mcast_addrs_t *mci_maddr = NULL; + mac_mcast_addrs_t *mi_maddr = NULL; + mac_mcast_addrs_t **last_maddr; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + ASSERT(addrtype == MAC_ADDRTYPE_MULTICAST || + addrtype == MAC_ADDRTYPE_BROADCAST); + + /* The list is protected by the perimeter */ + last_grp = &mip->mi_bcast_grp; + for (grp = *last_grp; grp != NULL; + last_grp = &grp->mbg_next, grp = grp->mbg_next) { + if (bcmp(grp->mbg_addr, addr, addr_len) == 0 && + grp->mbg_vid == vid) + break; + } + + if (grp == NULL) { + /* + * The group does not yet exist, create it. + */ + flow_desc_t flow_desc; + char flow_name[MAXFLOWNAME]; + + grp = kmem_cache_alloc(mac_bcast_grp_cache, KM_SLEEP); + bzero(grp, sizeof (mac_bcast_grp_t)); + grp->mbg_next = NULL; + grp->mbg_mac_impl = mip; + + DTRACE_PROBE1(mac__bcast__add__new__group, mac_bcast_grp_t *, + grp); + + grp->mbg_addr = kmem_zalloc(addr_len, KM_SLEEP); + bcopy(addr, grp->mbg_addr, addr_len); + grp->mbg_addrtype = addrtype; + grp->mbg_vid = vid; + + /* + * Add a new flow to the underlying MAC. + */ + bzero(&flow_desc, sizeof (flow_desc)); + bcopy(addr, &flow_desc.fd_dst_mac, addr_len); + flow_desc.fd_mac_len = (uint32_t)addr_len; + + flow_desc.fd_mask = FLOW_LINK_DST; + if (vid != 0) { + flow_desc.fd_vid = vid; + flow_desc.fd_mask |= FLOW_LINK_VID; + } + + grp->mbg_id = atomic_add_32_nv(&mac_bcast_id, 1); + (void) sprintf(flow_name, + "mac/%s/mcast%d", mip->mi_name, grp->mbg_id); + + rc = mac_flow_create(&flow_desc, NULL, flow_name, + grp, FLOW_MCAST, &grp->mbg_flow_ent); + if (rc != 0) { + kmem_free(grp->mbg_addr, addr_len); + kmem_cache_free(mac_bcast_grp_cache, grp); + return (rc); + } + grp->mbg_flow_ent->fe_mbg = grp; + mip->mi_bcast_ngrps++; + + /* + * Initial creation reference on the flow. This is released + * in the corresponding delete action i_mac_bcast_delete() + */ + FLOW_REFHOLD(grp->mbg_flow_ent); + + /* + * When the multicast and broadcast packet is received + * by the underlying NIC, mac_rx_classify() will invoke + * mac_bcast_send() with arg2=NULL, which will cause + * mac_bcast_send() to send a copy of the packet(s) + * to every MAC client opened on top of the underlying MAC. + * + * When the mac_bcast_send() function is invoked from + * the transmit path of a MAC client, it will specify the + * transmitting MAC client as the arg2 value, which will + * allow mac_bcast_send() to skip that MAC client and not + * send it a copy of the packet. + * + * We program the classifier to dispatch matching broadcast + * packets to mac_bcast_send(). + */ + + grp->mbg_flow_ent->fe_cb_fn = mac_bcast_send; + grp->mbg_flow_ent->fe_cb_arg1 = grp; + grp->mbg_flow_ent->fe_cb_arg2 = NULL; + + rc = mac_flow_add(mip->mi_flow_tab, grp->mbg_flow_ent); + if (rc != 0) { + FLOW_FINAL_REFRELE(grp->mbg_flow_ent); + return (rc); + } + + /* + * For multicast addresses, have the underlying MAC + * join the corresponsing multicast group. + */ + if (addrtype == MAC_ADDRTYPE_MULTICAST) { + rc = mip->mi_multicst(mip->mi_driver, B_TRUE, addr); + if (rc != 0) { + mac_flow_remove(mip->mi_flow_tab, + grp->mbg_flow_ent, B_FALSE); + mac_flow_wait(grp->mbg_flow_ent, + FLOW_DRIVER_UPCALL); + FLOW_FINAL_REFRELE(grp->mbg_flow_ent); + return (rc); + } + } + + *last_grp = grp; + } + + ASSERT(grp->mbg_addrtype == addrtype); + + /* + * Add the MAC client to the list of MAC clients associated + * with the group. + */ + rw_enter(&mip->mi_rw_lock, RW_WRITER); + if (addrtype == MAC_ADDRTYPE_MULTICAST) { + /* + * We maintain a separate list for each MAC client. Get + * the entry or add, if it is not present. + */ + last_maddr = &mcip->mci_mcast_addrs; + for (mci_maddr = *last_maddr; mci_maddr != NULL; + last_maddr = &mci_maddr->mma_next, + mci_maddr = mci_maddr->mma_next) { + if (bcmp(mci_maddr->mma_addr, addr, addr_len) == 0) + break; + } + if (mci_maddr == NULL) { + mci_maddr = kmem_zalloc(sizeof (mac_mcast_addrs_t), + KM_SLEEP); + bcopy(addr, mci_maddr->mma_addr, addr_len); + *last_maddr = mci_maddr; + } + mci_maddr->mma_ref++; + + /* + * In case of a driver (say aggr), we also need this + * information on a per MAC instance basis. + */ + last_maddr = &mip->mi_mcast_addrs; + for (mi_maddr = *last_maddr; mi_maddr != NULL; + last_maddr = &mi_maddr->mma_next, + mi_maddr = mi_maddr->mma_next) { + if (bcmp(mi_maddr->mma_addr, addr, addr_len) == 0) + break; + } + if (mi_maddr == NULL) { + mi_maddr = kmem_zalloc(sizeof (mac_mcast_addrs_t), + KM_SLEEP); + bcopy(addr, mi_maddr->mma_addr, addr_len); + *last_maddr = mi_maddr; + } + mi_maddr->mma_ref++; + } + for (i = 0; i < grp->mbg_nclients_alloc; i++) { + /* + * The MAC client was already added, say when we have + * different unicast addresses with the same vid. + * Just increment the ref and we are done. + */ + if (grp->mbg_clients[i].mgb_client == mcip) { + grp->mbg_clients[i].mgb_client_ref++; + goto add_done; + } else if (grp->mbg_clients[i].mgb_client == NULL && + index == -1) { + index = i; + } + } + if (grp->mbg_nclients_alloc == grp->mbg_nclients) { + mac_bcast_grp_mcip_t *new_clients; + uint_t new_size = grp->mbg_nclients+1; + + new_clients = kmem_zalloc(new_size * + sizeof (mac_bcast_grp_mcip_t), KM_SLEEP); + + if (grp->mbg_nclients > 0) { + ASSERT(grp->mbg_clients != NULL); + bcopy(grp->mbg_clients, new_clients, grp->mbg_nclients * + sizeof (mac_bcast_grp_mcip_t)); + kmem_free(grp->mbg_clients, grp->mbg_nclients * + sizeof (mac_bcast_grp_mcip_t)); + } + + grp->mbg_clients = new_clients; + grp->mbg_nclients_alloc = new_size; + index = new_size - 1; + } + + ASSERT(index != -1); + grp->mbg_clients[index].mgb_client = mcip; + grp->mbg_clients[index].mgb_client_ref = 1; + grp->mbg_nclients++; + /* + * Since we're adding to the list of MAC clients using that group, + * kick the generation count, which will allow mac_bcast_send() + * to detect that condition after re-acquiring the lock. + */ + grp->mbg_clients_gen++; +add_done: + rw_exit(&mip->mi_rw_lock); + + return (0); +} + +/* + * Remove the specified MAC client from the group corresponding to + * the specific broadcast or multicast address. + * + * Note: mac_bcast_delete() calls mac_remove_flow() which + * will call cv_wait for fe_refcnt to drop to 0. So this function + * should not be called from interrupt or STREAMS context. + */ +void +mac_bcast_delete(mac_client_impl_t *mcip, const uint8_t *addr, uint16_t vid) +{ + mac_impl_t *mip = mcip->mci_mip; + mac_bcast_grp_t *grp = NULL, **prev; + size_t addr_len = mip->mi_type->mt_addr_length; + flow_entry_t *flent; + uint_t i; + mac_mcast_addrs_t *maddr = NULL; + mac_mcast_addrs_t **mprev; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + /* find the broadcast group. The list is protected by the perimeter */ + prev = &mip->mi_bcast_grp; + for (grp = mip->mi_bcast_grp; grp != NULL; prev = &grp->mbg_next, + grp = grp->mbg_next) { + if (bcmp(grp->mbg_addr, addr, addr_len) == 0 && + grp->mbg_vid == vid) + break; + } + ASSERT(grp != NULL); + + /* + * Remove the MAC client from the list of MAC clients associated + * with that broadcast group. + * + * We mark the mbg_clients[] location corresponding to the removed MAC + * client NULL and reuse that location when we add a new MAC client. + */ + + rw_enter(&mip->mi_rw_lock, RW_WRITER); + + for (i = 0; i < grp->mbg_nclients_alloc; i++) { + if (grp->mbg_clients[i].mgb_client == mcip) + break; + } + + ASSERT(i < grp->mbg_nclients_alloc); + /* + * If there are more references to this MAC client, then we let + * it remain till it goes to 0. + */ + if (--grp->mbg_clients[i].mgb_client_ref > 0) + goto update_maddr; + + grp->mbg_clients[i].mgb_client = NULL; + grp->mbg_clients[i].mgb_client_ref = 0; + + /* + * Since we're removing from the list of MAC clients using that group, + * kick the generation count, which will allow mac_bcast_send() + * to detect that condition. + */ + grp->mbg_clients_gen++; + + if (--grp->mbg_nclients == 0) { + /* + * The last MAC client of the group was just removed. + * Unlink the current group from the list of groups + * defined on top of the underlying NIC. The group + * structure will stay around until the last reference + * is dropped. + */ + *prev = grp->mbg_next; + } +update_maddr: + if (grp->mbg_addrtype == MAC_ADDRTYPE_MULTICAST) { + mprev = &mcip->mci_mcast_addrs; + for (maddr = mcip->mci_mcast_addrs; maddr != NULL; + mprev = &maddr->mma_next, maddr = maddr->mma_next) { + if (bcmp(grp->mbg_addr, maddr->mma_addr, + mip->mi_type->mt_addr_length) == 0) + break; + } + ASSERT(maddr != NULL); + if (--maddr->mma_ref == 0) { + *mprev = maddr->mma_next; + maddr->mma_next = NULL; + kmem_free(maddr, sizeof (mac_mcast_addrs_t)); + } + + mprev = &mip->mi_mcast_addrs; + for (maddr = mip->mi_mcast_addrs; maddr != NULL; + mprev = &maddr->mma_next, maddr = maddr->mma_next) { + if (bcmp(grp->mbg_addr, maddr->mma_addr, + mip->mi_type->mt_addr_length) == 0) + break; + } + ASSERT(maddr != NULL); + if (--maddr->mma_ref == 0) { + *mprev = maddr->mma_next; + maddr->mma_next = NULL; + kmem_free(maddr, sizeof (mac_mcast_addrs_t)); + } + } + rw_exit(&mip->mi_rw_lock); + + /* + * If the group itself is being removed, remove the + * corresponding flow from the underlying NIC. + */ + flent = grp->mbg_flow_ent; + if (grp->mbg_nclients == 0) { + mac_flow_remove(mip->mi_flow_tab, flent, B_FALSE); + mac_flow_wait(flent, FLOW_DRIVER_UPCALL); + FLOW_FINAL_REFRELE(flent); + } +} + +/* + * This will be called by a driver, such as aggr, when a port is added/removed + * to add/remove the port to/from all the multcast addresses for that aggr. + */ +void +mac_bcast_refresh(mac_impl_t *mip, mac_multicst_t refresh_fn, void *arg, + boolean_t add) +{ + mac_mcast_addrs_t *grp, *next; + + ASSERT(refresh_fn != NULL); + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + /* + * Walk the multicast address list and call the refresh function for + * each address. + */ + + for (grp = mip->mi_mcast_addrs; grp != NULL; grp = next) { + /* + * Save the next pointer just in case the refresh + * function's action causes the group entry to be + * freed. + * We won't be adding to this list as part of the + * refresh. + */ + next = grp->mma_next; + refresh_fn(arg, add, grp->mma_addr); + } +} + +/* + * Walk the MAC client's multicast address list and add/remove the addr/vid + * ('arg' is 'flent') to all the addresses. + */ +void +mac_client_bcast_refresh(mac_client_impl_t *mcip, mac_multicst_t refresh_fn, + void *arg, boolean_t add) +{ + mac_mcast_addrs_t *grp, *next; + mac_impl_t *mip = mcip->mci_mip; + + ASSERT(refresh_fn != NULL); + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + /* + * Walk the multicast address list and call the refresh function for + * each address. + * Broadcast addresses are not added or removed through the multicast + * entry points, so don't include them as part of the refresh. + */ + for (grp = mcip->mci_mcast_addrs; grp != NULL; grp = next) { + /* + * Save the next pointer just in case the refresh + * function's action causes the group entry to be + * freed. + * We won't be adding to this list as part of the + * refresh. + */ + next = grp->mma_next; + refresh_fn(arg, add, grp->mma_addr); + } +} diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c new file mode 100644 index 0000000000..bd6b552e67 --- /dev/null +++ b/usr/src/uts/common/io/mac/mac_client.c @@ -0,0 +1,3763 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * - General Introduction: + * + * This file contains the implementation of the MAC client kernel + * API and related code. The MAC client API allows a kernel module + * to gain access to a MAC instance (physical NIC, link aggregation, etc). + * It allows a MAC client to associate itself with a MAC address, + * VLANs, callback functions for data traffic and for promiscuous mode. + * The MAC client API is also used to specify the properties associated + * with a MAC client, such as bandwidth limits, priority, CPUS, etc. + * These properties are further used to determine the hardware resources + * to allocate to the various MAC clients. + * + * - Primary MAC clients: + * + * The MAC client API refers to "primary MAC clients". A primary MAC + * client is a client which "owns" the primary MAC address of + * the underlying MAC instance. The primary MAC address is called out + * since it is associated with specific semantics: the primary MAC + * address is the MAC address which is assigned to the IP interface + * when it is plumbed, and the primary MAC address is assigned + * to VLAN data-links. The primary address of a MAC instance can + * also change dynamically from under the MAC client, for example + * as a result of a change of state of a link aggregation. In that + * case the MAC layer automatically updates all data-structures which + * refer to the current value of the primary MAC address. Typical + * primary MAC clients are dls, aggr, and xnb. A typical non-primary + * MAC client is the vnic driver. + * + * - Virtual Switching: + * + * The MAC layer implements a virtual switch between the MAC clients + * (primary and non-primary) defined on top of the same underlying + * NIC (physical, link aggregation, etc). The virtual switch is + * VLAN-aware, i.e. it allows multiple MAC clients to be member + * of one or more VLANs, and the virtual switch will distribute + * multicast tagged packets only to the member of the corresponding + * VLANs. + * + * - Upper vs Lower MAC: + * + * Creating a VNIC on top of a MAC instance effectively causes + * two MAC instances to be layered on top of each other, one for + * the VNIC(s), one for the underlying MAC instance (physical NIC, + * link aggregation, etc). In the code below we refer to the + * underlying NIC as the "lower MAC", and we refer to VNICs as + * the "upper MAC". + * + * - Pass-through for VNICs: + * + * When VNICs are created on top of an underlying MAC, this causes + * a layering of two MAC instances. Since the lower MAC already + * does the switching and demultiplexing to its MAC clients, the + * upper MAC would simply have to pass packets to the layer below + * or above it, which would introduce overhead. In order to avoid + * this overhead, the MAC layer implements a pass-through mechanism + * for VNICs. When a VNIC opens the lower MAC instance, it saves + * the MAC client handle it optains from the MAC layer. When a MAC + * client opens a VNIC (upper MAC), the MAC layer detects that + * the MAC being opened is a VNIC, and gets the MAC client handle + * that the VNIC driver obtained from the lower MAC. This exchange + * is doing through a private capability between the MAC layer + * and the VNIC driver. The upper MAC then returns that handle + * directly to its MAC client. Any operation done by the upper + * MAC client is now done on the lower MAC client handle, which + * allows the VNIC driver to be completely bypassed for the + * performance sensitive data-path. + * + */ + +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/id_space.h> +#include <sys/esunddi.h> +#include <sys/stat.h> +#include <sys/mkdev.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/strsubr.h> +#include <sys/dlpi.h> +#include <sys/modhash.h> +#include <sys/mac_impl.h> +#include <sys/mac_client_impl.h> +#include <sys/mac_soft_ring.h> +#include <sys/dls.h> +#include <sys/dld.h> +#include <sys/modctl.h> +#include <sys/fs/dv_node.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/callb.h> +#include <sys/cpuvar.h> +#include <sys/atomic.h> +#include <sys/sdt.h> +#include <sys/mac_flow.h> +#include <sys/ddi_intr_impl.h> +#include <sys/disp.h> +#include <sys/sdt.h> +#include <sys/vnic.h> +#include <sys/vnic_impl.h> +#include <sys/vlan.h> +#include <inet/ip.h> +#include <inet/ip6.h> +#include <sys/exacct.h> +#include <sys/exacct_impl.h> +#include <inet/nd.h> +#include <sys/ethernet.h> + +kmem_cache_t *mac_client_impl_cache; +kmem_cache_t *mac_promisc_impl_cache; + +static boolean_t mac_client_single_rcvr(mac_client_impl_t *); +static flow_entry_t *mac_client_swap_mciflent(mac_client_impl_t *); +static flow_entry_t *mac_client_get_flow(mac_client_impl_t *, + mac_unicast_impl_t *); +static void mac_client_remove_flow_from_list(mac_client_impl_t *, + flow_entry_t *); +static void mac_client_add_to_flow_list(mac_client_impl_t *, flow_entry_t *); +static void mac_rename_flow_names(mac_client_impl_t *, const char *); +static void mac_virtual_link_update(mac_impl_t *); + +/* ARGSUSED */ +static int +i_mac_client_impl_ctor(void *buf, void *arg, int kmflag) +{ + int i; + mac_client_impl_t *mcip = buf; + + bzero(buf, MAC_CLIENT_IMPL_SIZE); + mutex_init(&mcip->mci_tx_cb_lock, NULL, MUTEX_DRIVER, NULL); + mcip->mci_tx_notify_cb_info.mcbi_lockp = &mcip->mci_tx_cb_lock; + + ASSERT(mac_tx_percpu_cnt >= 0); + for (i = 0; i <= mac_tx_percpu_cnt; i++) { + mutex_init(&mcip->mci_tx_pcpu[i].pcpu_tx_lock, NULL, + MUTEX_DRIVER, NULL); + } + cv_init(&mcip->mci_tx_cv, NULL, CV_DRIVER, NULL); + + return (0); +} + +/* ARGSUSED */ +static void +i_mac_client_impl_dtor(void *buf, void *arg) +{ + int i; + mac_client_impl_t *mcip = buf; + + ASSERT(mcip->mci_promisc_list == NULL); + ASSERT(mcip->mci_unicast_list == NULL); + ASSERT(mcip->mci_state_flags == 0); + ASSERT(mcip->mci_tx_flag == 0); + + mutex_destroy(&mcip->mci_tx_cb_lock); + + ASSERT(mac_tx_percpu_cnt >= 0); + for (i = 0; i <= mac_tx_percpu_cnt; i++) { + ASSERT(mcip->mci_tx_pcpu[i].pcpu_tx_refcnt == 0); + mutex_destroy(&mcip->mci_tx_pcpu[i].pcpu_tx_lock); + } + cv_destroy(&mcip->mci_tx_cv); +} + +/* ARGSUSED */ +static int +i_mac_promisc_impl_ctor(void *buf, void *arg, int kmflag) +{ + mac_promisc_impl_t *mpip = buf; + + bzero(buf, sizeof (mac_promisc_impl_t)); + mpip->mpi_mci_link.mcb_objp = buf; + mpip->mpi_mci_link.mcb_objsize = sizeof (mac_promisc_impl_t); + mpip->mpi_mi_link.mcb_objp = buf; + mpip->mpi_mi_link.mcb_objsize = sizeof (mac_promisc_impl_t); + return (0); +} + +/* ARGSUSED */ +static void +i_mac_promisc_impl_dtor(void *buf, void *arg) +{ + mac_promisc_impl_t *mpip = buf; + + ASSERT(mpip->mpi_mci_link.mcb_objp != NULL); + ASSERT(mpip->mpi_mci_link.mcb_objsize == sizeof (mac_promisc_impl_t)); + ASSERT(mpip->mpi_mi_link.mcb_objp == mpip->mpi_mci_link.mcb_objp); + ASSERT(mpip->mpi_mi_link.mcb_objsize == sizeof (mac_promisc_impl_t)); + + mpip->mpi_mci_link.mcb_objp = NULL; + mpip->mpi_mci_link.mcb_objsize = 0; + mpip->mpi_mi_link.mcb_objp = NULL; + mpip->mpi_mi_link.mcb_objsize = 0; + + ASSERT(mpip->mpi_mci_link.mcb_flags == 0); + mpip->mpi_mci_link.mcb_objsize = 0; +} + +void +mac_client_init(void) +{ + ASSERT(mac_tx_percpu_cnt >= 0); + + mac_client_impl_cache = kmem_cache_create("mac_client_impl_cache", + MAC_CLIENT_IMPL_SIZE, 0, i_mac_client_impl_ctor, + i_mac_client_impl_dtor, NULL, NULL, NULL, 0); + ASSERT(mac_client_impl_cache != NULL); + + mac_promisc_impl_cache = kmem_cache_create("mac_promisc_impl_cache", + sizeof (mac_promisc_impl_t), 0, i_mac_promisc_impl_ctor, + i_mac_promisc_impl_dtor, NULL, NULL, NULL, 0); + ASSERT(mac_promisc_impl_cache != NULL); +} + +void +mac_client_fini(void) +{ + kmem_cache_destroy(mac_client_impl_cache); + kmem_cache_destroy(mac_promisc_impl_cache); +} + +/* + * Return the lower MAC client handle from the VNIC driver for the + * specified VNIC MAC instance. + */ +mac_client_impl_t * +mac_vnic_lower(mac_impl_t *mip) +{ + mac_capab_vnic_t cap; + mac_client_impl_t *mcip; + + VERIFY(i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_VNIC, &cap)); + mcip = cap.mcv_mac_client_handle(cap.mcv_arg); + + return (mcip); +} + +/* + * Return the MAC client handle of the primary MAC client for the + * specified MAC instance, or NULL otherwise. + */ +mac_client_impl_t * +mac_primary_client_handle(mac_impl_t *mip) +{ + mac_client_impl_t *mcip; + + if (mip->mi_state_flags & MIS_IS_VNIC) + return (mac_vnic_lower(mip)); + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + for (mcip = mip->mi_clients_list; mcip != NULL; + mcip = mcip->mci_client_next) { + if (MCIP_DATAPATH_SETUP(mcip) && mac_is_primary_client(mcip)) + return (mcip); + } + return (NULL); +} + +/* + * Open a MAC specified by its MAC name. + */ +int +mac_open(const char *macname, mac_handle_t *mhp) +{ + mac_impl_t *mip; + int err; + + /* + * Look up its entry in the global hash table. + */ + if ((err = mac_hold(macname, &mip)) != 0) + return (err); + + /* + * Hold the dip associated to the MAC to prevent it from being + * detached. For a softmac, its underlying dip is held by the + * mi_open() callback. + * + * This is done to be more tolerant with some defective drivers, + * which incorrectly handle mac_unregister() failure in their + * xxx_detach() routine. For example, some drivers ignore the + * failure of mac_unregister() and free all resources that + * that are needed for data transmition. + */ + e_ddi_hold_devi(mip->mi_dip); + + if (!(mip->mi_callbacks->mc_callbacks & MC_OPEN)) { + *mhp = (mac_handle_t)mip; + return (0); + } + + /* + * The mac perimeter is used in both mac_open and mac_close by the + * framework to single thread the MC_OPEN/MC_CLOSE of drivers. + */ + i_mac_perim_enter(mip); + mip->mi_oref++; + if (mip->mi_oref != 1 || ((err = mip->mi_open(mip->mi_driver)) == 0)) { + *mhp = (mac_handle_t)mip; + i_mac_perim_exit(mip); + return (0); + } + mip->mi_oref--; + ddi_release_devi(mip->mi_dip); + mac_rele(mip); + i_mac_perim_exit(mip); + return (err); +} + +/* + * Open a MAC specified by its linkid. + */ +int +mac_open_by_linkid(datalink_id_t linkid, mac_handle_t *mhp) +{ + dls_dl_handle_t dlh; + int err; + + if ((err = dls_devnet_hold_tmp(linkid, &dlh)) != 0) + return (err); + + dls_devnet_prop_task_wait(dlh); + + err = mac_open(dls_devnet_mac(dlh), mhp); + + dls_devnet_rele_tmp(dlh); + return (err); +} + +/* + * Open a MAC specified by its link name. + */ +int +mac_open_by_linkname(const char *link, mac_handle_t *mhp) +{ + datalink_id_t linkid; + int err; + + if ((err = dls_mgmt_get_linkid(link, &linkid)) != 0) + return (err); + return (mac_open_by_linkid(linkid, mhp)); +} + +/* + * Close the specified MAC. + */ +void +mac_close(mac_handle_t mh) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + i_mac_perim_enter(mip); + /* + * The mac perimeter is used in both mac_open and mac_close by the + * framework to single thread the MC_OPEN/MC_CLOSE of drivers. + */ + if (mip->mi_callbacks->mc_callbacks & MC_OPEN) { + ASSERT(mip->mi_oref != 0); + if (--mip->mi_oref == 0) { + if ((mip->mi_callbacks->mc_callbacks & MC_CLOSE)) + mip->mi_close(mip->mi_driver); + } + } + i_mac_perim_exit(mip); + ddi_release_devi(mip->mi_dip); + mac_rele(mip); +} + +/* + * Misc utility functions to retrieve various information about a MAC + * instance or a MAC client. + */ + +const mac_info_t * +mac_info(mac_handle_t mh) +{ + return (&((mac_impl_t *)mh)->mi_info); +} + +dev_info_t * +mac_devinfo_get(mac_handle_t mh) +{ + return (((mac_impl_t *)mh)->mi_dip); +} + +const char * +mac_name(mac_handle_t mh) +{ + return (((mac_impl_t *)mh)->mi_name); +} + +char * +mac_client_name(mac_client_handle_t mch) +{ + return (((mac_client_impl_t *)mch)->mci_name); +} + +minor_t +mac_minor(mac_handle_t mh) +{ + return (((mac_impl_t *)mh)->mi_minor); +} + +/* + * Return the VID associated with a MAC client. This function should + * be called for clients which are associated with only one VID. + */ +uint16_t +mac_client_vid(mac_client_handle_t mch) +{ + uint16_t vid = VLAN_ID_NONE; + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + flow_desc_t flow_desc; + + if (mcip->mci_nflents == 0) + return (vid); + + ASSERT(MCIP_DATAPATH_SETUP(mcip) && mac_client_single_rcvr(mcip)); + + mac_flow_get_desc(mcip->mci_flent, &flow_desc); + if ((flow_desc.fd_mask & FLOW_LINK_VID) != 0) + vid = flow_desc.fd_vid; + + return (vid); +} + +/* + * Return the link speed associated with the specified MAC client. + * + * The link speed of a MAC client is equal to the smallest value of + * 1) the current link speed of the underlying NIC, or + * 2) the bandwidth limit set for the MAC client. + * + * Note that the bandwidth limit can be higher than the speed + * of the underlying NIC. This is allowed to avoid spurious + * administration action failures or artifically lowering the + * bandwidth limit of a link that may have temporarily lowered + * its link speed due to hardware problem or administrator action. + */ +static uint64_t +mac_client_ifspeed(mac_client_impl_t *mcip) +{ + mac_impl_t *mip = mcip->mci_mip; + uint64_t nic_speed; + + nic_speed = mac_stat_get((mac_handle_t)mip, MAC_STAT_IFSPEED); + + if (nic_speed == 0) { + return (0); + } else { + uint64_t policy_limit = (uint64_t)-1; + + if (MCIP_RESOURCE_PROPS_MASK(mcip) & MRP_MAXBW) + policy_limit = MCIP_RESOURCE_PROPS_MAXBW(mcip); + + return (MIN(policy_limit, nic_speed)); + } +} + +/* + * Return the link state of the specified client. If here are more + * than one clients of the underying mac_impl_t, the link state + * will always be UP regardless of the link state of the underlying + * mac_impl_t. This is needed to allow the MAC clients to continue + * to communicate with each other even when the physical link of + * their mac_impl_t is down. + */ +static uint64_t +mac_client_link_state(mac_client_impl_t *mcip) +{ + mac_impl_t *mip = mcip->mci_mip; + uint16_t vid; + mac_client_impl_t *mci_list; + mac_unicast_impl_t *mui_list, *oth_mui_list; + + /* + * Returns LINK_STATE_UP if there are other MAC clients defined on + * mac_impl_t which share same VLAN ID as that of mcip. Note that + * if 'mcip' has more than one VID's then we match ANY one of the + * VID's with other MAC client's VID's and return LINK_STATE_UP. + */ + rw_enter(&mcip->mci_rw_lock, RW_READER); + for (mui_list = mcip->mci_unicast_list; mui_list != NULL; + mui_list = mui_list->mui_next) { + vid = mui_list->mui_vid; + for (mci_list = mip->mi_clients_list; mci_list != NULL; + mci_list = mci_list->mci_client_next) { + if (mci_list == mcip) + continue; + for (oth_mui_list = mci_list->mci_unicast_list; + oth_mui_list != NULL; oth_mui_list = oth_mui_list-> + mui_next) { + if (vid == oth_mui_list->mui_vid) { + rw_exit(&mcip->mci_rw_lock); + return (LINK_STATE_UP); + } + } + } + } + rw_exit(&mcip->mci_rw_lock); + + return (mac_stat_get((mac_handle_t)mip, MAC_STAT_LINK_STATE)); +} + +/* + * Return the statistics of a MAC client. These statistics are different + * then the statistics of the underlying MAC which are returned by + * mac_stat_get(). + */ +uint64_t +mac_client_stat_get(mac_client_handle_t mch, uint_t stat) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; + uint64_t val; + + switch (stat) { + case MAC_STAT_LINK_STATE: + val = mac_client_link_state(mcip); + break; + case MAC_STAT_LINK_UP: + val = (mac_client_link_state(mcip) == LINK_STATE_UP); + break; + case MAC_STAT_PROMISC: + val = mac_stat_get((mac_handle_t)mip, MAC_STAT_PROMISC); + break; + case MAC_STAT_IFSPEED: + val = mac_client_ifspeed(mcip); + break; + case MAC_STAT_MULTIRCV: + val = mcip->mci_stat_multircv; + break; + case MAC_STAT_BRDCSTRCV: + val = mcip->mci_stat_brdcstrcv; + break; + case MAC_STAT_MULTIXMT: + val = mcip->mci_stat_multixmt; + break; + case MAC_STAT_BRDCSTXMT: + val = mcip->mci_stat_brdcstxmt; + break; + case MAC_STAT_OBYTES: + val = mcip->mci_stat_obytes; + break; + case MAC_STAT_OPACKETS: + val = mcip->mci_stat_opackets; + break; + case MAC_STAT_OERRORS: + val = mcip->mci_stat_oerrors; + break; + case MAC_STAT_IPACKETS: + val = mcip->mci_stat_ipackets; + break; + case MAC_STAT_RBYTES: + val = mcip->mci_stat_ibytes; + break; + case MAC_STAT_IERRORS: + val = mcip->mci_stat_ierrors; + break; + default: + val = mac_stat_default(mip, stat); + break; + } + + return (val); +} + +/* + * Return the statistics of the specified MAC instance. + */ +uint64_t +mac_stat_get(mac_handle_t mh, uint_t stat) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + uint64_t val; + int ret; + + /* + * The range of stat determines where it is maintained. Stat + * values from 0 up to (but not including) MAC_STAT_MIN are + * mainteined by the mac module itself. Everything else is + * maintained by the driver. + * + * If the mac_impl_t being queried corresponds to a VNIC, + * the stats need to be queried from the lower MAC client + * corresponding to the VNIC. (The mac_link_update() + * invoked by the driver to the lower MAC causes the *lower + * MAC* to update its mi_linkstate, and send a notification + * to its MAC clients. Due to the VNIC passthrough, + * these notifications are sent to the upper MAC clients + * of the VNIC directly, and the upper mac_impl_t of the VNIC + * does not have a valid mi_linkstate. + */ + if (stat < MAC_STAT_MIN && !(mip->mi_state_flags & MIS_IS_VNIC)) { + /* these stats are maintained by the mac module itself */ + switch (stat) { + case MAC_STAT_LINK_STATE: + return (mip->mi_linkstate); + case MAC_STAT_LINK_UP: + return (mip->mi_linkstate == LINK_STATE_UP); + case MAC_STAT_PROMISC: + return (mip->mi_devpromisc != 0); + default: + ASSERT(B_FALSE); + } + } + + /* + * Call the driver to get the given statistic. + */ + ret = mip->mi_getstat(mip->mi_driver, stat, &val); + if (ret != 0) { + /* + * The driver doesn't support this statistic. Get the + * statistic's default value. + */ + val = mac_stat_default(mip, stat); + } + return (val); +} + +/* + * Utility function which returns the VID associated with a flow entry. + */ +uint16_t +i_mac_flow_vid(flow_entry_t *flent) +{ + flow_desc_t flow_desc; + + mac_flow_get_desc(flent, &flow_desc); + + if ((flow_desc.fd_mask & FLOW_LINK_VID) != 0) + return (flow_desc.fd_vid); + return (VLAN_ID_NONE); +} + +/* + * Verify the validity of the specified unicast MAC address. Returns B_TRUE + * if the address is valid, B_FALSE otherwise (multicast address, or incorrect + * length. + */ +boolean_t +mac_unicst_verify(mac_handle_t mh, const uint8_t *addr, uint_t len) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + /* + * Verify the address. No lock is needed since mi_type and plugin + * details don't change after mac_register(). + */ + if ((len != mip->mi_type->mt_addr_length) || + (mip->mi_type->mt_ops.mtops_unicst_verify(addr, + mip->mi_pdata)) != 0) { + return (B_FALSE); + } else { + return (B_TRUE); + } +} + +void +mac_sdu_get(mac_handle_t mh, uint_t *min_sdu, uint_t *max_sdu) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + if (min_sdu != NULL) + *min_sdu = mip->mi_sdu_min; + if (max_sdu != NULL) + *max_sdu = mip->mi_sdu_max; +} + +/* + * Update the MAC unicast address of the specified client's flows. Currently + * only one unicast MAC unicast address is allowed per client. + */ +static void +mac_unicast_update_client_flow(mac_client_impl_t *mcip) +{ + mac_impl_t *mip = mcip->mci_mip; + flow_entry_t *flent = mcip->mci_flent; + mac_address_t *map = mcip->mci_unicast; + flow_desc_t flow_desc; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + ASSERT(flent != NULL); + + mac_flow_get_desc(flent, &flow_desc); + ASSERT(flow_desc.fd_mask & FLOW_LINK_DST); + + bcopy(map->ma_addr, flow_desc.fd_dst_mac, map->ma_len); + mac_flow_set_desc(flent, &flow_desc); + + /* + * A MAC client could have one MAC address but multiple + * VLANs. In that case update the flow entries corresponding + * to all VLANs of the MAC client. + */ + for (flent = mcip->mci_flent_list; flent != NULL; + flent = flent->fe_client_next) { + mac_flow_get_desc(flent, &flow_desc); + if (!(flent->fe_type & FLOW_PRIMARY_MAC || + flent->fe_type & FLOW_VNIC_MAC)) + continue; + + bcopy(map->ma_addr, flow_desc.fd_dst_mac, map->ma_len); + mac_flow_set_desc(flent, &flow_desc); + } +} + +/* + * Update all clients that share the same unicast address. + */ +void +mac_unicast_update_clients(mac_impl_t *mip, mac_address_t *map) +{ + mac_client_impl_t *mcip; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + /* + * Find all clients that share the same unicast MAC address and update + * them appropriately. + */ + for (mcip = mip->mi_clients_list; mcip != NULL; + mcip = mcip->mci_client_next) { + /* + * Ignore clients that don't share this MAC address. + */ + if (map != mcip->mci_unicast) + continue; + + /* + * Update those clients with same old unicast MAC address. + */ + mac_unicast_update_client_flow(mcip); + } +} + +/* + * Update the unicast MAC address of the specified VNIC MAC client. + * + * Check whether the operation is valid. Any of following cases should fail: + * + * 1. It's a VLAN type of VNIC. + * 2. The new value is current "primary" MAC address. + * 3. The current MAC address is shared with other clients. + * 4. The new MAC address has been used. This case will be valid when + * client migration is fully supported. + */ +int +mac_vnic_unicast_set(mac_client_handle_t mch, const uint8_t *addr) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; + mac_address_t *map = mcip->mci_unicast; + int err; + + ASSERT(!(mip->mi_state_flags & MIS_IS_VNIC)); + ASSERT(mcip->mci_state_flags & MCIS_IS_VNIC); + ASSERT(mcip->mci_flags != MAC_CLIENT_FLAGS_PRIMARY); + + i_mac_perim_enter(mip); + + /* + * If this is a VLAN type of VNIC, it's using "primary" MAC address + * of the underlying interface. Must fail here. Refer to case 1 above. + */ + if (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0) { + i_mac_perim_exit(mip); + return (ENOTSUP); + } + + /* + * If the new address is the "primary" one, must fail. Refer to + * case 2 above. + */ + if (bcmp(addr, mip->mi_addr, map->ma_len) == 0) { + i_mac_perim_exit(mip); + return (EACCES); + } + + /* + * If the address is shared by multiple clients, must fail. Refer + * to case 3 above. + */ + if (mac_check_macaddr_shared(map)) { + i_mac_perim_exit(mip); + return (EBUSY); + } + + /* + * If the new address has been used, must fail for now. Refer to + * case 4 above. + */ + if (mac_find_macaddr(mip, (uint8_t *)addr) != NULL) { + i_mac_perim_exit(mip); + return (ENOTSUP); + } + + /* + * Update the MAC address. + */ + err = mac_update_macaddr(map, (uint8_t *)addr); + + if (err != 0) { + i_mac_perim_exit(mip); + return (err); + } + + /* + * Update all flows of this MAC client. + */ + mac_unicast_update_client_flow(mcip); + + i_mac_perim_exit(mip); + return (0); +} + +/* + * Program the new primary unicast address of the specified MAC. + * + * Function mac_update_macaddr() takes care different types of underlying + * MAC. If the underlying MAC is VNIC, the VNIC driver must have registerd + * mi_unicst() entry point, that indirectly calls mac_vnic_unicast_set() + * which will take care of updating the MAC address of the corresponding + * MAC client. + * + * This is the only interface that allow the client to update the "primary" + * MAC address of the underlying MAC. The new value must have not been + * used by other clients. + */ +int +mac_unicast_primary_set(mac_handle_t mh, const uint8_t *addr) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + mac_address_t *map; + int err; + + /* verify the address validity */ + if (!mac_unicst_verify(mh, addr, mip->mi_type->mt_addr_length)) + return (EINVAL); + + i_mac_perim_enter(mip); + + /* + * If the new value is the same as the current primary address value, + * there's nothing to do. + */ + if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) == 0) { + i_mac_perim_exit(mip); + return (0); + } + + if (mac_find_macaddr(mip, (uint8_t *)addr) != 0) { + i_mac_perim_exit(mip); + return (EBUSY); + } + + map = mac_find_macaddr(mip, mip->mi_addr); + ASSERT(map != NULL); + + /* + * Update the MAC address. + */ + if (mip->mi_state_flags & MIS_IS_AGGR) { + mac_capab_aggr_t aggr_cap; + + /* + * If the mac is an aggregation, other than the unicast + * addresses programming, aggr must be informed about this + * primary unicst address change to change its mac address + * policy to be user-specified. + */ + ASSERT(map->ma_type == MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED); + VERIFY(i_mac_capab_get(mh, MAC_CAPAB_AGGR, &aggr_cap)); + err = aggr_cap.mca_unicst(mip->mi_driver, addr); + if (err == 0) + bcopy(addr, map->ma_addr, map->ma_len); + } else { + err = mac_update_macaddr(map, (uint8_t *)addr); + } + + if (err != 0) { + i_mac_perim_exit(mip); + return (err); + } + + mac_unicast_update_clients(mip, map); + + /* + * Save the new primary MAC address in mac_impl_t. + */ + bcopy(addr, mip->mi_addr, mip->mi_type->mt_addr_length); + + i_mac_perim_exit(mip); + + if (err == 0) + i_mac_notify(mip, MAC_NOTE_UNICST); + + return (err); +} + +/* + * Return the current primary MAC address of the specified MAC. + */ +void +mac_unicast_primary_get(mac_handle_t mh, uint8_t *addr) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + rw_enter(&mip->mi_rw_lock, RW_READER); + bcopy(mip->mi_addr, addr, mip->mi_type->mt_addr_length); + rw_exit(&mip->mi_rw_lock); +} + +/* + * Return information about the use of the primary MAC address of the + * specified MAC instance: + * + * - if client_name is non-NULL, it must point to a string of at + * least MAXNAMELEN bytes, and will be set to the name of the MAC + * client which uses the primary MAC address. + * + * - if in_use is non-NULL, used to return whether the primary MAC + * address is currently in use. + */ +void +mac_unicast_primary_info(mac_handle_t mh, char *client_name, boolean_t *in_use) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + mac_client_impl_t *cur_client; + + if (in_use != NULL) + *in_use = B_FALSE; + if (client_name != NULL) + bzero(client_name, MAXNAMELEN); + + /* + * The mi_rw_lock is used to protect threads that don't hold the + * mac perimeter to get a consistent view of the mi_clients_list. + * Threads that modify the list must hold both the mac perimeter and + * mi_rw_lock(RW_WRITER) + */ + rw_enter(&mip->mi_rw_lock, RW_READER); + for (cur_client = mip->mi_clients_list; cur_client != NULL; + cur_client = cur_client->mci_client_next) { + if (mac_is_primary_client(cur_client) || + (mip->mi_state_flags & MIS_IS_VNIC)) { + rw_exit(&mip->mi_rw_lock); + if (in_use != NULL) + *in_use = B_TRUE; + if (client_name != NULL) { + bcopy(cur_client->mci_name, client_name, + MAXNAMELEN); + } + return; + } + } + rw_exit(&mip->mi_rw_lock); +} + +/* + * Add the specified MAC client to the list of clients which opened + * the specified MAC. + */ +static void +mac_client_add(mac_client_impl_t *mcip) +{ + mac_impl_t *mip = mcip->mci_mip; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + /* add VNIC to the front of the list */ + rw_enter(&mip->mi_rw_lock, RW_WRITER); + mcip->mci_client_next = mip->mi_clients_list; + mip->mi_clients_list = mcip; + mip->mi_nclients++; + rw_exit(&mip->mi_rw_lock); +} + +/* + * Remove the specified MAC client from the list of clients which opened + * the specified MAC. + */ +static void +mac_client_remove(mac_client_impl_t *mcip) +{ + mac_impl_t *mip = mcip->mci_mip; + mac_client_impl_t **prev, *cclient; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + rw_enter(&mip->mi_rw_lock, RW_WRITER); + prev = &mip->mi_clients_list; + cclient = *prev; + while (cclient != NULL && cclient != mcip) { + prev = &cclient->mci_client_next; + cclient = *prev; + } + ASSERT(cclient != NULL); + *prev = cclient->mci_client_next; + mip->mi_nclients--; + rw_exit(&mip->mi_rw_lock); +} + +static mac_unicast_impl_t * +mac_client_find_vid(mac_client_impl_t *mcip, uint16_t vid) +{ + mac_unicast_impl_t *muip = mcip->mci_unicast_list; + + while ((muip != NULL) && (muip->mui_vid != vid)) + muip = muip->mui_next; + + return (muip); +} + +/* + * Return whether the specified (MAC address, VID) tuple is already used by + * one of the MAC clients associated with the specified MAC. + */ +static boolean_t +mac_addr_in_use(mac_impl_t *mip, uint8_t *mac_addr, uint16_t vid) +{ + mac_client_impl_t *client; + mac_address_t *map; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + for (client = mip->mi_clients_list; client != NULL; + client = client->mci_client_next) { + + /* + * Ignore clients that don't have unicast address. + */ + if (client->mci_unicast_list == NULL) + continue; + + map = client->mci_unicast; + + if ((bcmp(mac_addr, map->ma_addr, map->ma_len) == 0) && + (mac_client_find_vid(client, vid) != NULL)) { + return (B_TRUE); + } + } + + return (B_FALSE); +} + +/* + * Generate a random MAC address. The MAC address prefix is + * stored in the array pointed to by mac_addr, and its length, in bytes, + * is specified by prefix_len. The least significant bits + * after prefix_len bytes are generated, and stored after the prefix + * in the mac_addr array. + */ +int +mac_addr_random(mac_client_handle_t mch, uint_t prefix_len, + uint8_t *mac_addr, mac_diag_t *diag) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; + size_t addr_len = mip->mi_type->mt_addr_length; + + if (prefix_len >= addr_len) { + *diag = MAC_DIAG_MACPREFIXLEN_INVALID; + return (EINVAL); + } + + /* check the prefix value */ + if (prefix_len > 0) { + bzero(mac_addr + prefix_len, addr_len - prefix_len); + if (!mac_unicst_verify((mac_handle_t)mip, mac_addr, + addr_len)) { + *diag = MAC_DIAG_MACPREFIX_INVALID; + return (EINVAL); + } + } + + /* generate the MAC address */ + if (prefix_len < addr_len) { + (void) random_get_pseudo_bytes(mac_addr + + prefix_len, addr_len - prefix_len); + } + + *diag = 0; + return (0); +} + +/* + * Set the priority range for this MAC client. This will be used to + * determine the absolute priority for the threads created for this + * MAC client using the specified "low", "medium" and "high" level. + * This will also be used for any subflows on this MAC client. + */ +#define MAC_CLIENT_SET_PRIORITY_RANGE(mcip, pri) { \ + (mcip)->mci_min_pri = FLOW_MIN_PRIORITY(MINCLSYSPRI, \ + MAXCLSYSPRI, (pri)); \ + (mcip)->mci_max_pri = FLOW_MAX_PRIORITY(MINCLSYSPRI, \ + MAXCLSYSPRI, (mcip)->mci_min_pri); \ + } + +/* + * MAC client open entry point. Return a new MAC client handle. Each + * MAC client is associated with a name, specified through the 'name' + * argument. + */ +int +mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name, + uint16_t flags) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + mac_client_impl_t *mcip; + int err = 0; + boolean_t share_desired = + ((flags & MAC_OPEN_FLAGS_SHARES_DESIRED) != 0); + boolean_t no_hwrings = ((flags & MAC_OPEN_FLAGS_NO_HWRINGS) != 0); + boolean_t req_hwrings = ((flags & MAC_OPEN_FLAGS_REQ_HWRINGS) != 0); + flow_entry_t *flent = NULL; + + *mchp = NULL; + if (share_desired && no_hwrings) { + /* can't have shares but no hardware rings */ + return (EINVAL); + } + + i_mac_perim_enter(mip); + + if (mip->mi_state_flags & MIS_IS_VNIC) { + /* + * The underlying MAC is a VNIC. Return the MAC client + * handle of the lower MAC which was obtained by + * the VNIC driver when it did its mac_client_open(). + */ + + mcip = mac_vnic_lower(mip); + /* + * If there are multiple MAC clients of the VNIC, they + * all share the same underlying MAC client handle. + */ + if ((flags & MAC_OPEN_FLAGS_TAG_DISABLE) != 0) + mcip->mci_state_flags |= MCIS_TAG_DISABLE; + + if ((flags & MAC_OPEN_FLAGS_STRIP_DISABLE) != 0) + mcip->mci_state_flags |= MCIS_STRIP_DISABLE; + + if ((flags & MAC_OPEN_FLAGS_DISABLE_TX_VID_CHECK) != 0) + mcip->mci_state_flags |= MCIS_DISABLE_TX_VID_CHECK; + + /* + * Note that multiple mac clients share the same mcip in + * this case. + */ + if (flags & MAC_OPEN_FLAGS_EXCLUSIVE) + mcip->mci_state_flags |= MCIS_EXCLUSIVE; + + mip->mi_clients_list = mcip; + i_mac_perim_exit(mip); + *mchp = (mac_client_handle_t)mcip; + return (err); + } + + mcip = kmem_cache_alloc(mac_client_impl_cache, KM_SLEEP); + + mcip->mci_mip = mip; + mcip->mci_upper_mip = NULL; + mcip->mci_rx_fn = mac_pkt_drop; + mcip->mci_rx_arg = NULL; + mcip->mci_direct_rx_fn = NULL; + mcip->mci_direct_rx_arg = NULL; + + if ((flags & MAC_OPEN_FLAGS_IS_VNIC) != 0) + mcip->mci_state_flags |= MCIS_IS_VNIC; + + if ((flags & MAC_OPEN_FLAGS_EXCLUSIVE) != 0) + mcip->mci_state_flags |= MCIS_EXCLUSIVE; + + if ((flags & MAC_OPEN_FLAGS_IS_AGGR_PORT) != 0) + mcip->mci_state_flags |= MCIS_IS_AGGR_PORT; + + if ((flags & MAC_OPEN_FLAGS_TAG_DISABLE) != 0) + mcip->mci_state_flags |= MCIS_TAG_DISABLE; + + if ((flags & MAC_OPEN_FLAGS_STRIP_DISABLE) != 0) + mcip->mci_state_flags |= MCIS_STRIP_DISABLE; + + if ((flags & MAC_OPEN_FLAGS_DISABLE_TX_VID_CHECK) != 0) + mcip->mci_state_flags |= MCIS_DISABLE_TX_VID_CHECK; + + if ((flags & MAC_OPEN_FLAGS_USE_DATALINK_NAME) != 0) { + datalink_id_t linkid; + + ASSERT(name == NULL); + if ((err = dls_devnet_macname2linkid(mip->mi_name, + &linkid)) != 0) { + goto done; + } + if ((err = dls_mgmt_get_linkinfo(linkid, mcip->mci_name, NULL, + NULL, NULL)) != 0) { + /* + * Use mac name if dlmgmtd is not available. + */ + if (err == EBADF) { + (void) strlcpy(mcip->mci_name, mip->mi_name, + sizeof (mcip->mci_name)); + err = 0; + } else { + goto done; + } + } + mcip->mci_state_flags |= MCIS_USE_DATALINK_NAME; + } else { + ASSERT(name != NULL); + if (strlen(name) > MAXNAMELEN) { + err = EINVAL; + goto done; + } + (void) strlcpy(mcip->mci_name, name, sizeof (mcip->mci_name)); + } + /* the subflow table will be created dynamically */ + mcip->mci_subflow_tab = NULL; + mcip->mci_stat_multircv = 0; + mcip->mci_stat_brdcstrcv = 0; + mcip->mci_stat_multixmt = 0; + mcip->mci_stat_brdcstxmt = 0; + + mcip->mci_stat_obytes = 0; + mcip->mci_stat_opackets = 0; + mcip->mci_stat_oerrors = 0; + mcip->mci_stat_ibytes = 0; + mcip->mci_stat_ipackets = 0; + mcip->mci_stat_ierrors = 0; + + /* Create an initial flow */ + + err = mac_flow_create(NULL, NULL, mcip->mci_name, NULL, + mcip->mci_state_flags & MCIS_IS_VNIC ? FLOW_VNIC_MAC : + FLOW_PRIMARY_MAC, &flent); + if (err != 0) + goto done; + mcip->mci_flent = flent; + FLOW_MARK(flent, FE_MC_NO_DATAPATH); + flent->fe_mcip = mcip; + /* + * Place initial creation reference on the flow. This reference + * is released in the corresponding delete action viz. + * mac_unicast_remove after waiting for all transient refs to + * to go away. The wait happens in mac_flow_wait. + */ + FLOW_REFHOLD(flent); + + /* + * Do this ahead of the mac_bcast_add() below so that the mi_nclients + * will have the right value for mac_rx_srs_setup(). + */ + mac_client_add(mcip); + + mcip->mci_no_hwrings = no_hwrings; + mcip->mci_req_hwrings = req_hwrings; + mcip->mci_share = NULL; + if (share_desired) { + ASSERT(!no_hwrings); + i_mac_share_alloc(mcip); + } + + DTRACE_PROBE2(mac__client__open__allocated, mac_impl_t *, + mcip->mci_mip, mac_client_impl_t *, mcip); + *mchp = (mac_client_handle_t)mcip; + + i_mac_perim_exit(mip); + return (0); + +done: + i_mac_perim_exit(mip); + mcip->mci_state_flags = 0; + mcip->mci_tx_flag = 0; + kmem_cache_free(mac_client_impl_cache, mcip); + return (err); +} + +/* + * Close the specified MAC client handle. + */ +void +mac_client_close(mac_client_handle_t mch, uint16_t flags) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; + flow_entry_t *flent; + + i_mac_perim_enter(mip); + + if (flags & MAC_CLOSE_FLAGS_EXCLUSIVE) + mcip->mci_state_flags &= ~MCIS_EXCLUSIVE; + + if ((mcip->mci_state_flags & MCIS_IS_VNIC) && + !(flags & MAC_CLOSE_FLAGS_IS_VNIC)) { + /* + * This is an upper VNIC client initiated operation. + * The lower MAC client will be closed by the VNIC driver + * when the VNIC is deleted. + */ + + /* + * Clear the flags set when the upper client initiated + * open. + */ + mcip->mci_state_flags &= ~(MCIS_TAG_DISABLE | + MCIS_STRIP_DISABLE | MCIS_DISABLE_TX_VID_CHECK); + + i_mac_perim_exit(mip); + return; + } + + /* + * Remove the flent associated with the MAC client + */ + flent = mcip->mci_flent; + mcip->mci_flent = NULL; + FLOW_FINAL_REFRELE(flent); + + /* + * MAC clients must remove the unicast addresses and promisc callbacks + * they added before issuing a mac_client_close(). + */ + ASSERT(mcip->mci_unicast_list == NULL); + ASSERT(mcip->mci_promisc_list == NULL); + ASSERT(mcip->mci_tx_notify_cb_list == NULL); + + i_mac_share_free(mcip); + + mac_client_remove(mcip); + + i_mac_perim_exit(mip); + mcip->mci_subflow_tab = NULL; + mcip->mci_state_flags = 0; + mcip->mci_tx_flag = 0; + kmem_cache_free(mac_client_impl_cache, mch); +} + +/* + * Enable bypass for the specified MAC client. + */ +boolean_t +mac_rx_bypass_set(mac_client_handle_t mch, mac_direct_rx_t rx_fn, void *arg1) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + /* + * If the mac_client is a VLAN or native media is non ethernet, we + * should not do DLS bypass and instead let the packets go via the + * default mac_rx_deliver route so vlan header can be stripped etc. + */ + if (mcip->mci_nvids > 0 || + mip->mi_info.mi_nativemedia != DL_ETHER) + return (B_FALSE); + + /* + * These are not accessed directly in the data path, and hence + * don't need any protection + */ + mcip->mci_direct_rx_fn = rx_fn; + mcip->mci_direct_rx_arg = arg1; + mcip->mci_state_flags |= MCIS_CLIENT_POLL_CAPABLE; + return (B_TRUE); +} + +/* + * Set the receive callback for the specified MAC client. There can be + * at most one such callback per MAC client. + */ +void +mac_rx_set(mac_client_handle_t mch, mac_rx_t rx_fn, void *arg) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; + + /* + * Instead of adding an extra set of locks and refcnts in + * the datapath at the mac client boundary, we temporarily quiesce + * the SRS and related entities. We then change the receive function + * without interference from any receive data thread and then reenable + * the data flow subsequently. + */ + i_mac_perim_enter(mip); + mac_rx_client_quiesce(mch); + + mcip->mci_rx_fn = rx_fn; + mcip->mci_rx_arg = arg; + mac_rx_client_restart(mch); + i_mac_perim_exit(mip); +} + +/* + * Reset the receive callback for the specified MAC client. + */ +void +mac_rx_clear(mac_client_handle_t mch) +{ + mac_rx_set(mch, mac_pkt_drop, NULL); +} + +/* + * Walk the MAC client subflow table and updates their priority values. + */ +static int +mac_update_subflow_priority_cb(flow_entry_t *flent, void *arg) +{ + mac_flow_update_priority(arg, flent); + return (0); +} + +void +mac_update_subflow_priority(mac_client_impl_t *mcip) +{ + (void) mac_flow_walk(mcip->mci_subflow_tab, + mac_update_subflow_priority_cb, mcip); +} + +/* + * When the MAC client is being brought up (i.e. we do a unicast_add) we need + * to initialize the cpu and resource control structure in the + * mac_client_impl_t from the mac_impl_t (i.e if there are any cached + * properties before the flow entry for the unicast address was created). + */ +int +mac_resource_ctl_set(mac_client_handle_t mch, mac_resource_props_t *mrp) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = (mac_impl_t *)mcip->mci_mip; + int err = 0; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + err = mac_validate_props(mrp); + if (err != 0) + return (err); + + mac_update_resources(mrp, MCIP_RESOURCE_PROPS(mcip), B_FALSE); + if (MCIP_DATAPATH_SETUP(mcip)) { + /* + * We have to set this prior to calling mac_flow_modify. + */ + if (mrp->mrp_mask & MRP_PRIORITY) { + if (mrp->mrp_priority == MPL_RESET) { + MAC_CLIENT_SET_PRIORITY_RANGE(mcip, + MPL_LINK_DEFAULT); + } else { + MAC_CLIENT_SET_PRIORITY_RANGE(mcip, + mrp->mrp_priority); + } + } + + mac_flow_modify(mip->mi_flow_tab, mcip->mci_flent, mrp); + if (mrp->mrp_mask & MRP_PRIORITY) + mac_update_subflow_priority(mcip); + return (0); + } + return (0); +} + +void +mac_resource_ctl_get(mac_client_handle_t mch, mac_resource_props_t *mrp) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip); + + bcopy(mcip_mrp, mrp, sizeof (mac_resource_props_t)); +} + +static int +mac_unicast_flow_create(mac_client_impl_t *mcip, uint8_t *mac_addr, + uint16_t vid, boolean_t is_primary, boolean_t first_flow, + flow_entry_t **flent, mac_resource_props_t *mrp) +{ + mac_impl_t *mip = (mac_impl_t *)mcip->mci_mip; + flow_desc_t flow_desc; + char flowname[MAXFLOWNAME]; + int err; + uint_t flent_flags; + + /* + * First unicast address being added, create a new flow + * for that MAC client. + */ + bzero(&flow_desc, sizeof (flow_desc)); + + flow_desc.fd_mac_len = mip->mi_type->mt_addr_length; + bcopy(mac_addr, flow_desc.fd_dst_mac, flow_desc.fd_mac_len); + flow_desc.fd_mask = FLOW_LINK_DST; + if (vid != 0) { + flow_desc.fd_vid = vid; + flow_desc.fd_mask |= FLOW_LINK_VID; + } + + /* + * XXX-nicolas. For now I'm keeping the FLOW_PRIMARY_MAC + * and FLOW_VNIC. Even though they're a hack inherited + * from the SRS code, we'll keep them for now. They're currently + * consumed by mac_datapath_setup() to create the SRS. + * That code should be eventually moved out of + * mac_datapath_setup() and moved to a mac_srs_create() + * function of some sort to keep things clean. + * + * Also, there's no reason why the SRS for the primary MAC + * client should be different than any other MAC client. Until + * this is cleaned-up, we support only one MAC unicast address + * per client. + * + * We set FLOW_PRIMARY_MAC for the primary MAC address, + * FLOW_VNIC for everything else. + */ + if (is_primary) + flent_flags = FLOW_PRIMARY_MAC; + else + flent_flags = FLOW_VNIC_MAC; + + /* + * For the first flow we use the mac client's name - mci_name, for + * subsequent ones we just create a name with the vid. This is + * so that we can add these flows to the same flow table. This is + * fine as the flow name (except for the one with the mac client's + * name) is not visible. When the first flow is removed, we just replace + * its fdesc with another from the list, so we will still retain the + * flent with the MAC client's flow name. + */ + if (first_flow) { + bcopy(mcip->mci_name, flowname, MAXFLOWNAME); + } else { + (void) sprintf(flowname, "%s%u", mcip->mci_name, vid); + flent_flags = FLOW_NO_STATS; + } + + if ((err = mac_flow_create(&flow_desc, mrp, flowname, NULL, + flent_flags, flent)) != 0) + return (err); + + FLOW_MARK(*flent, FE_INCIPIENT); + (*flent)->fe_mcip = mcip; + + /* + * Place initial creation reference on the flow. This reference + * is released in the corresponding delete action viz. + * mac_unicast_remove after waiting for all transient refs to + * to go away. The wait happens in mac_flow_wait. + * We have already held the reference in mac_client_open(). + */ + if (!first_flow) + FLOW_REFHOLD(*flent); + return (0); +} + +/* Refresh the multicast grouping for this VID. */ +int +mac_client_update_mcast(void *arg, boolean_t add, const uint8_t *addrp) +{ + flow_entry_t *flent = arg; + mac_client_impl_t *mcip = flent->fe_mcip; + uint16_t vid; + flow_desc_t flow_desc; + + mac_flow_get_desc(flent, &flow_desc); + vid = (flow_desc.fd_mask & FLOW_LINK_VID) != 0 ? + flow_desc.fd_vid : VLAN_ID_NONE; + + /* + * We don't call mac_multicast_add()/mac_multicast_remove() as + * we want to add/remove for this specific vid. + */ + if (add) { + return (mac_bcast_add(mcip, addrp, vid, + MAC_ADDRTYPE_MULTICAST)); + } else { + mac_bcast_delete(mcip, addrp, vid); + return (0); + } +} + +/* + * Add a new unicast address to the MAC client. + * + * The MAC address can be specified either by value, or the MAC client + * can specify that it wants to use the primary MAC address of the + * underlying MAC. See the introductory comments at the beginning + * of this file for more more information on primary MAC addresses. + * + * Note also the tuple (MAC address, VID) must be unique + * for the MAC clients defined on top of the same underlying MAC + * instance, unless the MAC_UNICAST_NODUPCHECK is specified. + */ + +int +i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags, + mac_unicast_handle_t *mah, uint16_t vid, mac_diag_t *diag) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; + mac_unicast_impl_t *muip; + flow_entry_t *flent; + int err; + uint_t mac_len = mip->mi_type->mt_addr_length; + boolean_t check_dups = !(flags & MAC_UNICAST_NODUPCHECK); + boolean_t is_primary = (flags & MAC_UNICAST_PRIMARY); + boolean_t is_vnic_primary = flags & MAC_UNICAST_VNIC_PRIMARY; + boolean_t bcast_added = B_FALSE; + boolean_t nactiveclients_added = B_FALSE; + boolean_t mac_started = B_FALSE; + mac_resource_props_t mrp; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + /* when VID is non-zero, the underlying MAC can not be VNIC */ + ASSERT(!((mip->mi_state_flags & MIS_IS_VNIC) && (vid != 0))); + + /* + * Check whether it's the primary client and flag it. + */ + if (!(mcip->mci_state_flags & MCIS_IS_VNIC) && is_primary && vid == 0) + mcip->mci_flags |= MAC_CLIENT_FLAGS_PRIMARY; + + /* + * is_vnic_primary is true when we come here as a VLAN VNIC + * which uses the primary mac client's address but with a non-zero + * VID. In this case the MAC address is not specified by an upper + * MAC client. + */ + if ((mcip->mci_state_flags & MCIS_IS_VNIC) && is_primary && + !is_vnic_primary) { + /* + * The address is being set by the upper MAC client + * of a VNIC. The MAC address was already set by the + * VNIC driver during VNIC creation. + * + * Note: a VNIC has only one MAC address. We return + * the MAC unicast address handle of the lower MAC client + * corresponding to the VNIC. We allocate a new entry + * which is flagged appropriately, so that mac_unicast_remove() + * doesn't attempt to free the original entry that + * was allocated by the VNIC driver. + */ + ASSERT(mcip->mci_unicast != NULL); + + /* + * Ensure that the primary unicast address of the VNIC + * is added only once. + */ + if (mcip->mci_flags & MAC_CLIENT_FLAGS_VNIC_PRIMARY) + return (EBUSY); + + mcip->mci_flags |= MAC_CLIENT_FLAGS_VNIC_PRIMARY; + + /* + * Create a handle for vid 0. + */ + ASSERT(vid == 0); + muip = kmem_zalloc(sizeof (mac_unicast_impl_t), KM_SLEEP); + muip->mui_vid = vid; + *mah = (mac_unicast_handle_t)muip; + return (0); + } + + /* primary MAC clients cannot be opened on top of anchor VNICs */ + if ((is_vnic_primary || is_primary) && + i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_ANCHOR_VNIC, NULL)) { + return (ENXIO); + } + + /* + * Return EBUSY if: + * - this is an exclusive active mac client and there already exist + * active mac clients, or + * - there already exist an exclusively active mac client. + */ + if ((mcip->mci_state_flags & MCIS_EXCLUSIVE) && + (mip->mi_nactiveclients != 0) || (mip->mi_state_flags & + MIS_EXCLUSIVE)) { + return (EBUSY); + } + + if (mcip->mci_state_flags & MCIS_EXCLUSIVE) + mip->mi_state_flags |= MIS_EXCLUSIVE; + + bzero(&mrp, sizeof (mac_resource_props_t)); + if (is_primary && !(mcip->mci_state_flags & MCIS_IS_VNIC)) { + /* + * Apply the property cached in the mac_impl_t to the primary + * mac client. If the mac client is a VNIC, its property were + * already set in the mcip when the VNIC was created. + */ + mac_get_resources((mac_handle_t)mip, &mrp); + (void) mac_client_set_resources(mch, &mrp); + } else if (mcip->mci_state_flags & MCIS_IS_VNIC) { + bcopy(MCIP_RESOURCE_PROPS(mcip), &mrp, + sizeof (mac_resource_props_t)); + } + + muip = kmem_zalloc(sizeof (mac_unicast_impl_t), KM_SLEEP); + muip->mui_vid = vid; + + if (is_primary || is_vnic_primary) { + mac_addr = mip->mi_addr; + check_dups = B_TRUE; + } else { + + /* + * Verify the validity of the specified MAC addresses value. + */ + if (!mac_unicst_verify((mac_handle_t)mip, mac_addr, mac_len)) { + *diag = MAC_DIAG_MACADDR_INVALID; + err = EINVAL; + goto bail; + } + + /* + * Make sure that the specified MAC address is different + * than the unicast MAC address of the underlying NIC. + */ + if (check_dups && bcmp(mip->mi_addr, mac_addr, mac_len) == 0) { + *diag = MAC_DIAG_MACADDR_NIC; + err = EINVAL; + goto bail; + } + } + + /* + * Make sure the MAC address is not already used by + * another MAC client defined on top of the same + * underlying NIC. + * xxx-venu mac_unicast_add doesnt' seem to be called + * with MAC_UNICAST_NODUPCHECK currently, if it does + * get called we need to do mac_addr_in_use() just + * to check for addr_in_use till 6697876 is fixed. + */ + if (check_dups && mac_addr_in_use(mip, mac_addr, vid)) { + *diag = MAC_DIAG_MACADDR_INUSE; + err = EEXIST; + goto bail; + } + + if ((err = mac_start(mip)) != 0) + goto bail; + + mac_started = B_TRUE; + + /* add the MAC client to the broadcast address group by default */ + if (mip->mi_type->mt_brdcst_addr != NULL) { + err = mac_bcast_add(mcip, mip->mi_type->mt_brdcst_addr, vid, + MAC_ADDRTYPE_BROADCAST); + if (err != 0) + goto bail; + bcast_added = B_TRUE; + } + flent = mcip->mci_flent; + ASSERT(flent != NULL); + /* We are configuring the unicast flow now */ + if (!MCIP_DATAPATH_SETUP(mcip)) { + + MAC_CLIENT_SET_PRIORITY_RANGE(mcip, + (mrp.mrp_mask & MRP_PRIORITY) ? mrp.mrp_priority : + MPL_LINK_DEFAULT); + + if ((err = mac_unicast_flow_create(mcip, mac_addr, vid, + is_primary || is_vnic_primary, B_TRUE, &flent, &mrp)) != 0) + goto bail; + + mip->mi_nactiveclients++; + nactiveclients_added = B_TRUE; + /* + * This will allocate the RX ring group if possible for the + * flow and program the software classifier as needed. + */ + if ((err = mac_datapath_setup(mcip, flent, SRST_LINK)) != 0) + goto bail; + + /* + * The unicast MAC address must have been added successfully. + */ + ASSERT(mcip->mci_unicast != NULL); + } else { + mac_address_t *map = mcip->mci_unicast; + + /* + * A unicast flow already exists for that MAC client, + * this flow must be the same mac address but with + * different VID. It has been checked by mac_addr_in_use(). + * + * We will use the SRS etc. from the mci_flent. Note that + * We don't need to create kstat for this as except for + * the fdesc, everything will be used from in the 1st flent. + */ + + if (bcmp(mac_addr, map->ma_addr, map->ma_len) != 0) { + err = EINVAL; + goto bail; + } + + if ((err = mac_unicast_flow_create(mcip, mac_addr, vid, + is_primary || is_vnic_primary, B_FALSE, &flent, NULL)) != 0) + goto bail; + + if ((err = mac_flow_add(mip->mi_flow_tab, flent)) != 0) { + FLOW_FINAL_REFRELE(flent); + goto bail; + } + + /* update the multicast group for this vid */ + mac_client_bcast_refresh(mcip, mac_client_update_mcast, + (void *)flent, B_TRUE); + + } + + /* populate the shared MAC address */ + muip->mui_map = mcip->mci_unicast; + + rw_enter(&mcip->mci_rw_lock, RW_WRITER); + muip->mui_next = mcip->mci_unicast_list; + mcip->mci_unicast_list = muip; + rw_exit(&mcip->mci_rw_lock); + + *mah = (mac_unicast_handle_t)muip; + + /* add it to the flow list of this mcip */ + mac_client_add_to_flow_list(mcip, flent); + + /* + * Trigger a renegotiation of the capabilities when the number of + * active clients changes from 1 to 2, since some of the capabilities + * might have to be disabled. Also send a MAC_NOTE_LINK notification + * to all the MAC clients whenever physical link is DOWN. + */ + if (mip->mi_nactiveclients == 2) { + mac_capab_update((mac_handle_t)mip); + mac_virtual_link_update(mip); + } + /* + * Now that the setup is complete, clear the INCIPIENT flag. + * The flag was set to avoid incoming packets seeing inconsistent + * structures while the setup was in progress. Clear the mci_tx_flag + * by calling mac_tx_client_block. It is possible that + * mac_unicast_remove was called prior to this mac_unicast_add which + * could have set the MCI_TX_QUIESCE flag. + */ + if (flent->fe_rx_ring_group != NULL) + mac_rx_group_unmark(flent->fe_rx_ring_group, MR_INCIPIENT); + FLOW_UNMARK(flent, FE_INCIPIENT); + FLOW_UNMARK(flent, FE_MC_NO_DATAPATH); + mac_tx_client_unblock(mcip); + return (0); +bail: + if (bcast_added) + mac_bcast_delete(mcip, mip->mi_type->mt_brdcst_addr, vid); + if (mac_started) + mac_stop(mip); + + if (nactiveclients_added) + mip->mi_nactiveclients--; + if (mcip->mci_state_flags & MCIS_EXCLUSIVE) + mip->mi_state_flags &= ~MIS_EXCLUSIVE; + kmem_free(muip, sizeof (mac_unicast_impl_t)); + return (err); +} + +int +mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags, + mac_unicast_handle_t *mah, uint16_t vid, mac_diag_t *diag) +{ + mac_impl_t *mip = ((mac_client_impl_t *)mch)->mci_mip; + uint_t err; + + i_mac_perim_enter(mip); + err = i_mac_unicast_add(mch, mac_addr, flags, mah, vid, diag); + i_mac_perim_exit(mip); + + return (err); +} + +/* + * Add the primary MAC address to the MAC client. This is a convenience + * function which can be called by primary MAC clients which do not + * need to specify any other additional flags. + * + * It's called in one of following situations: + * * dls as the primary MAC client + * * aggr as an exclusive client + * * by VNIC's client + */ +int +mac_unicast_primary_add(mac_client_handle_t mch, mac_unicast_handle_t *mah, + mac_diag_t *diag) +{ + return (mac_unicast_add(mch, NULL, MAC_UNICAST_PRIMARY, mah, 0, diag)); +} + +/* + * Remove a MAC address which was previously added by mac_unicast_add(). + */ +int +mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_unicast_impl_t *muip = (mac_unicast_impl_t *)mah; + mac_unicast_impl_t *pre; + mac_impl_t *mip = mcip->mci_mip; + flow_entry_t *flent; + + i_mac_perim_enter(mip); + if (mcip->mci_flags & MAC_CLIENT_FLAGS_VNIC_PRIMARY) { + /* + * Called made by the upper MAC client of a VNIC. + * There's nothing much to do, the unicast address will + * be removed by the VNIC driver when the VNIC is deleted, + * but let's ensure that all our transmit is done before + * the client does a mac_client_stop lest it trigger an + * assert in the driver. + */ + ASSERT(muip->mui_vid == 0); + + mac_tx_client_flush(mcip); + mcip->mci_flags &= ~MAC_CLIENT_FLAGS_VNIC_PRIMARY; + + kmem_free(muip, sizeof (mac_unicast_impl_t)); + i_mac_perim_exit(mip); + return (0); + } + + ASSERT(muip != NULL); + + /* + * Remove the VID from the list of client's VIDs. + */ + pre = mcip->mci_unicast_list; + if (muip == pre) + mcip->mci_unicast_list = muip->mui_next; + else { + while ((pre->mui_next != NULL) && (pre->mui_next != muip)) + pre = pre->mui_next; + ASSERT(pre->mui_next == muip); + rw_enter(&mcip->mci_rw_lock, RW_WRITER); + pre->mui_next = muip->mui_next; + rw_exit(&mcip->mci_rw_lock); + } + + if ((mcip->mci_flags & MAC_CLIENT_FLAGS_PRIMARY) && muip->mui_vid == 0) + mcip->mci_flags &= ~MAC_CLIENT_FLAGS_PRIMARY; + + /* + * This MAC client is shared, so we will just remove the flent + * corresponding to the address being removed. We don't invoke + * mac_rx_classify_flow_rem() since the additional flow is + * not associated with its own separate set of SRS and rings, + * and these constructs are still needed for the remaining flows. + */ + if (!mac_client_single_rcvr(mcip)) { + flent = mac_client_get_flow(mcip, muip); + ASSERT(flent != NULL); + + /* + * The first one is disappearing, need to make sure + * we replace it with another from the list of + * shared clients. + */ + if (flent == mcip->mci_flent) + flent = mac_client_swap_mciflent(mcip); + mac_client_remove_flow_from_list(mcip, flent); + mac_flow_remove(mip->mi_flow_tab, flent, B_FALSE); + mac_flow_wait(flent, FLOW_DRIVER_UPCALL); + + /* + * The multicast groups that were added by the client so + * far must be removed from the brodcast domain corresponding + * to the VID being removed. + */ + mac_client_bcast_refresh(mcip, mac_client_update_mcast, + (void *)flent, B_FALSE); + + if (mip->mi_type->mt_brdcst_addr != NULL) { + mac_bcast_delete(mcip, mip->mi_type->mt_brdcst_addr, + muip->mui_vid); + } + mac_stop(mip); + FLOW_FINAL_REFRELE(flent); + i_mac_perim_exit(mip); + return (0); + } + + mip->mi_nactiveclients--; + + /* Tear down the Data path */ + mac_datapath_teardown(mcip, mcip->mci_flent, SRST_LINK); + + /* + * Prevent any future access to the flow entry through the mci_flent + * pointer by setting the mci_flent to NULL. Access to mci_flent in + * mac_bcast_send is also under mi_rw_lock. + */ + rw_enter(&mip->mi_rw_lock, RW_WRITER); + flent = mcip->mci_flent; + mac_client_remove_flow_from_list(mcip, flent); + + if (mcip->mci_state_flags & MCIS_DESC_LOGGED) + mcip->mci_state_flags &= ~MCIS_DESC_LOGGED; + + /* + * This is the last unicast address being removed and there shouldn't + * be any outbound data threads at this point coming down from mac + * clients. We have waited for the data threads to finish before + * starting dld_str_detach. Non-data threads must access TX SRS + * under mi_rw_lock. + */ + rw_exit(&mip->mi_rw_lock); + + /* + * Update the multicast group for this vid. + */ + mac_client_bcast_refresh(mcip, mac_client_update_mcast, (void *)flent, + B_FALSE); + + /* + * Don't use FLOW_MARK with FE_MC_NO_DATAPATH, as the flow might + * contain other flags, such as FE_CONDEMNED, which we need to + * cleared. We don't call mac_flow_cleanup() for this unicast + * flow as we have a already cleaned up SRSs etc. (via the teadown + * path). We just clear the stats and reset the initial callback + * function, the rest will be set when we call mac_flow_create, + * if at all. + */ + mutex_enter(&flent->fe_lock); + ASSERT(flent->fe_refcnt == 1 && flent->fe_mbg == NULL && + flent->fe_tx_srs == NULL && flent->fe_rx_srs_cnt == 0); + flent->fe_flags = FE_MC_NO_DATAPATH; + flow_stat_destroy(flent); + + /* Initialize the receiver function to a safe routine */ + flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop; + flent->fe_cb_arg1 = NULL; + flent->fe_cb_arg2 = NULL; + + flent->fe_index = -1; + mutex_exit(&flent->fe_lock); + + if (mip->mi_type->mt_brdcst_addr != NULL) { + mac_bcast_delete(mcip, mip->mi_type->mt_brdcst_addr, + muip->mui_vid); + } + + if (mip->mi_nactiveclients == 1) { + mac_capab_update((mac_handle_t)mip); + mac_virtual_link_update(mip); + } + if (mcip->mci_state_flags & MCIS_EXCLUSIVE) + mip->mi_state_flags &= ~MIS_EXCLUSIVE; + + mac_stop(mip); + + i_mac_perim_exit(mip); + kmem_free(muip, sizeof (mac_unicast_impl_t)); + return (0); +} + +/* + * Multicast add function invoked by MAC clients. + */ +int +mac_multicast_add(mac_client_handle_t mch, const uint8_t *addr) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; + flow_entry_t *flent = mcip->mci_flent_list; + flow_entry_t *prev_fe = NULL; + uint16_t vid; + int err = 0; + + /* Verify the address is a valid multicast address */ + if ((err = mip->mi_type->mt_ops.mtops_multicst_verify(addr, + mip->mi_pdata)) != 0) + return (err); + + i_mac_perim_enter(mip); + while (flent != NULL) { + vid = i_mac_flow_vid(flent); + + err = mac_bcast_add((mac_client_impl_t *)mch, addr, vid, + MAC_ADDRTYPE_MULTICAST); + if (err != 0) + break; + prev_fe = flent; + flent = flent->fe_client_next; + } + + /* + * If we failed adding, then undo all, rather than partial + * success. + */ + if (flent != NULL && prev_fe != NULL) { + flent = mcip->mci_flent_list; + while (flent != prev_fe->fe_client_next) { + vid = i_mac_flow_vid(flent); + mac_bcast_delete((mac_client_impl_t *)mch, addr, vid); + flent = flent->fe_client_next; + } + } + i_mac_perim_exit(mip); + return (err); +} + +/* + * Multicast delete function invoked by MAC clients. + */ +void +mac_multicast_remove(mac_client_handle_t mch, const uint8_t *addr) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; + flow_entry_t *flent; + uint16_t vid; + + i_mac_perim_enter(mip); + for (flent = mcip->mci_flent_list; flent != NULL; + flent = flent->fe_client_next) { + vid = i_mac_flow_vid(flent); + mac_bcast_delete((mac_client_impl_t *)mch, addr, vid); + } + i_mac_perim_exit(mip); +} + +/* + * When a MAC client desires to capture packets on an interface, + * it registers a promiscuous call back with mac_promisc_add(). + * There are three types of promiscuous callbacks: + * + * * MAC_CLIENT_PROMISC_ALL + * Captures all packets sent and received by the MAC client, + * the physical interface, as well as all other MAC clients + * defined on top of the same MAC. + * + * * MAC_CLIENT_PROMISC_FILTERED + * Captures all packets sent and received by the MAC client, + * plus all multicast traffic sent and received by the phyisical + * interface and the other MAC clients. + * + * * MAC_CLIENT_PROMISC_MULTI + * Captures all broadcast and multicast packets sent and + * received by the MAC clients as well as the physical interface. + * + * In all cases, the underlying MAC is put in promiscuous mode. + */ +int +mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type, + mac_rx_t fn, void *arg, mac_promisc_handle_t *mphp, uint16_t flags) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; + mac_promisc_impl_t *mpip; + mac_cb_info_t *mcbi; + int rc; + + i_mac_perim_enter(mip); + + if ((rc = mac_start(mip)) != 0) { + i_mac_perim_exit(mip); + return (rc); + } + + if ((mcip->mci_state_flags & MCIS_IS_VNIC) && + type == MAC_CLIENT_PROMISC_ALL) { + /* + * The function is being invoked by the upper MAC client + * of a VNIC. The VNIC should only see the traffic + * it is entitled to. + */ + type = MAC_CLIENT_PROMISC_FILTERED; + } + + + /* + * Turn on promiscuous mode for the underlying NIC. + * This is needed even for filtered callbacks which + * expect to receive all multicast traffic on the wire. + * + * Physical promiscuous mode should not be turned on if + * MAC_PROMISC_FLAGS_NO_PHYS is set. + */ + if ((flags & MAC_PROMISC_FLAGS_NO_PHYS) == 0) { + if ((rc = i_mac_promisc_set(mip, B_TRUE, MAC_DEVPROMISC)) + != 0) { + mac_stop(mip); + i_mac_perim_exit(mip); + return (rc); + } + } + + mpip = kmem_cache_alloc(mac_promisc_impl_cache, KM_SLEEP); + + mpip->mpi_type = type; + mpip->mpi_fn = fn; + mpip->mpi_arg = arg; + mpip->mpi_mcip = mcip; + mpip->mpi_no_tx_loop = ((flags & MAC_PROMISC_FLAGS_NO_TX_LOOP) != 0); + mpip->mpi_no_phys = ((flags & MAC_PROMISC_FLAGS_NO_PHYS) != 0); + + mcbi = &mip->mi_promisc_cb_info; + mutex_enter(mcbi->mcbi_lockp); + + mac_callback_add(&mip->mi_promisc_cb_info, &mcip->mci_promisc_list, + &mpip->mpi_mci_link); + mac_callback_add(&mip->mi_promisc_cb_info, &mip->mi_promisc_list, + &mpip->mpi_mi_link); + + mutex_exit(mcbi->mcbi_lockp); + + *mphp = (mac_promisc_handle_t)mpip; + i_mac_perim_exit(mip); + return (0); +} + +/* + * Remove a multicast address previously aded through mac_promisc_add(). + */ +int +mac_promisc_remove(mac_promisc_handle_t mph) +{ + mac_promisc_impl_t *mpip = (mac_promisc_impl_t *)mph; + mac_client_impl_t *mcip = mpip->mpi_mcip; + mac_impl_t *mip = mcip->mci_mip; + mac_cb_info_t *mcbi; + int rc = 0; + + i_mac_perim_enter(mip); + + /* + * Even if the device can't be reset into normal mode, we still + * need to clear the client promisc callbacks. The client may want + * to close the mac end point and we can't have stale callbacks. + */ + if (!(mpip->mpi_no_phys)) { + rc = mac_promisc_set((mac_handle_t)mip, B_FALSE, + MAC_DEVPROMISC); + if (rc != 0) + goto done; + } + mcbi = &mip->mi_promisc_cb_info; + mutex_enter(mcbi->mcbi_lockp); + if (mac_callback_remove(mcbi, &mip->mi_promisc_list, + &mpip->mpi_mi_link)) { + VERIFY(mac_callback_remove(&mip->mi_promisc_cb_info, + &mcip->mci_promisc_list, &mpip->mpi_mci_link)); + kmem_cache_free(mac_promisc_impl_cache, mpip); + } else { + mac_callback_remove_wait(&mip->mi_promisc_cb_info); + } + mutex_exit(mcbi->mcbi_lockp); + mac_stop(mip); + +done: + i_mac_perim_exit(mip); + return (rc); +} + +/* + * Reference count the number of active Tx threads. MCI_TX_QUIESCE indicates + * that a control operation wants to quiesce the Tx data flow in which case + * we return an error. Holding any of the per cpu locks ensures that the + * mci_tx_flag won't change. + * + * 'CPU' must be accessed just once and used to compute the index into the + * percpu array, and that index must be used for the entire duration of the + * packet send operation. Note that the thread may be preempted and run on + * another cpu any time and so we can't use 'CPU' more than once for the + * operation. + */ +#define MAC_TX_TRY_HOLD(mcip, mytx, error) \ +{ \ + (error) = 0; \ + (mytx) = &(mcip)->mci_tx_pcpu[CPU->cpu_seqid & mac_tx_percpu_cnt]; \ + mutex_enter(&(mytx)->pcpu_tx_lock); \ + if (!((mcip)->mci_tx_flag & MCI_TX_QUIESCE)) { \ + (mytx)->pcpu_tx_refcnt++; \ + } else { \ + (error) = -1; \ + } \ + mutex_exit(&(mytx)->pcpu_tx_lock); \ +} + +/* + * Release the reference. If needed, signal any control operation waiting + * for Tx quiescence. The wait and signal are always done using the + * mci_tx_pcpu[0]'s lock + */ +#define MAC_TX_RELE(mcip, mytx) { \ + mutex_enter(&(mytx)->pcpu_tx_lock); \ + if (--(mytx)->pcpu_tx_refcnt == 0 && \ + (mcip)->mci_tx_flag & MCI_TX_QUIESCE) { \ + mutex_exit(&(mytx)->pcpu_tx_lock); \ + mutex_enter(&(mcip)->mci_tx_pcpu[0].pcpu_tx_lock); \ + cv_signal(&(mcip)->mci_tx_cv); \ + mutex_exit(&(mcip)->mci_tx_pcpu[0].pcpu_tx_lock); \ + } else { \ + mutex_exit(&(mytx)->pcpu_tx_lock); \ + } \ +} + +/* + * Bump the count of the number of active Tx threads. This is maintained as + * a per CPU counter. On (CMT kind of) machines with large number of CPUs, + * a single mci_tx_lock may become contended. However a count of the total + * number of Tx threads per client is needed in order to quiesce the Tx side + * prior to reassigning a Tx ring dynamically to another client. The thread + * that needs to quiesce the Tx traffic grabs all the percpu locks and checks + * the sum of the individual percpu refcnts. Each Tx data thread only grabs + * its own percpu lock and increments its own refcnt. + */ +void * +mac_tx_hold(mac_client_handle_t mch) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_tx_percpu_t *mytx; + int error; + + MAC_TX_TRY_HOLD(mcip, mytx, error); + return (error == 0 ? (void *)mytx : NULL); +} + +void +mac_tx_rele(mac_client_handle_t mch, void *mytx_handle) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_tx_percpu_t *mytx = mytx_handle; + + MAC_TX_RELE(mcip, mytx) +} + +/* + * Send function invoked by MAC clients. + */ +mac_tx_cookie_t +mac_tx(mac_client_handle_t mch, mblk_t *mp_chain, uintptr_t hint, + uint16_t flag, mblk_t **ret_mp) +{ + mac_tx_cookie_t cookie; + int error; + mac_tx_percpu_t *mytx; + mac_soft_ring_set_t *srs; + flow_entry_t *flent; + boolean_t is_subflow = B_FALSE; + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; + mac_srs_tx_t *srs_tx; + + /* + * Check whether the active Tx threads count is bumped already. + */ + if (!(flag & MAC_TX_NO_HOLD)) { + MAC_TX_TRY_HOLD(mcip, mytx, error); + if (error != 0) { + freemsgchain(mp_chain); + return (NULL); + } + } + + if (mcip->mci_subflow_tab != NULL && + mcip->mci_subflow_tab->ft_flow_count > 0 && + mac_flow_lookup(mcip->mci_subflow_tab, mp_chain, + FLOW_OUTBOUND, &flent) == 0) { + /* + * The main assumption here is that if in the event + * we get a chain, all the packets will be classified + * to the same Flow/SRS. If this changes for any + * reason, the following logic should change as well. + * I suppose the fanout_hint also assumes this . + */ + ASSERT(flent != NULL); + is_subflow = B_TRUE; + } else { + flent = mcip->mci_flent; + } + + srs = flent->fe_tx_srs; + srs_tx = &srs->srs_tx; + if (srs_tx->st_mode == SRS_TX_DEFAULT && + (srs->srs_state & SRS_ENQUEUED) == 0 && + mip->mi_nactiveclients == 1 && mip->mi_promisc_list == NULL && + mp_chain->b_next == NULL) { + uint64_t obytes; + + /* + * Since dls always opens the underlying MAC, nclients equals + * to 1 means that the only active client is dls itself acting + * as a primary client of the MAC instance. Since dls will not + * send tagged packets in that case, and dls is trusted to send + * packets for its allowed VLAN(s), the VLAN tag insertion and + * check is required only if nclients is greater than 1. + */ + if (mip->mi_nclients > 1) { + if (MAC_VID_CHECK_NEEDED(mcip)) { + int err = 0; + + MAC_VID_CHECK(mcip, mp_chain, err); + if (err != 0) { + freemsg(mp_chain); + mcip->mci_stat_oerrors++; + goto done; + } + } + if (MAC_TAG_NEEDED(mcip)) { + mp_chain = mac_add_vlan_tag(mp_chain, 0, + mac_client_vid(mch)); + if (mp_chain == NULL) { + mcip->mci_stat_oerrors++; + goto done; + } + } + } + + obytes = (mp_chain->b_cont == NULL ? MBLKL(mp_chain) : + msgdsize(mp_chain)); + + MAC_TX(mip, srs_tx->st_arg2, mp_chain, mcip); + + if (mp_chain == NULL) { + cookie = NULL; + mcip->mci_stat_obytes += obytes; + mcip->mci_stat_opackets += 1; + if ((srs->srs_type & SRST_FLOW) != 0) { + FLOW_STAT_UPDATE(flent, obytes, obytes); + FLOW_STAT_UPDATE(flent, opackets, 1); + } + } else { + mutex_enter(&srs->srs_lock); + cookie = mac_tx_srs_no_desc(srs, mp_chain, + flag, ret_mp); + mutex_exit(&srs->srs_lock); + } + } else { + cookie = srs_tx->st_func(srs, mp_chain, hint, flag, ret_mp); + } + +done: + if (is_subflow) + FLOW_REFRELE(flent); + + if (!(flag & MAC_TX_NO_HOLD)) + MAC_TX_RELE(mcip, mytx); + + return (cookie); +} + +/* + * mac_tx_is_blocked + * + * Given a cookie, it returns if the ring identified by the cookie is + * flow-controlled or not (this is not implemented yet). If NULL is + * passed in place of a cookie, then it finds out if any of the + * underlying rings belonging to the SRS is flow controlled or not + * and returns that status. + */ +/* ARGSUSED */ +boolean_t +mac_tx_is_flow_blocked(mac_client_handle_t mch, mac_tx_cookie_t cookie) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_soft_ring_set_t *mac_srs = MCIP_TX_SRS(mcip); + mac_soft_ring_t *sringp; + boolean_t blocked = B_FALSE; + int i; + + /* + * On etherstubs, there won't be a Tx SRS or an Rx + * SRS. Infact there won't even be a flow_entry. + */ + if (mac_srs == NULL) + return (B_FALSE); + + mutex_enter(&mac_srs->srs_lock); + if (mac_srs->srs_tx.st_mode == SRS_TX_FANOUT) { + for (i = 0; i < mac_srs->srs_oth_ring_count; i++) { + sringp = mac_srs->srs_oth_soft_rings[i]; + mutex_enter(&sringp->s_ring_lock); + if (sringp->s_ring_state & S_RING_TX_HIWAT) { + blocked = B_TRUE; + mutex_exit(&sringp->s_ring_lock); + break; + } + mutex_exit(&sringp->s_ring_lock); + } + } else { + blocked = (mac_srs->srs_state & SRS_TX_HIWAT); + } + mutex_exit(&mac_srs->srs_lock); + return (blocked); +} + +/* + * Check if the MAC client is the primary MAC client. + */ +boolean_t +mac_is_primary_client(mac_client_impl_t *mcip) +{ + return (mcip->mci_flags & MAC_CLIENT_FLAGS_PRIMARY); +} + +void +mac_ioctl(mac_handle_t mh, queue_t *wq, mblk_t *bp) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + int cmd = ((struct iocblk *)bp->b_rptr)->ioc_cmd; + + if ((cmd == ND_GET && (mip->mi_callbacks->mc_callbacks & MC_GETPROP)) || + (cmd == ND_SET && (mip->mi_callbacks->mc_callbacks & MC_SETPROP))) { + /* + * If ndd props were registered, call them. + * Note that ndd ioctls are Obsolete + */ + mac_ndd_ioctl(mip, wq, bp); + return; + } + + /* + * Call the driver to handle the ioctl. The driver may not support + * any ioctls, in which case we reply with a NAK on its behalf. + */ + if (mip->mi_callbacks->mc_callbacks & MC_IOCTL) + mip->mi_ioctl(mip->mi_driver, wq, bp); + else + miocnak(wq, bp, 0, EINVAL); +} + +/* + * Return the link state of the specified MAC instance. + */ +link_state_t +mac_link_get(mac_handle_t mh) +{ + return (((mac_impl_t *)mh)->mi_linkstate); +} + +/* + * Add a mac client specified notification callback. Please see the comments + * above mac_callback_add() for general information about mac callback + * addition/deletion in the presence of mac callback list walkers + */ +mac_notify_handle_t +mac_notify_add(mac_handle_t mh, mac_notify_t notify_fn, void *arg) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + mac_notify_cb_t *mncb; + mac_cb_info_t *mcbi; + + /* + * Allocate a notify callback structure, fill in the details and + * use the mac callback list manipulation functions to chain into + * the list of callbacks. + */ + mncb = kmem_zalloc(sizeof (mac_notify_cb_t), KM_SLEEP); + mncb->mncb_fn = notify_fn; + mncb->mncb_arg = arg; + mncb->mncb_mip = mip; + mncb->mncb_link.mcb_objp = mncb; + mncb->mncb_link.mcb_objsize = sizeof (mac_notify_cb_t); + mncb->mncb_link.mcb_flags = MCB_NOTIFY_CB_T; + + mcbi = &mip->mi_notify_cb_info; + + i_mac_perim_enter(mip); + mutex_enter(mcbi->mcbi_lockp); + + mac_callback_add(&mip->mi_notify_cb_info, &mip->mi_notify_cb_list, + &mncb->mncb_link); + + mutex_exit(mcbi->mcbi_lockp); + i_mac_perim_exit(mip); + return ((mac_notify_handle_t)mncb); +} + +void +mac_notify_remove_wait(mac_handle_t mh) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + mac_cb_info_t *mcbi = &mip->mi_notify_cb_info; + + mutex_enter(mcbi->mcbi_lockp); + mac_callback_remove_wait(&mip->mi_notify_cb_info); + mutex_exit(mcbi->mcbi_lockp); +} + +/* + * Remove a mac client specified notification callback + */ +int +mac_notify_remove(mac_notify_handle_t mnh, boolean_t wait) +{ + mac_notify_cb_t *mncb = (mac_notify_cb_t *)mnh; + mac_impl_t *mip = mncb->mncb_mip; + mac_cb_info_t *mcbi; + int err = 0; + + mcbi = &mip->mi_notify_cb_info; + + i_mac_perim_enter(mip); + mutex_enter(mcbi->mcbi_lockp); + + ASSERT(mncb->mncb_link.mcb_objp == mncb); + /* + * If there aren't any list walkers, the remove would succeed + * inline, else we wait for the deferred remove to complete + */ + if (mac_callback_remove(&mip->mi_notify_cb_info, + &mip->mi_notify_cb_list, &mncb->mncb_link)) { + kmem_free(mncb, sizeof (mac_notify_cb_t)); + } else { + err = EBUSY; + } + + mutex_exit(mcbi->mcbi_lockp); + i_mac_perim_exit(mip); + + /* + * If we failed to remove the notification callback and "wait" is set + * to be B_TRUE, wait for the callback to finish after we exit the + * mac perimeter. + */ + if (err != 0 && wait) { + mac_notify_remove_wait((mac_handle_t)mip); + return (0); + } + + return (err); +} + +/* + * Associate resource management callbacks with the specified MAC + * clients. + */ + +void +mac_resource_set_common(mac_client_handle_t mch, mac_resource_add_t add, + mac_resource_remove_t remove, mac_resource_quiesce_t quiesce, + mac_resource_restart_t restart, mac_resource_bind_t bind, + void *arg) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + + mcip->mci_resource_add = add; + mcip->mci_resource_remove = remove; + mcip->mci_resource_quiesce = quiesce; + mcip->mci_resource_restart = restart; + mcip->mci_resource_bind = bind; + mcip->mci_resource_arg = arg; + + if (arg == NULL) + mcip->mci_state_flags &= ~MCIS_CLIENT_POLL_CAPABLE; +} + +void +mac_resource_set(mac_client_handle_t mch, mac_resource_add_t add, void *arg) +{ + /* update the 'resource_add' callback */ + mac_resource_set_common(mch, add, NULL, NULL, NULL, NULL, arg); +} + +/* + * Sets up the client resources and enable the polling interface over all the + * SRS's and the soft rings of the client + */ +void +mac_client_poll_enable(mac_client_handle_t mch) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_soft_ring_set_t *mac_srs; + flow_entry_t *flent; + int i; + + flent = mcip->mci_flent; + ASSERT(flent != NULL); + + for (i = 0; i < flent->fe_rx_srs_cnt; i++) { + mac_srs = (mac_soft_ring_set_t *)flent->fe_rx_srs[i]; + ASSERT(mac_srs->srs_mcip == mcip); + mac_srs_client_poll_enable(mcip, mac_srs); + } +} + +/* + * Tears down the client resources and disable the polling interface over all + * the SRS's and the soft rings of the client + */ +void +mac_client_poll_disable(mac_client_handle_t mch) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_soft_ring_set_t *mac_srs; + flow_entry_t *flent; + int i; + + flent = mcip->mci_flent; + ASSERT(flent != NULL); + + for (i = 0; i < flent->fe_rx_srs_cnt; i++) { + mac_srs = (mac_soft_ring_set_t *)flent->fe_rx_srs[i]; + ASSERT(mac_srs->srs_mcip == mcip); + mac_srs_client_poll_disable(mcip, mac_srs); + } +} + +/* + * Associate the CPUs specified by the given property with a MAC client. + */ +int +mac_cpu_set(mac_client_handle_t mch, mac_resource_props_t *mrp) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; + int err = 0; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + if ((err = mac_validate_props(mrp)) != 0) + return (err); + + if (MCIP_DATAPATH_SETUP(mcip)) + mac_flow_modify(mip->mi_flow_tab, mcip->mci_flent, mrp); + + mac_update_resources(mrp, MCIP_RESOURCE_PROPS(mcip), B_FALSE); + return (0); +} + +/* + * Apply the specified properties to the specified MAC client. + */ +int +mac_client_set_resources(mac_client_handle_t mch, mac_resource_props_t *mrp) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; + int err = 0; + + i_mac_perim_enter(mip); + + if ((mrp->mrp_mask & MRP_MAXBW) || (mrp->mrp_mask & MRP_PRIORITY)) { + err = mac_resource_ctl_set(mch, mrp); + if (err != 0) { + i_mac_perim_exit(mip); + return (err); + } + } + + if (mrp->mrp_mask & MRP_CPUS) + err = mac_cpu_set(mch, mrp); + + i_mac_perim_exit(mip); + return (err); +} + +/* + * Return the properties currently associated with the specified MAC client. + */ +void +mac_client_get_resources(mac_client_handle_t mch, mac_resource_props_t *mrp) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip); + + bcopy(mcip_mrp, mrp, sizeof (mac_resource_props_t)); +} + +/* + * Pass a copy of the specified packet to the promiscuous callbacks + * of the specified MAC. + * + * If sender is NULL, the function is being invoked for a packet chain + * received from the wire. If sender is non-NULL, it points to + * the MAC client from which the packet is being sent. + * + * The packets are distributed to the promiscuous callbacks as follows: + * + * - all packets are sent to the MAC_CLIENT_PROMISC_ALL callbacks + * - all broadcast and multicast packets are sent to the + * MAC_CLIENT_PROMISC_FILTER and MAC_CLIENT_PROMISC_MULTI. + * + * The unicast packets of MAC_CLIENT_PROMISC_FILTER callbacks are dispatched + * after classification by mac_rx_deliver(). + */ + +static void +mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp, + boolean_t loopback) +{ + mblk_t *mp_copy; + + mp_copy = copymsg(mp); + if (mp_copy == NULL) + return; + mp_copy->b_next = NULL; + + mpip->mpi_fn(mpip->mpi_arg, NULL, mp_copy, loopback); +} + +/* + * Return the VID of a packet. Zero if the packet is not tagged. + */ +static uint16_t +mac_ether_vid(mblk_t *mp) +{ + struct ether_header *eth = (struct ether_header *)mp->b_rptr; + + if (ntohs(eth->ether_type) == ETHERTYPE_VLAN) { + struct ether_vlan_header *t_evhp = + (struct ether_vlan_header *)mp->b_rptr; + return (VLAN_ID(ntohs(t_evhp->ether_tci))); + } + + return (0); +} + +/* + * Return whether the specified packet contains a multicast or broadcast + * destination MAC address. + */ +static boolean_t +mac_is_mcast(mac_impl_t *mip, mblk_t *mp) +{ + mac_header_info_t hdr_info; + + if (mac_header_info((mac_handle_t)mip, mp, &hdr_info) != 0) + return (B_FALSE); + return ((hdr_info.mhi_dsttype == MAC_ADDRTYPE_BROADCAST) || + (hdr_info.mhi_dsttype == MAC_ADDRTYPE_MULTICAST)); +} + +/* + * Send a copy of an mblk chain to the MAC clients of the specified MAC. + * "sender" points to the sender MAC client for outbound packets, and + * is set to NULL for inbound packets. + */ +void +mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain, + mac_client_impl_t *sender) +{ + mac_promisc_impl_t *mpip; + mac_cb_t *mcb; + mblk_t *mp; + boolean_t is_mcast, is_sender; + + MAC_PROMISC_WALKER_INC(mip); + for (mp = mp_chain; mp != NULL; mp = mp->b_next) { + is_mcast = mac_is_mcast(mip, mp); + /* send packet to interested callbacks */ + for (mcb = mip->mi_promisc_list; mcb != NULL; + mcb = mcb->mcb_nextp) { + mpip = (mac_promisc_impl_t *)mcb->mcb_objp; + is_sender = (mpip->mpi_mcip == sender); + + if (is_sender && mpip->mpi_no_tx_loop) + /* + * The sender doesn't want to receive + * copies of the packets it sends. + */ + continue; + + /* + * For an ethernet MAC, don't displatch a multicast + * packet to a non-PROMISC_ALL callbacks unless the VID + * of the packet matches the VID of the client. + */ + if (is_mcast && + mpip->mpi_type != MAC_CLIENT_PROMISC_ALL && + !mac_client_check_flow_vid(mpip->mpi_mcip, + mac_ether_vid(mp))) + continue; + + if (is_sender || + mpip->mpi_type == MAC_CLIENT_PROMISC_ALL || + is_mcast) + mac_promisc_dispatch_one(mpip, mp, is_sender); + } + } + MAC_PROMISC_WALKER_DCR(mip); +} + +void +mac_promisc_client_dispatch(mac_client_impl_t *mcip, mblk_t *mp_chain) +{ + mac_impl_t *mip = mcip->mci_mip; + mac_promisc_impl_t *mpip; + boolean_t is_mcast; + mblk_t *mp; + mac_cb_t *mcb; + + /* + * The unicast packets for the MAC client still + * need to be delivered to the MAC_CLIENT_PROMISC_FILTERED + * promiscuous callbacks. The broadcast and multicast + * packets were delivered from mac_rx(). + */ + MAC_PROMISC_WALKER_INC(mip); + for (mp = mp_chain; mp != NULL; mp = mp->b_next) { + is_mcast = mac_is_mcast(mip, mp); + for (mcb = mcip->mci_promisc_list; mcb != NULL; + mcb = mcb->mcb_nextp) { + mpip = (mac_promisc_impl_t *)mcb->mcb_objp; + if (mpip->mpi_type == MAC_CLIENT_PROMISC_FILTERED && + !is_mcast) { + mac_promisc_dispatch_one(mpip, mp, B_FALSE); + } + } + } + MAC_PROMISC_WALKER_DCR(mip); +} + +/* + * Return the margin value currently assigned to the specified MAC instance. + */ +void +mac_margin_get(mac_handle_t mh, uint32_t *marginp) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + rw_enter(&(mip->mi_rw_lock), RW_READER); + *marginp = mip->mi_margin; + rw_exit(&(mip->mi_rw_lock)); +} + +/* + * mac_info_get() is used for retrieving the mac_info when a DL_INFO_REQ is + * issued before a DL_ATTACH_REQ. we walk the i_mac_impl_hash table and find + * the first mac_impl_t with a matching driver name; then we copy its mac_info_t + * to the caller. we do all this with i_mac_impl_lock held so the mac_impl_t + * cannot disappear while we are accessing it. + */ +typedef struct i_mac_info_state_s { + const char *mi_name; + mac_info_t *mi_infop; +} i_mac_info_state_t; + +/*ARGSUSED*/ +static uint_t +i_mac_info_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg) +{ + i_mac_info_state_t *statep = arg; + mac_impl_t *mip = (mac_impl_t *)val; + + if (mip->mi_state_flags & MIS_DISABLED) + return (MH_WALK_CONTINUE); + + if (strcmp(statep->mi_name, + ddi_driver_name(mip->mi_dip)) != 0) + return (MH_WALK_CONTINUE); + + statep->mi_infop = &mip->mi_info; + return (MH_WALK_TERMINATE); +} + +boolean_t +mac_info_get(const char *name, mac_info_t *minfop) +{ + i_mac_info_state_t state; + + rw_enter(&i_mac_impl_lock, RW_READER); + state.mi_name = name; + state.mi_infop = NULL; + mod_hash_walk(i_mac_impl_hash, i_mac_info_walker, &state); + if (state.mi_infop == NULL) { + rw_exit(&i_mac_impl_lock); + return (B_FALSE); + } + *minfop = *state.mi_infop; + rw_exit(&i_mac_impl_lock); + return (B_TRUE); +} + +/* + * To get the capabilities that MAC layer cares about, such as rings, factory + * mac address, vnic or not, it should directly invoke this function + */ +boolean_t +i_mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + if (mip->mi_callbacks->mc_callbacks & MC_GETCAPAB) + return (mip->mi_getcapab(mip->mi_driver, cap, cap_data)); + else + return (B_FALSE); +} + +/* + * Capability query function. If number of active mac clients is greater than + * 1, only limited capabilities can be advertised to the caller no matter the + * driver has certain capability or not. Else, we query the driver to get the + * capability. + */ +boolean_t +mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + /* + * if mi_nactiveclients > 1, only MAC_CAPAB_HCKSUM, + * MAC_CAPAB_NO_NATIVEVLAN, MAC_CAPAB_NO_ZCOPY can be advertised. + */ + if (mip->mi_nactiveclients > 1) { + switch (cap) { + case MAC_CAPAB_HCKSUM: + return (i_mac_capab_get(mh, cap, cap_data)); + case MAC_CAPAB_NO_NATIVEVLAN: + case MAC_CAPAB_NO_ZCOPY: + return (B_TRUE); + default: + return (B_FALSE); + } + } + + /* else get capab from driver */ + return (i_mac_capab_get(mh, cap, cap_data)); +} + +boolean_t +mac_sap_verify(mac_handle_t mh, uint32_t sap, uint32_t *bind_sap) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + return (mip->mi_type->mt_ops.mtops_sap_verify(sap, bind_sap, + mip->mi_pdata)); +} + +mblk_t * +mac_header(mac_handle_t mh, const uint8_t *daddr, uint32_t sap, mblk_t *payload, + size_t extra_len) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + return (mip->mi_type->mt_ops.mtops_header(mip->mi_addr, daddr, sap, + mip->mi_pdata, payload, extra_len)); +} + +int +mac_header_info(mac_handle_t mh, mblk_t *mp, mac_header_info_t *mhip) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + return (mip->mi_type->mt_ops.mtops_header_info(mp, mip->mi_pdata, + mhip)); +} + +mblk_t * +mac_header_cook(mac_handle_t mh, mblk_t *mp) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + if (mip->mi_type->mt_ops.mtops_ops & MTOPS_HEADER_COOK) { + if (DB_REF(mp) > 1) { + mblk_t *newmp = copymsg(mp); + if (newmp == NULL) + return (NULL); + freemsg(mp); + mp = newmp; + } + return (mip->mi_type->mt_ops.mtops_header_cook(mp, + mip->mi_pdata)); + } + return (mp); +} + +mblk_t * +mac_header_uncook(mac_handle_t mh, mblk_t *mp) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + if (mip->mi_type->mt_ops.mtops_ops & MTOPS_HEADER_UNCOOK) { + if (DB_REF(mp) > 1) { + mblk_t *newmp = copymsg(mp); + if (newmp == NULL) + return (NULL); + freemsg(mp); + mp = newmp; + } + return (mip->mi_type->mt_ops.mtops_header_uncook(mp, + mip->mi_pdata)); + } + return (mp); +} + +uint_t +mac_addr_len(mac_handle_t mh) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + return (mip->mi_type->mt_addr_length); +} + +/* True if a MAC is a VNIC */ +boolean_t +mac_is_vnic(mac_handle_t mh) +{ + return (((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC); +} + +mac_handle_t +mac_get_lower_mac_handle(mac_handle_t mh) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + ASSERT(mac_is_vnic(mh)); + return (((vnic_t *)mip->mi_driver)->vn_lower_mh); +} + +void +mac_update_resources(mac_resource_props_t *nmrp, mac_resource_props_t *cmrp, + boolean_t is_user_flow) +{ + if (nmrp != NULL && cmrp != NULL) { + if (nmrp->mrp_mask & MRP_PRIORITY) { + if (nmrp->mrp_priority == MPL_RESET) { + cmrp->mrp_mask &= ~MRP_PRIORITY; + if (is_user_flow) { + cmrp->mrp_priority = + MPL_SUBFLOW_DEFAULT; + } else { + cmrp->mrp_priority = MPL_LINK_DEFAULT; + } + } else { + cmrp->mrp_mask |= MRP_PRIORITY; + cmrp->mrp_priority = nmrp->mrp_priority; + } + } + if (nmrp->mrp_mask & MRP_MAXBW) { + cmrp->mrp_maxbw = nmrp->mrp_maxbw; + if (nmrp->mrp_maxbw == MRP_MAXBW_RESETVAL) + cmrp->mrp_mask &= ~MRP_MAXBW; + else + cmrp->mrp_mask |= MRP_MAXBW; + } + if (nmrp->mrp_mask & MRP_CPUS) + MAC_COPY_CPUS(nmrp, cmrp); + } +} + +/* + * i_mac_set_resources: + * + * This routine associates properties with the primary MAC client of + * the specified MAC instance. + * - Cache the properties in mac_impl_t + * - Apply the properties to the primary MAC client if exists + */ +int +i_mac_set_resources(mac_handle_t mh, mac_resource_props_t *mrp) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + mac_client_impl_t *mcip; + int err = 0; + mac_resource_props_t tmrp; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + err = mac_validate_props(mrp); + if (err != 0) + return (err); + + /* + * Since bind_cpu may be modified by mac_client_set_resources() + * we use a copy of bind_cpu and finally cache bind_cpu in mip. + * This allows us to cache only user edits in mip. + */ + bcopy(mrp, &tmrp, sizeof (mac_resource_props_t)); + mcip = mac_primary_client_handle(mip); + if (mcip != NULL) { + err = + mac_client_set_resources((mac_client_handle_t)mcip, &tmrp); + } + /* if mac_client_set_resources failed, do not update the values */ + if (err == 0) + mac_update_resources(mrp, &mip->mi_resource_props, B_FALSE); + return (err); +} + +int +mac_set_resources(mac_handle_t mh, mac_resource_props_t *mrp) +{ + int err; + + i_mac_perim_enter((mac_impl_t *)mh); + err = i_mac_set_resources(mh, mrp); + i_mac_perim_exit((mac_impl_t *)mh); + return (err); +} + +/* + * Get the properties cached for the specified MAC instance. + */ +void +mac_get_resources(mac_handle_t mh, mac_resource_props_t *mrp) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + mac_client_impl_t *mcip; + + if (mip->mi_state_flags & MIS_IS_VNIC) { + mcip = mac_primary_client_handle(mip); + if (mcip != NULL) { + mac_client_get_resources((mac_client_handle_t)mcip, + mrp); + return; + } + } + bcopy(&mip->mi_resource_props, mrp, sizeof (mac_resource_props_t)); +} + +/* + * Rename a mac client, its flow, and the kstat. + */ +int +mac_rename_primary(mac_handle_t mh, const char *new_name) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + mac_client_impl_t *cur_clnt = NULL; + flow_entry_t *fep; + + i_mac_perim_enter(mip); + + /* + * VNICs: we need to change the sys flow name and + * the associated flow kstat. + */ + if (mip->mi_state_flags & MIS_IS_VNIC) { + ASSERT(new_name != NULL); + mac_rename_flow_names(mac_vnic_lower(mip), new_name); + goto done; + } + /* + * This mac may itself be an aggr link, or it may have some client + * which is an aggr port. For both cases, we need to change the + * aggr port's mac client name, its flow name and the associated flow + * kstat. + */ + if (mip->mi_state_flags & MIS_IS_AGGR) { + mac_capab_aggr_t aggr_cap; + mac_rename_fn_t rename_fn; + boolean_t ret; + + ASSERT(new_name != NULL); + ret = i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, + (void *)(&aggr_cap)); + ASSERT(ret == B_TRUE); + rename_fn = aggr_cap.mca_rename_fn; + rename_fn(new_name, mip->mi_driver); + /* + * The aggr's client name and kstat flow name will be + * updated below, i.e. via mac_rename_flow_names. + */ + } + + for (cur_clnt = mip->mi_clients_list; cur_clnt != NULL; + cur_clnt = cur_clnt->mci_client_next) { + if (cur_clnt->mci_state_flags & MCIS_IS_AGGR_PORT) { + if (new_name != NULL) { + char *str_st = cur_clnt->mci_name; + char *str_del = strchr(str_st, '-'); + + ASSERT(str_del != NULL); + bzero(str_del + 1, MAXNAMELEN - + (str_del - str_st + 1)); + bcopy(new_name, str_del + 1, + strlen(new_name)); + } + fep = cur_clnt->mci_flent; + mac_rename_flow(fep, cur_clnt->mci_name); + break; + } else if (new_name != NULL && + cur_clnt->mci_state_flags & MCIS_USE_DATALINK_NAME) { + mac_rename_flow_names(cur_clnt, new_name); + break; + } + } + +done: + i_mac_perim_exit(mip); + return (0); +} + +/* + * Rename the MAC client's flow names + */ +static void +mac_rename_flow_names(mac_client_impl_t *mcip, const char *new_name) +{ + flow_entry_t *flent; + uint16_t vid; + char flowname[MAXFLOWNAME]; + mac_impl_t *mip = mcip->mci_mip; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + /* + * Use mi_rw_lock to ensure that threads not in the mac perimeter + * see a self-consistent value for mci_name + */ + rw_enter(&mip->mi_rw_lock, RW_WRITER); + (void) strlcpy(mcip->mci_name, new_name, sizeof (mcip->mci_name)); + rw_exit(&mip->mi_rw_lock); + + mac_rename_flow(mcip->mci_flent, new_name); + + if (mcip->mci_nflents == 1) + return; + + /* + * We have to rename all the others too, no stats to destroy for + * these. + */ + for (flent = mcip->mci_flent_list; flent != NULL; + flent = flent->fe_client_next) { + if (flent != mcip->mci_flent) { + vid = i_mac_flow_vid(flent); + (void) sprintf(flowname, "%s%u", new_name, vid); + mac_flow_set_name(flent, flowname); + } + } +} + + +/* + * Add a flow to the MAC client's flow list - i.e list of MAC/VID tuples + * defined for the specified MAC client. + */ +static void +mac_client_add_to_flow_list(mac_client_impl_t *mcip, flow_entry_t *flent) +{ + ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); + /* + * The promisc Rx data path walks the mci_flent_list. Protect by + * using mi_rw_lock + */ + rw_enter(&mcip->mci_rw_lock, RW_WRITER); + + /* Add it to the head */ + flent->fe_client_next = mcip->mci_flent_list; + mcip->mci_flent_list = flent; + mcip->mci_nflents++; + + /* + * Keep track of the number of non-zero VIDs addresses per MAC + * client to avoid figuring it out in the data-path. + */ + if (i_mac_flow_vid(flent) != VLAN_ID_NONE) + mcip->mci_nvids++; + + rw_exit(&mcip->mci_rw_lock); +} + +/* + * Remove a flow entry from the MAC client's list. + */ +static void +mac_client_remove_flow_from_list(mac_client_impl_t *mcip, flow_entry_t *flent) +{ + flow_entry_t *fe = mcip->mci_flent_list; + flow_entry_t *prev_fe = NULL; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); + /* + * The promisc Rx data path walks the mci_flent_list. Protect by + * using mci_rw_lock + */ + rw_enter(&mcip->mci_rw_lock, RW_WRITER); + while ((fe != NULL) && (fe != flent)) { + prev_fe = fe; + fe = fe->fe_client_next; + } + + /* XXX should be an ASSERT */ + if (fe != NULL) { + if (prev_fe == NULL) { + /* Deleting the first node */ + mcip->mci_flent_list = fe->fe_client_next; + } else { + prev_fe->fe_client_next = fe->fe_client_next; + } + mcip->mci_nflents--; + + if (i_mac_flow_vid(flent) != VLAN_ID_NONE) + mcip->mci_nvids--; + } + rw_exit(&mcip->mci_rw_lock); +} + +/* + * Check if the given VID belongs to this MAC client. + */ +boolean_t +mac_client_check_flow_vid(mac_client_impl_t *mcip, uint16_t vid) +{ + flow_entry_t *flent; + uint16_t mci_vid; + + /* The mci_flent_list is protected by mci_rw_lock */ + rw_enter(&mcip->mci_rw_lock, RW_WRITER); + for (flent = mcip->mci_flent_list; flent != NULL; + flent = flent->fe_client_next) { + mci_vid = i_mac_flow_vid(flent); + if (vid == mci_vid) { + rw_exit(&mcip->mci_rw_lock); + return (B_TRUE); + } + } + rw_exit(&mcip->mci_rw_lock); + return (B_FALSE); +} + +/* + * Get the flow entry for the specified <MAC addr, VID> tuple. + */ +static flow_entry_t * +mac_client_get_flow(mac_client_impl_t *mcip, mac_unicast_impl_t *muip) +{ + mac_address_t *map = mcip->mci_unicast; + flow_entry_t *flent; + uint16_t vid; + flow_desc_t flow_desc; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); + + mac_flow_get_desc(mcip->mci_flent, &flow_desc); + if (bcmp(flow_desc.fd_dst_mac, map->ma_addr, map->ma_len) != 0) + return (NULL); + + for (flent = mcip->mci_flent_list; flent != NULL; + flent = flent->fe_client_next) { + vid = i_mac_flow_vid(flent); + if (vid == muip->mui_vid) { + return (flent); + } + } + + return (NULL); +} + +/* + * Since mci_flent has the SRSs, when we want to remove it, we replace + * the flow_desc_t in mci_flent with that of an existing flent and then + * remove that flent instead of mci_flent. + */ +static flow_entry_t * +mac_client_swap_mciflent(mac_client_impl_t *mcip) +{ + flow_entry_t *flent = mcip->mci_flent; + flow_tab_t *ft = flent->fe_flow_tab; + flow_entry_t *flent1; + flow_desc_t fl_desc; + char fl_name[MAXFLOWNAME]; + int err; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); + ASSERT(mcip->mci_nflents > 1); + + /* get the next flent following the primary flent */ + flent1 = mcip->mci_flent_list->fe_client_next; + ASSERT(flent1 != NULL && flent1->fe_flow_tab == ft); + + /* + * Remove the flent from the flow table before updating the + * flow descriptor as the hash depends on the flow descriptor. + * This also helps incoming packet classification avoid having + * to grab fe_lock. Access to fe_flow_desc of a flent not in the + * flow table is done under the fe_lock so that log or stat functions + * see a self-consistent fe_flow_desc. The name and desc are specific + * to a flow, the rest are shared by all the clients, including + * resource control etc. + */ + mac_flow_remove(ft, flent, B_TRUE); + mac_flow_remove(ft, flent1, B_TRUE); + + bcopy(&flent->fe_flow_desc, &fl_desc, sizeof (flow_desc_t)); + bcopy(flent->fe_flow_name, fl_name, MAXFLOWNAME); + + /* update the primary flow entry */ + mutex_enter(&flent->fe_lock); + bcopy(&flent1->fe_flow_desc, &flent->fe_flow_desc, + sizeof (flow_desc_t)); + bcopy(&flent1->fe_flow_name, &flent->fe_flow_name, MAXFLOWNAME); + mutex_exit(&flent->fe_lock); + + /* update the flow entry that is to be freed */ + mutex_enter(&flent1->fe_lock); + bcopy(&fl_desc, &flent1->fe_flow_desc, sizeof (flow_desc_t)); + bcopy(fl_name, &flent1->fe_flow_name, MAXFLOWNAME); + mutex_exit(&flent1->fe_lock); + + /* now reinsert the flow entries in the table */ + err = mac_flow_add(ft, flent); + ASSERT(err == 0); + + err = mac_flow_add(ft, flent1); + ASSERT(err == 0); + + return (flent1); +} + +/* + * Return whether there is only one flow entry associated with this + * MAC client. + */ +static boolean_t +mac_client_single_rcvr(mac_client_impl_t *mcip) +{ + return (mcip->mci_nflents == 1); +} + +int +mac_validate_props(mac_resource_props_t *mrp) +{ + if (mrp == NULL) + return (0); + + if (mrp->mrp_mask & MRP_PRIORITY) { + mac_priority_level_t pri = mrp->mrp_priority; + + if (pri < MPL_LOW || pri > MPL_RESET) + return (EINVAL); + } + + if (mrp->mrp_mask & MRP_MAXBW) { + uint64_t maxbw = mrp->mrp_maxbw; + + if (maxbw < MRP_MAXBW_MINVAL && maxbw != 0) + return (EINVAL); + } + if (mrp->mrp_mask & MRP_CPUS) { + int i; + mac_cpu_mode_t fanout; + + if (mrp->mrp_ncpus > ncpus || mrp->mrp_ncpus > MAX_SR_FANOUT) + return (EINVAL); + + for (i = 0; i < mrp->mrp_ncpus; i++) { + cpu_t *cp; + int rv; + + mutex_enter(&cpu_lock); + cp = cpu_get(mrp->mrp_cpu[i]); + if (cp != NULL) + rv = cpu_is_online(cp); + else + rv = 0; + mutex_exit(&cpu_lock); + if (rv == 0) + return (EINVAL); + } + + fanout = mrp->mrp_fanout_mode; + if (fanout < 0 || fanout > MCM_CPUS) + return (EINVAL); + } + return (0); +} + +/* + * Send a MAC_NOTE_LINK notification to all the MAC clients whenever the + * underlying physical link is down. This is to allow MAC clients to + * communicate with other clients. + */ +void +mac_virtual_link_update(mac_impl_t *mip) +{ + if (mip->mi_linkstate != LINK_STATE_UP) + i_mac_notify(mip, MAC_NOTE_LINK); +} + +/* + * For clients that have a pass-thru MAC, e.g. VNIC, we set the VNIC's + * mac handle in the client. + */ +void +mac_set_upper_mac(mac_client_handle_t mch, mac_handle_t mh) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + + mcip->mci_upper_mip = (mac_impl_t *)mh; +} + +/* + * Mark the mac as being used exclusively by the single mac client that is + * doing some control operation on this mac. No further opens of this mac + * will be allowed until this client calls mac_unmark_exclusive. The mac + * client calling this function must already be in the mac perimeter + */ +int +mac_mark_exclusive(mac_handle_t mh) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + ASSERT(MAC_PERIM_HELD(mh)); + /* + * Look up its entry in the global hash table. + */ + rw_enter(&i_mac_impl_lock, RW_WRITER); + if (mip->mi_state_flags & MIS_DISABLED) { + rw_exit(&i_mac_impl_lock); + return (ENOENT); + } + + /* + * A reference to mac is held even if the link is not plumbed. + * In i_dls_link_create() we open the MAC interface and hold the + * reference. There is an additional reference for the mac_open + * done in acquiring the mac perimeter + */ + if (mip->mi_ref != 2) { + rw_exit(&i_mac_impl_lock); + return (EBUSY); + } + + ASSERT(!(mip->mi_state_flags & MIS_EXCLUSIVE_HELD)); + mip->mi_state_flags |= MIS_EXCLUSIVE_HELD; + rw_exit(&i_mac_impl_lock); + return (0); +} + +void +mac_unmark_exclusive(mac_handle_t mh) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + ASSERT(MAC_PERIM_HELD(mh)); + + rw_enter(&i_mac_impl_lock, RW_WRITER); + /* 1 for the creation and another for the perimeter */ + ASSERT(mip->mi_ref == 2 && (mip->mi_state_flags & MIS_EXCLUSIVE_HELD)); + mip->mi_state_flags &= ~MIS_EXCLUSIVE_HELD; + rw_exit(&i_mac_impl_lock); +} + +/* + * Set the MTU for the specified device. The function returns EBUSY if + * another MAC client prevents the caller to become the exclusive client. + * Returns EAGAIN if the client is started. + */ +int +mac_set_mtu(mac_handle_t mh, uint_t new_mtu, uint_t *old_mtu_arg) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + uint_t old_mtu; + int rv; + boolean_t exclusive = B_FALSE; + + i_mac_perim_enter(mip); + + if ((mip->mi_callbacks->mc_callbacks & MC_SETPROP) == 0 || + (mip->mi_callbacks->mc_callbacks & MC_GETPROP) == 0) { + rv = ENOTSUP; + goto bail; + } + + if ((rv = mac_mark_exclusive(mh)) != 0) + goto bail; + exclusive = B_TRUE; + + if (mip->mi_active > 0) { + /* + * The MAC instance is started, for example due to the + * presence of a promiscuous clients. Fail the operation + * since the MAC's MTU cannot be changed while the NIC + * is started. + */ + rv = EAGAIN; + goto bail; + } + + mac_sdu_get(mh, NULL, &old_mtu); + + if (old_mtu != new_mtu) { + rv = mip->mi_callbacks->mc_setprop(mip->mi_driver, + "mtu", MAC_PROP_MTU, sizeof (uint_t), &new_mtu); + } + +bail: + if (exclusive) + mac_unmark_exclusive(mh); + i_mac_perim_exit(mip); + + if (rv == 0 && old_mtu_arg != NULL) + *old_mtu_arg = old_mtu; + return (rv); +} + +void +mac_get_hwgrp_info(mac_handle_t mh, int grp_index, uint_t *grp_num, + uint_t *n_rings, uint_t *type, uint_t *n_clnts, char *clnts_name) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + mac_grp_client_t *mcip; + uint_t i = 0, index = 0; + + /* Revisit when we implement fully dynamic group allocation */ + ASSERT(grp_index >= 0 && grp_index < mip->mi_rx_group_count); + + rw_enter(&mip->mi_rw_lock, RW_READER); + *grp_num = mip->mi_rx_groups[grp_index].mrg_index; + *type = mip->mi_rx_groups[grp_index].mrg_type; + *n_rings = mip->mi_rx_groups[grp_index].mrg_cur_count; + for (mcip = mip->mi_rx_groups[grp_index].mrg_clients; mcip != NULL; + mcip = mcip->mgc_next) { + int name_len = strlen(mcip->mgc_client->mci_name); + + /* + * MAXCLIENTNAMELEN is the buffer size reserved for client + * names. + * XXXX Formating the client name string needs to be moved + * to user land when fixing the size of dhi_clnts in + * dld_hwgrpinfo_t. We should use n_clients * client_name for + * dhi_clntsin instead of MAXCLIENTNAMELEN + */ + if (index + name_len >= MAXCLIENTNAMELEN) { + index = MAXCLIENTNAMELEN; + break; + } + bcopy(mcip->mgc_client->mci_name, &(clnts_name[index]), + name_len); + index += name_len; + clnts_name[index++] = ','; + i++; + } + + /* Get rid of the last , */ + if (index > 0) + clnts_name[index - 1] = '\0'; + *n_clnts = i; + rw_exit(&mip->mi_rw_lock); +} + +uint_t +mac_hwgrp_num(mac_handle_t mh) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + return (mip->mi_rx_group_count); +} diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c new file mode 100644 index 0000000000..f265e53f13 --- /dev/null +++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c @@ -0,0 +1,3347 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/callb.h> +#include <sys/sdt.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <sys/vlan.h> +#include <inet/ipsec_impl.h> +#include <inet/ip_impl.h> +#include <inet/sadb.h> +#include <inet/ipsecesp.h> +#include <inet/ipsecah.h> + +#include <sys/mac_impl.h> +#include <sys/mac_client_impl.h> +#include <sys/mac_client_priv.h> +#include <sys/mac_soft_ring.h> +#include <sys/mac_flow_impl.h> + +static void mac_srs_soft_rings_signal(mac_soft_ring_set_t *, uint_t); +static void mac_srs_update_fanout_list(mac_soft_ring_set_t *); +static void mac_srs_poll_unbind(mac_soft_ring_set_t *); +static void mac_srs_worker_unbind(mac_soft_ring_set_t *); +static void mac_srs_soft_rings_quiesce(mac_soft_ring_set_t *, uint_t); + +static int mac_srs_cpu_setup(cpu_setup_t, int, void *); +static void mac_srs_worker_bind(mac_soft_ring_set_t *, processorid_t); +static void mac_srs_poll_bind(mac_soft_ring_set_t *, processorid_t); +static void mac_srs_threads_unbind(mac_soft_ring_set_t *); +static void mac_srs_add_glist(mac_soft_ring_set_t *); +static void mac_srs_remove_glist(mac_soft_ring_set_t *); +static void mac_srs_fanout_list_free(mac_soft_ring_set_t *); +static void mac_soft_ring_remove(mac_soft_ring_set_t *, mac_soft_ring_t *); + +static int mac_compute_soft_ring_count(flow_entry_t *, int); +static void mac_walk_srs_and_bind(int); +static void mac_walk_srs_and_unbind(int); + +extern mac_group_t *mac_reserve_rx_group(mac_client_impl_t *, uint8_t *, + mac_rx_group_reserve_type_t); +extern void mac_release_rx_group(mac_client_impl_t *, mac_group_t *); + +extern boolean_t mac_latency_optimize; + +static kmem_cache_t *mac_srs_cache; +kmem_cache_t *mac_soft_ring_cache; + +/* + * The duration in msec we wait before signalling the soft ring + * worker thread in case packets get queued. + */ +static uint32_t mac_soft_ring_worker_wait = 0; + +/* + * Need to set mac_soft_ring_max_q_cnt based on bandwidth and perhaps latency. + * Large values could end up in consuming lot of system memory and cause + * system hang. + */ +static int mac_soft_ring_max_q_cnt = 1024; +static int mac_soft_ring_min_q_cnt = 256; +static int mac_soft_ring_poll_thres = 16; + +/* + * Default value of number of TX rings to be assigned to a MAC client. + * If less than 'mac_tx_ring_count' worth of Tx rings is available, then + * as many as is available will be assigned to the newly created MAC client. + * If no TX rings are available, then MAC client(s) will be assigned the + * default Tx ring. Default Tx ring can be shared among multiple MAC clients. + */ +static uint32_t mac_tx_ring_count = 8; +static boolean_t mac_tx_serialize = B_FALSE; + +/* + * mac_tx_srs_hiwat is the queue depth threshold at which callers of + * mac_tx() will be notified of flow control condition. + * + * TCP does not honour flow control condition sent up by mac_tx(). + * Thus provision is made for TCP to allow more packets to be queued + * in SRS upto a maximum of mac_tx_srs_max_q_cnt. + * + * Note that mac_tx_srs_hiwat is always be lesser than + * mac_tx_srs_max_q_cnt. + */ +static uint32_t mac_tx_srs_max_q_cnt = 100000; +static uint32_t mac_tx_srs_hiwat = 1000; + +/* + * mac_rx_soft_ring_count, mac_soft_ring_10gig_count: + * + * Global tunables that determines the number of soft rings to be used for + * fanning out incoming traffic on a link. These count will be used only + * when no explicit set of CPUs was assigned to the data-links. + * + * mac_rx_soft_ring_count tunable will come into effect only if + * mac_soft_ring_enable is set. mac_soft_ring_enable is turned on by + * default only for sun4v platforms. + * + * mac_rx_soft_ring_10gig_count will come into effect if you are running on a + * 10Gbps link and is not dependent upon mac_soft_ring_enable. + * + * The number of soft rings for fanout for a link or a flow is determined + * by mac_compute_soft_ring_count() routine. This routine will take into + * account mac_soft_ring_enable, mac_rx_soft_ring_count and + * mac_rx_soft_ring_10gig_count to determine the soft ring count for a link. + * + * If a bandwidth is specified, the determination of the number of soft + * rings is based on specified bandwidth, CPU speed and number of CPUs in + * the system. + */ +static uint_t mac_rx_soft_ring_count = 8; +static uint_t mac_rx_soft_ring_10gig_count = 8; + +/* + * Every Tx and Rx mac_soft_ring_set_t (mac_srs) created gets added + * to mac_srs_g_list and mac_srs_g_lock protects mac_srs_g_list. The + * list is used to walk the list of all MAC threads when a CPU is + * coming online or going offline. + */ +static mac_soft_ring_set_t *mac_srs_g_list = NULL; +static krwlock_t mac_srs_g_lock; + +/* + * Whether the SRS threads should be bound, or not. + */ +static boolean_t mac_srs_thread_bind = B_TRUE; + +/* + * CPU to fallback to, used by mac_next_bind_cpu(). + */ +static processorid_t srs_bind_cpu = 0; + +/* + * Possible setting for soft_ring_process_flag is + * 0 or ST_RING_WORKER_ONLY. + */ +static int soft_ring_process_flag = ST_RING_WORKER_ONLY; + +/* + * If cpu bindings are specified by user, then Tx SRS and its soft + * rings should also be bound to the CPUs specified by user. The + * CPUs for Tx bindings are at the end of the cpu list provided by + * the user. If enough CPUs are not available (for Tx and Rx + * SRSes), then the CPUs are shared by both Tx and Rx SRSes. + */ +#define BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs, mrp) { \ + processorid_t cpuid; \ + int i, j; \ + mac_soft_ring_t *softring; \ + \ + cpuid = mrp->mrp_cpu[mrp->mrp_ncpus - 1]; \ + mac_srs_worker_bind(mac_tx_srs, cpuid); \ + if (TX_MULTI_RING_MODE(mac_tx_srs)) { \ + j = mrp->mrp_ncpus - 1; \ + for (i = 0; \ + i < mac_tx_srs->srs_oth_ring_count; i++, j--) { \ + if (j < 0) \ + j = mrp->mrp_ncpus - 1; \ + cpuid = mrp->mrp_cpu[j]; \ + softring = mac_tx_srs->srs_oth_soft_rings[i]; \ + (void) mac_soft_ring_bind(softring, cpuid); \ + } \ + } \ +} + +/* INIT and FINI ROUTINES */ + +void +mac_soft_ring_init(void) +{ + mac_soft_ring_cache = kmem_cache_create("mac_soft_ring_cache", + sizeof (mac_soft_ring_t), 64, NULL, NULL, NULL, NULL, NULL, 0); + + mac_srs_cache = kmem_cache_create("mac_srs_cache", + sizeof (mac_soft_ring_set_t), + 64, NULL, NULL, NULL, NULL, NULL, 0); + + rw_init(&mac_srs_g_lock, NULL, RW_DEFAULT, NULL); + mutex_enter(&cpu_lock); + register_cpu_setup_func(mac_srs_cpu_setup, NULL); + mutex_exit(&cpu_lock); +} + +void +mac_soft_ring_finish(void) +{ + mutex_enter(&cpu_lock); + unregister_cpu_setup_func(mac_srs_cpu_setup, NULL); + mutex_exit(&cpu_lock); + rw_destroy(&mac_srs_g_lock); + kmem_cache_destroy(mac_soft_ring_cache); + kmem_cache_destroy(mac_srs_cache); +} + +static void +mac_srs_soft_rings_free(mac_soft_ring_set_t *mac_srs, boolean_t release_tx_ring) +{ + mac_soft_ring_t *softring, *next, *head; + + /* + * Synchronize with mac_walk_srs_bind/unbind which are callbacks from + * DR. The callbacks from DR are called with cpu_lock held, and hence + * can't wait to grab the mac perimeter. The soft ring list is hence + * protected for read access by srs_lock. Changing the soft ring list + * needs the mac perimeter and the srs_lock. + */ + mutex_enter(&mac_srs->srs_lock); + + head = mac_srs->srs_soft_ring_head; + mac_srs->srs_soft_ring_head = NULL; + mac_srs->srs_soft_ring_tail = NULL; + mac_srs->srs_soft_ring_count = 0; + + mutex_exit(&mac_srs->srs_lock); + + for (softring = head; softring != NULL; softring = next) { + next = softring->s_ring_next; + mac_soft_ring_free(softring, release_tx_ring); + } +} + +static void +mac_srs_add_glist(mac_soft_ring_set_t *mac_srs) +{ + ASSERT(mac_srs->srs_next == NULL && mac_srs->srs_prev == NULL); + ASSERT(MAC_PERIM_HELD((mac_handle_t)mac_srs->srs_mcip->mci_mip)); + + rw_enter(&mac_srs_g_lock, RW_WRITER); + mutex_enter(&mac_srs->srs_lock); + + ASSERT((mac_srs->srs_state & SRS_IN_GLIST) == 0); + + if (mac_srs_g_list == NULL) { + mac_srs_g_list = mac_srs; + } else { + mac_srs->srs_next = mac_srs_g_list; + mac_srs_g_list->srs_prev = mac_srs; + mac_srs->srs_prev = NULL; + mac_srs_g_list = mac_srs; + } + mac_srs->srs_state |= SRS_IN_GLIST; + + mutex_exit(&mac_srs->srs_lock); + rw_exit(&mac_srs_g_lock); +} + +static void +mac_srs_remove_glist(mac_soft_ring_set_t *mac_srs) +{ + ASSERT(MAC_PERIM_HELD((mac_handle_t)mac_srs->srs_mcip->mci_mip)); + + rw_enter(&mac_srs_g_lock, RW_WRITER); + mutex_enter(&mac_srs->srs_lock); + + ASSERT((mac_srs->srs_state & SRS_IN_GLIST) != 0); + + if (mac_srs == mac_srs_g_list) { + mac_srs_g_list = mac_srs->srs_next; + if (mac_srs_g_list != NULL) + mac_srs_g_list->srs_prev = NULL; + } else { + mac_srs->srs_prev->srs_next = mac_srs->srs_next; + if (mac_srs->srs_next != NULL) + mac_srs->srs_next->srs_prev = mac_srs->srs_prev; + } + mac_srs->srs_state &= ~SRS_IN_GLIST; + + mutex_exit(&mac_srs->srs_lock); + rw_exit(&mac_srs_g_lock); +} + +/* POLLING SETUP AND TEAR DOWN ROUTINES */ + +/* + * mac_srs_client_poll_quiesce and mac_srs_client_poll_restart + * + * These routines are used to call back into the upper layer + * (primarily TCP squeue) to stop polling the soft rings or + * restart polling. + */ +void +mac_srs_client_poll_quiesce(mac_client_impl_t *mcip, + mac_soft_ring_set_t *mac_srs) +{ + mac_soft_ring_t *softring; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); + + if (!(mac_srs->srs_type & SRST_CLIENT_POLL_ENABLED)) { + ASSERT(!(mac_srs->srs_type & SRST_DLS_BYPASS)); + return; + } + + for (softring = mac_srs->srs_soft_ring_head; + softring != NULL; softring = softring->s_ring_next) { + if ((softring->s_ring_type & ST_RING_TCP) && + (softring->s_ring_rx_arg2 != NULL)) { + mcip->mci_resource_quiesce(mcip->mci_resource_arg, + softring->s_ring_rx_arg2); + } + } +} + +void +mac_srs_client_poll_restart(mac_client_impl_t *mcip, + mac_soft_ring_set_t *mac_srs) +{ + mac_soft_ring_t *softring; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); + + if (!(mac_srs->srs_type & SRST_CLIENT_POLL_ENABLED)) { + ASSERT(!(mac_srs->srs_type & SRST_DLS_BYPASS)); + return; + } + + for (softring = mac_srs->srs_soft_ring_head; + softring != NULL; softring = softring->s_ring_next) { + if ((softring->s_ring_type & ST_RING_TCP) && + (softring->s_ring_rx_arg2 != NULL)) { + mcip->mci_resource_restart(mcip->mci_resource_arg, + softring->s_ring_rx_arg2); + } + } +} + +/* + * Register the given SRS and associated soft rings with the consumer and + * enable the polling interface used by the consumer.(i.e IP) over this + * SRS and associated soft rings. + */ +void +mac_srs_client_poll_enable(mac_client_impl_t *mcip, + mac_soft_ring_set_t *mac_srs) +{ + mac_rx_fifo_t mrf; + mac_soft_ring_t *softring; + + ASSERT(mac_srs->srs_mcip == mcip); + ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); + + if (!(mcip->mci_state_flags & MCIS_CLIENT_POLL_CAPABLE)) + return; + + bzero(&mrf, sizeof (mac_rx_fifo_t)); + mrf.mrf_type = MAC_RX_FIFO; + + /* + * A SRS is capable of acting as a soft ring for cases + * where no fanout is needed. This is the case for userland + * flows. + */ + if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) + return; + + mrf.mrf_receive = (mac_receive_t)mac_soft_ring_poll; + mrf.mrf_intr_enable = (mac_intr_enable_t)mac_soft_ring_intr_enable; + mrf.mrf_intr_disable = (mac_intr_disable_t)mac_soft_ring_intr_disable; + mac_srs->srs_type |= SRST_CLIENT_POLL_ENABLED; + + softring = mac_srs->srs_soft_ring_head; + while (softring != NULL) { + if (softring->s_ring_type & (ST_RING_TCP | ST_RING_UDP)) { + /* + * TCP and UDP support DLS bypass. Squeue polling + * support implies DLS bypass since the squeue poll + * path does not have DLS processing. + */ + mac_soft_ring_dls_bypass(softring, + mcip->mci_direct_rx_fn, mcip->mci_direct_rx_arg); + } + /* + * Non-TCP protocols don't support squeues. Hence we don't + * make any ring addition callbacks for non-TCP rings + */ + if (!(softring->s_ring_type & ST_RING_TCP)) { + softring->s_ring_rx_arg2 = NULL; + softring = softring->s_ring_next; + continue; + } + mrf.mrf_rx_arg = softring; + mrf.mrf_intr_handle = (mac_intr_handle_t)softring; + mrf.mrf_cpu_id = softring->s_ring_cpuid; + mrf.mrf_flow_priority = mac_srs->srs_pri; + + softring->s_ring_rx_arg2 = mcip->mci_resource_add( + mcip->mci_resource_arg, (mac_resource_t *)&mrf); + + softring = softring->s_ring_next; + } +} + +/* + * Unregister the given SRS and associated soft rings with the consumer and + * disable the polling interface used by the consumer.(i.e IP) over this + * SRS and associated soft rings. + */ +void +mac_srs_client_poll_disable(mac_client_impl_t *mcip, + mac_soft_ring_set_t *mac_srs) +{ + mac_soft_ring_t *softring; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); + + /* + * A SRS is capable of acting as a soft ring for cases + * where no protocol fanout is needed. This is the case + * for userland flows. Nothing to do here. + */ + if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) + return; + + mutex_enter(&mac_srs->srs_lock); + if (!(mac_srs->srs_type & SRST_CLIENT_POLL_ENABLED)) { + ASSERT(!(mac_srs->srs_type & SRST_DLS_BYPASS)); + mutex_exit(&mac_srs->srs_lock); + return; + } + mac_srs->srs_type &= ~(SRST_CLIENT_POLL_ENABLED | SRST_DLS_BYPASS); + mutex_exit(&mac_srs->srs_lock); + + /* + * DLS bypass is now disabled in the case of both TCP and UDP. + * Reset the soft ring callbacks to the standard 'mac_rx_deliver' + * callback. In addition, in the case of TCP, invoke IP's callback + * for ring removal. + */ + for (softring = mac_srs->srs_soft_ring_head; + softring != NULL; softring = softring->s_ring_next) { + if (!(softring->s_ring_type & (ST_RING_UDP | ST_RING_TCP))) + continue; + + if ((softring->s_ring_type & ST_RING_TCP) && + softring->s_ring_rx_arg2 != NULL) { + mcip->mci_resource_remove(mcip->mci_resource_arg, + softring->s_ring_rx_arg2); + } + + mutex_enter(&softring->s_ring_lock); + while (softring->s_ring_state & S_RING_PROC) { + softring->s_ring_state |= S_RING_CLIENT_WAIT; + cv_wait(&softring->s_ring_client_cv, + &softring->s_ring_lock); + } + softring->s_ring_state &= ~S_RING_CLIENT_WAIT; + softring->s_ring_rx_arg2 = NULL; + softring->s_ring_rx_func = mac_rx_deliver; + softring->s_ring_rx_arg1 = mcip; + mutex_exit(&softring->s_ring_lock); + } +} + +/* + * Enable or disable poll capability of the SRS on the underlying Rx ring. + * + * There is a need to enable or disable the poll capability of an SRS over an + * Rx ring depending on the number of mac clients sharing the ring and also + * whether user flows are configured on it. However the poll state is actively + * manipulated by the SRS worker and poll threads and uncoordinated changes by + * yet another thread to the underlying capability can surprise them leading + * to assert failures. Instead we quiesce the SRS, make the changes and then + * restart the SRS. + */ +static void +mac_srs_poll_state_change(mac_soft_ring_set_t *mac_srs, + boolean_t turn_off_poll_capab, mac_rx_func_t rx_func) +{ + boolean_t need_restart = B_FALSE; + mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; + mac_ring_t *ring; + + if (!SRS_QUIESCED(mac_srs)) { + mac_rx_srs_quiesce(mac_srs, SRS_QUIESCE); + need_restart = B_TRUE; + } + + ring = mac_srs->srs_ring; + if ((ring != NULL) && + (ring->mr_classify_type == MAC_HW_CLASSIFIER)) { + if (turn_off_poll_capab) + mac_srs->srs_state &= ~SRS_POLLING_CAPAB; + else + mac_srs->srs_state |= SRS_POLLING_CAPAB; + } + srs_rx->sr_lower_proc = rx_func; + + if (need_restart) + mac_rx_srs_restart(mac_srs); +} + +/* CPU RECONFIGURATION AND FANOUT COMPUTATION ROUTINES */ + +/* + * Return the next CPU to be used to bind a MAC kernel thread. + */ +static processorid_t +mac_next_bind_cpu(void) +{ + static processorid_t srs_curr_cpu = -1; + cpu_t *cp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + srs_curr_cpu++; + cp = cpu_get(srs_curr_cpu); + if (cp == NULL || !cpu_is_online(cp)) + srs_curr_cpu = srs_bind_cpu; + + return (srs_curr_cpu); +} + +/* ARGSUSED */ +static int +mac_srs_cpu_setup(cpu_setup_t what, int id, void *arg) +{ + ASSERT(MUTEX_HELD(&cpu_lock)); + switch (what) { + case CPU_CONFIG: + case CPU_ON: + case CPU_CPUPART_IN: + mac_walk_srs_and_bind(id); + break; + + case CPU_UNCONFIG: + case CPU_OFF: + case CPU_CPUPART_OUT: + mac_walk_srs_and_unbind(id); + break; + + default: + break; + } + return (0); +} + +/* + * mac_compute_soft_ring_count(): + * + * This routine computes the number of soft rings needed to handle incoming + * load given a flow_entry. + * + * The routine does the following: + * 1) soft rings will be created if mac_soft_ring_enable is set. + * 2) If the underlying link is a 10Gbps link, then soft rings will be + * created even if mac_soft_ring_enable is not set. The number of soft + * rings, so created, will equal mac_rx_soft_ring_10gig_count. + * 3) On a sun4v platform (i.e., mac_soft_ring_enable is set), 2 times the + * mac_rx_soft_ring_10gig_count number of soft rings will be created for a + * 10Gbps link. + * + * If a bandwidth limit is specified, the number that gets computed is + * dependent upon CPU speed, the number of Rx rings configured, and + * the bandwidth limit. + * If more Rx rings are available, less number of soft rings is needed. + * + * mac_use_bw_heuristic is another "hidden" variable that can be used to + * override the default use of soft ring count computation. Depending upon + * the usefulness of it, mac_use_bw_heuristic can later be made into a + * data-link property or removed altogether. + * + * TODO: Cleanup and tighten some of the assumptions. + */ +boolean_t mac_use_bw_heuristic = B_TRUE; +static int +mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt) +{ + uint64_t cpu_speed, bw = 0; + int srings = 0; + boolean_t bw_enabled = B_FALSE; + + ASSERT(!(flent->fe_type & FLOW_USER)); + if (flent->fe_resource_props.mrp_mask & MRP_MAXBW && + mac_use_bw_heuristic) { + /* bandwidth enabled */ + bw_enabled = B_TRUE; + bw = flent->fe_resource_props.mrp_maxbw; + } + if (!bw_enabled) { + /* No bandwidth enabled */ + if (mac_soft_ring_enable) + srings = mac_rx_soft_ring_count; + + /* Is this a 10Gig link? */ + flent->fe_nic_speed = mac_client_stat_get(flent->fe_mcip, + MAC_STAT_IFSPEED); + /* convert to Mbps */ + if (((flent->fe_nic_speed)/1000000) > 1000 && + mac_rx_soft_ring_10gig_count > 0) { + /* This is a 10Gig link */ + srings = mac_rx_soft_ring_10gig_count; + /* + * Use 2 times mac_rx_soft_ring_10gig_count for + * sun4v systems. + */ + if (mac_soft_ring_enable) + srings = srings * 2; + } + } else { + /* + * Soft ring computation using CPU speed and specified + * bandwidth limit. + */ + /* Assumption: all CPUs have the same frequency */ + cpu_speed = (uint64_t)CPU->cpu_type_info.pi_clock; + + /* cpu_speed is in MHz; make bw in units of Mbps. */ + bw = bw/1000000; + + if (bw >= 1000) { + /* + * bw is greater than or equal to 1Gbps. + * The number of soft rings required is a function + * of bandwidth and CPU speed. To keep this simple, + * let's use this rule: 1GHz CPU can handle 1Gbps. + * If bw is less than 1 Gbps, then there is no need + * for soft rings. Assumption is that CPU speeds + * (on modern systems) are at least 1GHz. + */ + srings = bw/cpu_speed; + if (srings <= 1 && mac_soft_ring_enable) { + /* + * Give at least 2 soft rings + * for sun4v systems + */ + srings = 2; + } + } + } + /* + * If the flent has multiple Rx SRSs, then each SRS need not + * have that many soft rings on top of it. The number of + * soft rings for each Rx SRS is found by dividing srings by + * rx_srs_cnt. + */ + if (rx_srs_cnt > 1) { + int remainder; + + remainder = srings%rx_srs_cnt; + srings = srings/rx_srs_cnt; + if (remainder != 0) + srings++; + /* + * Fanning out to 1 soft ring is not very useful. + * Set it as well to 0 and mac_srs_fanout_init() + * will take care of creating a single soft ring + * for proto fanout. + */ + if (srings == 1) + srings = 0; + } + /* Do some more massaging */ + srings = min(srings, ncpus); + srings = min(srings, MAX_SR_FANOUT); + return (srings); +} + +/* + * Assignment of user specified CPUs to a link. + * + * Minimum CPUs required to get an optimal assignmet: + * For each Rx SRS, atleast two CPUs are needed if mac_latency_optimize + * flag is set -- one for polling, one for fanout soft ring. + * If mac_latency_optimize is not set, then 3 CPUs are needed -- one + * for polling, one for SRS worker thread and one for fanout soft ring. + * + * The CPUs needed for Tx side is equal to the number of Tx rings + * the link is using. + * + * mac_flow_user_cpu_init() categorizes the CPU assignment depending + * upon the number of CPUs in 3 different buckets. + * + * In the first bucket, the most optimal case is handled. The user has + * passed enough number of CPUs and every thread gets its own CPU. + * + * The second and third are the sub-optimal cases. Enough CPUs are not + * available. + * + * The second bucket handles the case where atleast one distinct CPU is + * is available for each of the Rx rings (Rx SRSes) and Tx rings (Tx + * SRS or soft rings). + * + * In the third case (worst case scenario), specified CPU count is less + * than the Rx rings configured for the link. In this case, we round + * robin the CPUs among the Rx SRSes and Tx SRS/soft rings. + */ +static void +mac_flow_user_cpu_init(flow_entry_t *flent, mac_resource_props_t *mrp) +{ + mac_soft_ring_set_t *rx_srs, *tx_srs; + int i, srs_cnt; + mac_cpus_t *srs_cpu; + int no_of_cpus, cpu_cnt; + int rx_srs_cnt, reqd_rx_cpu_cnt; + int fanout_cpu_cnt, reqd_tx_cpu_cnt; + int reqd_poll_worker_cnt, fanout_cnt_per_srs; + + ASSERT(mrp->mrp_fanout_mode == MCM_CPUS); + /* + * The check for nbc_ncpus to be within limits for + * the user specified case was done earlier and if + * not within limits, an error would have been + * returned to the user. + */ + ASSERT(mrp->mrp_ncpus > 0 && mrp->mrp_ncpus <= MAX_SR_FANOUT); + + no_of_cpus = mrp->mrp_ncpus; + + if (mrp->mrp_intr_cpu != -1) { + /* + * interrupt has been re-targetted. Poll + * thread needs to be bound to interrupt + * CPU. Presently only fixed interrupts + * are re-targetted, MSI-x aren't. + * + * Find where in the list is the intr + * CPU and swap it with the first one. + * We will be using the first CPU in the + * list for poll. + */ + for (i = 0; i < no_of_cpus; i++) { + if (mrp->mrp_cpu[i] == mrp->mrp_intr_cpu) + break; + } + mrp->mrp_cpu[i] = mrp->mrp_cpu[0]; + mrp->mrp_cpu[0] = mrp->mrp_intr_cpu; + } + + /* + * Requirements: + * The number of CPUs that each Rx ring needs is dependent + * upon mac_latency_optimize flag. + * 1) If set, atleast 2 CPUs are needed -- one for + * polling, one for fanout soft ring. + * 2) If not set, then atleast 3 CPUs are needed -- one + * for polling, one for srs worker thread, and one for + * fanout soft ring. + */ + rx_srs_cnt = (flent->fe_rx_srs_cnt > 1) ? + (flent->fe_rx_srs_cnt - 1) : flent->fe_rx_srs_cnt; + reqd_rx_cpu_cnt = mac_latency_optimize ? + (rx_srs_cnt * 2) : (rx_srs_cnt * 3); + + /* How many CPUs are needed for Tx side? */ + tx_srs = flent->fe_tx_srs; + reqd_tx_cpu_cnt = TX_MULTI_RING_MODE(tx_srs) ? + tx_srs->srs_oth_ring_count : 1; + + /* CPUs needed for Rx SRSes poll and worker threads */ + reqd_poll_worker_cnt = mac_latency_optimize ? + rx_srs_cnt : rx_srs_cnt * 2; + + /* Has the user provided enough CPUs? */ + if (no_of_cpus >= (reqd_rx_cpu_cnt + reqd_tx_cpu_cnt)) { + /* + * Best case scenario. There is enough CPUs. All + * Rx rings will get their own set of CPUs plus + * Tx soft rings will get their own. + */ + /* + * fanout_cpu_cnt is the number of CPUs available + * for Rx side fanout soft rings. + */ + fanout_cpu_cnt = no_of_cpus - + reqd_poll_worker_cnt - reqd_tx_cpu_cnt; + + /* + * Divide fanout_cpu_cnt by rx_srs_cnt to find + * out how many fanout soft rings each Rx SRS + * can have. + */ + fanout_cnt_per_srs = fanout_cpu_cnt/rx_srs_cnt; + + /* Do the assignment for the default Rx ring */ + cpu_cnt = 0; + rx_srs = flent->fe_rx_srs[0]; + ASSERT(rx_srs->srs_ring == NULL); + if (rx_srs->srs_fanout_state == SRS_FANOUT_INIT) + rx_srs->srs_fanout_state = SRS_FANOUT_REINIT; + srs_cpu = &rx_srs->srs_cpu; + srs_cpu->mc_ncpus = no_of_cpus; + bcopy(mrp->mrp_cpu, + srs_cpu->mc_cpus, sizeof (srs_cpu->mc_cpus)); + srs_cpu->mc_fanout_cnt = fanout_cnt_per_srs; + srs_cpu->mc_pollid = mrp->mrp_cpu[cpu_cnt++]; + srs_cpu->mc_intr_cpu = mrp->mrp_intr_cpu; + srs_cpu->mc_workerid = srs_cpu->mc_pollid; + if (!mac_latency_optimize) + srs_cpu->mc_workerid = mrp->mrp_cpu[cpu_cnt++]; + for (i = 0; i < fanout_cnt_per_srs; i++) + srs_cpu->mc_fanout_cpus[i] = mrp->mrp_cpu[cpu_cnt++]; + + /* Do the assignment for h/w Rx SRSes */ + if (flent->fe_rx_srs_cnt > 1) { + cpu_cnt = 0; + for (srs_cnt = 1; + srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) { + rx_srs = flent->fe_rx_srs[srs_cnt]; + ASSERT(rx_srs->srs_ring != NULL); + if (rx_srs->srs_fanout_state == + SRS_FANOUT_INIT) { + rx_srs->srs_fanout_state = + SRS_FANOUT_REINIT; + } + srs_cpu = &rx_srs->srs_cpu; + srs_cpu->mc_ncpus = no_of_cpus; + bcopy(mrp->mrp_cpu, srs_cpu->mc_cpus, + sizeof (srs_cpu->mc_cpus)); + srs_cpu->mc_fanout_cnt = fanout_cnt_per_srs; + /* The first CPU in the list is the intr CPU */ + srs_cpu->mc_pollid = mrp->mrp_cpu[cpu_cnt++]; + srs_cpu->mc_intr_cpu = mrp->mrp_intr_cpu; + srs_cpu->mc_workerid = srs_cpu->mc_pollid; + if (!mac_latency_optimize) { + srs_cpu->mc_workerid = + mrp->mrp_cpu[cpu_cnt++]; + } + for (i = 0; i < fanout_cnt_per_srs; i++) { + srs_cpu->mc_fanout_cpus[i] = + mrp->mrp_cpu[cpu_cnt++]; + } + ASSERT(cpu_cnt <= no_of_cpus); + } + } + return; + } + + /* + * Sub-optimal case. + * We have the following information: + * no_of_cpus - no. of cpus that user passed. + * rx_srs_cnt - no. of rx rings. + * reqd_rx_cpu_cnt = mac_latency_optimize?rx_srs_cnt*2:rx_srs_cnt*3 + * reqd_tx_cpu_cnt - no. of cpus reqd. for Tx side. + * reqd_poll_worker_cnt = mac_latency_optimize?rx_srs_cnt:rx_srs_cnt*2 + */ + /* + * If we bind the Rx fanout soft rings to the same CPUs + * as poll/worker, would that be enough? + */ + if (no_of_cpus >= (rx_srs_cnt + reqd_tx_cpu_cnt)) { + boolean_t worker_assign = B_FALSE; + + /* + * If mac_latency_optimize is not set, are there + * enough CPUs to assign a CPU for worker also? + */ + if (no_of_cpus >= (reqd_poll_worker_cnt + reqd_tx_cpu_cnt)) + worker_assign = B_TRUE; + /* + * Zero'th Rx SRS is the default Rx ring. It is not + * associated with h/w Rx ring. + */ + rx_srs = flent->fe_rx_srs[0]; + ASSERT(rx_srs->srs_ring == NULL); + if (rx_srs->srs_fanout_state == SRS_FANOUT_INIT) + rx_srs->srs_fanout_state = SRS_FANOUT_REINIT; + cpu_cnt = 0; + srs_cpu = &rx_srs->srs_cpu; + srs_cpu->mc_ncpus = no_of_cpus; + bcopy(mrp->mrp_cpu, + srs_cpu->mc_cpus, sizeof (srs_cpu->mc_cpus)); + srs_cpu->mc_fanout_cnt = 1; + srs_cpu->mc_pollid = mrp->mrp_cpu[cpu_cnt++]; + srs_cpu->mc_intr_cpu = mrp->mrp_intr_cpu; + srs_cpu->mc_workerid = srs_cpu->mc_pollid; + if (!mac_latency_optimize && worker_assign) + srs_cpu->mc_workerid = mrp->mrp_cpu[cpu_cnt++]; + srs_cpu->mc_fanout_cpus[0] = mrp->mrp_cpu[cpu_cnt]; + + /* Do CPU bindings for SRSes having h/w Rx rings */ + if (flent->fe_rx_srs_cnt > 1) { + cpu_cnt = 0; + for (srs_cnt = 1; + srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) { + rx_srs = flent->fe_rx_srs[srs_cnt]; + ASSERT(rx_srs->srs_ring != NULL); + if (rx_srs->srs_fanout_state == + SRS_FANOUT_INIT) { + rx_srs->srs_fanout_state = + SRS_FANOUT_REINIT; + } + srs_cpu = &rx_srs->srs_cpu; + srs_cpu->mc_ncpus = no_of_cpus; + bcopy(mrp->mrp_cpu, srs_cpu->mc_cpus, + sizeof (srs_cpu->mc_cpus)); + srs_cpu->mc_pollid = + mrp->mrp_cpu[cpu_cnt]; + srs_cpu->mc_intr_cpu = mrp->mrp_intr_cpu; + srs_cpu->mc_workerid = srs_cpu->mc_pollid; + if (!mac_latency_optimize && worker_assign) { + srs_cpu->mc_workerid = + mrp->mrp_cpu[++cpu_cnt]; + } + srs_cpu->mc_fanout_cnt = 1; + srs_cpu->mc_fanout_cpus[0] = + mrp->mrp_cpu[cpu_cnt]; + cpu_cnt++; + ASSERT(cpu_cnt <= no_of_cpus); + } + } + return; + } + + /* + * Real sub-optimal case. Not enough CPUs for poll and + * Tx soft rings. Do a round robin assignment where + * each Rx SRS will get the same CPU for poll, worker + * and fanout soft ring. + */ + cpu_cnt = 0; + for (srs_cnt = 0; srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) { + rx_srs = flent->fe_rx_srs[srs_cnt]; + srs_cpu = &rx_srs->srs_cpu; + if (rx_srs->srs_fanout_state == SRS_FANOUT_INIT) + rx_srs->srs_fanout_state = SRS_FANOUT_REINIT; + srs_cpu->mc_ncpus = no_of_cpus; + bcopy(mrp->mrp_cpu, + srs_cpu->mc_cpus, sizeof (srs_cpu->mc_cpus)); + srs_cpu->mc_fanout_cnt = 1; + srs_cpu->mc_pollid = mrp->mrp_cpu[cpu_cnt]; + srs_cpu->mc_intr_cpu = mrp->mrp_intr_cpu; + srs_cpu->mc_workerid = mrp->mrp_cpu[cpu_cnt]; + srs_cpu->mc_fanout_cpus[0] = mrp->mrp_cpu[cpu_cnt]; + if (++cpu_cnt >= no_of_cpus) + cpu_cnt = 0; + } +} + +/* + * mac_flow_cpu_init(): + * + * Each SRS has a mac_cpu_t structure, srs_cpu. This routine fills in + * the CPU binding information in srs_cpu for all Rx SRSes associated + * with a flent. + */ +static void +mac_flow_cpu_init(flow_entry_t *flent, mac_resource_props_t *mrp) +{ + mac_soft_ring_set_t *rx_srs; + processorid_t cpuid; + int j, srs_cnt, soft_ring_cnt = 0; + mac_cpus_t *srs_cpu; + + if (mrp->mrp_mask & MRP_CPUS_USERSPEC) { + mac_flow_user_cpu_init(flent, mrp); + } else { + /* + * Compute the number of soft rings needed on top for each Rx + * SRS. "rx_srs_cnt-1" indicates the number of Rx SRS + * associated with h/w Rx rings. Soft ring count needed for + * each h/w Rx SRS is computed and the same is applied to + * software classified Rx SRS. The first Rx SRS in fe_rx_srs[] + * is the software classified Rx SRS. + */ + soft_ring_cnt = mac_compute_soft_ring_count(flent, + flent->fe_rx_srs_cnt - 1); + if (soft_ring_cnt == 0) { + /* + * Even when soft_ring_cnt is 0, we still need + * to create a soft ring for TCP, UDP and + * OTHER. So set it to 1. + */ + soft_ring_cnt = 1; + } + for (srs_cnt = 0; srs_cnt < flent->fe_rx_srs_cnt; srs_cnt++) { + rx_srs = flent->fe_rx_srs[srs_cnt]; + srs_cpu = &rx_srs->srs_cpu; + if (rx_srs->srs_fanout_state == SRS_FANOUT_INIT) { + if (soft_ring_cnt == srs_cpu->mc_fanout_cnt) + continue; + rx_srs->srs_fanout_state = SRS_FANOUT_REINIT; + } + srs_cpu->mc_ncpus = soft_ring_cnt; + srs_cpu->mc_fanout_cnt = soft_ring_cnt; + mutex_enter(&cpu_lock); + for (j = 0; j < soft_ring_cnt; j++) { + cpuid = mac_next_bind_cpu(); + srs_cpu->mc_cpus[j] = cpuid; + srs_cpu->mc_fanout_cpus[j] = cpuid; + } + cpuid = mac_next_bind_cpu(); + srs_cpu->mc_pollid = cpuid; + /* increment ncpus to account for polling cpu */ + srs_cpu->mc_ncpus++; + srs_cpu->mc_cpus[j++] = cpuid; + if (!mac_latency_optimize) { + cpuid = mac_next_bind_cpu(); + srs_cpu->mc_ncpus++; + srs_cpu->mc_cpus[j++] = cpuid; + } + srs_cpu->mc_workerid = cpuid; + mutex_exit(&cpu_lock); + } + } +} + +/* + * DATAPATH SETUP ROUTINES + * (setup SRS and set/update FANOUT, B/W and PRIORITY) + */ + +static void +mac_srs_fanout_list_alloc(mac_soft_ring_set_t *mac_srs) +{ + mac_srs->srs_tcp_soft_rings = (mac_soft_ring_t **) + kmem_zalloc(sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT, KM_SLEEP); + mac_srs->srs_udp_soft_rings = (mac_soft_ring_t **) + kmem_zalloc(sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT, KM_SLEEP); + mac_srs->srs_oth_soft_rings = (mac_soft_ring_t **) + kmem_zalloc(sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT, KM_SLEEP); +} + +static void +mac_srs_worker_bind(mac_soft_ring_set_t *mac_srs, processorid_t cpuid) +{ + cpu_t *cp; + boolean_t clear = B_FALSE; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + if (!mac_srs_thread_bind) + return; + + cp = cpu_get(cpuid); + if (cp == NULL || !cpu_is_online(cp)) + return; + + mutex_enter(&mac_srs->srs_lock); + mac_srs->srs_state |= SRS_WORKER_BOUND; + if (mac_srs->srs_worker_cpuid != -1) + clear = B_TRUE; + mac_srs->srs_worker_cpuid = cpuid; + mutex_exit(&mac_srs->srs_lock); + + if (clear) + thread_affinity_clear(mac_srs->srs_worker); + + thread_affinity_set(mac_srs->srs_worker, cpuid); + DTRACE_PROBE1(worker__CPU, processorid_t, cpuid); +} + +static void +mac_srs_poll_bind(mac_soft_ring_set_t *mac_srs, processorid_t cpuid) +{ + cpu_t *cp; + boolean_t clear = B_FALSE; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + if (!mac_srs_thread_bind || mac_srs->srs_poll_thr == NULL) + return; + + cp = cpu_get(cpuid); + if (cp == NULL || !cpu_is_online(cp)) + return; + + mutex_enter(&mac_srs->srs_lock); + mac_srs->srs_state |= SRS_POLL_BOUND; + if (mac_srs->srs_poll_cpuid != -1) + clear = B_TRUE; + mac_srs->srs_poll_cpuid = cpuid; + mutex_exit(&mac_srs->srs_lock); + + if (clear) + thread_affinity_clear(mac_srs->srs_poll_thr); + + thread_affinity_set(mac_srs->srs_poll_thr, cpuid); + DTRACE_PROBE1(poll__CPU, processorid_t, cpuid); +} + +/* + * When a CPU comes back online, bind the MAC kernel threads which + * were previously bound to that CPU, and had to be unbound because + * the CPU was going away. + * + * These functions are called with cpu_lock held and hence we can't + * cv_wait to grab the mac perimeter. Since these functions walk the soft + * ring list of an SRS without being in the perimeter, the list itself + * is protected by the SRS lock. + */ +static void +mac_walk_srs_and_bind(int cpuid) +{ + mac_soft_ring_set_t *mac_srs; + mac_soft_ring_t *soft_ring; + + rw_enter(&mac_srs_g_lock, RW_READER); + + if ((mac_srs = mac_srs_g_list) == NULL) + goto done; + + for (; mac_srs != NULL; mac_srs = mac_srs->srs_next) { + if (mac_srs->srs_worker_cpuid == -1 && + mac_srs->srs_worker_cpuid_save == cpuid) { + mac_srs->srs_worker_cpuid_save = -1; + mac_srs_worker_bind(mac_srs, cpuid); + } + + if (!(mac_srs->srs_type & SRST_TX)) { + if (mac_srs->srs_poll_cpuid == -1 && + mac_srs->srs_poll_cpuid_save == cpuid) { + mac_srs->srs_poll_cpuid_save = -1; + mac_srs_poll_bind(mac_srs, cpuid); + } + } + + /* Next tackle the soft rings associated with the srs */ + mutex_enter(&mac_srs->srs_lock); + for (soft_ring = mac_srs->srs_soft_ring_head; soft_ring != NULL; + soft_ring = soft_ring->s_ring_next) { + if (soft_ring->s_ring_cpuid == -1 && + soft_ring->s_ring_cpuid_save == cpuid) { + soft_ring->s_ring_cpuid_save = -1; + (void) mac_soft_ring_bind(soft_ring, cpuid); + } + } + mutex_exit(&mac_srs->srs_lock); + } +done: + rw_exit(&mac_srs_g_lock); +} + +/* + * Change the priority of the SRS's poll and worker thread. Additionally, + * update the priority of the worker threads for the SRS's soft rings. + * Need to modify any associated squeue threads. + */ +void +mac_update_srs_priority(mac_soft_ring_set_t *mac_srs, pri_t prival) +{ + mac_soft_ring_t *ringp; + + mac_srs->srs_pri = prival; + thread_lock(mac_srs->srs_worker); + (void) thread_change_pri(mac_srs->srs_worker, mac_srs->srs_pri, 0); + thread_unlock(mac_srs->srs_worker); + if (mac_srs->srs_poll_thr != NULL) { + thread_lock(mac_srs->srs_poll_thr); + (void) thread_change_pri(mac_srs->srs_poll_thr, + mac_srs->srs_pri, 0); + thread_unlock(mac_srs->srs_poll_thr); + } + if ((ringp = mac_srs->srs_soft_ring_head) == NULL) + return; + while (ringp != mac_srs->srs_soft_ring_tail) { + thread_lock(ringp->s_ring_worker); + (void) thread_change_pri(ringp->s_ring_worker, + mac_srs->srs_pri, 0); + thread_unlock(ringp->s_ring_worker); + ringp = ringp->s_ring_next; + } + ASSERT(ringp == mac_srs->srs_soft_ring_tail); + thread_lock(ringp->s_ring_worker); + (void) thread_change_pri(ringp->s_ring_worker, mac_srs->srs_pri, 0); + thread_unlock(ringp->s_ring_worker); +} + +/* + * Change the receive bandwidth limit. + */ +static void +mac_rx_srs_update_bwlimit(mac_soft_ring_set_t *srs, mac_resource_props_t *mrp) +{ + mac_soft_ring_t *softring; + + mutex_enter(&srs->srs_lock); + mutex_enter(&srs->srs_bw->mac_bw_lock); + + if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) { + /* Reset bandwidth limit */ + if (srs->srs_type & SRST_BW_CONTROL) { + softring = srs->srs_soft_ring_head; + while (softring != NULL) { + softring->s_ring_type &= ~ST_RING_BW_CTL; + softring = softring->s_ring_next; + } + srs->srs_type &= ~SRST_BW_CONTROL; + srs->srs_drain_func = mac_rx_srs_drain; + } + } else { + /* Set/Modify bandwidth limit */ + srs->srs_bw->mac_bw_limit = FLOW_BYTES_PER_TICK(mrp->mrp_maxbw); + /* + * Give twice the queuing capability before + * dropping packets. The unit is bytes/tick. + */ + srs->srs_bw->mac_bw_drop_threshold = + srs->srs_bw->mac_bw_limit << 1; + if (!(srs->srs_type & SRST_BW_CONTROL)) { + softring = srs->srs_soft_ring_head; + while (softring != NULL) { + softring->s_ring_type |= ST_RING_BW_CTL; + softring = softring->s_ring_next; + } + srs->srs_type |= SRST_BW_CONTROL; + srs->srs_drain_func = mac_rx_srs_drain_bw; + } + } +done: + mutex_exit(&srs->srs_bw->mac_bw_lock); + mutex_exit(&srs->srs_lock); +} + +/* Change the transmit bandwidth limit */ +static void +mac_tx_srs_update_bwlimit(mac_soft_ring_set_t *srs, mac_resource_props_t *mrp) +{ + mac_srs_tx_t *srs_tx = &srs->srs_tx; + uint32_t tx_mode; + mac_impl_t *mip = srs->srs_mcip->mci_mip; + + mutex_enter(&srs->srs_lock); + mutex_enter(&srs->srs_bw->mac_bw_lock); + + tx_mode = srs_tx->st_mode; + + if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) { + /* Reset bandwidth limit */ + if (tx_mode == SRS_TX_BW) { + if (mac_tx_serialize || + (mip->mi_v12n_level & MAC_VIRT_SERIALIZE)) { + srs_tx->st_mode = SRS_TX_SERIALIZE; + } else { + srs_tx->st_mode = SRS_TX_DEFAULT; + } + } else if (tx_mode == SRS_TX_BW_FANOUT) { + srs_tx->st_mode = SRS_TX_FANOUT; + } + srs->srs_type &= ~SRST_BW_CONTROL; + } else { + /* Set/Modify bandwidth limit */ + srs->srs_bw->mac_bw_limit = FLOW_BYTES_PER_TICK(mrp->mrp_maxbw); + /* + * Give twice the queuing capability before + * dropping packets. The unit is bytes/tick. + */ + srs->srs_bw->mac_bw_drop_threshold = + srs->srs_bw->mac_bw_limit << 1; + srs->srs_type |= SRST_BW_CONTROL; + if (tx_mode != SRS_TX_BW && + tx_mode != SRS_TX_BW_FANOUT) { + if (tx_mode == SRS_TX_SERIALIZE || + tx_mode == SRS_TX_DEFAULT) { + srs_tx->st_mode = SRS_TX_BW; + } else if (tx_mode == SRS_TX_FANOUT) { + srs_tx->st_mode = SRS_TX_BW_FANOUT; + } else { + ASSERT(0); + } + } + } +done: + srs_tx->st_func = mac_tx_get_func(srs_tx->st_mode); + mutex_exit(&srs->srs_bw->mac_bw_lock); + mutex_exit(&srs->srs_lock); +} + +/* + * The uber function that deals with any update to bandwidth limits. + */ +void +mac_srs_update_bwlimit(flow_entry_t *flent, mac_resource_props_t *mrp) +{ + int count; + + for (count = 0; count < flent->fe_rx_srs_cnt; count++) + mac_rx_srs_update_bwlimit(flent->fe_rx_srs[count], mrp); + mac_tx_srs_update_bwlimit(flent->fe_tx_srs, mrp); +} + +void +mac_srs_change_upcall(void *arg, mac_direct_rx_t rx_func, void *rx_arg1) +{ + mac_soft_ring_set_t *mac_srs = arg; + mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; + mac_soft_ring_t *softring; + + mutex_enter(&mac_srs->srs_lock); + ASSERT((mac_srs->srs_type & SRST_TX) == 0); + srs_rx->sr_func = rx_func; + srs_rx->sr_arg1 = rx_arg1; + + softring = mac_srs->srs_soft_ring_head; + while (softring != NULL) { + mutex_enter(&softring->s_ring_lock); + softring->s_ring_rx_func = rx_func; + softring->s_ring_rx_arg1 = rx_arg1; + mutex_exit(&softring->s_ring_lock); + softring = softring->s_ring_next; + } + + mutex_exit(&mac_srs->srs_lock); +} + +/* + * When the first sub-flow is added to a link, we disable polling on the + * link and also modify the entry point to mac_rx_srs_subflow_process. + * (polling is disabled because with the subflow added, accounting + * for polling needs additional logic, it is assumed that when a subflow is + * added, we can take some hit as a result of disabling polling rather than + * adding more complexity - if this becomes a perf. issue we need to + * re-rvaluate this logic). When the last subflow is removed, we turn back + * polling and also reset the entry point to mac_rx_srs_process. + * + * In the future if there are multiple SRS, we can simply + * take one and give it to the flow rather than disabling polling and + * resetting the entry point. + */ +void +mac_client_update_classifier(mac_client_impl_t *mcip, boolean_t enable) +{ + flow_entry_t *flent = mcip->mci_flent; + int i; + mac_impl_t *mip = mcip->mci_mip; + mac_rx_func_t rx_func; + uint_t rx_srs_cnt; + boolean_t enable_classifier; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + enable_classifier = !FLOW_TAB_EMPTY(mcip->mci_subflow_tab) && enable; + + rx_func = enable_classifier ? mac_rx_srs_subflow_process : + mac_rx_srs_process; + + /* + * If receive function has already been configured correctly for + * current subflow configuration, do nothing. + */ + if (flent->fe_cb_fn == (flow_fn_t)rx_func) + return; + + rx_srs_cnt = flent->fe_rx_srs_cnt; + for (i = 0; i < rx_srs_cnt; i++) { + ASSERT(flent->fe_rx_srs[i] != NULL); + mac_srs_poll_state_change(flent->fe_rx_srs[i], + enable_classifier, rx_func); + } + + /* + * Change the S/W classifier so that we can land in the + * correct processing function with correct argument. + * If all subflows have been removed we can revert to + * mac_rx_srsprocess, else we need mac_rx_srs_subflow_process. + */ + mutex_enter(&flent->fe_lock); + flent->fe_cb_fn = (flow_fn_t)rx_func; + flent->fe_cb_arg1 = (void *)mip; + flent->fe_cb_arg2 = flent->fe_rx_srs[0]; + mutex_exit(&flent->fe_lock); +} + +static void +mac_srs_update_fanout_list(mac_soft_ring_set_t *mac_srs) +{ + int tcp_count = 0; + int udp_count = 0; + int oth_count = 0; + mac_soft_ring_t *softring; + + softring = mac_srs->srs_soft_ring_head; + if (softring == NULL) { + ASSERT(mac_srs->srs_soft_ring_count == 0); + mac_srs->srs_tcp_ring_count = 0; + mac_srs->srs_udp_ring_count = 0; + mac_srs->srs_oth_ring_count = 0; + return; + } + + softring = mac_srs->srs_soft_ring_head; + tcp_count = udp_count = oth_count = 0; + + while (softring != NULL) { + if (softring->s_ring_type & ST_RING_TCP) + mac_srs->srs_tcp_soft_rings[tcp_count++] = softring; + else if (softring->s_ring_type & ST_RING_UDP) + mac_srs->srs_udp_soft_rings[udp_count++] = softring; + else + mac_srs->srs_oth_soft_rings[oth_count++] = softring; + softring = softring->s_ring_next; + } + + ASSERT(mac_srs->srs_soft_ring_count == + (tcp_count + udp_count + oth_count)); + + mac_srs->srs_tcp_ring_count = tcp_count; + mac_srs->srs_udp_ring_count = udp_count; + mac_srs->srs_oth_ring_count = oth_count; +} + +void +mac_srs_create_proto_softrings(int id, void *flent, uint16_t type, + pri_t pri, mac_client_impl_t *mcip, mac_soft_ring_set_t *mac_srs, + processorid_t cpuid, mac_direct_rx_t rx_func, void *x_arg1, + mac_resource_handle_t x_arg2, boolean_t set_bypass) +{ + mac_soft_ring_t *softring; + mac_rx_fifo_t mrf; + + bzero(&mrf, sizeof (mac_rx_fifo_t)); + mrf.mrf_type = MAC_RX_FIFO; + mrf.mrf_receive = (mac_receive_t)mac_soft_ring_poll; + mrf.mrf_intr_enable = + (mac_intr_enable_t)mac_soft_ring_intr_enable; + mrf.mrf_intr_disable = + (mac_intr_disable_t)mac_soft_ring_intr_disable; + mrf.mrf_flow_priority = pri; + + softring = mac_soft_ring_create(id, mac_soft_ring_worker_wait, + (void *)flent, (type|ST_RING_TCP), pri, mcip, mac_srs, + cpuid, rx_func, x_arg1, x_arg2); + softring->s_ring_rx_arg2 = NULL; + + /* + * TCP and UDP support DLS bypass. In addition TCP + * squeue can also poll their corresponding soft rings. + */ + if (set_bypass && (mcip->mci_resource_arg != NULL)) { + mac_soft_ring_dls_bypass(softring, + mcip->mci_direct_rx_fn, + mcip->mci_direct_rx_arg); + + mrf.mrf_rx_arg = softring; + mrf.mrf_intr_handle = (mac_intr_handle_t)softring; + + /* + * Make a call in IP to get a TCP squeue assigned to + * this softring to maintain full CPU locality through + * the stack and allow the squeue to be able to poll + * the softring so the flow control can be pushed + * all the way to H/W. + */ + softring->s_ring_rx_arg2 = + mcip->mci_resource_add((void *)mcip->mci_resource_arg, + (mac_resource_t *)&mrf); + } + + /* + * Non-TCP protocols don't support squeues. Hence we + * don't make any ring addition callbacks for non-TCP + * rings. Now create the UDP softring and allow it to + * bypass the DLS layer. + */ + softring = mac_soft_ring_create(id, mac_soft_ring_worker_wait, + (void *)flent, (type|ST_RING_UDP), pri, mcip, mac_srs, + cpuid, rx_func, x_arg1, x_arg2); + softring->s_ring_rx_arg2 = NULL; + + if (set_bypass && (mcip->mci_resource_arg != NULL)) { + mac_soft_ring_dls_bypass(softring, + mcip->mci_direct_rx_fn, + mcip->mci_direct_rx_arg); + } + + /* Create the Oth softrings which has to go through the DLS */ + softring = mac_soft_ring_create(id, mac_soft_ring_worker_wait, + (void *)flent, (type|ST_RING_OTH), pri, mcip, mac_srs, + cpuid, rx_func, x_arg1, x_arg2); + softring->s_ring_rx_arg2 = NULL; +} + +/* + * This routine associates a CPU or a set of CPU to process incoming + * traffic from a mac client. If multiple CPUs are specified, then + * so many soft rings are created with each soft ring worker thread + * bound to a CPU in the set. Each soft ring in turn will be + * associated with an squeue and the squeue will be moved to the + * same CPU as that of the soft ring's. + */ +static void +mac_srs_fanout_modify(mac_client_impl_t *mcip, flow_entry_t *flent, + mac_resource_props_t *mrp, mac_direct_rx_t rx_func, void *x_arg1, + mac_resource_handle_t x_arg2, mac_soft_ring_set_t *mac_rx_srs, + mac_soft_ring_set_t *mac_tx_srs) +{ + mac_soft_ring_t *softring; + uint32_t soft_ring_flag = soft_ring_process_flag; + processorid_t cpuid = -1; + boolean_t user_specified; + int i, srings_present, new_fanout_cnt; + mac_cpus_t *srs_cpu; + + user_specified = mrp->mrp_mask & MRP_CPUS_USERSPEC; + /* fanout state is REINIT. Set it back to INIT */ + ASSERT(mac_rx_srs->srs_fanout_state == SRS_FANOUT_REINIT); + mac_rx_srs->srs_fanout_state = SRS_FANOUT_INIT; + + /* how many are present right now */ + srings_present = mac_rx_srs->srs_tcp_ring_count; + /* new request */ + srs_cpu = &mac_rx_srs->srs_cpu; + new_fanout_cnt = srs_cpu->mc_fanout_cnt; + + mutex_enter(&mac_rx_srs->srs_lock); + if (mac_rx_srs->srs_type & SRST_BW_CONTROL) + soft_ring_flag |= ST_RING_BW_CTL; + mutex_exit(&mac_rx_srs->srs_lock); + + if (new_fanout_cnt > srings_present) { + /* soft rings increased */ + mutex_enter(&mac_rx_srs->srs_lock); + mac_rx_srs->srs_type |= SRST_FANOUT_SRC_IP; + mutex_exit(&mac_rx_srs->srs_lock); + + for (i = mac_rx_srs->srs_tcp_ring_count; + i < new_fanout_cnt; i++) { + /* + * Create the protocol softrings and set the + * DLS bypass where possible. + */ + mac_srs_create_proto_softrings(i, + (void *)flent, soft_ring_flag, + mac_rx_srs->srs_pri, mcip, mac_rx_srs, cpuid, + rx_func, x_arg1, x_arg2, B_TRUE); + } + mac_srs_update_fanout_list(mac_rx_srs); + } else if (new_fanout_cnt < srings_present) { + /* soft rings decreased */ + if (new_fanout_cnt == 1) { + mutex_enter(&mac_rx_srs->srs_lock); + mac_rx_srs->srs_type &= ~SRST_FANOUT_SRC_IP; + ASSERT(mac_rx_srs->srs_type & SRST_FANOUT_PROTO); + mutex_exit(&mac_rx_srs->srs_lock); + } + /* Get rid of extra soft rings */ + for (i = new_fanout_cnt; + i < mac_rx_srs->srs_tcp_ring_count; i++) { + softring = mac_rx_srs->srs_tcp_soft_rings[i]; + if (softring->s_ring_rx_arg2 != NULL) { + mcip->mci_resource_remove( + (void *)mcip->mci_resource_arg, + softring->s_ring_rx_arg2); + } + mac_soft_ring_remove(mac_rx_srs, + mac_rx_srs->srs_tcp_soft_rings[i]); + mac_soft_ring_remove(mac_rx_srs, + mac_rx_srs->srs_udp_soft_rings[i]); + mac_soft_ring_remove(mac_rx_srs, + mac_rx_srs->srs_oth_soft_rings[i]); + } + mac_srs_update_fanout_list(mac_rx_srs); + } + + ASSERT(new_fanout_cnt == mac_rx_srs->srs_tcp_ring_count); + mutex_enter(&cpu_lock); + for (i = 0; i < mac_rx_srs->srs_tcp_ring_count; i++) { + cpuid = srs_cpu->mc_fanout_cpus[i]; + (void) mac_soft_ring_bind(mac_rx_srs->srs_udp_soft_rings[i], + cpuid); + (void) mac_soft_ring_bind(mac_rx_srs->srs_oth_soft_rings[i], + cpuid); + (void) mac_soft_ring_bind(mac_rx_srs->srs_tcp_soft_rings[i], + cpuid); + softring = mac_rx_srs->srs_tcp_soft_rings[i]; + if (softring->s_ring_rx_arg2 != NULL) { + mcip->mci_resource_bind((void *)mcip->mci_resource_arg, + softring->s_ring_rx_arg2, cpuid); + } + } + + mac_srs_worker_bind(mac_rx_srs, srs_cpu->mc_pollid); + mac_srs_poll_bind(mac_rx_srs, srs_cpu->mc_workerid); + + /* + * Bind Tx srs and soft ring threads too. Let's bind tx + * srs to the last cpu in mrp list. + */ + if (mac_tx_srs != NULL && user_specified) { + BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs, mrp); + } + mutex_exit(&cpu_lock); +} + +/* + * Bind SRS threads and soft rings to CPUs/create fanout list. + */ +void +mac_srs_fanout_init(mac_client_impl_t *mcip, flow_entry_t *flent, + mac_resource_props_t *mrp, mac_direct_rx_t rx_func, void *x_arg1, + mac_resource_handle_t x_arg2, mac_soft_ring_set_t *mac_rx_srs, + mac_soft_ring_set_t *mac_tx_srs) +{ + int i; + processorid_t cpuid, worker_cpuid, poll_cpuid; + uint32_t soft_ring_flag = soft_ring_process_flag; + int soft_ring_cnt; + boolean_t user_specified = B_FALSE; + mac_cpus_t *srs_cpu = &mac_rx_srs->srs_cpu; + + /* + * Remove the no soft ring flag and we will adjust it + * appropriately further down. + */ + mutex_enter(&mac_rx_srs->srs_lock); + mac_rx_srs->srs_type &= ~SRST_NO_SOFT_RINGS; + mutex_exit(&mac_rx_srs->srs_lock); + + ASSERT(mac_rx_srs->srs_soft_ring_head == NULL); + + if (mac_rx_srs->srs_type & SRST_BW_CONTROL) + soft_ring_flag |= ST_RING_BW_CTL; + + ASSERT(mac_rx_srs->srs_fanout_state == SRS_FANOUT_UNINIT); + mac_rx_srs->srs_fanout_state = SRS_FANOUT_INIT; + user_specified = mrp->mrp_mask & MRP_CPUS_USERSPEC; + /* + * Ring count can be 0 if no fanout is required and no cpu + * were specified. Leave the SRS worker and poll thread + * unbound + */ + ASSERT(mrp != NULL); + soft_ring_cnt = srs_cpu->mc_fanout_cnt; + + /* Step 1: bind cpu contains cpu list where threads need to bind */ + if (soft_ring_cnt > 0) { + mutex_enter(&cpu_lock); + for (i = 0; i < soft_ring_cnt; i++) { + cpuid = srs_cpu->mc_fanout_cpus[i]; + /* Create the protocol softrings */ + mac_srs_create_proto_softrings(i, (void *)flent, + soft_ring_flag, mac_rx_srs->srs_pri, + mcip, mac_rx_srs, cpuid, rx_func, + x_arg1, x_arg2, B_FALSE); + } + worker_cpuid = srs_cpu->mc_workerid; + poll_cpuid = srs_cpu->mc_pollid; + mac_srs_worker_bind(mac_rx_srs, worker_cpuid); + mac_srs_poll_bind(mac_rx_srs, poll_cpuid); + + /* + * Bind Tx srs and soft ring threads too. + * Let's bind tx srs to the last cpu in + * mrp list. + */ + if (mac_tx_srs == NULL) { + mutex_exit(&cpu_lock); + goto alldone; + } + + if (user_specified) { + BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs, mrp); + } + mutex_exit(&cpu_lock); + } else { + mutex_enter(&cpu_lock); + /* + * For a subflow, mrp_workerid and mrp_pollid + * is not set. + */ + mac_srs_worker_bind(mac_rx_srs, mrp->mrp_workerid); + mac_srs_poll_bind(mac_rx_srs, mrp->mrp_pollid); + mutex_exit(&cpu_lock); + goto no_softrings; + } + +alldone: + if (soft_ring_cnt > 1) + mac_rx_srs->srs_type |= SRST_FANOUT_SRC_IP; + mac_srs_update_fanout_list(mac_rx_srs); + mac_srs_client_poll_enable(mcip, mac_rx_srs); + return; + +no_softrings: + if (mac_rx_srs->srs_type & SRST_FANOUT_PROTO) { + mutex_enter(&cpu_lock); + cpuid = mac_next_bind_cpu(); + /* Create the protocol softrings */ + mac_srs_create_proto_softrings(0, (void *)flent, + soft_ring_flag, mac_rx_srs->srs_pri, + mcip, mac_rx_srs, cpuid, rx_func, + x_arg1, x_arg2, B_FALSE); + mutex_exit(&cpu_lock); + } else { + /* + * This is the case when there is no fanout which is + * true for subflows. + */ + mac_rx_srs->srs_type |= SRST_NO_SOFT_RINGS; + } + mac_srs_update_fanout_list(mac_rx_srs); + mac_srs_client_poll_enable(mcip, mac_rx_srs); +} + +/* + * mac_fanout_setup: + * + * Calls mac_srs_fanout_init() or modify() depending upon whether + * the SRS is getting initialized or re-initialized. + */ +void +mac_fanout_setup(mac_client_impl_t *mcip, flow_entry_t *flent, + mac_resource_props_t *mrp, mac_direct_rx_t rx_func, void *x_arg1, + mac_resource_handle_t x_arg2) +{ + mac_soft_ring_set_t *mac_rx_srs, *mac_tx_srs; + int i, rx_srs_cnt; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); + /* + * This is an aggregation port. Fanout will be setup + * over the aggregation itself. + */ + if (mcip->mci_state_flags & MCIS_IS_AGGR_PORT) + return; + + mac_rx_srs = flent->fe_rx_srs[0]; + /* + * Set up the fanout on the tx side only once, with the + * first rx SRS. The CPU binding, fanout, and bandwidth + * criteria are common to both RX and TX, so + * initializing them along side avoids redundant code. + */ + mac_tx_srs = flent->fe_tx_srs; + rx_srs_cnt = flent->fe_rx_srs_cnt; + + /* No fanout for subflows */ + if (flent->fe_type & FLOW_USER) { + mac_srs_fanout_init(mcip, flent, mrp, rx_func, + x_arg1, x_arg2, mac_rx_srs, mac_tx_srs); + return; + } + + mac_flow_cpu_init(flent, mrp); + + /* + * Set up fanout for both SW (0th SRS) and HW classified + * SRS (the rest of Rx SRSs in flent). + */ + for (i = 0; i < rx_srs_cnt; i++) { + mac_rx_srs = flent->fe_rx_srs[i]; + if (i != 0) + mac_tx_srs = NULL; + switch (mac_rx_srs->srs_fanout_state) { + case SRS_FANOUT_UNINIT: + mac_srs_fanout_init(mcip, flent, mrp, rx_func, + x_arg1, x_arg2, mac_rx_srs, mac_tx_srs); + break; + case SRS_FANOUT_INIT: + break; + case SRS_FANOUT_REINIT: + mac_rx_srs_quiesce(mac_rx_srs, SRS_QUIESCE); + mac_srs_fanout_modify(mcip, flent, mrp, rx_func, + x_arg1, x_arg2, mac_rx_srs, mac_tx_srs); + mac_rx_srs_restart(mac_rx_srs); + break; + default: + VERIFY(mac_rx_srs->srs_fanout_state <= + SRS_FANOUT_REINIT); + break; + } + } +} + +/* + * mac_create_soft_ring_set: + * + * Create a mac_soft_ring_set_t (SRS). If soft_ring_fanout_type is + * SRST_TX, an SRS for Tx side is created. Otherwise an SRS for Rx side + * processing is created. + * + * Details on Rx SRS: + * Create a SRS and also add the necessary soft rings for TCP and + * non-TCP based on fanout type and count specified. + * + * mac_soft_ring_fanout, mac_srs_fanout_modify (?), + * mac_soft_ring_stop_workers, mac_soft_ring_set_destroy, etc need + * to be heavily modified. + * + * mi_soft_ring_list_size, mi_soft_ring_size, etc need to disappear. + */ +mac_soft_ring_set_t * +mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type, + mac_direct_rx_t rx_func, void *x_arg1, mac_resource_handle_t x_arg2, + mac_ring_t *ring) +{ + mac_soft_ring_set_t *mac_srs; + mac_srs_rx_t *srs_rx; + mac_srs_tx_t *srs_tx; + mac_bw_ctl_t *mac_bw; + mac_resource_props_t *mrp; + boolean_t is_tx_srs = ((srs_type & SRST_TX) != 0); + + mac_srs = kmem_cache_alloc(mac_srs_cache, KM_SLEEP); + bzero(mac_srs, sizeof (mac_soft_ring_set_t)); + srs_rx = &mac_srs->srs_rx; + srs_tx = &mac_srs->srs_tx; + + mutex_enter(&flent->fe_lock); + + /* + * Get the bandwidth control structure from the flent. Get + * rid of any residual values in the control structure for + * the tx bw struct and also for the rx, if the rx srs is + * the 1st one being brought up (the rx bw ctl struct may + * be shared by multiple SRSs) + */ + if (is_tx_srs) { + mac_srs->srs_bw = &flent->fe_tx_bw; + bzero(mac_srs->srs_bw, sizeof (mac_bw_ctl_t)); + flent->fe_tx_srs = mac_srs; + } else { + /* + * The bw counter (stored in the flent) is shared + * by SRS's within an rx group. + */ + mac_srs->srs_bw = &flent->fe_rx_bw; + /* First rx SRS, clear the bw structure */ + if (flent->fe_rx_srs_cnt == 0) + bzero(mac_srs->srs_bw, sizeof (mac_bw_ctl_t)); + ASSERT(flent->fe_rx_srs_cnt < MAX_RINGS_PER_GROUP); + flent->fe_rx_srs[flent->fe_rx_srs_cnt] = mac_srs; + flent->fe_rx_srs_cnt++; + } + mac_srs->srs_flent = flent; + mutex_exit(&flent->fe_lock); + + mac_srs->srs_state = 0; + mac_srs->srs_type = (srs_type | SRST_NO_SOFT_RINGS); + mac_srs->srs_worker_cpuid = mac_srs->srs_worker_cpuid_save = -1; + mac_srs->srs_poll_cpuid = mac_srs->srs_poll_cpuid_save = -1; + mac_srs_fanout_list_alloc(mac_srs); + + /* + * For a flow we use the underlying MAC client's priority range with + * the priority value to find an absolute priority value. For a MAC + * client we use the MAC client's maximum priority as the value. + */ + mrp = &flent->fe_effective_props; + if ((mac_srs->srs_type & SRST_FLOW) != 0) { + mac_srs->srs_pri = FLOW_PRIORITY(mcip->mci_min_pri, + mcip->mci_max_pri, mrp->mrp_priority); + } else { + mac_srs->srs_pri = mcip->mci_max_pri; + } + mac_srs->srs_mcip = mcip; + /* + * We need to insert the SRS in the global list before + * binding the SRS and SR threads. Otherwise there is a + * is a small window where the cpu reconfig callbacks + * may miss the SRS in the list walk and DR could fail + * as there are bound threads. + */ + mac_srs_add_glist(mac_srs); + + /* Initialize bw limit */ + if ((mrp->mrp_mask & MRP_MAXBW) != 0) { + mac_srs->srs_drain_func = mac_rx_srs_drain_bw; + + mac_bw = mac_srs->srs_bw; + mutex_enter(&mac_bw->mac_bw_lock); + mac_bw->mac_bw_limit = FLOW_BYTES_PER_TICK(mrp->mrp_maxbw); + + /* + * Give twice the queuing capability before + * dropping packets. The unit is bytes/tick. + */ + mac_bw->mac_bw_drop_threshold = mac_bw->mac_bw_limit << 1; + mutex_exit(&mac_bw->mac_bw_lock); + mac_srs->srs_type |= SRST_BW_CONTROL; + } else { + mac_srs->srs_drain_func = mac_rx_srs_drain; + } + + /* + * We use the following policy to control Receive + * Side Dynamic Polling: + * 1) We switch to poll mode anytime the processing thread causes + * a backlog to build up in SRS and its associated Soft Rings + * (sr_poll_pkt_cnt > 0). + * 2) As long as the backlog stays under the low water mark + * (sr_lowat), we poll the H/W for more packets. + * 3) If the backlog (sr_poll_pkt_cnt) exceeds low water mark, we + * stay in poll mode but don't poll the H/W for more packets. + * 4) Anytime in polling mode, if we poll the H/W for packets and + * find nothing plus we have an existing backlog + * (sr_poll_pkt_cnt > 0), we stay in polling mode but don't poll + * the H/W for packets anymore (let the polling thread go to sleep). + * 5) Once the backlog is relived (packets are processed) we reenable + * polling (by signalling the poll thread) only when the backlog + * dips below sr_poll_thres. + * 6) sr_hiwat is used exclusively when we are not polling capable + * and is used to decide when to drop packets so the SRS queue + * length doesn't grow infinitely. + */ + if (!is_tx_srs) { + srs_rx->sr_hiwat = mac_soft_ring_max_q_cnt; + /* Low water mark needs to be less than high water mark */ + srs_rx->sr_lowat = mac_soft_ring_min_q_cnt <= + mac_soft_ring_max_q_cnt ? mac_soft_ring_min_q_cnt : + (mac_soft_ring_max_q_cnt >> 2); + /* Poll threshold need to be half of low water mark or less */ + srs_rx->sr_poll_thres = mac_soft_ring_poll_thres <= + (srs_rx->sr_lowat >> 1) ? mac_soft_ring_poll_thres : + (srs_rx->sr_lowat >> 1); + if (mac_latency_optimize) + mac_srs->srs_state |= SRS_LATENCY_OPT; + } + + mac_srs->srs_worker = thread_create(NULL, 0, + mac_srs_worker, mac_srs, 0, &p0, TS_RUN, mac_srs->srs_pri); + + if (is_tx_srs) { + /* Handle everything about Tx SRS and return */ + mac_srs->srs_drain_func = mac_tx_srs_drain; + srs_tx->st_max_q_cnt = mac_tx_srs_max_q_cnt; + srs_tx->st_hiwat = + (mac_tx_srs_hiwat > mac_tx_srs_max_q_cnt) ? + mac_tx_srs_max_q_cnt : mac_tx_srs_hiwat; + srs_tx->st_arg1 = x_arg1; + srs_tx->st_arg2 = x_arg2; + return (mac_srs); + } + + if ((srs_type & SRST_FLOW) != 0 || + FLOW_TAB_EMPTY(mcip->mci_subflow_tab)) + srs_rx->sr_lower_proc = mac_rx_srs_process; + else + srs_rx->sr_lower_proc = mac_rx_srs_subflow_process; + + srs_rx->sr_func = rx_func; + srs_rx->sr_arg1 = x_arg1; + srs_rx->sr_arg2 = x_arg2; + + if (ring != NULL) { + /* Is the mac_srs created over the RX default group? */ + if (ring->mr_gh == (mac_group_handle_t) + (&mcip->mci_mip->mi_rx_groups[0])) + mac_srs->srs_type |= SRST_DEFAULT_GRP; + + mac_srs->srs_ring = ring; + ring->mr_srs = mac_srs; + ring->mr_classify_type = MAC_HW_CLASSIFIER; + ring->mr_flag |= MR_INCIPIENT; + + if (FLOW_TAB_EMPTY(mcip->mci_subflow_tab)) + mac_srs->srs_state |= SRS_POLLING_CAPAB; + + mac_srs->srs_poll_thr = thread_create(NULL, 0, + mac_rx_srs_poll_ring, mac_srs, 0, &p0, TS_RUN, + mac_srs->srs_pri); + } + return (mac_srs); +} + +/* + * Figure out the number of soft rings required. Its dependant on + * if protocol fanout is required (for LINKs), global settings + * require us to do fanout for performance (based on mac_soft_ring_enable), + * or user has specifically requested fanout. + */ +static uint32_t +mac_find_fanout(flow_entry_t *flent, uint32_t link_type) +{ + uint32_t fanout_type; + mac_resource_props_t *mrp = &flent->fe_effective_props; + + /* no fanout for subflows */ + switch (link_type) { + case SRST_FLOW: + fanout_type = SRST_NO_SOFT_RINGS; + break; + case SRST_LINK: + fanout_type = SRST_FANOUT_PROTO; + break; + } + + /* A primary NIC/link is being plumbed */ + if (flent->fe_type & FLOW_PRIMARY_MAC) { + if (mac_soft_ring_enable && mac_rx_soft_ring_count > 1) { + fanout_type |= SRST_FANOUT_SRC_IP; + } + } else if (flent->fe_type & FLOW_VNIC) { + /* A VNIC is being created */ + if (mrp != NULL && mrp->mrp_ncpus > 0) { + fanout_type |= SRST_FANOUT_SRC_IP; + } + } + + return (fanout_type); +} + +/* + * Change a group from h/w to s/w classification. + */ +static void +mac_rx_switch_grp_to_sw(mac_group_t *group) +{ + mac_ring_t *ring; + mac_soft_ring_set_t *mac_srs; + + for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) { + if (ring->mr_classify_type == MAC_HW_CLASSIFIER) { + /* + * Remove the SRS associated with the HW ring. + * As a result, polling will be disabled. + */ + mac_srs = ring->mr_srs; + ASSERT(mac_srs != NULL); + mac_rx_srs_remove(mac_srs); + ring->mr_srs = NULL; + } + + if (ring->mr_state != MR_INUSE) + (void) mac_start_ring(ring); + /* + * We need to perform SW classification + * for packets landing in these rings + */ + ring->mr_state = MR_INUSE; + ring->mr_flag = 0; + ring->mr_classify_type = MAC_SW_CLASSIFIER; + } +} + +/* + * Create the Rx SRS for S/W classifier and for each ring in the + * group (if exclusive group). Also create the Tx SRS. + */ +void +mac_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, + mac_group_t *group, uint32_t link_type) +{ + mac_impl_t *mip = mcip->mci_mip; + mac_soft_ring_set_t *mac_srs; + mac_soft_ring_set_t *tx_srs = NULL; + mac_ring_t *ring; + uint32_t fanout_type; + boolean_t created_srs = B_FALSE; + + fanout_type = mac_find_fanout(flent, link_type); + + /* Create the SRS for S/W classification if none exists */ + if (flent->fe_rx_srs[0] == NULL) { + ASSERT(flent->fe_rx_srs_cnt == 0); + /* Setup the Rx SRS */ + mac_srs = mac_srs_create(mcip, flent, fanout_type | link_type, + mac_rx_deliver, mcip, NULL, NULL); + + mutex_enter(&flent->fe_lock); + flent->fe_cb_fn = (flow_fn_t)mac_srs->srs_rx.sr_lower_proc; + flent->fe_cb_arg1 = (void *)mip; + flent->fe_cb_arg2 = (void *)mac_srs; + mutex_exit(&flent->fe_lock); + + /* Setup the Tx SRS as well */ + ASSERT(flent->fe_tx_srs == NULL); + tx_srs = mac_srs_create(mcip, flent, SRST_TX | link_type, + NULL, mcip, NULL, NULL); + + if (mcip->mci_share != NULL) { + mac_srs_tx_t *tx = &tx_srs->srs_tx; + ASSERT(!mcip->mci_no_hwrings); + /* + * A share requires a dedicated TX group. + * mac_reserve_tx_group() does the work needed to + * allocate a new group and populate that group + * with rings according to the driver requirements + * and limitations. + */ + tx->st_group = + mac_reserve_tx_group(mip, mcip->mci_share); + ASSERT(tx->st_group != NULL); + tx->st_group->mrg_tx_client = mcip; + } + mac_tx_srs_setup(mcip, flent, link_type); + created_srs = B_TRUE; + } + + if (group == NULL) { + if (created_srs) { + mac_fanout_setup(mcip, flent, + MCIP_RESOURCE_PROPS(mcip), mac_rx_deliver, + mcip, NULL); + } + return; + } + + /* + * fanout for default SRS is done when default SRS are created + * above. As each ring is added to the group, we setup the + * SRS and fanout to it. + */ + switch (group->mrg_state) { + case MAC_GROUP_STATE_RESERVED: + /* + * The group is exclusively ours. Create a SRS + * for each ring in the group and allow the + * individual SRS to dynamically poll their + * Rx ring. Do this only if the client is not + * a VLAN MAC client since for VLAN we do + * s/w classification for the VID check. + */ + if (i_mac_flow_vid(mcip->mci_flent) != VLAN_ID_NONE) + break; + for (ring = group->mrg_rings; ring != NULL; + ring = ring->mr_next) { + switch (ring->mr_state) { + case MR_INUSE: + case MR_FREE: + if (ring->mr_srs != NULL) + break; + if (ring->mr_state != MR_INUSE) + (void) mac_start_ring(ring); + + ring->mr_state = MR_INUSE; + + mac_srs = mac_srs_create(mcip, flent, + fanout_type | link_type, + mac_rx_deliver, mcip, NULL, ring); + if (mip->mi_v12n_level & MAC_VIRT_SERIALIZE) { + mac_srs->srs_rx.sr_enqueue_always = + B_TRUE; + } + break; + default: + cmn_err(CE_PANIC, "srs_setup: mcip = %p " + "trying to add UNKNOWN ring = %p\n", + (void *)mcip, (void *)ring); + break; + } + } + break; + case MAC_GROUP_STATE_SHARED: + /* + * Set all rings of this group to software classified. + * + * If the group is current RESERVED, the existing mac client + * (the only client on this group) is using this group + * exclusively. In that case we need to disable polling on + * the rings of the group (if it was enabled), and free the + * SRS associated with the rings. + */ + mac_rx_switch_grp_to_sw(group); + break; + default: + ASSERT(B_FALSE); + break; + } + mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip), + mac_rx_deliver, mcip, NULL); +} + +void +mac_srs_group_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, + uint32_t link_type) +{ + mac_soft_ring_set_t *mac_srs; + mac_soft_ring_set_t *tx_srs; + mac_srs_tx_t *tx; + int i; + + for (i = 0; i < flent->fe_rx_srs_cnt; i++) { + mac_srs = flent->fe_rx_srs[i]; + mac_rx_srs_quiesce(mac_srs, SRS_CONDEMNED); + /* + * Deal with all fanout tear down etc. + */ + mac_srs_free(mac_srs); + flent->fe_rx_srs[i] = NULL; + } + flent->fe_rx_srs_cnt = 0; + + tx_srs = flent->fe_tx_srs; + tx = &tx_srs->srs_tx; + switch (link_type) { + case SRST_FLOW: + /* + * For flows, we need to work with passed + * flent to find the Rx/Tx SRS. + */ + mac_tx_srs_quiesce(tx_srs, SRS_CONDEMNED); + break; + case SRST_LINK: + mac_tx_client_quiesce(mcip, SRS_CONDEMNED); + /* + * Release the TX resources. First the TX group, if any + * was assigned to the MAC client, which will cause the + * TX rings to be moved back to the pool. Then free the + * rings themselves. + */ + if (tx->st_group != NULL) { + mac_release_tx_group(tx_srs->srs_mcip->mci_mip, + tx->st_group); + tx->st_group = NULL; + } + if (tx->st_arg2 != NULL) { + ASSERT(tx_srs->srs_type & SRST_TX); + mac_release_tx_ring(tx->st_arg2); + } + break; + default: + ASSERT(B_FALSE); + break; + } + mac_srs_free(tx_srs); + flent->fe_tx_srs = NULL; +} + +/* + * This is the group state machine. The state of an Rx group is given by + * the following table. The default group and its rings are started in + * mac_start itself and the default group stays in SHARED state until + * mac_stop at which time the group and rings are stopped and and it + * reverts to the Registered state. + * + * Typically this function is called on a group after adding or removing a + * client from it, to find out what should be the new state of the group. + * If the new state is RESERVED, then the client that owns this group + * exclusively is also returned. Note that adding or removing a client from + * a group could also impact the default group and the caller needs to + * evaluate the effect on the default group. + * + * Group type # of clients mi_nactiveclients Group State + * in the group + * + * Non-default 0 N.A. REGISTERED + * Non-default 1 N.A. RESERVED + * Non-default > 1 N.A. SHARED + * + * Default 0 N.A. SHARED + * Default 1 1 RESERVED + * Default 1 > 1 SHARED + * Default > 1 N.A. SHARED + */ +mac_group_state_t +mac_rx_group_next_state(mac_group_t *grp, mac_client_impl_t **group_only_mcip) +{ + mac_impl_t *mip = (mac_impl_t *)grp->mrg_mh; + + *group_only_mcip = NULL; + + /* Non-default group */ + + if (grp != mip->mi_rx_groups) { + if (MAC_RX_GROUP_NO_CLIENT(grp)) + return (MAC_GROUP_STATE_REGISTERED); + + *group_only_mcip = MAC_RX_GROUP_ONLY_CLIENT(grp); + if (*group_only_mcip != NULL) + return (MAC_GROUP_STATE_RESERVED); + + return (MAC_GROUP_STATE_SHARED); + } + + /* Default group */ + + if (MAC_RX_GROUP_NO_CLIENT(grp) || mip->mi_nactiveclients != 1) + return (MAC_GROUP_STATE_SHARED); + + *group_only_mcip = MAC_RX_GROUP_ONLY_CLIENT(grp); + ASSERT(*group_only_mcip != NULL); + return (MAC_GROUP_STATE_RESERVED); +} + +/* + * OVERVIEW NOTES FOR DATAPATH + * =========================== + * + * Create an SRS and setup the corresponding flow function and args. + * Add a classification rule for the flow specified by 'flent' and program + * the hardware classifier when applicable. + * + * Rx ring assignment, SRS, polling and B/W enforcement + * ---------------------------------------------------- + * + * We try to use H/W classification on NIC and assign traffic to a + * MAC address to a particular Rx ring. There is a 1-1 mapping + * between a SRS and a Rx ring. The SRS (short for soft ring set) + * dynamically switches the underlying Rx ring between interrupt + * and polling mode and enforces any specified B/W control. + * + * There is always a SRS created and tied to each H/W and S/W rule. + * Whenever we create a H/W rule, we always add the the same rule to + * S/W classifier and tie a SRS to it. + * + * In case a B/W control is specified, its broken into bytes + * per ticks and as soon as the quota for a tick is exhausted, + * the underlying Rx ring is forced into poll mode for remianing + * tick. The SRS poll thread only polls for bytes that are + * allowed to come in the SRS. We typically let 4x the configured + * B/W worth of packets to come in the SRS (to prevent unnecessary + * drops due to bursts) but only process the specified amount. + * + * A Link (primary NIC, VNIC, VLAN or aggr) can have 1 or more + * Rx rings (and corresponding SRSs) assigned to it. The SRS + * in turn can have softrings to do protocol level fanout or + * softrings to do S/W based fanout or both. In case the NIC + * has no Rx rings, we do S/W classification to respective SRS. + * The S/W classification rule is always setup and ready. This + * allows the MAC layer to reassign Rx rings whenever needed + * but packets still continue to flow via the default path and + * getting S/W classified to correct SRS. + * + * In other cases where a NIC or VNIC is plumbed, our goal is use + * H/W classifier and get two Rx ring assigned for the Link. One + * for TCP and one for UDP|SCTP. The respective SRS still do the + * polling on the Rx ring. For Link that is plumbed for IP, there + * is a TCP squeue which also does polling and can control the + * the Rx ring directly (where SRS is just pass through). For + * the following cases, the SRS does the polling underneath. + * 1) non IP based Links (Links which are not plumbed via ifconfig) + * and paths which have no IP squeues (UDP & SCTP) + * 2) If B/W control is specified on the Link + * 3) If S/W fanout is secified + * + * Note1: As of current implementation, we try to assign only 1 Rx + * ring per Link and more than 1 Rx ring for primary Link for + * H/W based fanout. We always create following softrings per SRS: + * 1) TCP softring which is polled by TCP squeue where possible + * (and also bypasses DLS) + * 2) UDP/SCTP based which bypasses DLS + * 3) OTH softring which goes via DLS (currently deal with IPv6 + * and non TCP/UDP/SCTP for IPv4 packets). + * + * It is necessary to create 3 softrings since SRS has to poll + * the single Rx ring underneath and enforce any link level B/W + * control (we can't switch the Rx ring in poll mode just based + * on TCP squeue if the same Rx ring is sharing UDP and other + * traffic as well). Once polling is done and any Link level B/W + * control is specified, the packets are assigned to respective + * softring based on protocol. Since TCP has IP based squeue + * which benefits by polling, we separate TCP packets into + * its own softring which can be polled by IP squeue. We need + * to separate out UDP/SCTP to UDP softring since it can bypass + * the DLS layer which has heavy performance advanatges and we + * need a softring (OTH) for rest. + * + * ToDo: The 3 softrings for protocol are needed only till we can + * get rid of DLS from datapath, make IPv4 and IPv6 paths + * symmetric (deal with mac_header_info for v6 and polling for + * IPv4 TCP - ip_accept_tcp is IPv4 specific although squeues + * are generic), and bring SAP based classification to MAC layer + * + * H/W and S/W based fanout and multiple Rx rings per Link + * ------------------------------------------------------- + * + * In case, fanout is requested (or determined automatically based + * on Link speed and processor speed), we try to assign multiple + * Rx rings per Link with their respective SRS. In this case + * the NIC should be capable of fanning out incoming packets between + * the assigned Rx rings (H/W based fanout). All the SRS + * individually switch their Rx ring between interrupt and polling + * mode but share a common B/W control counter in case of Link + * level B/W is specified. + * + * If S/W based fanout is specified in lieu of H/W based fanout, + * the Link SRS creates the specified number of softrings for + * each protocol (TCP, UDP, OTH). Incoming packets are fanned + * out to the correct softring based on their protocol and + * protocol specific hash function. + * + * Primary and non primary MAC clients + * ----------------------------------- + * + * The NICs, VNICs, Vlans, and Aggrs are typically termed as Links + * and are a Layer 2 construct. + * + * Primary NIC: + * The Link that owns the primary MAC address and typically + * is used as the data NIC in non virtualized cases. As such + * H/W resources are preferntially given to primary NIC. As + * far as code is concerned, there is no difference in the + * primary NIC vs VNICs. They are all treated as Links. + * At the very first call to mac_unicast_add() we program the S/W + * classifier for the primary MAC address, get a soft ring set + * (and soft rings based on 'ip_soft_ring_cnt') + * and a Rx ring assigned for polling to get enabled. + * When IP get plumbed and negotiates polling, we can + * let squeue do the polling on TCP softring. + * + * VNICs: + * Same as any other Link. As long as the H/W resource assignments + * are equal, the data path and setup for all Links is same. + * + * Flows: + * Can be configured on Links. They have their own SRS and the + * S/W classifier is programmed appropriately based on the flow. + * The flows typically deal with layer 3 and above and + * creates a soft ring set specific to the flow. The receive + * side function is switched from mac_rx_srs_process to + * mac_rx_srs_subflow_process which first tries to assign the + * packet to appropriate flow SRS and failing which assigns it + * to link SRS. This allows us to avoid the layered approach + * which gets complex. + * + * By the time mac_datapath_setup() completes, we already have the + * soft rings set, Rx rings, soft rings, etc figured out and both H/W + * and S/W classifiers programmed. IP is not plumbed yet (and might + * never be for Virtual Machines guest OS path). When IP is plumbed + * (for both NIC and VNIC), we do a capability negotiation for polling + * and upcall functions etc. + * + * Rx ring Assignement NOTES + * ------------------------- + * + * For NICs which have only 1 Rx ring (we treat NICs with no Rx rings + * as NIC with a single default ring), we assign the only ring to + * primary Link as MAC_RX_HW_DEFAULT_RING. The primary Link SRS can do + * polling on it as long as it is the only link in use and we compare + * the MAC address for unicast packets before accepting an incoming + * packet (there is no need for S/W classification in this case). We + * disable polling on the only ring the moment 2nd link gets created + * (the polling remains enabled even though there are broadcast and + * multicast flows created). + * + * If the NIC has more than 1 Rx ring, we assign the default ring (the + * 1st ring) to deal with broadcast, multicast and traffic for other + * NICs which needs S/W classification. We assign the primary mac + * addresses to another ring by specifiying a classification rule for + * primary unicast MAC address to the selected ring. The primary Link + * (and its SRS) can continue to poll the assigned Rx ring at all times + * independantly. + * + * Right now we just assign MAC_RX_HW_DEFAULT_RING to note that it is + * primary NIC and later we will check to see how many Rx rings we + * have and can we get a non default Rx ring for the primary MAC. + * + * Note: In future, if no fanout is specified, we try to assign 2 Rx + * rings for the primary Link with the primary MAC address + TCP going + * to one ring and primary MAC address + UDP|SCTP going to other ring. + * Any remaining traffic for primary MAC address can go to the default + * Rx ring and get S/W classified. This way the respective SRSs don't + * need to do proto fanout and don't need to have softrings at all and + * can poll their respective Rx rings. + * + * As an optimization, when a new NIC or VNIC is created, we can get + * only one Rx ring and make it a TCP specific Rx ring and use the + * H/W default Rx ring for the rest (this Rx ring is never polled). + */ +int +mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent, + uint32_t link_type) +{ + mac_impl_t *mip = mcip->mci_mip; + mac_group_t *group = NULL; + mac_group_t *default_group; + int err; + uint8_t *mac_addr; + mac_rx_group_reserve_type_t rtype = MAC_RX_RESERVE_NONDEFAULT; + mac_group_state_t next_state; + mac_client_impl_t *group_only_mcip; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + switch (link_type) { + case SRST_FLOW: + mac_srs_group_setup(mcip, flent, NULL, link_type); + return (0); + + case SRST_LINK: + mac_addr = flent->fe_flow_desc.fd_dst_mac; + + /* Check if we need to reserve the default group */ + if (flent->fe_type & FLOW_PRIMARY_MAC) + rtype = MAC_RX_RESERVE_DEFAULT; + + if (!mcip->mci_no_hwrings) { + /* + * Check to see if we can get an exclusive group for + * this mac address or if there already exists a + * group that has this mac address (case of VLANs). + * If no groups are available, use the default group. + */ + group = mac_reserve_rx_group(mcip, mac_addr, rtype); + } + + if (group == NULL) { + if (mcip->mci_req_hwrings) + return (ENOSPC); + group = &mip->mi_rx_groups[0]; + } + + /* + * Some NICs don't support any Rx rings, so there may not + * even be a default group. + */ + if (group != NULL) { + flent->fe_rx_ring_group = group; + /* + * Add the client to the group. This could cause + * either this group to move to the shared state or + * cause the default group to move to the shared state. + * The actions on this group are done here, while the + * actions on the default group are postponed to + * the end of this function. + */ + mac_rx_group_add_client(group, mcip); + next_state = mac_rx_group_next_state(group, + &group_only_mcip); + + ASSERT((next_state == MAC_GROUP_STATE_RESERVED && + mcip == group_only_mcip) || + (next_state == MAC_GROUP_STATE_SHARED && + group_only_mcip == NULL)); + + mac_set_rx_group_state(group, next_state); + } + + /* + * Setup the Rx and Tx SRSes. If we got a pristine group + * exclusively above, mac_srs_group_setup would simply create + * the required SRSes. If we ended up sharing a previously + * reserved group, mac_srs_group_setup would also dismantle the + * SRSes of the previously exclusive group + */ + mac_srs_group_setup(mcip, flent, group, link_type); + + /* Program the S/W Classifer */ + if ((err = mac_flow_add(mip->mi_flow_tab, flent)) != 0) + goto setup_failed; + + /* Program the H/W Classifier */ + if ((err = mac_add_macaddr(mip, group, mac_addr)) != 0) + goto setup_failed; + mcip->mci_unicast = mac_find_macaddr(mip, mac_addr); + ASSERT(mcip->mci_unicast != NULL); + break; + + default: + ASSERT(B_FALSE); + break; + } + + /* + * All broadcast and multicast traffic is received only on the default + * group. If we have setup the datapath for a non-default group above + * then move the default group to shared state to allow distribution of + * incoming broadcast traffic to the other groups and dismantle the + * SRSes over the default group. + */ + if (group != NULL) { + if (group != mip->mi_rx_groups) { + default_group = mip->mi_rx_groups; + if (default_group->mrg_state == + MAC_GROUP_STATE_RESERVED) { + group_only_mcip = MAC_RX_GROUP_ONLY_CLIENT( + default_group); + ASSERT(group_only_mcip != NULL && + mip->mi_nactiveclients > 1); + + mac_set_rx_group_state(default_group, + MAC_GROUP_STATE_SHARED); + mac_srs_group_setup(group_only_mcip, + group_only_mcip->mci_flent, + default_group, SRST_LINK); + } + ASSERT(default_group->mrg_state == + MAC_GROUP_STATE_SHARED); + } + /* + * If we get an exclusive group for a VLAN MAC client we + * need to take the s/w path to make the additional check for + * the vid. Disable polling and set it to s/w classification. + */ + if (group->mrg_state == MAC_GROUP_STATE_RESERVED && + i_mac_flow_vid(mcip->mci_flent) != VLAN_ID_NONE) { + mac_rx_switch_grp_to_sw(group); + } + } + return (0); + +setup_failed: + mac_datapath_teardown(mcip, flent, link_type); + return (err); +} + +void +mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, + uint32_t link_type) +{ + mac_impl_t *mip = mcip->mci_mip; + mac_group_t *group = NULL; + mac_client_impl_t *grp_only_mcip; + flow_entry_t *group_only_flent; + mac_group_t *default_group; + boolean_t check_default_group = B_FALSE; + mac_group_state_t next_state; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + switch (link_type) { + case SRST_FLOW: + mac_srs_group_teardown(mcip, flent, SRST_FLOW); + return; + + case SRST_LINK: + /* Stop sending packets */ + mac_tx_client_block(mcip); + + /* Stop the packets coming from the H/W */ + if (mcip->mci_unicast != NULL) { + int err; + err = mac_remove_macaddr(mcip->mci_unicast); + if (err != 0) { + cmn_err(CE_WARN, "%s: failed to remove a MAC" + " address because of error 0x%x", + mip->mi_name, err); + } + mcip->mci_unicast = NULL; + } + + /* Stop the packets coming from the S/W classifier */ + mac_flow_remove(mip->mi_flow_tab, flent, B_FALSE); + mac_flow_wait(flent, FLOW_DRIVER_UPCALL); + + /* Now quiesce and destroy all SRS and soft rings */ + mac_srs_group_teardown(mcip, flent, SRST_LINK); + ASSERT((mcip->mci_flent == flent) && + (flent->fe_next == NULL)); + + /* + * Release our hold on the group as well. We need + * to check if the shared group has only one client + * left who can use it exclusively. Also, if we + * were the last client, release the group. + */ + group = flent->fe_rx_ring_group; + if (group != NULL) { + mac_rx_group_remove_client(group, mcip); + next_state = mac_rx_group_next_state(group, + &grp_only_mcip); + if (next_state == MAC_GROUP_STATE_RESERVED) { + /* + * Only one client left on this RX group. + */ + ASSERT(grp_only_mcip != NULL); + mac_set_rx_group_state(group, + MAC_GROUP_STATE_RESERVED); + group_only_flent = grp_only_mcip->mci_flent; + + /* + * The only remaining client has exclusive + * access on the group. Allow it to + * dynamically poll the H/W rings etc. + */ + mac_srs_group_setup(grp_only_mcip, + group_only_flent, group, SRST_LINK); + mac_rx_group_unmark(group, MR_INCIPIENT); + } else if (next_state == MAC_GROUP_STATE_REGISTERED) { + /* + * This is a non-default group being freed up. + * We need to reevaluate the default group + * to see if the primary client can get + * exclusive access to the default group. + */ + ASSERT(group != mip->mi_rx_groups); + mac_release_rx_group(mcip, group); + mac_set_rx_group_state(group, + MAC_GROUP_STATE_REGISTERED); + check_default_group = B_TRUE; + } else { + ASSERT(next_state == MAC_GROUP_STATE_SHARED); + mac_set_rx_group_state(group, + MAC_GROUP_STATE_SHARED); + mac_rx_group_unmark(group, MR_CONDEMNED); + } + flent->fe_rx_ring_group = NULL; + } + break; + default: + ASSERT(B_FALSE); + break; + } + + /* + * The mac client using the default group gets exclusive access to the + * default group if and only if it is the sole client on the entire + * mip. If so set the group state to reserved, and set up the SRSes + * over the default group. + */ + if (check_default_group) { + default_group = mip->mi_rx_groups; + ASSERT(default_group->mrg_state == MAC_GROUP_STATE_SHARED); + next_state = mac_rx_group_next_state(default_group, + &grp_only_mcip); + if (next_state == MAC_GROUP_STATE_RESERVED) { + ASSERT(grp_only_mcip != NULL && + mip->mi_nactiveclients == 1); + mac_set_rx_group_state(default_group, + MAC_GROUP_STATE_RESERVED); + mac_srs_group_setup(grp_only_mcip, + grp_only_mcip->mci_flent, + default_group, SRST_LINK); + } + } +} + +/* DATAPATH TEAR DOWN ROUTINES (SRS and FANOUT teardown) */ + +static void +mac_srs_fanout_list_free(mac_soft_ring_set_t *mac_srs) +{ + ASSERT(mac_srs->srs_tcp_soft_rings != NULL); + kmem_free(mac_srs->srs_tcp_soft_rings, + sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT); + mac_srs->srs_tcp_soft_rings = NULL; + ASSERT(mac_srs->srs_udp_soft_rings != NULL); + kmem_free(mac_srs->srs_udp_soft_rings, + sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT); + mac_srs->srs_udp_soft_rings = NULL; + ASSERT(mac_srs->srs_oth_soft_rings != NULL); + kmem_free(mac_srs->srs_oth_soft_rings, + sizeof (mac_soft_ring_t *) * MAX_SR_FANOUT); + mac_srs->srs_oth_soft_rings = NULL; +} + +/* + * An RX SRS is attached to at most one mac_ring. + * A TX SRS has no rings. + */ +static void +mac_srs_ring_free(mac_soft_ring_set_t *mac_srs) +{ + mac_client_impl_t *mcip; + mac_ring_t *ring; + flow_entry_t *flent; + + ring = mac_srs->srs_ring; + if (mac_srs->srs_type & SRST_TX) { + ASSERT(ring == NULL); + return; + } + + if (ring == NULL) + return; + + /* + * Broadcast flows don't have a client impl association, but they + * use only soft rings. + */ + flent = mac_srs->srs_flent; + mcip = flent->fe_mcip; + ASSERT(mcip != NULL); + + ring->mr_classify_type = MAC_NO_CLASSIFIER; + ring->mr_srs = NULL; +} + +/* + * Physical unlink and free of the data structures happen below. This is + * driven from mac_flow_destroy(), on the last refrele of a flow. + * + * Assumes Rx srs is 1-1 mapped with an ring. + */ +void +mac_srs_free(mac_soft_ring_set_t *mac_srs) +{ + ASSERT(mac_srs->srs_mcip == NULL || + MAC_PERIM_HELD((mac_handle_t)mac_srs->srs_mcip->mci_mip)); + ASSERT((mac_srs->srs_state & (SRS_CONDEMNED | SRS_CONDEMNED_DONE | + SRS_PROC | SRS_PROC_FAST)) == (SRS_CONDEMNED | SRS_CONDEMNED_DONE)); + + mac_pkt_drop(NULL, NULL, mac_srs->srs_first, B_FALSE); + mac_srs_ring_free(mac_srs); + mac_srs_soft_rings_free(mac_srs, B_TRUE); + mac_srs_fanout_list_free(mac_srs); + + mac_srs->srs_bw = NULL; + kmem_cache_free(mac_srs_cache, mac_srs); +} + +static void +mac_srs_soft_rings_quiesce(mac_soft_ring_set_t *mac_srs, uint_t s_ring_flag) +{ + mac_soft_ring_t *softring; + + ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); + + mac_srs_soft_rings_signal(mac_srs, s_ring_flag); + if (s_ring_flag == S_RING_CONDEMNED) { + while (mac_srs->srs_soft_ring_condemned_count != + mac_srs->srs_soft_ring_count) + cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); + } else { + while (mac_srs->srs_soft_ring_quiesced_count != + mac_srs->srs_soft_ring_count) + cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); + } + mutex_exit(&mac_srs->srs_lock); + + for (softring = mac_srs->srs_soft_ring_head; softring != NULL; + softring = softring->s_ring_next) + (void) untimeout(softring->s_ring_tid); + + (void) untimeout(mac_srs->srs_tid); + + mutex_enter(&mac_srs->srs_lock); +} + +/* + * The block comment above mac_rx_classify_flow_state_change explains the + * background. At this point upcalls from the driver (both hardware classified + * and software classified) have been cut off. We now need to quiesce the + * SRS worker, poll, and softring threads. The SRS worker thread serves as + * the master controller. The steps involved are described below in the function + */ +void +mac_srs_worker_quiesce(mac_soft_ring_set_t *mac_srs) +{ + uint_t s_ring_flag; + uint_t srs_poll_wait_flag; + + ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); + ASSERT(mac_srs->srs_state & (SRS_CONDEMNED | SRS_QUIESCE)); + + if (mac_srs->srs_state & SRS_CONDEMNED) { + s_ring_flag = S_RING_CONDEMNED; + srs_poll_wait_flag = SRS_POLL_THR_EXITED; + } else { + s_ring_flag = S_RING_QUIESCE; + srs_poll_wait_flag = SRS_POLL_THR_QUIESCED; + } + + /* + * In the case of Rx SRS wait till the poll thread is done. + */ + if ((mac_srs->srs_type & SRST_TX) == 0 && + mac_srs->srs_poll_thr != NULL) { + while (!(mac_srs->srs_state & srs_poll_wait_flag)) + cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); + + /* + * Turn off polling as part of the quiesce operation. + */ + MAC_SRS_POLLING_OFF(mac_srs); + mac_srs->srs_state &= ~(SRS_POLLING | SRS_GET_PKTS); + } + + /* + * Then signal the soft ring worker threads to quiesce or quit + * as needed and then wait till that happens. + */ + mac_srs_soft_rings_quiesce(mac_srs, s_ring_flag); + + if (mac_srs->srs_state & SRS_CONDEMNED) + mac_srs->srs_state |= (SRS_QUIESCE_DONE | SRS_CONDEMNED_DONE); + else + mac_srs->srs_state |= SRS_QUIESCE_DONE; + cv_signal(&mac_srs->srs_quiesce_done_cv); +} + +/* + * Signal an SRS to start a temporary quiesce, or permanent removal, or restart + * a quiesced SRS by setting the appropriate flags and signaling the SRS worker + * or poll thread. This function is internal to the quiescing logic and is + * called internally from the SRS quiesce or flow quiesce or client quiesce + * higher level functions. + */ +void +mac_srs_signal(mac_soft_ring_set_t *mac_srs, uint_t srs_flag) +{ + mac_ring_t *ring; + + ring = mac_srs->srs_ring; + ASSERT(ring == NULL || ring->mr_refcnt == 0); + + if (srs_flag == SRS_CONDEMNED) { + /* + * The SRS is going away. We need to unbind the SRS and SR + * threads before removing from the global SRS list. Otherwise + * there is a small window where the cpu reconfig callbacks + * may miss the SRS in the list walk and DR could fail since + * there are still bound threads. + */ + mac_srs_threads_unbind(mac_srs); + mac_srs_remove_glist(mac_srs); + } + /* + * Wakeup the SRS worker and poll threads. + */ + mutex_enter(&mac_srs->srs_lock); + mac_srs->srs_state |= srs_flag; + cv_signal(&mac_srs->srs_async); + cv_signal(&mac_srs->srs_cv); + mutex_exit(&mac_srs->srs_lock); +} + +/* + * In the Rx side, the quiescing is done bottom up. After the Rx upcalls + * from the driver are done, then the Rx SRS is quiesced and only then can + * we signal the soft rings. Thus this function can't be called arbitrarily + * without satisfying the prerequisites. On the Tx side, the threads from + * top need to quiesced, then the Tx SRS and only then can we signal the + * Tx soft rings. + */ +static void +mac_srs_soft_rings_signal(mac_soft_ring_set_t *mac_srs, uint_t sr_flag) +{ + mac_soft_ring_t *softring; + + for (softring = mac_srs->srs_soft_ring_head; softring != NULL; + softring = softring->s_ring_next) + mac_soft_ring_signal(softring, sr_flag); +} + +/* + * The block comment above mac_rx_classify_flow_state_change explains the + * background. At this point the SRS is quiesced and we need to restart the + * SRS worker, poll, and softring threads. The SRS worker thread serves as + * the master controller. The steps involved are described below in the function + */ +void +mac_srs_worker_restart(mac_soft_ring_set_t *mac_srs) +{ + boolean_t iam_rx_srs; + mac_soft_ring_t *softring; + + ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); + if ((mac_srs->srs_type & SRST_TX) != 0) { + iam_rx_srs = B_FALSE; + ASSERT((mac_srs->srs_state & + (SRS_POLL_THR_QUIESCED | SRS_QUIESCE_DONE | SRS_QUIESCE)) == + (SRS_QUIESCE_DONE | SRS_QUIESCE)); + } else { + iam_rx_srs = B_TRUE; + ASSERT((mac_srs->srs_state & + (SRS_QUIESCE_DONE | SRS_QUIESCE)) == + (SRS_QUIESCE_DONE | SRS_QUIESCE)); + if (mac_srs->srs_poll_thr != NULL) { + ASSERT((mac_srs->srs_state & SRS_POLL_THR_QUIESCED) == + SRS_POLL_THR_QUIESCED); + } + } + + /* + * Signal any quiesced soft ring workers to restart and wait for the + * soft ring down count to come down to zero. + */ + if (mac_srs->srs_soft_ring_quiesced_count != 0) { + for (softring = mac_srs->srs_soft_ring_head; softring != NULL; + softring = softring->s_ring_next) { + if (!(softring->s_ring_state & S_RING_QUIESCE)) + continue; + mac_soft_ring_signal(softring, S_RING_RESTART); + } + while (mac_srs->srs_soft_ring_quiesced_count != 0) + cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); + } + + mac_srs->srs_state &= ~(SRS_QUIESCE_DONE | SRS_QUIESCE | SRS_RESTART); + if (iam_rx_srs && mac_srs->srs_poll_thr != NULL) { + /* + * Signal the poll thread and ask it to restart. Wait till it + * actually restarts and the SRS_POLL_THR_QUIESCED flag gets + * cleared. + */ + mac_srs->srs_state |= SRS_POLL_THR_RESTART; + cv_signal(&mac_srs->srs_cv); + while (mac_srs->srs_state & SRS_POLL_THR_QUIESCED) + cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); + ASSERT(!(mac_srs->srs_state & SRS_POLL_THR_RESTART)); + } + /* Wake up any waiter waiting for the restart to complete */ + mac_srs->srs_state |= SRS_RESTART_DONE; + cv_signal(&mac_srs->srs_quiesce_done_cv); +} + +static void +mac_srs_worker_unbind(mac_soft_ring_set_t *mac_srs) +{ + mutex_enter(&mac_srs->srs_lock); + if (!(mac_srs->srs_state & SRS_WORKER_BOUND)) { + ASSERT(mac_srs->srs_worker_cpuid == -1); + mutex_exit(&mac_srs->srs_lock); + return; + } + + mac_srs->srs_worker_cpuid = -1; + mac_srs->srs_state &= ~SRS_WORKER_BOUND; + thread_affinity_clear(mac_srs->srs_worker); + mutex_exit(&mac_srs->srs_lock); +} + +static void +mac_srs_poll_unbind(mac_soft_ring_set_t *mac_srs) +{ + mutex_enter(&mac_srs->srs_lock); + if (mac_srs->srs_poll_thr == NULL || + (mac_srs->srs_state & SRS_POLL_BOUND) == 0) { + ASSERT(mac_srs->srs_poll_cpuid == -1); + mutex_exit(&mac_srs->srs_lock); + return; + } + + mac_srs->srs_poll_cpuid = -1; + mac_srs->srs_state &= ~SRS_POLL_BOUND; + thread_affinity_clear(mac_srs->srs_poll_thr); + mutex_exit(&mac_srs->srs_lock); +} + +static void +mac_srs_threads_unbind(mac_soft_ring_set_t *mac_srs) +{ + mac_soft_ring_t *soft_ring; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mac_srs->srs_mcip->mci_mip)); + + mutex_enter(&cpu_lock); + mac_srs_worker_unbind(mac_srs); + if (!(mac_srs->srs_type & SRST_TX)) + mac_srs_poll_unbind(mac_srs); + + for (soft_ring = mac_srs->srs_soft_ring_head; soft_ring != NULL; + soft_ring = soft_ring->s_ring_next) { + mac_soft_ring_unbind(soft_ring); + } + mutex_exit(&cpu_lock); +} + +/* + * When a CPU is going away, unbind all MAC threads which are bound + * to that CPU. The affinity of the thread to the CPU is saved to allow + * the thread to be rebound to the CPU if it comes back online. + */ +static void +mac_walk_srs_and_unbind(int cpuid) +{ + mac_soft_ring_set_t *mac_srs; + mac_soft_ring_t *soft_ring; + + rw_enter(&mac_srs_g_lock, RW_READER); + + if ((mac_srs = mac_srs_g_list) == NULL) + goto done; + + for (; mac_srs != NULL; mac_srs = mac_srs->srs_next) { + if (mac_srs->srs_worker_cpuid == cpuid) { + mac_srs->srs_worker_cpuid_save = cpuid; + mac_srs_worker_unbind(mac_srs); + } + + if (!(mac_srs->srs_type & SRST_TX)) { + if (mac_srs->srs_poll_cpuid == cpuid) { + mac_srs->srs_poll_cpuid_save = cpuid; + mac_srs_poll_unbind(mac_srs); + } + } + + /* Next tackle the soft rings associated with the srs */ + mutex_enter(&mac_srs->srs_lock); + for (soft_ring = mac_srs->srs_soft_ring_head; soft_ring != NULL; + soft_ring = soft_ring->s_ring_next) { + if (soft_ring->s_ring_cpuid == cpuid) { + soft_ring->s_ring_cpuid_save = cpuid; + mac_soft_ring_unbind(soft_ring); + } + } + mutex_exit(&mac_srs->srs_lock); + } +done: + rw_exit(&mac_srs_g_lock); +} + +/* TX SETUP and TEARDOWN ROUTINES */ + +/* + * XXXHIO need to make sure the two mac_tx_srs_{add,del}_ring() + * handle the case where the number of rings is one. I.e. there is + * a ring pointed to by mac_srs->srs_tx_arg2. + */ +void +mac_tx_srs_add_ring(mac_soft_ring_set_t *mac_srs, mac_ring_t *tx_ring) +{ + mac_client_impl_t *mcip = mac_srs->srs_mcip; + mac_soft_ring_t *soft_ring; + int count = mac_srs->srs_oth_ring_count; + + ASSERT(mac_srs->srs_state & SRS_QUIESCE); + soft_ring = mac_soft_ring_create(count, 0, NULL, + (ST_RING_OTH | ST_RING_TX), maxclsyspri, mcip, mac_srs, -1, + NULL, mcip, (mac_resource_handle_t)tx_ring); + mac_srs->srs_oth_ring_count++; + /* + * put this soft ring in quiesce mode too so when we restart + * all soft rings in the srs are in the same state. + */ + mac_soft_ring_signal(soft_ring, S_RING_QUIESCE); +} + +static void +mac_soft_ring_remove(mac_soft_ring_set_t *mac_srs, mac_soft_ring_t *softring) +{ + int sringcnt; + + mutex_enter(&mac_srs->srs_lock); + sringcnt = mac_srs->srs_soft_ring_count; + ASSERT(sringcnt > 0); + mac_soft_ring_signal(softring, S_RING_CONDEMNED); + + ASSERT(mac_srs->srs_soft_ring_condemned_count == 0); + while (mac_srs->srs_soft_ring_condemned_count != 1) + cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); + + if (softring == mac_srs->srs_soft_ring_head) { + mac_srs->srs_soft_ring_head = softring->s_ring_next; + if (mac_srs->srs_soft_ring_head != NULL) { + mac_srs->srs_soft_ring_head->s_ring_prev = NULL; + } else { + mac_srs->srs_soft_ring_tail = NULL; + } + } else { + softring->s_ring_prev->s_ring_next = + softring->s_ring_next; + if (softring->s_ring_next != NULL) { + softring->s_ring_next->s_ring_prev = + softring->s_ring_prev; + } else { + mac_srs->srs_soft_ring_tail = + softring->s_ring_prev; + } + } + mac_srs->srs_soft_ring_count--; + + mac_srs->srs_soft_ring_condemned_count--; + mutex_exit(&mac_srs->srs_lock); + + mac_soft_ring_free(softring, B_FALSE); +} + +void +mac_tx_srs_del_ring(mac_soft_ring_set_t *mac_srs, mac_ring_t *tx_ring) +{ + int i; + mac_soft_ring_t *soft_ring, *remove_sring; + + mutex_enter(&mac_srs->srs_lock); + for (i = 0; i < mac_srs->srs_oth_ring_count; i++) { + soft_ring = mac_srs->srs_oth_soft_rings[i]; + if (soft_ring->s_ring_tx_arg2 == tx_ring) + break; + } + mutex_exit(&mac_srs->srs_lock); + ASSERT(i < mac_srs->srs_oth_ring_count); + remove_sring = soft_ring; + mac_soft_ring_remove(mac_srs, remove_sring); + mac_srs_update_fanout_list(mac_srs); +} + +/* + * mac_tx_srs_setup(): + * + * Used to setup Tx rings. If no free Tx ring is available, then default + * Tx ring is used. + */ +void +mac_tx_srs_setup(mac_client_impl_t *mcip, flow_entry_t *flent, + uint32_t srs_type) +{ + mac_impl_t *mip = mcip->mci_mip; + mac_soft_ring_set_t *tx_srs; + int i, tx_ring_count = 0, tx_rings_reserved; + mac_ring_handle_t *tx_ring = NULL; + uint32_t soft_ring_type; + mac_group_t *grp = NULL; + mac_ring_t *ring; + mac_srs_tx_t *tx; + boolean_t serialize = B_FALSE; + + tx_srs = flent->fe_tx_srs; + tx = &tx_srs->srs_tx; + + if (tx->st_group != NULL) { + grp = tx->st_group; + tx_ring_count = grp->mrg_cur_count; + } else { + tx_ring_count = mac_tx_ring_count; + } + + if (tx_ring_count != 0) { + tx_ring = kmem_zalloc(sizeof (mac_ring_handle_t) * + tx_ring_count, KM_SLEEP); + } + + /* + * Just use the default ring for now. We need to use + * the underlying link's ring set instead of the underlying + * NIC's. + */ + if (srs_type == SRST_FLOW || mcip->mci_no_hwrings) + goto use_default_ring; + + if (mcip->mci_share != NULL) + ring = grp->mrg_rings; + /* + * An attempt is made to reserve 'tx_ring_count' number + * of Tx rings. If tx_ring_count is 0, default Tx ring + * is used. If it is 1, an attempt is made to reserve one + * Tx ring. In both the cases, the ring information is + * stored in Tx SRS. If multiple Tx rings are specified, + * then each Tx ring will have a Tx-side soft ring. All + * these soft rings will be hang off Tx SRS. + */ + for (i = 0, tx_rings_reserved = 0; + i < tx_ring_count; i++, tx_rings_reserved++) { + if (mcip->mci_share != NULL) { + /* + * The ring was already chosen and associated + * with the TX group. Save it in the new + * array to keep as much of the code below common + * between the share and non-share cases. + */ + ASSERT(ring != NULL); + tx_ring[i] = (mac_ring_handle_t)ring; + ring = ring->mr_next; + } else { + tx_ring[i] = + (mac_ring_handle_t)mac_reserve_tx_ring(mip, NULL); + if (tx_ring[i] == NULL) + break; + } + } + if (mac_tx_serialize || (mip->mi_v12n_level & MAC_VIRT_SERIALIZE)) + serialize = B_TRUE; + /* + * Did we get the requested number of tx rings? + * There are 3 actions we can take depending upon the number + * of tx_rings we got. + * 1) If we got none, then hook up the tx_srs with the + * default ring. + * 2) If we got one, then get the tx_ring from the soft ring, + * save it in SRS and free up the soft ring. + * 3) If we got more than 1, then do the tx fanout among the + * rings we obtained. + */ + switch (tx_rings_reserved) { + case 1: + /* + * No need to allocate Tx soft rings. Tx-side soft + * rings are for Tx fanout case. Just use Tx SRS. + */ + /* FALLTHRU */ + + case 0: +use_default_ring: + if (tx_rings_reserved == 0) + tx->st_arg2 = (void *)mip->mi_default_tx_ring; + else + tx->st_arg2 = (void *)tx_ring[0]; + /* For ring_count of 0 or 1, set the tx_mode and return */ + if (tx_srs->srs_type & SRST_BW_CONTROL) + tx->st_mode = SRS_TX_BW; + else if (serialize) + tx->st_mode = SRS_TX_SERIALIZE; + else + tx->st_mode = SRS_TX_DEFAULT; + break; + + default: + /* + * We got multiple Tx rings for Tx fanout. + * + * cpuid of -1 is passed. This creates an unbound + * worker thread. Instead the code should get CPU + * binding information and pass that to + * mac_soft_ring_create(). This needs to be done + * in conjunction with Rx-side soft ring + * bindings. + */ + soft_ring_type = ST_RING_OTH | ST_RING_TX; + if (tx_srs->srs_type & SRST_BW_CONTROL) { + tx->st_mode = SRS_TX_BW_FANOUT; + } else { + tx->st_mode = SRS_TX_FANOUT; + if (serialize) + soft_ring_type |= ST_RING_WORKER_ONLY; + } + for (i = 0; i < tx_rings_reserved; i++) { + (void) mac_soft_ring_create(i, 0, NULL, soft_ring_type, + maxclsyspri, mcip, tx_srs, -1, NULL, mcip, + (mac_resource_handle_t)tx_ring[i]); + } + mac_srs_update_fanout_list(tx_srs); + } + tx->st_func = mac_tx_get_func(tx->st_mode); + + DTRACE_PROBE3(tx__srs___setup__return, mac_soft_ring_set_t *, tx_srs, + int, tx->st_mode, int, tx_srs->srs_oth_ring_count); + + if (tx_ring_count != 0) { + kmem_free(tx_ring, + sizeof (mac_ring_handle_t) * tx_ring_count); + } +} + +/* + * Walk through the list of mac clients for the MAC. + * For each active mac client, recompute the number of soft rings + * associated with every client, only if current speed is different + * from the speed that was previously used for soft ring computation. + * If the cable is disconnected whlie the NIC is started, we would get + * notification with speed set to 0. We do not recompute in that case. + */ +void +mac_fanout_recompute(mac_impl_t *mip) +{ + mac_client_impl_t *mcip; + uint64_t ifspeed; + mac_resource_props_t *mcip_mrp; + + i_mac_perim_enter(mip); + ASSERT(!(mip->mi_state_flags & MIS_IS_VNIC)); + + if (mip->mi_linkstate != LINK_STATE_UP) { + i_mac_perim_exit(mip); + return; + } + + for (mcip = mip->mi_clients_list; mcip != NULL; + mcip = mcip->mci_client_next) { + if (!MCIP_DATAPATH_SETUP(mcip)) + continue; + + ifspeed = mac_client_stat_get(mcip->mci_flent->fe_mcip, + MAC_STAT_IFSPEED); + if ((ifspeed != 0) && + (ifspeed != mcip->mci_flent->fe_nic_speed)) { + mcip_mrp = MCIP_RESOURCE_PROPS(mcip); + mac_fanout_setup(mcip, mcip->mci_flent, + mcip_mrp, mac_rx_deliver, mcip, NULL); + } + } + i_mac_perim_exit(mip); +} diff --git a/usr/src/uts/common/io/mac/mac_flow.c b/usr/src/uts/common/io/mac/mac_flow.c new file mode 100644 index 0000000000..f4c2113f61 --- /dev/null +++ b/usr/src/uts/common/io/mac/mac_flow.c @@ -0,0 +1,2373 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/strsun.h> +#include <sys/sdt.h> +#include <sys/mac.h> +#include <sys/mac_impl.h> +#include <sys/mac_client_impl.h> +#include <sys/dls.h> +#include <sys/dls_impl.h> +#include <sys/mac_soft_ring.h> +#include <sys/ethernet.h> +#include <sys/vlan.h> +#include <inet/ip.h> +#include <inet/ip6.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <netinet/sctp.h> + +/* global flow table, will be a per exclusive-zone table later */ +static mod_hash_t *flow_hash; +static krwlock_t flow_tab_lock; + +static kmem_cache_t *flow_cache; +static kmem_cache_t *flow_tab_cache; +static flow_ops_t flow_l2_ops; + +typedef struct { + const char *fs_name; + uint_t fs_offset; +} flow_stats_info_t; + +#define FS_OFF(f) (offsetof(flow_stats_t, f)) +static flow_stats_info_t flow_stats_list[] = { + {"rbytes", FS_OFF(fs_rbytes)}, + {"ipackets", FS_OFF(fs_ipackets)}, + {"ierrors", FS_OFF(fs_ierrors)}, + {"obytes", FS_OFF(fs_obytes)}, + {"opackets", FS_OFF(fs_opackets)}, + {"oerrors", FS_OFF(fs_oerrors)} +}; +#define FS_SIZE (sizeof (flow_stats_list) / sizeof (flow_stats_info_t)) + +/* + * Checks whether a flow mask is legal. + */ +static flow_tab_info_t *mac_flow_tab_info_get(flow_mask_t); + +static void +flow_stat_init(kstat_named_t *knp) +{ + int i; + + for (i = 0; i < FS_SIZE; i++, knp++) { + kstat_named_init(knp, flow_stats_list[i].fs_name, + KSTAT_DATA_UINT64); + } +} + +static int +flow_stat_update(kstat_t *ksp, int rw) +{ + flow_entry_t *fep = ksp->ks_private; + flow_stats_t *fsp = &fep->fe_flowstats; + kstat_named_t *knp = ksp->ks_data; + uint64_t *statp; + zoneid_t zid; + int i; + + if (rw != KSTAT_READ) + return (EACCES); + + zid = getzoneid(); + if (zid != GLOBAL_ZONEID && zid != fep->fe_zoneid) { + for (i = 0; i < FS_SIZE; i++, knp++) + knp->value.ui64 = 0; + + return (0); + } + + for (i = 0; i < FS_SIZE; i++, knp++) { + statp = (uint64_t *) + ((uchar_t *)fsp + flow_stats_list[i].fs_offset); + + knp->value.ui64 = *statp; + } + return (0); +} + +static void +flow_stat_create(flow_entry_t *fep) +{ + kstat_t *ksp; + kstat_named_t *knp; + uint_t nstats = FS_SIZE; + + ksp = kstat_create("unix", 0, (char *)fep->fe_flow_name, "flow", + KSTAT_TYPE_NAMED, nstats, 0); + if (ksp == NULL) + return; + + ksp->ks_update = flow_stat_update; + ksp->ks_private = fep; + fep->fe_ksp = ksp; + + knp = (kstat_named_t *)ksp->ks_data; + flow_stat_init(knp); + kstat_install(ksp); +} + +void +flow_stat_destroy(flow_entry_t *fep) +{ + if (fep->fe_ksp != NULL) { + kstat_delete(fep->fe_ksp); + fep->fe_ksp = NULL; + } +} + +/* + * Initialize the flow table + */ +void +mac_flow_init() +{ + flow_cache = kmem_cache_create("flow_entry_cache", + sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + flow_tab_cache = kmem_cache_create("flow_tab_cache", + sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + flow_hash = mod_hash_create_extended("flow_hash", + 100, mod_hash_null_keydtor, mod_hash_null_valdtor, + mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); + rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL); +} + +/* + * Cleanup and release the flow table + */ +void +mac_flow_fini() +{ + kmem_cache_destroy(flow_cache); + kmem_cache_destroy(flow_tab_cache); + mod_hash_destroy_hash(flow_hash); + rw_destroy(&flow_tab_lock); +} + +/* + * mac_create_flow(): create a flow_entry_t. + */ +int +mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name, + void *client_cookie, uint_t type, flow_entry_t **flentp) +{ + flow_entry_t *flent = *flentp; + int err = 0; + + if (mrp != NULL) { + err = mac_validate_props(mrp); + if (err != 0) + return (err); + } + + if (flent == NULL) { + flent = kmem_cache_alloc(flow_cache, KM_SLEEP); + bzero(flent, sizeof (*flent)); + mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL); + + /* Initialize the receiver function to a safe routine */ + flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop; + flent->fe_index = -1; + } + (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAME); + + /* This is an initial flow, will be configured later */ + if (fd == NULL) { + *flentp = flent; + return (0); + } + + flent->fe_client_cookie = client_cookie; + flent->fe_type = type; + + /* + * As flow creation is only allowed in global zone, this will + * always set fe_zoneid to GLOBAL_ZONEID, and dls_add_flow() will + * later set the right value. + */ + flent->fe_zoneid = getzoneid(); + + /* Save flow desc */ + bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); + + if (mrp != NULL) { + /* + * We have already set fe_resource_props for a Link. + */ + if (type & FLOW_USER) { + bcopy(mrp, &flent->fe_resource_props, + sizeof (mac_resource_props_t)); + } + /* + * The effective resource list should reflect the priority + * that we set implicitly. + */ + if (!(mrp->mrp_mask & MRP_PRIORITY)) + mrp->mrp_mask |= MRP_PRIORITY; + if (type & FLOW_USER) + mrp->mrp_priority = MPL_SUBFLOW_DEFAULT; + else + mrp->mrp_priority = MPL_LINK_DEFAULT; + bcopy(mrp, &flent->fe_effective_props, + sizeof (mac_resource_props_t)); + } + flow_stat_create(flent); + + *flentp = flent; + return (0); +} + +/* + * Validate flow entry and add it to a flow table. + */ +int +mac_flow_add(flow_tab_t *ft, flow_entry_t *flent) +{ + flow_entry_t **headp, **p; + flow_ops_t *ops = &ft->ft_ops; + flow_mask_t mask; + uint32_t index; + int err; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); + + /* + * Check for invalid bits in mask. + */ + mask = flent->fe_flow_desc.fd_mask; + if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0) + return (EOPNOTSUPP); + + /* + * Validate flent. + */ + if ((err = ops->fo_accept_fe(ft, flent)) != 0) { + DTRACE_PROBE3(accept_failed, flow_tab_t *, ft, + flow_entry_t *, flent, int, err); + return (err); + } + + /* + * Flent is valid. now calculate hash and insert it + * into hash table. + */ + index = ops->fo_hash_fe(ft, flent); + + /* + * We do not need a lock up until now because we were + * not accessing the flow table. + */ + rw_enter(&ft->ft_lock, RW_WRITER); + headp = &ft->ft_table[index]; + + /* + * Check for duplicate flow. + */ + for (p = headp; *p != NULL; p = &(*p)->fe_next) { + if ((*p)->fe_flow_desc.fd_mask != + flent->fe_flow_desc.fd_mask) + continue; + + if (ft->ft_ops.fo_match_fe(ft, *p, flent)) { + rw_exit(&ft->ft_lock); + DTRACE_PROBE3(dup_flow, flow_tab_t *, ft, + flow_entry_t *, flent, int, err); + return (EALREADY); + } + } + + /* + * Insert flow to hash list. + */ + err = ops->fo_insert_fe(ft, headp, flent); + if (err != 0) { + rw_exit(&ft->ft_lock); + DTRACE_PROBE3(insert_failed, flow_tab_t *, ft, + flow_entry_t *, flent, int, err); + return (err); + } + + /* + * Save the hash index so it can be used by mac_flow_remove(). + */ + flent->fe_index = (int)index; + + /* + * Save the flow tab back reference. + */ + flent->fe_flow_tab = ft; + FLOW_MARK(flent, FE_FLOW_TAB); + ft->ft_flow_count++; + rw_exit(&ft->ft_lock); + return (0); +} + +/* + * Remove a flow from a mac client's subflow table + */ +void +mac_flow_rem_subflow(flow_entry_t *flent) +{ + flow_tab_t *ft = flent->fe_flow_tab; + mac_client_impl_t *mcip = ft->ft_mcip; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); + + mac_flow_remove(ft, flent, B_FALSE); + if (flent->fe_mcip == NULL) { + /* + * The interface is not yet plumbed and mac_client_flow_add + * was not done. + */ + if (FLOW_TAB_EMPTY(ft)) { + mac_flow_tab_destroy(ft); + mcip->mci_subflow_tab = NULL; + } + return; + } + mac_flow_wait(flent, FLOW_DRIVER_UPCALL); + mac_link_flow_clean((mac_client_handle_t)mcip, flent); +} + +/* + * Add a flow to a mac client's subflow table and instantiate the flow + * in the mac by creating the associated SRSs etc. + */ +int +mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent, + boolean_t instantiate_flow) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + flow_tab_info_t *ftinfo; + flow_mask_t mask; + flow_tab_t *ft; + int err; + boolean_t ft_created = B_FALSE; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); + + /* + * If the subflow table exists already just add the new subflow + * to the existing table, else we create a new subflow table below. + */ + ft = mcip->mci_subflow_tab; + if (ft == NULL) { + mask = flent->fe_flow_desc.fd_mask; + /* + * Try to create a new table and then add the subflow to the + * newly created subflow table + */ + if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL) + return (EOPNOTSUPP); + + mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size, + mcip->mci_mip, &ft); + ft_created = B_TRUE; + } + + err = mac_flow_add(ft, flent); + if (err != 0) { + if (ft_created) + mac_flow_tab_destroy(ft); + return (err); + } + + if (instantiate_flow) { + /* Now activate the flow by creating its SRSs */ + ASSERT(MCIP_DATAPATH_SETUP(mcip)); + err = mac_link_flow_init((mac_client_handle_t)mcip, flent); + if (err != 0) { + mac_flow_remove(ft, flent, B_FALSE); + if (ft_created) + mac_flow_tab_destroy(ft); + return (err); + } + } else { + FLOW_MARK(flent, FE_UF_NO_DATAPATH); + } + if (ft_created) { + ASSERT(mcip->mci_subflow_tab == NULL); + ft->ft_mcip = mcip; + mcip->mci_subflow_tab = ft; + if (instantiate_flow) + mac_client_update_classifier(mcip, B_TRUE); + } + return (0); +} + +/* + * Remove flow entry from flow table. + */ +void +mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp) +{ + flow_entry_t **fp; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); + if (!(flent->fe_flags & FE_FLOW_TAB)) + return; + + rw_enter(&ft->ft_lock, RW_WRITER); + /* + * If this is a permanent removal from the flow table, mark it + * CONDEMNED to prevent future references. If this is a temporary + * removal from the table, say to update the flow descriptor then + * we don't mark it CONDEMNED + */ + if (!temp) + FLOW_MARK(flent, FE_CONDEMNED); + /* + * Locate the specified flent. + */ + fp = &ft->ft_table[flent->fe_index]; + while (*fp != flent) + fp = &(*fp)->fe_next; + + /* + * The flent must exist. Otherwise it's a bug. + */ + ASSERT(fp != NULL); + *fp = flent->fe_next; + flent->fe_next = NULL; + + /* + * Reset fe_index to -1 so any attempt to call mac_flow_remove() + * on a flent that is supposed to be in the table (FE_FLOW_TAB) + * will panic. + */ + flent->fe_index = -1; + FLOW_UNMARK(flent, FE_FLOW_TAB); + ft->ft_flow_count--; + rw_exit(&ft->ft_lock); +} + +/* + * This is the flow lookup routine used by the mac sw classifier engine. + */ +int +mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp) +{ + flow_state_t s; + flow_entry_t *flent; + flow_ops_t *ops = &ft->ft_ops; + boolean_t retried = B_FALSE; + int i, err; + + s.fs_flags = flags; + s.fs_mp = mp; +retry: + + /* + * Walk the list of predeclared accept functions. + * Each of these would accumulate enough state to allow the next + * accept routine to make progress. + */ + for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) { + if ((err = (ops->fo_accept[i])(ft, &s)) != 0) { + /* + * ENOBUFS indicates that the mp could be too short + * and may need a pullup. + */ + if (err != ENOBUFS || retried) + return (err); + + /* + * Don't modify the mblk if there are references to it. + * Also, there is no point pulling up if b_cont is NULL. + */ + if (DB_REF(mp) > 1 || mp->b_cont == NULL || + pullupmsg(mp, -1) == 0) + return (EINVAL); + + retried = B_TRUE; + DTRACE_PROBE2(need_pullup, flow_tab_t *, ft, + flow_state_t *, &s); + goto retry; + } + } + + /* + * The packet is considered sane. We may now attempt to + * find the corresponding flent. + */ + rw_enter(&ft->ft_lock, RW_READER); + flent = ft->ft_table[ops->fo_hash(ft, &s)]; + for (; flent != NULL; flent = flent->fe_next) { + if (flent->fe_match(ft, flent, &s)) { + FLOW_TRY_REFHOLD(flent, err); + if (err != 0) + continue; + *flentp = flent; + rw_exit(&ft->ft_lock); + return (0); + } + } + rw_exit(&ft->ft_lock); + return (ENOENT); +} + +/* + * Walk flow table. + * The caller is assumed to have proper perimeter protection. + */ +int +mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *), + void *arg) +{ + int err, i, cnt = 0; + flow_entry_t *flent; + + if (ft == NULL) + return (0); + + for (i = 0; i < ft->ft_size; i++) { + for (flent = ft->ft_table[i]; flent != NULL; + flent = flent->fe_next) { + cnt++; + err = (*fn)(flent, arg); + if (err != 0) + return (err); + } + } + VERIFY(cnt == ft->ft_flow_count); + return (0); +} + +/* + * Same as the above except a mutex is used for protection here. + */ +int +mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *), + void *arg) +{ + int err; + + if (ft == NULL) + return (0); + + rw_enter(&ft->ft_lock, RW_WRITER); + err = mac_flow_walk_nolock(ft, fn, arg); + rw_exit(&ft->ft_lock); + return (err); +} + +static boolean_t mac_flow_clean(flow_entry_t *); + +/* + * Destroy a flow entry. Called when the last reference on a flow is released. + */ +void +mac_flow_destroy(flow_entry_t *flent) +{ + ASSERT(flent->fe_refcnt == 0); + + if ((flent->fe_type & FLOW_USER) != 0) { + ASSERT(mac_flow_clean(flent)); + } else { + mac_flow_cleanup(flent); + } + + mutex_destroy(&flent->fe_lock); + cv_destroy(&flent->fe_cv); + flow_stat_destroy(flent); + kmem_cache_free(flow_cache, flent); +} + +/* + * XXX eric + * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and + * mac_link_flow_modify() should really be moved/reworked into the + * two functions below. This would consolidate all the mac property + * checking in one place. I'm leaving this alone for now since it's + * out of scope of the new flows work. + */ +/* ARGSUSED */ +uint32_t +mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp) +{ + uint32_t changed_mask = 0; + mac_resource_props_t *fmrp = &flent->fe_effective_props; + int i; + + if ((mrp->mrp_mask & MRP_MAXBW) != 0 && + (fmrp->mrp_maxbw != mrp->mrp_maxbw)) { + changed_mask |= MRP_MAXBW; + fmrp->mrp_maxbw = mrp->mrp_maxbw; + if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) { + fmrp->mrp_mask &= ~MRP_MAXBW; + } else { + fmrp->mrp_mask |= MRP_MAXBW; + } + } + + if ((mrp->mrp_mask & MRP_PRIORITY) != 0) { + if (fmrp->mrp_priority != mrp->mrp_priority) + changed_mask |= MRP_PRIORITY; + if (mrp->mrp_priority == MPL_RESET) { + fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT; + fmrp->mrp_mask &= ~MRP_PRIORITY; + } else { + fmrp->mrp_priority = mrp->mrp_priority; + fmrp->mrp_mask |= MRP_PRIORITY; + } + } + + /* modify fanout */ + if ((mrp->mrp_mask & MRP_CPUS) != 0) { + if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) && + (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) { + for (i = 0; i < mrp->mrp_ncpus; i++) { + if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i]) + break; + } + if (i == mrp->mrp_ncpus) { + /* + * The new set of cpus passed is exactly + * the same as the existing set. + */ + return (changed_mask); + } + } + changed_mask |= MRP_CPUS; + MAC_COPY_CPUS(mrp, fmrp); + } + return (changed_mask); +} + +void +mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp) +{ + uint32_t changed_mask; + mac_client_impl_t *mcip = flent->fe_mcip; + mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip); + + ASSERT(flent != NULL); + ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); + + rw_enter(&ft->ft_lock, RW_WRITER); + + /* Update the cached values inside the subflow entry */ + changed_mask = mac_flow_modify_props(flent, mrp); + rw_exit(&ft->ft_lock); + /* + * Push the changed parameters to the scheduling code in the + * SRS's, to take effect right away. + */ + if (changed_mask & MRP_MAXBW) { + mac_srs_update_bwlimit(flent, mrp); + /* + * If bandwidth is changed, we may have to change + * the number of soft ring to be used for fanout. + * Call mac_flow_update_fanout() if MAC_BIND_CPU + * is not set and there is no user supplied cpu + * info. This applies only to link at this time. + */ + if (!(flent->fe_type & FLOW_USER) && + !(changed_mask & MRP_CPUS) && + !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) { + mac_fanout_setup(mcip, flent, mcip_mrp, + mac_rx_deliver, mcip, NULL); + } + } + if (mrp->mrp_mask & MRP_PRIORITY) + mac_flow_update_priority(mcip, flent); + + if (changed_mask & MRP_CPUS) + mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL); +} + +/* + * This function waits for a certain condition to be met and is generally + * used before a destructive or quiescing operation. + */ +void +mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event) +{ + mutex_enter(&flent->fe_lock); + flent->fe_flags |= FE_WAITER; + + switch (event) { + case FLOW_DRIVER_UPCALL: + /* + * We want to make sure the driver upcalls have finished before + * we signal the Rx SRS worker to quit. + */ + while (flent->fe_refcnt != 1) + cv_wait(&flent->fe_cv, &flent->fe_lock); + break; + + case FLOW_USER_REF: + /* + * Wait for the fe_user_refcnt to drop to 0. The flow has + * been removed from the global flow hash. + */ + ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH)); + while (flent->fe_user_refcnt != 0) + cv_wait(&flent->fe_cv, &flent->fe_lock); + break; + + default: + ASSERT(0); + } + + flent->fe_flags &= ~FE_WAITER; + mutex_exit(&flent->fe_lock); +} + +static boolean_t +mac_flow_clean(flow_entry_t *flent) +{ + ASSERT(flent->fe_next == NULL); + ASSERT(flent->fe_tx_srs == NULL); + ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL); + ASSERT(flent->fe_mbg == NULL); + + return (B_TRUE); +} + +void +mac_flow_cleanup(flow_entry_t *flent) +{ + if ((flent->fe_type & FLOW_USER) == 0) { + ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) || + (flent->fe_mbg != NULL && flent->fe_mcip == NULL)); + ASSERT(flent->fe_refcnt == 0); + } else { + ASSERT(flent->fe_refcnt == 1); + } + + if (flent->fe_mbg != NULL) { + ASSERT(flent->fe_tx_srs == NULL); + /* This is a multicast or broadcast flow entry */ + mac_bcast_grp_free(flent->fe_mbg); + flent->fe_mbg = NULL; + } + + if (flent->fe_tx_srs != NULL) { + ASSERT(flent->fe_mbg == NULL); + mac_srs_free(flent->fe_tx_srs); + flent->fe_tx_srs = NULL; + } + + /* + * In the normal case fe_rx_srs_cnt is 1. However in the error case + * when mac_unicast_add fails we may not have set up any SRS + * in which case fe_rx_srs_cnt will be zero. + */ + if (flent->fe_rx_srs_cnt != 0) { + ASSERT(flent->fe_rx_srs_cnt == 1); + mac_srs_free(flent->fe_rx_srs[0]); + flent->fe_rx_srs[0] = NULL; + flent->fe_rx_srs_cnt = 0; + } + ASSERT(flent->fe_rx_srs[0] == NULL); +} + +void +mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd) +{ + /* + * Grab the fe_lock to see a self-consistent fe_flow_desc. + * Updates to the fe_flow_desc happen under the fe_lock + * after removing the flent from the flow table + */ + mutex_enter(&flent->fe_lock); + bcopy(&flent->fe_flow_desc, fd, sizeof (*fd)); + mutex_exit(&flent->fe_lock); +} + +/* + * Update a field of a flow entry. The mac perimeter ensures that + * this is the only thread doing a modify operation on this mac end point. + * So the flow table can't change or disappear. The ft_lock protects access + * to the flow entry, and holding the lock ensures that there isn't any thread + * accessing the flow entry or attempting a flow table lookup. However + * data threads that are using the flow entry based on the old descriptor + * will continue to use the flow entry. If strong coherence is required + * then the flow will have to be quiesced before the descriptor can be + * changed. + */ +void +mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd) +{ + flow_tab_t *ft = flent->fe_flow_tab; + flow_desc_t old_desc; + int err; + + if (ft == NULL) { + /* + * The flow hasn't yet been inserted into the table, + * so only the caller knows about this flow, however for + * uniformity we grab the fe_lock here. + */ + mutex_enter(&flent->fe_lock); + bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); + mutex_exit(&flent->fe_lock); + } + + ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); + + /* + * Need to remove the flow entry from the table and reinsert it, + * into a potentially diference hash line. The hash depends on + * the new descriptor fields. However access to fe_desc itself + * is always under the fe_lock. This helps log and stat functions + * see a self-consistent fe_flow_desc. + */ + mac_flow_remove(ft, flent, B_TRUE); + old_desc = flent->fe_flow_desc; + + mutex_enter(&flent->fe_lock); + bcopy(fd, &flent->fe_flow_desc, sizeof (*fd)); + mutex_exit(&flent->fe_lock); + + if (mac_flow_add(ft, flent) != 0) { + /* + * The add failed say due to an invalid flow descriptor. + * Undo the update + */ + flent->fe_flow_desc = old_desc; + err = mac_flow_add(ft, flent); + ASSERT(err == 0); + } +} + +void +mac_flow_set_name(flow_entry_t *flent, const char *name) +{ + flow_tab_t *ft = flent->fe_flow_tab; + + if (ft == NULL) { + /* + * The flow hasn't yet been inserted into the table, + * so only the caller knows about this flow + */ + (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAME); + } else { + ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); + } + + mutex_enter(&flent->fe_lock); + (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAME); + mutex_exit(&flent->fe_lock); +} + +/* + * Return the client-private cookie that was associated with + * the flow when it was created. + */ +void * +mac_flow_get_client_cookie(flow_entry_t *flent) +{ + return (flent->fe_client_cookie); +} + +/* + * Forward declarations. + */ +static uint32_t flow_l2_hash(flow_tab_t *, flow_state_t *); +static int flow_l2_accept(flow_tab_t *, flow_state_t *); +static uint32_t flow_ether_hash(flow_tab_t *, flow_state_t *); +static int flow_ether_accept(flow_tab_t *, flow_state_t *); + +/* + * Create flow table. + */ +void +mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size, + mac_impl_t *mip, flow_tab_t **ftp) +{ + flow_tab_t *ft; + flow_ops_t *new_ops; + + ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP); + bzero(ft, sizeof (*ft)); + + ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP); + + /* + * We make a copy of the ops vector instead of just pointing to it + * because we might want to customize the ops vector on a per table + * basis (e.g. for optimization). + */ + new_ops = &ft->ft_ops; + bcopy(ops, new_ops, sizeof (*ops)); + ft->ft_mask = mask; + ft->ft_size = size; + ft->ft_mip = mip; + + /* + * Optimization for DL_ETHER media. + */ + if (mip->mi_info.mi_nativemedia == DL_ETHER) { + if (new_ops->fo_hash == flow_l2_hash) + new_ops->fo_hash = flow_ether_hash; + + if (new_ops->fo_accept[0] == flow_l2_accept) + new_ops->fo_accept[0] = flow_ether_accept; + + } + *ftp = ft; +} + +void +mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp) +{ + mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID, + 1024, mip, ftp); +} + +/* + * Destroy flow table. + */ +void +mac_flow_tab_destroy(flow_tab_t *ft) +{ + if (ft == NULL) + return; + + ASSERT(ft->ft_flow_count == 0); + kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *)); + bzero(ft, sizeof (*ft)); + kmem_cache_free(flow_tab_cache, ft); +} + +/* + * Add a new flow entry to the global flow hash table + */ +int +mac_flow_hash_add(flow_entry_t *flent) +{ + int err; + + rw_enter(&flow_tab_lock, RW_WRITER); + err = mod_hash_insert(flow_hash, + (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent); + if (err != 0) { + rw_exit(&flow_tab_lock); + return (EEXIST); + } + /* Mark as inserted into the global flow hash table */ + FLOW_MARK(flent, FE_G_FLOW_HASH); + rw_exit(&flow_tab_lock); + return (err); +} + +/* + * Remove a flow entry from the global flow hash table + */ +void +mac_flow_hash_remove(flow_entry_t *flent) +{ + mod_hash_val_t val; + + rw_enter(&flow_tab_lock, RW_WRITER); + VERIFY(mod_hash_remove(flow_hash, + (mod_hash_key_t)flent->fe_flow_name, &val) == 0); + + /* Clear the mark that says inserted into the global flow hash table */ + FLOW_UNMARK(flent, FE_G_FLOW_HASH); + rw_exit(&flow_tab_lock); +} + +/* + * Retrieve a flow entry from the global flow hash table. + */ +int +mac_flow_lookup_byname(char *name, flow_entry_t **flentp) +{ + int err; + flow_entry_t *flent; + + rw_enter(&flow_tab_lock, RW_READER); + err = mod_hash_find(flow_hash, (mod_hash_key_t)name, + (mod_hash_val_t *)&flent); + if (err != 0) { + rw_exit(&flow_tab_lock); + return (ENOENT); + } + ASSERT(flent != NULL); + FLOW_USER_REFHOLD(flent); + rw_exit(&flow_tab_lock); + + *flentp = flent; + return (0); +} + +/* + * Initialize or release mac client flows by walking the subflow table. + * These are typically invoked during plumb/unplumb of links. + */ + +static int +mac_link_init_flows_cb(flow_entry_t *flent, void *arg) +{ + mac_client_impl_t *mcip = arg; + + if (mac_link_flow_init(arg, flent) != 0) { + cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'", + flent->fe_flow_name, mcip->mci_name); + } else { + FLOW_UNMARK(flent, FE_UF_NO_DATAPATH); + } + return (0); +} + +void +mac_link_init_flows(mac_client_handle_t mch) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + + (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, + mac_link_init_flows_cb, mcip); + /* + * If mac client had subflow(s) configured before plumb, change + * function to mac_rx_srs_subflow_process and in case of hardware + * classification, disable polling. + */ + mac_client_update_classifier(mcip, B_TRUE); + +} + +boolean_t +mac_link_has_flows(mac_client_handle_t mch) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + + if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab)) + return (B_TRUE); + + return (B_FALSE); +} + +static int +mac_link_release_flows_cb(flow_entry_t *flent, void *arg) +{ + FLOW_MARK(flent, FE_UF_NO_DATAPATH); + mac_flow_wait(flent, FLOW_DRIVER_UPCALL); + mac_link_flow_clean(arg, flent); + return (0); +} + +void +mac_link_release_flows(mac_client_handle_t mch) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + + /* + * Change the mci_flent callback back to mac_rx_srs_process() + * because flows are about to be deactivated. + */ + mac_client_update_classifier(mcip, B_FALSE); + (void) mac_flow_walk_nolock(mcip->mci_subflow_tab, + mac_link_release_flows_cb, mcip); +} + +void +mac_rename_flow(flow_entry_t *fep, const char *new_name) +{ + mac_flow_set_name(fep, new_name); + if (fep->fe_ksp != NULL) { + flow_stat_destroy(fep); + flow_stat_create(fep); + } +} + +/* + * mac_link_flow_init() + * Internal flow interface used for allocating SRSs and related + * data structures. Not meant to be used by mac clients. + */ +int +mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; + int err; + + ASSERT(mch != NULL); + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0) + return (err); + + sub_flow->fe_mcip = mcip; + + return (0); +} + +/* + * mac_link_flow_add() + * Used by flowadm(1m) or kernel mac clients for creating flows. + */ +int +mac_link_flow_add(datalink_id_t linkid, char *flow_name, + flow_desc_t *flow_desc, mac_resource_props_t *mrp) +{ + flow_entry_t *flent = NULL; + int err; + dls_dl_handle_t dlh; + dls_link_t *dlp; + boolean_t link_held = B_FALSE; + boolean_t hash_added = B_FALSE; + mac_perim_handle_t mph; + + err = mac_flow_lookup_byname(flow_name, &flent); + if (err == 0) { + FLOW_USER_REFRELE(flent); + return (EEXIST); + } + + /* + * First create a flow entry given the description provided + * by the caller. + */ + err = mac_flow_create(flow_desc, mrp, flow_name, NULL, + FLOW_USER | FLOW_OTHER, &flent); + + if (err != 0) + return (err); + + /* + * We've got a local variable referencing this flow now, so we need + * to hold it. We'll release this flow before returning. + * All failures until we return will undo any action that may internally + * held the flow, so the last REFRELE will assure a clean freeing + * of resources. + */ + FLOW_REFHOLD(flent); + + flent->fe_link_id = linkid; + FLOW_MARK(flent, FE_INCIPIENT); + + err = mac_perim_enter_by_linkid(linkid, &mph); + if (err != 0) { + FLOW_FINAL_REFRELE(flent); + return (err); + } + + /* + * dls will eventually be merged with mac so it's ok + * to call dls' internal functions. + */ + err = dls_devnet_hold_link(linkid, &dlh, &dlp); + if (err != 0) + goto bail; + + link_held = B_TRUE; + + /* + * Add the flow to the global flow table, this table will be per + * exclusive zone so each zone can have its own flow namespace. + * RFE 6625651 will fix this. + * + */ + if ((err = mac_flow_hash_add(flent)) != 0) + goto bail; + + hash_added = B_TRUE; + + /* + * do not allow flows to be configured on an anchor VNIC + */ + if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) { + err = ENOTSUP; + goto bail; + } + + /* + * Save the zoneid of the underlying link in the flow entry, + * this is needed to prevent non-global zone from getting + * statistics information of global zone. + */ + flent->fe_zoneid = dlp->dl_zid; + + /* + * Add the subflow to the subflow table. Also instantiate the flow + * in the mac if there is an active DLS user. The dl_mah is set when + * dls_active_set() is called, typically during interface plumb. + */ + err = mac_flow_add_subflow(dlp->dl_mch, flent, dlp->dl_mah != NULL); + if (err != 0) + goto bail; + + FLOW_UNMARK(flent, FE_INCIPIENT); + dls_devnet_rele_link(dlh, dlp); + mac_perim_exit(mph); + return (0); + +bail: + if (hash_added) + mac_flow_hash_remove(flent); + + if (link_held) + dls_devnet_rele_link(dlh, dlp); + + /* + * Wait for any transient global flow hash refs to clear + * and then release the creation reference on the flow + */ + mac_flow_wait(flent, FLOW_USER_REF); + FLOW_FINAL_REFRELE(flent); + mac_perim_exit(mph); + return (err); +} + +/* + * mac_link_flow_clean() + * Internal flow interface used for freeing SRSs and related + * data structures. Not meant to be used by mac clients. + */ +void +mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; + boolean_t last_subflow; + + ASSERT(mch != NULL); + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + /* + * This sub flow entry may fail to be fully initialized by + * mac_link_flow_init(). If so, simply return. + */ + if (sub_flow->fe_mcip == NULL) + return; + + last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab); + /* + * Tear down the data path + */ + mac_datapath_teardown(mcip, sub_flow, SRST_FLOW); + sub_flow->fe_mcip = NULL; + + /* + * Delete the SRSs associated with this subflow. If this is being + * driven by flowadm(1M) then the subflow will be deleted by + * dls_rem_flow. However if this is a result of the interface being + * unplumbed then the subflow itself won't be deleted. + */ + mac_flow_cleanup(sub_flow); + + /* + * If all the subflows are gone, renable some of the stuff + * we disabled when adding a subflow, polling etc. + */ + if (last_subflow) { + /* + * The subflow table itself is not protected by any locks or + * refcnts. Hence quiesce the client upfront before clearing + * mci_subflow_tab. + */ + mac_client_quiesce(mcip); + mac_client_update_classifier(mcip, B_FALSE); + mac_flow_tab_destroy(mcip->mci_subflow_tab); + mcip->mci_subflow_tab = NULL; + mac_client_restart(mcip); + } +} + +/* + * mac_link_flow_remove() + * Used by flowadm(1m) or kernel mac clients for removing flows. + */ +int +mac_link_flow_remove(char *flow_name) +{ + flow_entry_t *flent; + mac_perim_handle_t mph; + int err; + datalink_id_t linkid; + + err = mac_flow_lookup_byname(flow_name, &flent); + if (err != 0) + return (err); + + linkid = flent->fe_link_id; + FLOW_USER_REFRELE(flent); + + /* + * The perim must be acquired before acquiring any other references + * to maintain the lock and perimeter hierarchy. Please note the + * FLOW_REFRELE above. + */ + err = mac_perim_enter_by_linkid(linkid, &mph); + if (err != 0) + return (err); + + /* + * Note the second lookup of the flow, because a concurrent thread + * may have removed it already while we were waiting to enter the + * link's perimeter. + */ + err = mac_flow_lookup_byname(flow_name, &flent); + if (err != 0) { + mac_perim_exit(mph); + return (err); + } + FLOW_USER_REFRELE(flent); + + /* + * Remove the flow from the subflow table and deactivate the flow + * by quiescing and removings its SRSs + */ + mac_flow_rem_subflow(flent); + + /* + * Finally, remove the flow from the global table. + */ + mac_flow_hash_remove(flent); + + /* + * Wait for any transient global flow hash refs to clear + * and then release the creation reference on the flow + */ + mac_flow_wait(flent, FLOW_USER_REF); + FLOW_FINAL_REFRELE(flent); + + mac_perim_exit(mph); + + return (0); +} + +/* + * mac_link_flow_modify() + * Modifies the properties of a flow identified by its name. + */ +int +mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp) +{ + flow_entry_t *flent; + mac_client_impl_t *mcip; + int err = 0; + mac_perim_handle_t mph; + datalink_id_t linkid; + flow_tab_t *flow_tab; + + err = mac_validate_props(mrp); + if (err != 0) + return (err); + + err = mac_flow_lookup_byname(flow_name, &flent); + if (err != 0) + return (err); + + linkid = flent->fe_link_id; + FLOW_USER_REFRELE(flent); + + /* + * The perim must be acquired before acquiring any other references + * to maintain the lock and perimeter hierarchy. Please note the + * FLOW_REFRELE above. + */ + err = mac_perim_enter_by_linkid(linkid, &mph); + if (err != 0) + return (err); + + /* + * Note the second lookup of the flow, because a concurrent thread + * may have removed it already while we were waiting to enter the + * link's perimeter. + */ + err = mac_flow_lookup_byname(flow_name, &flent); + if (err != 0) { + mac_perim_exit(mph); + return (err); + } + FLOW_USER_REFRELE(flent); + + /* + * If this flow is attached to a MAC client, then pass the request + * along to the client. + * Otherwise, just update the cached values. + */ + mcip = flent->fe_mcip; + mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE); + if (mcip != NULL) { + if ((flow_tab = mcip->mci_subflow_tab) == NULL) { + err = ENOENT; + } else { + mac_flow_modify(flow_tab, flent, mrp); + } + } else { + (void) mac_flow_modify_props(flent, mrp); + } + +done: + mac_perim_exit(mph); + return (err); +} + + +/* + * State structure and misc functions used by mac_link_flow_walk(). + */ +typedef struct { + int (*ws_func)(mac_flowinfo_t *, void *); + void *ws_arg; +} flow_walk_state_t; + +static void +mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent) +{ + (void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name, MAXNAMELEN); + finfop->fi_link_id = flent->fe_link_id; + finfop->fi_flow_desc = flent->fe_flow_desc; + finfop->fi_resource_props = flent->fe_resource_props; +} + +static int +mac_link_flow_walk_cb(flow_entry_t *flent, void *arg) +{ + flow_walk_state_t *statep = arg; + mac_flowinfo_t finfo; + + mac_link_flowinfo_copy(&finfo, flent); + return (statep->ws_func(&finfo, statep->ws_arg)); +} + +/* + * mac_link_flow_walk() + * Invokes callback 'func' for all flows belonging to the specified link. + */ +int +mac_link_flow_walk(datalink_id_t linkid, + int (*func)(mac_flowinfo_t *, void *), void *arg) +{ + mac_client_impl_t *mcip; + mac_perim_handle_t mph; + flow_walk_state_t state; + dls_dl_handle_t dlh; + dls_link_t *dlp; + int err; + + err = mac_perim_enter_by_linkid(linkid, &mph); + if (err != 0) + return (err); + + err = dls_devnet_hold_link(linkid, &dlh, &dlp); + if (err != 0) { + mac_perim_exit(mph); + return (err); + } + + mcip = (mac_client_impl_t *)dlp->dl_mch; + state.ws_func = func; + state.ws_arg = arg; + + err = mac_flow_walk_nolock(mcip->mci_subflow_tab, + mac_link_flow_walk_cb, &state); + + dls_devnet_rele_link(dlh, dlp); + mac_perim_exit(mph); + return (err); +} + +/* + * mac_link_flow_info() + * Retrieves information about a specific flow. + */ +int +mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo) +{ + flow_entry_t *flent; + int err; + + err = mac_flow_lookup_byname(flow_name, &flent); + if (err != 0) + return (err); + + mac_link_flowinfo_copy(finfo, flent); + FLOW_USER_REFRELE(flent); + return (0); +} + +#define HASH_MAC_VID(a, v, s) \ + ((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s)) + +#define PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end)) + +/* ARGSUSED */ +static boolean_t +flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) +{ + flow_l2info_t *l2 = &s->fs_l2info; + flow_desc_t *fd = &flent->fe_flow_desc; + + return (l2->l2_vid == fd->fd_vid && + bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0); +} + +/* + * Layer 2 hash function. + * Must be paired with flow_l2_accept() within a set of flow_ops + * because it assumes the dest address is already extracted. + */ +static uint32_t +flow_l2_hash(flow_tab_t *ft, flow_state_t *s) +{ + flow_l2info_t *l2 = &s->fs_l2info; + + return (HASH_MAC_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size)); +} + +/* + * This is the generic layer 2 accept function. + * It makes use of mac_header_info() to extract the header length, + * sap, vlan ID and destination address. + */ +static int +flow_l2_accept(flow_tab_t *ft, flow_state_t *s) +{ + boolean_t is_ether; + flow_l2info_t *l2 = &s->fs_l2info; + mac_header_info_t mhi; + int err; + + is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER); + if ((err = mac_header_info((mac_handle_t)ft->ft_mip, + s->fs_mp, &mhi)) != 0) { + if (err == EINVAL) + err = ENOBUFS; + + return (err); + } + + l2->l2_start = s->fs_mp->b_rptr; + l2->l2_daddr = (uint8_t *)mhi.mhi_daddr; + + if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN && + ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) { + struct ether_vlan_header *evhp = + (struct ether_vlan_header *)l2->l2_start; + + if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp))) + return (ENOBUFS); + + l2->l2_sap = ntohs(evhp->ether_type); + l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci)); + l2->l2_hdrsize = sizeof (*evhp); + } else { + l2->l2_sap = mhi.mhi_bindsap; + l2->l2_vid = 0; + l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize; + } + return (0); +} + +/* + * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/ + * accept(). The notable difference is that dest address is now extracted + * by hash() rather than by accept(). This saves a few memory references + * for flow tables that do not care about mac addresses. + */ +static uint32_t +flow_ether_hash(flow_tab_t *ft, flow_state_t *s) +{ + flow_l2info_t *l2 = &s->fs_l2info; + struct ether_vlan_header *evhp; + + evhp = (struct ether_vlan_header *)l2->l2_start; + l2->l2_daddr = evhp->ether_dhost.ether_addr_octet; + return (HASH_MAC_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size)); +} + +/* ARGSUSED */ +static int +flow_ether_accept(flow_tab_t *ft, flow_state_t *s) +{ + flow_l2info_t *l2 = &s->fs_l2info; + struct ether_vlan_header *evhp; + uint16_t sap; + + evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr; + l2->l2_start = (uchar_t *)evhp; + + if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header))) + return (ENOBUFS); + + if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN && + ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) { + if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp))) + return (ENOBUFS); + + l2->l2_sap = ntohs(evhp->ether_type); + l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci)); + l2->l2_hdrsize = sizeof (struct ether_vlan_header); + } else { + l2->l2_sap = sap; + l2->l2_vid = 0; + l2->l2_hdrsize = sizeof (struct ether_header); + } + return (0); +} + +/* + * Validates a layer 2 flow entry. + */ +static int +flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent) +{ + int i; + flow_desc_t *fd = &flent->fe_flow_desc; + + /* + * Dest address is mandatory. + */ + if ((fd->fd_mask & FLOW_LINK_DST) == 0) + return (EINVAL); + + for (i = 0; i < fd->fd_mac_len; i++) { + if (fd->fd_dst_mac[i] != 0) + break; + } + if (i == fd->fd_mac_len || fd->fd_mac_len < ETHERADDRL) + return (EINVAL); + + if ((fd->fd_mask & FLOW_LINK_VID) != 0) { + /* + * VLAN flows are only supported over ethernet macs. + */ + if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER) + return (EINVAL); + + if (fd->fd_vid == 0) + return (EINVAL); + + } + flent->fe_match = flow_l2_match; + return (0); +} + +/* + * Calculates hash index of flow entry. + */ +static uint32_t +flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent) +{ + flow_desc_t *fd = &flent->fe_flow_desc; + + ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0); + return (HASH_MAC_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size)); +} + +/* + * This is used for duplicate flow checking. + */ +/* ARGSUSED */ +static boolean_t +flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) +{ + flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; + + ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0); + return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac, + fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid); +} + +/* + * Generic flow entry insertion function. + * Used by flow tables that do not have ordering requirements. + */ +/* ARGSUSED */ +static int +flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp, + flow_entry_t *flent) +{ + ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); + + if (*headp != NULL) { + ASSERT(flent->fe_next == NULL); + flent->fe_next = *headp; + } + *headp = flent; + return (0); +} + +/* + * IP version independent DSField matching function. + */ +/* ARGSUSED */ +static boolean_t +flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) +{ + flow_l3info_t *l3info = &s->fs_l3info; + flow_desc_t *fd = &flent->fe_flow_desc; + + switch (l3info->l3_version) { + case IPV4_VERSION: { + ipha_t *ipha = (ipha_t *)l3info->l3_start; + + return ((ipha->ipha_type_of_service & + fd->fd_dsfield_mask) == fd->fd_dsfield); + } + case IPV6_VERSION: { + ip6_t *ip6h = (ip6_t *)l3info->l3_start; + + return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) & + fd->fd_dsfield_mask) == fd->fd_dsfield); + } + default: + return (B_FALSE); + } +} + +/* + * IP v4 and v6 address matching. + * The netmask only needs to be applied on the packet but not on the + * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets. + */ + +/* ARGSUSED */ +static boolean_t +flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) +{ + flow_l3info_t *l3info = &s->fs_l3info; + flow_desc_t *fd = &flent->fe_flow_desc; + ipha_t *ipha = (ipha_t *)l3info->l3_start; + in_addr_t addr; + + addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src); + if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) { + return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) == + V4_PART_OF_V6(fd->fd_local_addr)); + } + return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) == + V4_PART_OF_V6(fd->fd_remote_addr)); +} + +/* ARGSUSED */ +static boolean_t +flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) +{ + flow_l3info_t *l3info = &s->fs_l3info; + flow_desc_t *fd = &flent->fe_flow_desc; + ip6_t *ip6h = (ip6_t *)l3info->l3_start; + in6_addr_t *addrp; + + addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src); + if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) { + return (V6_MASK_EQ(*addrp, fd->fd_local_netmask, + fd->fd_local_addr)); + } + return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr)); +} + +/* ARGSUSED */ +static boolean_t +flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) +{ + flow_l3info_t *l3info = &s->fs_l3info; + flow_desc_t *fd = &flent->fe_flow_desc; + + return (l3info->l3_protocol == fd->fd_protocol); +} + +static uint32_t +flow_ip_hash(flow_tab_t *ft, flow_state_t *s) +{ + flow_l3info_t *l3info = &s->fs_l3info; + flow_mask_t mask = ft->ft_mask; + + if ((mask & FLOW_IP_LOCAL) != 0) { + l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0); + } else if ((mask & FLOW_IP_REMOTE) != 0) { + l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0); + } else if ((mask & FLOW_IP_DSFIELD) != 0) { + /* + * DSField flents are arranged as a single list. + */ + return (0); + } + /* + * IP addr flents are hashed into two lists, v4 or v6. + */ + ASSERT(ft->ft_size >= 2); + return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1); +} + +static uint32_t +flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s) +{ + flow_l3info_t *l3info = &s->fs_l3info; + + return (l3info->l3_protocol % ft->ft_size); +} + +/* ARGSUSED */ +static int +flow_ip_accept(flow_tab_t *ft, flow_state_t *s) +{ + flow_l2info_t *l2info = &s->fs_l2info; + flow_l3info_t *l3info = &s->fs_l3info; + uint16_t sap = l2info->l2_sap; + uchar_t *l3_start; + + l3info->l3_start = l3_start = l2info->l2_start + l2info->l2_hdrsize; + if (!OK_32PTR(l3_start)) + return (EINVAL); + + switch (sap) { + case ETHERTYPE_IP: { + ipha_t *ipha = (ipha_t *)l3_start; + + if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH)) + return (ENOBUFS); + + l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha); + l3info->l3_protocol = ipha->ipha_protocol; + l3info->l3_version = IPV4_VERSION; + l3info->l3_fragmented = + IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags); + break; + } + case ETHERTYPE_IPV6: { + ip6_t *ip6h = (ip6_t *)l3_start; + uint16_t ip6_hdrlen; + uint8_t nexthdr; + + if (!mac_ip_hdr_length_v6(s->fs_mp, ip6h, &ip6_hdrlen, + &nexthdr)) { + return (ENOBUFS); + } + l3info->l3_hdrsize = ip6_hdrlen; + l3info->l3_protocol = nexthdr; + l3info->l3_version = IPV6_VERSION; + l3info->l3_fragmented = B_FALSE; + break; + } + default: + return (EINVAL); + } + return (0); +} + +/* ARGSUSED */ +static int +flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent) +{ + flow_desc_t *fd = &flent->fe_flow_desc; + + switch (fd->fd_protocol) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_SCTP: + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + flent->fe_match = flow_ip_proto_match; + return (0); + default: + return (EINVAL); + } +} + +/* ARGSUSED */ +static int +flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent) +{ + flow_desc_t *fd = &flent->fe_flow_desc; + flow_mask_t mask; + uint8_t version; + in6_addr_t *addr, *netmask; + + /* + * DSField does not require a IP version. + */ + if (fd->fd_mask == FLOW_IP_DSFIELD) { + if (fd->fd_dsfield_mask == 0) + return (EINVAL); + + flent->fe_match = flow_ip_dsfield_match; + return (0); + } + + /* + * IP addresses must come with a version to avoid ambiguity. + */ + if ((fd->fd_mask & FLOW_IP_VERSION) == 0) + return (EINVAL); + + version = fd->fd_ipversion; + if (version != IPV4_VERSION && version != IPV6_VERSION) + return (EINVAL); + + mask = fd->fd_mask & ~FLOW_IP_VERSION; + switch (mask) { + case FLOW_IP_LOCAL: + addr = &fd->fd_local_addr; + netmask = &fd->fd_local_netmask; + break; + case FLOW_IP_REMOTE: + addr = &fd->fd_remote_addr; + netmask = &fd->fd_remote_netmask; + break; + default: + return (EINVAL); + } + + /* + * Apply netmask onto specified address. + */ + V6_MASK_COPY(*addr, *netmask, *addr); + if (version == IPV4_VERSION) { + ipaddr_t v4addr = V4_PART_OF_V6((*addr)); + ipaddr_t v4mask = V4_PART_OF_V6((*netmask)); + + if (v4addr == 0 || v4mask == 0) + return (EINVAL); + flent->fe_match = flow_ip_v4_match; + } else { + if (IN6_IS_ADDR_UNSPECIFIED(addr) || + IN6_IS_ADDR_UNSPECIFIED(netmask)) + return (EINVAL); + flent->fe_match = flow_ip_v6_match; + } + return (0); +} + +static uint32_t +flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent) +{ + flow_desc_t *fd = &flent->fe_flow_desc; + + return (fd->fd_protocol % ft->ft_size); +} + +static uint32_t +flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent) +{ + flow_desc_t *fd = &flent->fe_flow_desc; + + /* + * DSField flents are arranged as a single list. + */ + if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0) + return (0); + + /* + * IP addr flents are hashed into two lists, v4 or v6. + */ + ASSERT(ft->ft_size >= 2); + return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1); +} + +/* ARGSUSED */ +static boolean_t +flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) +{ + flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; + + return (fd1->fd_protocol == fd2->fd_protocol); +} + +/* ARGSUSED */ +static boolean_t +flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) +{ + flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; + in6_addr_t *a1, *m1, *a2, *m2; + + ASSERT(fd1->fd_mask == fd2->fd_mask); + if (fd1->fd_mask == FLOW_IP_DSFIELD) { + return (fd1->fd_dsfield == fd2->fd_dsfield && + fd1->fd_dsfield_mask == fd2->fd_dsfield_mask); + } + + /* + * flow_ip_accept_fe() already validated the version. + */ + ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0); + if (fd1->fd_ipversion != fd2->fd_ipversion) + return (B_FALSE); + + switch (fd1->fd_mask & ~FLOW_IP_VERSION) { + case FLOW_IP_LOCAL: + a1 = &fd1->fd_local_addr; + m1 = &fd1->fd_local_netmask; + a2 = &fd2->fd_local_addr; + m2 = &fd2->fd_local_netmask; + break; + case FLOW_IP_REMOTE: + a1 = &fd1->fd_remote_addr; + m1 = &fd1->fd_remote_netmask; + a2 = &fd2->fd_remote_addr; + m2 = &fd2->fd_remote_netmask; + break; + default: + /* + * This is unreachable given the checks in + * flow_ip_accept_fe(). + */ + return (B_FALSE); + } + + if (fd1->fd_ipversion == IPV4_VERSION) { + return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) && + V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2))); + + } else { + return (IN6_ARE_ADDR_EQUAL(a1, a2) && + IN6_ARE_ADDR_EQUAL(m1, m2)); + } +} + +static int +flow_ip_mask2plen(in6_addr_t *v6mask) +{ + int bits; + int plen = IPV6_ABITS; + int i; + + for (i = 3; i >= 0; i--) { + if (v6mask->s6_addr32[i] == 0) { + plen -= 32; + continue; + } + bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1; + if (bits == 0) + break; + plen -= bits; + } + return (plen); +} + +/* ARGSUSED */ +static int +flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp, + flow_entry_t *flent) +{ + flow_entry_t **p = headp; + flow_desc_t *fd0, *fd; + in6_addr_t *m0, *m; + int plen0, plen; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip)); + + /* + * No special ordering needed for dsfield. + */ + fd0 = &flent->fe_flow_desc; + if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) { + if (*p != NULL) { + ASSERT(flent->fe_next == NULL); + flent->fe_next = *p; + } + *p = flent; + return (0); + } + + /* + * IP address flows are arranged in descending prefix length order. + */ + m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ? + &fd0->fd_local_netmask : &fd0->fd_remote_netmask; + plen0 = flow_ip_mask2plen(m0); + ASSERT(plen0 != 0); + + for (; *p != NULL; p = &(*p)->fe_next) { + fd = &(*p)->fe_flow_desc; + + /* + * Normally a dsfield flent shouldn't end up on the same + * list as an IP address because flow tables are (for now) + * disjoint. If we decide to support both IP and dsfield + * in the same table in the future, this check will allow + * for that. + */ + if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0) + continue; + + /* + * We also allow for the mixing of local and remote address + * flents within one list. + */ + m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ? + &fd->fd_local_netmask : &fd->fd_remote_netmask; + plen = flow_ip_mask2plen(m); + + if (plen <= plen0) + break; + } + if (*p != NULL) { + ASSERT(flent->fe_next == NULL); + flent->fe_next = *p; + } + *p = flent; + return (0); +} + +/* + * Transport layer protocol and port matching functions. + */ + +/* ARGSUSED */ +static boolean_t +flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) +{ + flow_l3info_t *l3info = &s->fs_l3info; + flow_l4info_t *l4info = &s->fs_l4info; + flow_desc_t *fd = &flent->fe_flow_desc; + + return (fd->fd_protocol == l3info->l3_protocol && + fd->fd_local_port == l4info->l4_hash_port); +} + +/* ARGSUSED */ +static boolean_t +flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) +{ + flow_l3info_t *l3info = &s->fs_l3info; + flow_l4info_t *l4info = &s->fs_l4info; + flow_desc_t *fd = &flent->fe_flow_desc; + + return (fd->fd_protocol == l3info->l3_protocol && + fd->fd_remote_port == l4info->l4_hash_port); +} + +/* + * Transport hash function. + * Since we only support either local or remote port flows, + * we only need to extract one of the ports to be used for + * matching. + */ +static uint32_t +flow_transport_hash(flow_tab_t *ft, flow_state_t *s) +{ + flow_l3info_t *l3info = &s->fs_l3info; + flow_l4info_t *l4info = &s->fs_l4info; + uint8_t proto = l3info->l3_protocol; + boolean_t dst_or_src; + + if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) { + dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0); + } else { + dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0); + } + + l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port : + l4info->l4_src_port; + + return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size); +} + +/* + * Unlike other accept() functions above, we do not need to get the header + * size because this is our highest layer so far. If we want to do support + * other higher layer protocols, we would need to save the l4_hdrsize + * in the code below. + */ + +/* ARGSUSED */ +static int +flow_transport_accept(flow_tab_t *ft, flow_state_t *s) +{ + flow_l3info_t *l3info = &s->fs_l3info; + flow_l4info_t *l4info = &s->fs_l4info; + uint8_t proto = l3info->l3_protocol; + uchar_t *l4_start; + + l4info->l4_start = l4_start = l3info->l3_start + l3info->l3_hdrsize; + if (!OK_32PTR(l4_start)) + return (EINVAL); + + if (l3info->l3_fragmented == B_TRUE) + return (EINVAL); + + switch (proto) { + case IPPROTO_TCP: { + struct tcphdr *tcph = (struct tcphdr *)l4_start; + + if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph))) + return (ENOBUFS); + + l4info->l4_src_port = tcph->th_sport; + l4info->l4_dst_port = tcph->th_dport; + break; + } + case IPPROTO_UDP: { + struct udphdr *udph = (struct udphdr *)l4_start; + + if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph))) + return (ENOBUFS); + + l4info->l4_src_port = udph->uh_sport; + l4info->l4_dst_port = udph->uh_dport; + break; + } + case IPPROTO_SCTP: { + sctp_hdr_t *sctph = (sctp_hdr_t *)l4_start; + + if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph))) + return (ENOBUFS); + + l4info->l4_src_port = sctph->sh_sport; + l4info->l4_dst_port = sctph->sh_dport; + break; + } + default: + return (EINVAL); + } + + return (0); +} + +/* + * Validates transport flow entry. + * The protocol field must be present. + */ + +/* ARGSUSED */ +static int +flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent) +{ + flow_desc_t *fd = &flent->fe_flow_desc; + flow_mask_t mask = fd->fd_mask; + + if ((mask & FLOW_IP_PROTOCOL) == 0) + return (EINVAL); + + switch (fd->fd_protocol) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_SCTP: + break; + default: + return (EINVAL); + } + + switch (mask & ~FLOW_IP_PROTOCOL) { + case FLOW_ULP_PORT_LOCAL: + if (fd->fd_local_port == 0) + return (EINVAL); + + flent->fe_match = flow_transport_lport_match; + break; + case FLOW_ULP_PORT_REMOTE: + if (fd->fd_remote_port == 0) + return (EINVAL); + + flent->fe_match = flow_transport_rport_match; + break; + case 0: + /* + * transport-only flows conflicts with our table type. + */ + return (EOPNOTSUPP); + default: + return (EINVAL); + } + + return (0); +} + +static uint32_t +flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent) +{ + flow_desc_t *fd = &flent->fe_flow_desc; + uint16_t port = 0; + + port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ? + fd->fd_local_port : fd->fd_remote_port; + + return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size); +} + +/* ARGSUSED */ +static boolean_t +flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2) +{ + flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc; + + if (fd1->fd_protocol != fd2->fd_protocol) + return (B_FALSE); + + if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) + return (fd1->fd_local_port == fd2->fd_local_port); + + return (fd1->fd_remote_port == fd2->fd_remote_port); +} + +static flow_ops_t flow_l2_ops = { + flow_l2_accept_fe, + flow_l2_hash_fe, + flow_l2_match_fe, + flow_generic_insert_fe, + flow_l2_hash, + {flow_l2_accept} +}; + +static flow_ops_t flow_ip_ops = { + flow_ip_accept_fe, + flow_ip_hash_fe, + flow_ip_match_fe, + flow_ip_insert_fe, + flow_ip_hash, + {flow_l2_accept, flow_ip_accept} +}; + +static flow_ops_t flow_ip_proto_ops = { + flow_ip_proto_accept_fe, + flow_ip_proto_hash_fe, + flow_ip_proto_match_fe, + flow_generic_insert_fe, + flow_ip_proto_hash, + {flow_l2_accept, flow_ip_accept} +}; + +static flow_ops_t flow_transport_ops = { + flow_transport_accept_fe, + flow_transport_hash_fe, + flow_transport_match_fe, + flow_generic_insert_fe, + flow_transport_hash, + {flow_l2_accept, flow_ip_accept, flow_transport_accept} +}; + +static flow_tab_info_t flow_tab_info_list[] = { + {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2}, + {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2}, + {&flow_ip_ops, FLOW_IP_DSFIELD, 1}, + {&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256}, + {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024} +}; + +#define FLOW_MAX_TAB_INFO \ + ((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t)) + +static flow_tab_info_t * +mac_flow_tab_info_get(flow_mask_t mask) +{ + int i; + + for (i = 0; i < FLOW_MAX_TAB_INFO; i++) { + if (mask == flow_tab_info_list[i].fti_mask) + return (&flow_tab_info_list[i]); + } + return (NULL); +} diff --git a/usr/src/uts/common/io/mac/mac_hio.c b/usr/src/uts/common/io/mac/mac_hio.c new file mode 100644 index 0000000000..d930506ae7 --- /dev/null +++ b/usr/src/uts/common/io/mac/mac_hio.c @@ -0,0 +1,182 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * MAC Hybrid I/O related code. + */ + +#include <sys/types.h> +#include <sys/sdt.h> +#include <sys/mac.h> +#include <sys/mac_impl.h> +#include <sys/mac_client_impl.h> +#include <sys/mac_soft_ring.h> + + +/* + * Return the number of shares supported by the specified MAC. + */ +int +mac_share_capable(mac_handle_t mh) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + return (mip->mi_share_capab.ms_snum); +} + + +/* + * Allocate a share to the specified MAC client. Invoked when + * mac_client_open() is invoked with MAC_OPEN_FLAGS_SHARES_DESIRED set. + */ +void +i_mac_share_alloc(mac_client_impl_t *mcip) +{ + mac_impl_t *mip = mcip->mci_mip; + int rv; + + i_mac_perim_enter(mip); + + ASSERT(mcip->mci_share == NULL); + + if (mac_share_capable((mac_handle_t)mcip->mci_mip) == 0) { + DTRACE_PROBE1(i__mac__share__alloc__not__sup, + mac_client_impl_t *, mcip); + i_mac_perim_exit(mip); + return; + } + + rv = mip->mi_share_capab.ms_salloc(mip->mi_share_capab.ms_handle, + &mcip->mci_share); + DTRACE_PROBE3(i__mac__share__alloc, mac_client_impl_t *, mcip, + int, rv, mac_share_handle_t, mcip->mci_share); + + mcip->mci_share_bound = B_FALSE; + + i_mac_perim_exit(mip); +} + + +/* + * Free a share previously allocated through i_mac_share_alloc(). + * Safely handles the case when no shares were allocated to the MAC client. + */ +void +i_mac_share_free(mac_client_impl_t *mcip) +{ + mac_impl_t *mip = mcip->mci_mip; + + i_mac_perim_enter(mip); + + /* MAC clients are required to unbind they shares before freeing them */ + ASSERT(!mcip->mci_share_bound); + + if (mcip->mci_share == NULL) { + i_mac_perim_exit(mip); + return; + } + + mip->mi_share_capab.ms_sfree(mcip->mci_share); + i_mac_perim_exit(mip); +} + + +/* + * Bind a share. After this operation the rings that were associated + * with the MAC client are mapped directly into the corresponding + * guest domain. + */ +int +mac_share_bind(mac_client_handle_t mch, uint64_t cookie, uint64_t *rcookie) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; + int rv; + + i_mac_perim_enter(mip); + + if (mcip->mci_share == NULL) { + i_mac_perim_exit(mip); + return (ENOTSUP); + } + + ASSERT(!mcip->mci_share_bound); + + /* + * Temporarly suspend the TX traffic for that client to make sure + * there are no in flight packets through a transmit ring + * which is being bound to another domain. + */ + mac_tx_client_quiesce(mcip, SRS_QUIESCE); + + /* + * For the receive path, no traffic will be sent up through + * the rings to the IO domain. For TX, we need to ensure + * that traffic sent by the MAC client are sent through + * the default ring. + * + * For TX XXX will ensure that packets are sent through the + * default ring if the share of the MAC client is bound. + */ + + rv = mip->mi_share_capab.ms_sbind(mcip->mci_share, cookie, rcookie); + if (rv == 0) + mcip->mci_share_bound = B_TRUE; + + /* + * Resume TX traffic for the MAC client. Since mci_share_bound is set + * to B_TRUE, mac_tx_send() will not send traffic to individual TX + * rings until the share is unbound. + */ + mac_tx_client_restart(mcip); + + i_mac_perim_exit(mip); + + return (rv); +} + + +/* + * Unbind a share. + */ +void +mac_share_unbind(mac_client_handle_t mch) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = mcip->mci_mip; + + i_mac_perim_enter(mip); + + if (mcip->mci_share == NULL) { + i_mac_perim_exit(mip); + return; + } + + mip->mi_share_capab.ms_sunbind(mcip->mci_share); + + mcip->mci_share_bound = B_FALSE; + + i_mac_perim_exit(mip); +} diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c new file mode 100644 index 0000000000..714fb79afb --- /dev/null +++ b/usr/src/uts/common/io/mac/mac_provider.c @@ -0,0 +1,1031 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/id_space.h> +#include <sys/esunddi.h> +#include <sys/stat.h> +#include <sys/mkdev.h> +#include <sys/stream.h> +#include <sys/strsubr.h> +#include <sys/dlpi.h> +#include <sys/modhash.h> +#include <sys/mac.h> +#include <sys/mac_provider.h> +#include <sys/mac_impl.h> +#include <sys/mac_client_impl.h> +#include <sys/mac_client_priv.h> +#include <sys/mac_soft_ring.h> +#include <sys/modctl.h> +#include <sys/fs/dv_node.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/callb.h> +#include <sys/cpuvar.h> +#include <sys/atomic.h> +#include <sys/sdt.h> +#include <sys/mac_flow.h> +#include <sys/ddi_intr_impl.h> +#include <sys/disp.h> +#include <sys/sdt.h> + +/* + * MAC Provider Interface. + * + * Interface for GLDv3 compatible NIC drivers. + */ + +static void i_mac_notify_thread(void *); + +typedef void (*mac_notify_default_cb_fn_t)(mac_impl_t *); + +typedef struct mac_notify_default_cb_s { + mac_notify_type_t mac_notify_type; + mac_notify_default_cb_fn_t mac_notify_cb_fn; +}mac_notify_default_cb_t; + +mac_notify_default_cb_t mac_notify_cb_list[] = { + { MAC_NOTE_LINK, mac_fanout_recompute}, + { MAC_NOTE_PROMISC, NULL}, + { MAC_NOTE_UNICST, NULL}, + { MAC_NOTE_TX, NULL}, + { MAC_NOTE_RESOURCE, NULL}, + { MAC_NOTE_DEVPROMISC, NULL}, + { MAC_NOTE_FASTPATH_FLUSH, NULL}, + { MAC_NOTE_SDU_SIZE, NULL}, + { MAC_NOTE_MARGIN, NULL}, + { MAC_NOTE_CAPAB_CHG, NULL}, + { MAC_NNOTE, NULL}, +}; + +/* + * Driver support functions. + */ + +/* REGISTRATION */ + +mac_register_t * +mac_alloc(uint_t mac_version) +{ + mac_register_t *mregp; + + /* + * Make sure there isn't a version mismatch between the driver and + * the framework. In the future, if multiple versions are + * supported, this check could become more sophisticated. + */ + if (mac_version != MAC_VERSION) + return (NULL); + + mregp = kmem_zalloc(sizeof (mac_register_t), KM_SLEEP); + mregp->m_version = mac_version; + return (mregp); +} + +void +mac_free(mac_register_t *mregp) +{ + kmem_free(mregp, sizeof (mac_register_t)); +} + +/* + * mac_register() is how drivers register new MACs with the GLDv3 + * framework. The mregp argument is allocated by drivers using the + * mac_alloc() function, and can be freed using mac_free() immediately upon + * return from mac_register(). Upon success (0 return value), the mhp + * opaque pointer becomes the driver's handle to its MAC interface, and is + * the argument to all other mac module entry points. + */ +/* ARGSUSED */ +int +mac_register(mac_register_t *mregp, mac_handle_t *mhp) +{ + mac_impl_t *mip; + mactype_t *mtype; + int err = EINVAL; + struct devnames *dnp = NULL; + uint_t instance; + boolean_t style1_created = B_FALSE; + boolean_t style2_created = B_FALSE; + mac_capab_legacy_t legacy; + char *driver; + minor_t minor = 0; + + /* Find the required MAC-Type plugin. */ + if ((mtype = mactype_getplugin(mregp->m_type_ident)) == NULL) + return (EINVAL); + + /* Create a mac_impl_t to represent this MAC. */ + mip = kmem_cache_alloc(i_mac_impl_cachep, KM_SLEEP); + + /* + * The mac is not ready for open yet. + */ + mip->mi_state_flags |= MIS_DISABLED; + + /* + * When a mac is registered, the m_instance field can be set to: + * + * 0: Get the mac's instance number from m_dip. + * This is usually used for physical device dips. + * + * [1 .. MAC_MAX_MINOR-1]: Use the value as the mac's instance number. + * For example, when an aggregation is created with the key option, + * "key" will be used as the instance number. + * + * -1: Assign an instance number from [MAC_MAX_MINOR .. MAXMIN-1]. + * This is often used when a MAC of a virtual link is registered + * (e.g., aggregation when "key" is not specified, or vnic). + * + * Note that the instance number is used to derive the mi_minor field + * of mac_impl_t, which will then be used to derive the name of kstats + * and the devfs nodes. The first 2 cases are needed to preserve + * backward compatibility. + */ + switch (mregp->m_instance) { + case 0: + instance = ddi_get_instance(mregp->m_dip); + break; + case ((uint_t)-1): + minor = mac_minor_hold(B_TRUE); + if (minor == 0) { + err = ENOSPC; + goto fail; + } + instance = minor - 1; + break; + default: + instance = mregp->m_instance; + if (instance >= MAC_MAX_MINOR) { + err = EINVAL; + goto fail; + } + break; + } + + mip->mi_minor = (minor_t)(instance + 1); + mip->mi_dip = mregp->m_dip; + mip->mi_clients_list = NULL; + mip->mi_nclients = 0; + + driver = (char *)ddi_driver_name(mip->mi_dip); + + /* Construct the MAC name as <drvname><instance> */ + (void) snprintf(mip->mi_name, sizeof (mip->mi_name), "%s%d", + driver, instance); + + mip->mi_driver = mregp->m_driver; + + mip->mi_type = mtype; + mip->mi_margin = mregp->m_margin; + mip->mi_info.mi_media = mtype->mt_type; + mip->mi_info.mi_nativemedia = mtype->mt_nativetype; + if (mregp->m_max_sdu <= mregp->m_min_sdu) + goto fail; + mip->mi_sdu_min = mregp->m_min_sdu; + mip->mi_sdu_max = mregp->m_max_sdu; + mip->mi_info.mi_addr_length = mip->mi_type->mt_addr_length; + /* + * If the media supports a broadcast address, cache a pointer to it + * in the mac_info_t so that upper layers can use it. + */ + mip->mi_info.mi_brdcst_addr = mip->mi_type->mt_brdcst_addr; + + mip->mi_v12n_level = mregp->m_v12n; + + /* + * Copy the unicast source address into the mac_info_t, but only if + * the MAC-Type defines a non-zero address length. We need to + * handle MAC-Types that have an address length of 0 + * (point-to-point protocol MACs for example). + */ + if (mip->mi_type->mt_addr_length > 0) { + if (mregp->m_src_addr == NULL) + goto fail; + mip->mi_info.mi_unicst_addr = + kmem_alloc(mip->mi_type->mt_addr_length, KM_SLEEP); + bcopy(mregp->m_src_addr, mip->mi_info.mi_unicst_addr, + mip->mi_type->mt_addr_length); + + /* + * Copy the fixed 'factory' MAC address from the immutable + * info. This is taken to be the MAC address currently in + * use. + */ + bcopy(mip->mi_info.mi_unicst_addr, mip->mi_addr, + mip->mi_type->mt_addr_length); + + /* + * At this point, we should set up the classification + * rules etc but we delay it till mac_open() so that + * the resource discovery has taken place and we + * know someone wants to use the device. Otherwise + * memory gets allocated for Rx ring structures even + * during probe. + */ + + /* Copy the destination address if one is provided. */ + if (mregp->m_dst_addr != NULL) { + bcopy(mregp->m_dst_addr, mip->mi_dstaddr, + mip->mi_type->mt_addr_length); + } + } else if (mregp->m_src_addr != NULL) { + goto fail; + } + + /* + * The format of the m_pdata is specific to the plugin. It is + * passed in as an argument to all of the plugin callbacks. The + * driver can update this information by calling + * mac_pdata_update(). + */ + if (mregp->m_pdata != NULL) { + /* + * Verify that the plugin supports MAC plugin data and that + * the supplied data is valid. + */ + if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY)) + goto fail; + if (!mip->mi_type->mt_ops.mtops_pdata_verify(mregp->m_pdata, + mregp->m_pdata_size)) { + goto fail; + } + mip->mi_pdata = kmem_alloc(mregp->m_pdata_size, KM_SLEEP); + bcopy(mregp->m_pdata, mip->mi_pdata, mregp->m_pdata_size); + mip->mi_pdata_size = mregp->m_pdata_size; + } + + /* + * Register the private properties. + */ + mac_register_priv_prop(mip, mregp->m_priv_props, + mregp->m_priv_prop_count); + + /* + * Stash the driver callbacks into the mac_impl_t, but first sanity + * check to make sure all mandatory callbacks are set. + */ + if (mregp->m_callbacks->mc_getstat == NULL || + mregp->m_callbacks->mc_start == NULL || + mregp->m_callbacks->mc_stop == NULL || + mregp->m_callbacks->mc_setpromisc == NULL || + mregp->m_callbacks->mc_multicst == NULL) { + goto fail; + } + mip->mi_callbacks = mregp->m_callbacks; + + if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_LEGACY, &legacy)) + mip->mi_state_flags |= MIS_LEGACY; + + if (mip->mi_state_flags & MIS_LEGACY) { + mip->mi_unsup_note = legacy.ml_unsup_note; + mip->mi_phy_dev = legacy.ml_dev; + } else { + mip->mi_unsup_note = 0; + mip->mi_phy_dev = makedevice(ddi_driver_major(mip->mi_dip), + ddi_get_instance(mip->mi_dip) + 1); + } + + /* + * Allocate a notification thread. thread_create blocks for memory + * if needed, it never fails. + */ + mip->mi_notify_thread = thread_create(NULL, 0, i_mac_notify_thread, + mip, 0, &p0, TS_RUN, minclsyspri); + + /* + * Initialize the capabilities + */ + + if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_VNIC, NULL)) + mip->mi_state_flags |= MIS_IS_VNIC; + + if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL)) + mip->mi_state_flags |= MIS_IS_AGGR; + + mac_addr_factory_init(mip); + + /* + * Enforce the virtrualization level registered. + */ + if (mip->mi_v12n_level & MAC_VIRT_LEVEL1) { + if (mac_init_rings(mip, MAC_RING_TYPE_RX) != 0 || + mac_init_rings(mip, MAC_RING_TYPE_TX) != 0) + goto fail; + + /* + * The driver needs to register at least rx rings for this + * virtualization level. + */ + if (mip->mi_rx_groups == NULL) + goto fail; + } + + /* + * The driver must set mc_unicst entry point to NULL when it advertises + * CAP_RINGS for rx groups. + */ + if (mip->mi_rx_groups != NULL) { + if (mregp->m_callbacks->mc_unicst != NULL) + goto fail; + } else { + if (mregp->m_callbacks->mc_unicst == NULL) + goto fail; + } + + /* + * The driver must set mc_tx entry point to NULL when it advertises + * CAP_RINGS for tx rings. + */ + if (mip->mi_tx_groups != NULL) { + if (mregp->m_callbacks->mc_tx != NULL) + goto fail; + } else { + if (mregp->m_callbacks->mc_tx == NULL) + goto fail; + } + + /* + * Initialize MAC addresses. Must be called after mac_init_rings(). + */ + mac_init_macaddr(mip); + + mip->mi_share_capab.ms_snum = 0; + if (mip->mi_v12n_level & MAC_VIRT_HIO) { + (void) mac_capab_get((mac_handle_t)mip, MAC_CAPAB_SHARES, + &mip->mi_share_capab); + } + + /* + * Initialize the kstats for this device. + */ + mac_stat_create(mip); + + /* Zero out any properties. */ + bzero(&mip->mi_resource_props, sizeof (mac_resource_props_t)); + + /* set the gldv3 flag in dn_flags */ + dnp = &devnamesp[ddi_driver_major(mip->mi_dip)]; + LOCK_DEV_OPS(&dnp->dn_lock); + dnp->dn_flags |= (DN_GLDV3_DRIVER | DN_NETWORK_DRIVER); + UNLOCK_DEV_OPS(&dnp->dn_lock); + + if (mip->mi_minor < MAC_MAX_MINOR + 1) { + /* Create a style-2 DLPI device */ + if (ddi_create_minor_node(mip->mi_dip, driver, S_IFCHR, 0, + DDI_NT_NET, CLONE_DEV) != DDI_SUCCESS) + goto fail; + style2_created = B_TRUE; + + /* Create a style-1 DLPI device */ + if (ddi_create_minor_node(mip->mi_dip, mip->mi_name, S_IFCHR, + mip->mi_minor, DDI_NT_NET, 0) != DDI_SUCCESS) + goto fail; + style1_created = B_TRUE; + } + + mac_flow_l2tab_create(mip, &mip->mi_flow_tab); + + rw_enter(&i_mac_impl_lock, RW_WRITER); + if (mod_hash_insert(i_mac_impl_hash, + (mod_hash_key_t)mip->mi_name, (mod_hash_val_t)mip) != 0) { + rw_exit(&i_mac_impl_lock); + err = EEXIST; + goto fail; + } + + DTRACE_PROBE2(mac__register, struct devnames *, dnp, + (mac_impl_t *), mip); + + /* + * Mark the MAC to be ready for open. + */ + mip->mi_state_flags &= ~MIS_DISABLED; + rw_exit(&i_mac_impl_lock); + + atomic_inc_32(&i_mac_impl_count); + + cmn_err(CE_NOTE, "!%s registered", mip->mi_name); + *mhp = (mac_handle_t)mip; + return (0); + +fail: + if (style1_created) + ddi_remove_minor_node(mip->mi_dip, mip->mi_name); + + if (style2_created) + ddi_remove_minor_node(mip->mi_dip, driver); + + mac_addr_factory_fini(mip); + + /* Clean up registered MAC addresses */ + mac_fini_macaddr(mip); + + /* Clean up registered rings */ + mac_free_rings(mip, MAC_RING_TYPE_RX); + mac_free_rings(mip, MAC_RING_TYPE_TX); + + /* Clean up notification thread */ + if (mip->mi_notify_thread != NULL) + i_mac_notify_exit(mip); + + if (mip->mi_info.mi_unicst_addr != NULL) { + kmem_free(mip->mi_info.mi_unicst_addr, + mip->mi_type->mt_addr_length); + mip->mi_info.mi_unicst_addr = NULL; + } + + mac_stat_destroy(mip); + + if (mip->mi_type != NULL) { + atomic_dec_32(&mip->mi_type->mt_ref); + mip->mi_type = NULL; + } + + if (mip->mi_pdata != NULL) { + kmem_free(mip->mi_pdata, mip->mi_pdata_size); + mip->mi_pdata = NULL; + mip->mi_pdata_size = 0; + } + + if (minor != 0) { + ASSERT(minor > MAC_MAX_MINOR); + mac_minor_rele(minor); + } + + mac_unregister_priv_prop(mip); + + kmem_cache_free(i_mac_impl_cachep, mip); + return (err); +} + +/* + * Unregister from the GLDv3 framework + */ +int +mac_unregister(mac_handle_t mh) +{ + int err; + mac_impl_t *mip = (mac_impl_t *)mh; + mod_hash_val_t val; + mac_margin_req_t *mmr, *nextmmr; + + /* Fail the unregister if there are any open references to this mac. */ + if ((err = mac_disable_nowait(mh)) != 0) + return (err); + + /* + * Clean up notification thread and wait for it to exit. + */ + i_mac_notify_exit(mip); + + i_mac_perim_enter(mip); + + if (mip->mi_minor < MAC_MAX_MINOR + 1) { + ddi_remove_minor_node(mip->mi_dip, mip->mi_name); + ddi_remove_minor_node(mip->mi_dip, + (char *)ddi_driver_name(mip->mi_dip)); + } + + ASSERT(mip->mi_nactiveclients == 0 && !(mip->mi_state_flags & + MIS_EXCLUSIVE)); + + mac_stat_destroy(mip); + + (void) mod_hash_remove(i_mac_impl_hash, + (mod_hash_key_t)mip->mi_name, &val); + ASSERT(mip == (mac_impl_t *)val); + + ASSERT(i_mac_impl_count > 0); + atomic_dec_32(&i_mac_impl_count); + + if (mip->mi_pdata != NULL) + kmem_free(mip->mi_pdata, mip->mi_pdata_size); + mip->mi_pdata = NULL; + mip->mi_pdata_size = 0; + + /* + * Free the list of margin request. + */ + for (mmr = mip->mi_mmrp; mmr != NULL; mmr = nextmmr) { + nextmmr = mmr->mmr_nextp; + kmem_free(mmr, sizeof (mac_margin_req_t)); + } + mip->mi_mmrp = NULL; + + mip->mi_linkstate = LINK_STATE_UNKNOWN; + kmem_free(mip->mi_info.mi_unicst_addr, mip->mi_type->mt_addr_length); + mip->mi_info.mi_unicst_addr = NULL; + + atomic_dec_32(&mip->mi_type->mt_ref); + mip->mi_type = NULL; + + /* + * Free the primary MAC address. + */ + mac_fini_macaddr(mip); + + /* + * free all rings + */ + mac_free_rings(mip, MAC_RING_TYPE_RX); + mac_free_rings(mip, MAC_RING_TYPE_TX); + + mac_addr_factory_fini(mip); + + bzero(mip->mi_addr, MAXMACADDRLEN); + bzero(mip->mi_dstaddr, MAXMACADDRLEN); + + /* and the flows */ + mac_flow_tab_destroy(mip->mi_flow_tab); + mip->mi_flow_tab = NULL; + + if (mip->mi_minor > MAC_MAX_MINOR) + mac_minor_rele(mip->mi_minor); + + cmn_err(CE_NOTE, "!%s unregistered", mip->mi_name); + + /* + * Reset the perim related fields to default values before + * kmem_cache_free + */ + i_mac_perim_exit(mip); + mip->mi_state_flags = 0; + + mac_unregister_priv_prop(mip); + kmem_cache_free(i_mac_impl_cachep, mip); + + return (0); +} + +/* DATA RECEPTION */ + +/* + * This function is invoked for packets received by the MAC driver in + * interrupt context. The ring generation number provided by the driver + * is matched with the ring generation number held in MAC. If they do not + * match, received packets are considered stale packets coming from an older + * assignment of the ring. Drop them. + */ +void +mac_rx_ring(mac_handle_t mh, mac_ring_handle_t mrh, mblk_t *mp_chain, + uint64_t mr_gen_num) +{ + mac_ring_t *mr = (mac_ring_t *)mrh; + + if ((mr != NULL) && (mr->mr_gen_num != mr_gen_num)) { + DTRACE_PROBE2(mac__rx__rings__stale__packet, uint64_t, + mr->mr_gen_num, uint64_t, mr_gen_num); + freemsgchain(mp_chain); + return; + } + mac_rx(mh, (mac_resource_handle_t)mrh, mp_chain); +} + +/* + * This function is invoked for each packet received by the underlying + * driver. + */ +void +mac_rx(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + mac_ring_t *mr = (mac_ring_t *)mrh; + mac_soft_ring_set_t *mac_srs; + mblk_t *bp = mp_chain; + boolean_t hw_classified = B_FALSE; + + /* + * If there are any promiscuous mode callbacks defined for + * this MAC, pass them a copy if appropriate. + */ + if (mip->mi_promisc_list != NULL) + mac_promisc_dispatch(mip, mp_chain, NULL); + + if (mr != NULL) { + /* + * If the SRS teardown has started, just return. The 'mr' + * continues to be valid until the driver unregisters the mac. + * Hardware classified packets will not make their way up + * beyond this point once the teardown has started. The driver + * is never passed a pointer to a flow entry or SRS or any + * structure that can be freed much before mac_unregister. + */ + mutex_enter(&mr->mr_lock); + if ((mr->mr_state != MR_INUSE) || (mr->mr_flag & + (MR_INCIPIENT | MR_CONDEMNED | MR_QUIESCE))) { + mutex_exit(&mr->mr_lock); + freemsgchain(mp_chain); + return; + } + if (mr->mr_classify_type == MAC_HW_CLASSIFIER) { + hw_classified = B_TRUE; + MR_REFHOLD_LOCKED(mr); + } + mutex_exit(&mr->mr_lock); + + /* + * We check if an SRS is controlling this ring. + * If so, we can directly call the srs_lower_proc + * routine otherwise we need to go through mac_rx_classify + * to reach the right place. + */ + if (hw_classified) { + mac_srs = mr->mr_srs; + /* + * This is supposed to be the fast path. + * All packets received though here were steered by + * the hardware classifier, and share the same + * MAC header info. + */ + mac_srs->srs_rx.sr_lower_proc(mh, + (mac_resource_handle_t)mac_srs, mp_chain, B_FALSE); + MR_REFRELE(mr); + return; + } + /* We'll fall through to software classification */ + } + + if (!FLOW_TAB_EMPTY(mip->mi_flow_tab)) { + if ((bp = mac_rx_flow(mh, mrh, bp)) == NULL) + return; + } + + freemsgchain(bp); +} + +/* DATA TRANSMISSION */ + +/* + * A driver's notification to resume transmission, in case of a provider + * without TX rings. + */ +void +mac_tx_update(mac_handle_t mh) +{ + /* + * Walk the list of MAC clients (mac_client_handle) + * and update + */ + i_mac_tx_srs_notify((mac_impl_t *)mh, NULL); +} + +/* + * A driver's notification to resume transmission on the specified TX ring. + */ +void +mac_tx_ring_update(mac_handle_t mh, mac_ring_handle_t rh) +{ + i_mac_tx_srs_notify((mac_impl_t *)mh, rh); +} + +/* LINK STATE */ +/* + * Notify the MAC layer about a link state change + */ +void +mac_link_update(mac_handle_t mh, link_state_t link) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + /* + * Save the link state. + */ + mip->mi_linkstate = link; + + /* + * Send a MAC_NOTE_LINK notification. + */ + i_mac_notify(mip, MAC_NOTE_LINK); +} + +/* OTHER CONTROL INFORMATION */ + +/* + * A driver notified us that its primary MAC address has changed. + */ +void +mac_unicst_update(mac_handle_t mh, const uint8_t *addr) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + if (mip->mi_type->mt_addr_length == 0) + return; + + i_mac_perim_enter(mip); + /* + * If address doesn't change, do nothing. + */ + if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) == 0) { + i_mac_perim_exit(mip); + return; + } + + /* + * Freshen the MAC address value and update all MAC clients that + * share this MAC address. + */ + mac_freshen_macaddr(mac_find_macaddr(mip, mip->mi_addr), + (uint8_t *)addr); + + i_mac_perim_exit(mip); + + /* + * Send a MAC_NOTE_UNICST notification. + */ + i_mac_notify(mip, MAC_NOTE_UNICST); +} + +/* + * The provider's hw resources (e.g. rings grouping) has changed. + * Notify the MAC framework to trigger a re-negotiation of the capabilities. + */ +void +mac_resource_update(mac_handle_t mh) +{ + /* + * Send a MAC_NOTE_RESOURCE notification. + */ + i_mac_notify((mac_impl_t *)mh, MAC_NOTE_RESOURCE); +} + +/* + * MAC plugin information changed. + */ +int +mac_pdata_update(mac_handle_t mh, void *mac_pdata, size_t dsize) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + /* + * Verify that the plugin supports MAC plugin data and that the + * supplied data is valid. + */ + if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY)) + return (EINVAL); + if (!mip->mi_type->mt_ops.mtops_pdata_verify(mac_pdata, dsize)) + return (EINVAL); + + if (mip->mi_pdata != NULL) + kmem_free(mip->mi_pdata, mip->mi_pdata_size); + + mip->mi_pdata = kmem_alloc(dsize, KM_SLEEP); + bcopy(mac_pdata, mip->mi_pdata, dsize); + mip->mi_pdata_size = dsize; + + /* + * Since the MAC plugin data is used to construct MAC headers that + * were cached in fast-path headers, we need to flush fast-path + * information for links associated with this mac. + */ + i_mac_notify(mip, MAC_NOTE_FASTPATH_FLUSH); + return (0); +} + +/* + * Invoked by driver as well as the framework to notify its capability change. + */ +void +mac_capab_update(mac_handle_t mh) +{ + /* Send MAC_NOTE_CAPAB_CHG notification */ + i_mac_notify((mac_impl_t *)mh, MAC_NOTE_CAPAB_CHG); +} + +int +mac_maxsdu_update(mac_handle_t mh, uint_t sdu_max) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + if (sdu_max <= mip->mi_sdu_min) + return (EINVAL); + mip->mi_sdu_max = sdu_max; + + /* Send a MAC_NOTE_SDU_SIZE notification. */ + i_mac_notify(mip, MAC_NOTE_SDU_SIZE); + return (0); +} + +/* PRIVATE FUNCTIONS, FOR INTERNAL USE ONLY */ + +/* + * Updates the mac_impl structure with the current state of the link + */ +static void +i_mac_log_link_state(mac_impl_t *mip) +{ + /* + * If no change, then it is not interesting. + */ + if (mip->mi_lastlinkstate == mip->mi_linkstate) + return; + + switch (mip->mi_linkstate) { + case LINK_STATE_UP: + if (mip->mi_type->mt_ops.mtops_ops & MTOPS_LINK_DETAILS) { + char det[200]; + + mip->mi_type->mt_ops.mtops_link_details(det, + sizeof (det), (mac_handle_t)mip, mip->mi_pdata); + + cmn_err(CE_NOTE, "!%s link up, %s", mip->mi_name, det); + } else { + cmn_err(CE_NOTE, "!%s link up", mip->mi_name); + } + break; + + case LINK_STATE_DOWN: + /* + * Only transitions from UP to DOWN are interesting + */ + if (mip->mi_lastlinkstate != LINK_STATE_UNKNOWN) + cmn_err(CE_NOTE, "!%s link down", mip->mi_name); + break; + + case LINK_STATE_UNKNOWN: + /* + * This case is normally not interesting. + */ + break; + } + mip->mi_lastlinkstate = mip->mi_linkstate; +} + +/* + * Main routine for the callbacks notifications thread + */ +static void +i_mac_notify_thread(void *arg) +{ + mac_impl_t *mip = arg; + callb_cpr_t cprinfo; + mac_cb_t *mcb; + mac_cb_info_t *mcbi; + mac_notify_cb_t *mncb; + + mcbi = &mip->mi_notify_cb_info; + CALLB_CPR_INIT(&cprinfo, mcbi->mcbi_lockp, callb_generic_cpr, + "i_mac_notify_thread"); + + mutex_enter(mcbi->mcbi_lockp); + + for (;;) { + uint32_t bits; + uint32_t type; + + bits = mip->mi_notify_bits; + if (bits == 0) { + CALLB_CPR_SAFE_BEGIN(&cprinfo); + cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp); + CALLB_CPR_SAFE_END(&cprinfo, mcbi->mcbi_lockp); + continue; + } + mip->mi_notify_bits = 0; + if ((bits & (1 << MAC_NNOTE)) != 0) { + /* request to quit */ + ASSERT(mip->mi_state_flags & MIS_DISABLED); + break; + } + + mutex_exit(mcbi->mcbi_lockp); + + /* + * Log link changes. + */ + if ((bits & (1 << MAC_NOTE_LINK)) != 0) + i_mac_log_link_state(mip); + + /* + * Do notification callbacks for each notification type. + */ + for (type = 0; type < MAC_NNOTE; type++) { + if ((bits & (1 << type)) == 0) { + continue; + } + + if (mac_notify_cb_list[type].mac_notify_cb_fn) + mac_notify_cb_list[type].mac_notify_cb_fn(mip); + + /* + * Walk the list of notifications. + */ + MAC_CALLBACK_WALKER_INC(&mip->mi_notify_cb_info); + for (mcb = mip->mi_notify_cb_list; mcb != NULL; + mcb = mcb->mcb_nextp) { + mncb = (mac_notify_cb_t *)mcb->mcb_objp; + mncb->mncb_fn(mncb->mncb_arg, type); + } + MAC_CALLBACK_WALKER_DCR(&mip->mi_notify_cb_info, + &mip->mi_notify_cb_list); + } + + mutex_enter(mcbi->mcbi_lockp); + } + + mip->mi_state_flags |= MIS_NOTIFY_DONE; + cv_broadcast(&mcbi->mcbi_cv); + + /* CALLB_CPR_EXIT drops the lock */ + CALLB_CPR_EXIT(&cprinfo); + thread_exit(); +} + +/* + * Signal the i_mac_notify_thread asking it to quit. + * Then wait till it is done. + */ +void +i_mac_notify_exit(mac_impl_t *mip) +{ + mac_cb_info_t *mcbi; + + mcbi = &mip->mi_notify_cb_info; + + mutex_enter(mcbi->mcbi_lockp); + mip->mi_notify_bits = (1 << MAC_NNOTE); + cv_broadcast(&mcbi->mcbi_cv); + + + while ((mip->mi_notify_thread != NULL) && + !(mip->mi_state_flags & MIS_NOTIFY_DONE)) { + cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp); + } + + /* Necessary clean up before doing kmem_cache_free */ + mip->mi_state_flags &= ~MIS_NOTIFY_DONE; + mip->mi_notify_bits = 0; + mip->mi_notify_thread = NULL; + mutex_exit(mcbi->mcbi_lockp); +} + +/* + * Entry point invoked by drivers to dynamically add a ring to an + * existing group. + */ +int +mac_group_add_ring(mac_group_handle_t gh, int index) +{ + mac_group_t *group = (mac_group_t *)gh; + mac_impl_t *mip = (mac_impl_t *)group->mrg_mh; + int ret; + + i_mac_perim_enter(mip); + + /* + * Only RX rings can be added or removed by drivers currently. + */ + ASSERT(group->mrg_type == MAC_RING_TYPE_RX); + + ret = i_mac_group_add_ring(group, NULL, index); + + i_mac_perim_exit(mip); + + return (ret); +} + +/* + * Entry point invoked by drivers to dynamically remove a ring + * from an existing group. The specified ring handle must no longer + * be used by the driver after a call to this function. + */ +void +mac_group_rem_ring(mac_group_handle_t gh, mac_ring_handle_t rh) +{ + mac_group_t *group = (mac_group_t *)gh; + mac_impl_t *mip = (mac_impl_t *)group->mrg_mh; + + i_mac_perim_enter(mip); + + /* + * Only RX rings can be added or removed by drivers currently. + */ + ASSERT(group->mrg_type == MAC_RING_TYPE_RX); + + i_mac_group_rem_ring(group, (mac_ring_t *)rh, B_TRUE); + + i_mac_perim_exit(mip); +} diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c new file mode 100644 index 0000000000..290366f5d2 --- /dev/null +++ b/usr/src/uts/common/io/mac/mac_sched.c @@ -0,0 +1,3819 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/callb.h> +#include <sys/sdt.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <sys/vlan.h> +#include <inet/ipsec_impl.h> +#include <inet/ip_impl.h> +#include <inet/sadb.h> +#include <inet/ipsecesp.h> +#include <inet/ipsecah.h> +#include <inet/ip6.h> + +#include <sys/mac_impl.h> +#include <sys/mac_client_impl.h> +#include <sys/mac_client_priv.h> +#include <sys/mac_soft_ring.h> +#include <sys/mac_flow_impl.h> + +static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *, + uintptr_t, uint16_t, mblk_t **); +static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *, + uintptr_t, uint16_t, mblk_t **); +static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *, + uintptr_t, uint16_t, mblk_t **); +static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *, + uintptr_t, uint16_t, mblk_t **); + +typedef struct mac_tx_mode_s { + mac_tx_srs_mode_t mac_tx_mode; + mac_tx_func_t mac_tx_func; +} mac_tx_mode_t; + +/* + * There are five modes of operation on the Tx side. These modes get set + * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode, + * none of the other modes are user configurable. They get selected by + * the system depending upon whether the link (or flow) has multiple Tx + * rings or a bandwidth configured, etc. + */ +mac_tx_mode_t mac_tx_mode_list[] = { + {SRS_TX_DEFAULT, mac_tx_single_ring_mode}, + {SRS_TX_SERIALIZE, mac_tx_serializer_mode}, + {SRS_TX_FANOUT, mac_tx_fanout_mode}, + {SRS_TX_BW, mac_tx_bw_mode}, + {SRS_TX_BW_FANOUT, mac_tx_bw_mode} +}; + +/* + * Soft Ring Set (SRS) - The Run time code that deals with + * dynamic polling from the hardware, bandwidth enforcement, + * fanout etc. + * + * We try to use H/W classification on NIC and assign traffic for + * a MAC address to a particular Rx ring or ring group. There is a + * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically + * switches the underlying Rx ring between interrupt and + * polling mode and enforces any specified B/W control. + * + * There is always a SRS created and tied to each H/W and S/W rule. + * Whenever we create a H/W rule, we always add the the same rule to + * S/W classifier and tie a SRS to it. + * + * In case a B/W control is specified, it is broken into bytes + * per ticks and as soon as the quota for a tick is exhausted, + * the underlying Rx ring is forced into poll mode for remainder of + * the tick. The SRS poll thread only polls for bytes that are + * allowed to come in the SRS. We typically let 4x the configured + * B/W worth of packets to come in the SRS (to prevent unnecessary + * drops due to bursts) but only process the specified amount. + * + * A MAC client (e.g. a VNIC or aggr) can have 1 or more + * Rx rings (and corresponding SRSs) assigned to it. The SRS + * in turn can have softrings to do protocol level fanout or + * softrings to do S/W based fanout or both. In case the NIC + * has no Rx rings, we do S/W classification to respective SRS. + * The S/W classification rule is always setup and ready. This + * allows the MAC layer to reassign Rx rings whenever needed + * but packets still continue to flow via the default path and + * getting S/W classified to correct SRS. + * + * The SRS's are used on both Tx and Rx side. They use the same + * data structure but the processing routines have slightly different + * semantics due to the fact that Rx side needs to do dynamic + * polling etc. + * + * Dynamic Polling Notes + * ===================== + * + * Each Soft ring set is capable of switching its Rx ring between + * interrupt and poll mode and actively 'polls' for packets in + * poll mode. If the SRS is implementing a B/W limit, it makes + * sure that only Max allowed packets are pulled in poll mode + * and goes to poll mode as soon as B/W limit is exceeded. As + * such, there are no overheads to implement B/W limits. + * + * In poll mode, its better to keep the pipeline going where the + * SRS worker thread keeps processing packets and poll thread + * keeps bringing more packets (specially if they get to run + * on different CPUs). This also prevents the overheads associated + * by excessive signalling (on NUMA machines, this can be + * pretty devastating). The exception is latency optimized case + * where worker thread does no work and interrupt and poll thread + * are allowed to do their own drain. + * + * We use the following policy to control Dynamic Polling: + * 1) We switch to poll mode anytime the processing + * thread causes a backlog to build up in SRS and + * its associated Soft Rings (sr_poll_pkt_cnt > 0). + * 2) As long as the backlog stays under the low water + * mark (sr_lowat), we poll the H/W for more packets. + * 3) If the backlog (sr_poll_pkt_cnt) exceeds low + * water mark, we stay in poll mode but don't poll + * the H/W for more packets. + * 4) Anytime in polling mode, if we poll the H/W for + * packets and find nothing plus we have an existing + * backlog (sr_poll_pkt_cnt > 0), we stay in polling + * mode but don't poll the H/W for packets anymore + * (let the polling thread go to sleep). + * 5) Once the backlog is relived (packets are processed) + * we reenable polling (by signalling the poll thread) + * only when the backlog dips below sr_poll_thres. + * 6) sr_hiwat is used exclusively when we are not + * polling capable and is used to decide when to + * drop packets so the SRS queue length doesn't grow + * infinitely. + * + * NOTE: Also see the block level comment on top of mac_soft_ring.c + */ + +/* + * mac_latency_optimize + * + * Controls whether the poll thread can process the packets inline + * or let the SRS worker thread do the processing. This applies if + * the SRS was not being processed. For latency sensitive traffic, + * this needs to be true to allow inline processing. For throughput + * under load, this should be false. + * + * This (and other similar) tunable should be rolled into a link + * or flow specific workload hint that can be set using dladm + * linkprop (instead of multiple such tunables). + */ +boolean_t mac_latency_optimize = B_TRUE; + +/* + * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN + * + * queue a mp or chain in soft ring set and increment the + * local count (srs_count) for the SRS and the shared counter + * (srs_poll_pkt_cnt - shared between SRS and its soft rings + * to track the total unprocessed packets for polling to work + * correctly). + * + * The size (total bytes queued) counters are incremented only + * if we are doing B/W control. + */ +#define MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ + ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ + if ((mac_srs)->srs_last != NULL) \ + (mac_srs)->srs_last->b_next = (head); \ + else \ + (mac_srs)->srs_first = (head); \ + (mac_srs)->srs_last = (tail); \ + (mac_srs)->srs_count += count; \ +} + +#define MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ + mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ + \ + MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ + srs_rx->sr_poll_pkt_cnt += count; \ + ASSERT(srs_rx->sr_poll_pkt_cnt > 0); \ + if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ + (mac_srs)->srs_size += (sz); \ + mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock); \ + (mac_srs)->srs_bw->mac_bw_sz += (sz); \ + mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock); \ + } \ +} + +#define MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ + mac_srs->srs_state |= SRS_ENQUEUED; \ + MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ + if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ + (mac_srs)->srs_size += (sz); \ + (mac_srs)->srs_bw->mac_bw_sz += (sz); \ + } \ +} + +/* + * Turn polling on routines + */ +#define MAC_SRS_POLLING_ON(mac_srs) { \ + ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ + if (((mac_srs)->srs_state & \ + (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) { \ + (mac_srs)->srs_state |= SRS_POLLING; \ + (void) mac_hwring_disable_intr((mac_ring_handle_t) \ + (mac_srs)->srs_ring); \ + (mac_srs)->srs_rx.sr_poll_on++; \ + } \ +} + +#define MAC_SRS_WORKER_POLLING_ON(mac_srs) { \ + ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ + if (((mac_srs)->srs_state & \ + (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == \ + (SRS_POLLING_CAPAB|SRS_WORKER)) { \ + (mac_srs)->srs_state |= SRS_POLLING; \ + (void) mac_hwring_disable_intr((mac_ring_handle_t) \ + (mac_srs)->srs_ring); \ + (mac_srs)->srs_rx.sr_worker_poll_on++; \ + } \ +} + +/* + * MAC_SRS_POLL_RING + * + * Signal the SRS poll thread to poll the underlying H/W ring + * provided it wasn't already polling (SRS_GET_PKTS was set). + * + * Poll thread gets to run only from mac_rx_srs_drain() and only + * if the drain was being done by the worker thread. + */ +#define MAC_SRS_POLL_RING(mac_srs) { \ + mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ + \ + ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ + srs_rx->sr_poll_thr_sig++; \ + if (((mac_srs)->srs_state & \ + (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) == \ + (SRS_WORKER|SRS_POLLING_CAPAB)) { \ + (mac_srs)->srs_state |= SRS_GET_PKTS; \ + cv_signal(&(mac_srs)->srs_cv); \ + } else { \ + srs_rx->sr_poll_thr_busy++; \ + } \ +} + +/* + * MAC_SRS_CHECK_BW_CONTROL + * + * Check to see if next tick has started so we can reset the + * SRS_BW_ENFORCED flag and allow more packets to come in the + * system. + */ +#define MAC_SRS_CHECK_BW_CONTROL(mac_srs) { \ + ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ + ASSERT(((mac_srs)->srs_type & SRST_TX) || \ + MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock)); \ + if ((mac_srs)->srs_bw->mac_bw_curr_time != lbolt) { \ + (mac_srs)->srs_bw->mac_bw_curr_time = lbolt; \ + (mac_srs)->srs_bw->mac_bw_used = 0; \ + if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED) \ + (mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \ + } \ +} + +/* + * MAC_SRS_WORKER_WAKEUP + * + * Wake up the SRS worker thread to process the queue as long as + * no one else is processing the queue. If we are optimizing for + * latency, we wake up the worker thread immediately or else we + * wait mac_srs_worker_wakeup_ticks before worker thread gets + * woken up. + */ +int mac_srs_worker_wakeup_ticks = 0; +#define MAC_SRS_WORKER_WAKEUP(mac_srs) { \ + ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ + if (!((mac_srs)->srs_state & SRS_PROC) && \ + (mac_srs)->srs_tid == NULL) { \ + if (mac_latency_optimize || \ + (mac_srs_worker_wakeup_ticks == 0)) \ + cv_signal(&(mac_srs)->srs_async); \ + else \ + (mac_srs)->srs_tid = \ + timeout(mac_srs_fire, (mac_srs), \ + mac_srs_worker_wakeup_ticks); \ + } \ +} + +#define TX_SINGLE_RING_MODE(mac_srs) \ + ((mac_srs)->srs_tx.st_mode == SRS_TX_DEFAULT || \ + (mac_srs)->srs_tx.st_mode == SRS_TX_SERIALIZE || \ + (mac_srs)->srs_tx.st_mode == SRS_TX_BW) + +#define TX_BANDWIDTH_MODE(mac_srs) \ + ((mac_srs)->srs_tx.st_mode == SRS_TX_BW || \ + (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT) + +#define TX_SRS_TO_SOFT_RING(mac_srs, head, hint) { \ + uint_t hash, indx; \ + hash = HASH_HINT(hint); \ + indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); \ + softring = mac_srs->srs_oth_soft_rings[indx]; \ + (void) (mac_tx_soft_ring_process(softring, head, 0, NULL)); \ +} + +/* + * MAC_TX_SRS_BLOCK + * + * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED + * will be set only if srs_tx_woken_up is FALSE. If + * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived + * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to + * attempt to transmit again and not setting SRS_TX_BLOCKED does + * that. + */ +#define MAC_TX_SRS_BLOCK(srs, mp) { \ + ASSERT(MUTEX_HELD(&(srs)->srs_lock)); \ + if ((srs)->srs_tx.st_woken_up) { \ + (srs)->srs_tx.st_woken_up = B_FALSE; \ + } else { \ + ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED)); \ + (srs)->srs_state |= SRS_TX_BLOCKED; \ + (srs)->srs_tx.st_blocked_cnt++; \ + } \ +} + +/* + * MAC_TX_SRS_TEST_HIWAT + * + * Called before queueing a packet onto Tx SRS to test and set + * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat. + */ +#define MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) { \ + boolean_t enqueue = 1; \ + \ + if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) { \ + /* \ + * flow-controlled. Store srs in cookie so that it \ + * can be returned as mac_tx_cookie_t to client \ + */ \ + (srs)->srs_state |= SRS_TX_HIWAT; \ + cookie = (mac_tx_cookie_t)srs; \ + (srs)->srs_tx.st_hiwat_cnt++; \ + if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) { \ + /* increment freed stats */ \ + (srs)->srs_tx.st_drop_count += cnt; \ + /* \ + * b_prev may be set to the fanout hint \ + * hence can't use freemsg directly \ + */ \ + mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \ + DTRACE_PROBE1(tx_queued_hiwat, \ + mac_soft_ring_set_t *, srs); \ + enqueue = 0; \ + } \ + } \ + if (enqueue) \ + MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz); \ +} + +/* Some utility macros */ +#define MAC_SRS_BW_LOCK(srs) \ + if (!(srs->srs_type & SRST_TX)) \ + mutex_enter(&srs->srs_bw->mac_bw_lock); + +#define MAC_SRS_BW_UNLOCK(srs) \ + if (!(srs->srs_type & SRST_TX)) \ + mutex_exit(&srs->srs_bw->mac_bw_lock); + +#define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \ + mac_pkt_drop(NULL, NULL, mp, B_FALSE); \ + /* increment freed stats */ \ + mac_srs->srs_tx.st_drop_count++; \ + cookie = (mac_tx_cookie_t)srs; \ +} + +#define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \ + mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT; \ + cookie = (mac_tx_cookie_t)srs; \ + *ret_mp = mp_chain; \ +} + +/* + * Drop the rx packet and advance to the next one in the chain. + */ +static void +mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp) +{ + mac_srs_rx_t *srs_rx = &srs->srs_rx; + + ASSERT(mp->b_next == NULL); + mutex_enter(&srs->srs_lock); + MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1); + MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp)); + mutex_exit(&srs->srs_lock); + + srs_rx->sr_drop_count++; + freemsg(mp); +} + +/* DATAPATH RUNTIME ROUTINES */ + +/* + * mac_srs_fire + * + * Timer callback routine for waking up the SRS worker thread. + */ +static void +mac_srs_fire(void *arg) +{ + mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg; + + mutex_enter(&mac_srs->srs_lock); + if (mac_srs->srs_tid == 0) { + mutex_exit(&mac_srs->srs_lock); + return; + } + + mac_srs->srs_tid = 0; + if (!(mac_srs->srs_state & SRS_PROC)) + cv_signal(&mac_srs->srs_async); + + mutex_exit(&mac_srs->srs_lock); +} + +/* + * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack, + * and it is used on the TX path. + */ +#define HASH_HINT(hint) (((hint) << 17) | ((hint) >> 16)) + +/* + * hash based on the src address and the port information. + */ +#define HASH_ADDR(src, ports) \ + (ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^ \ + ((ports) >> 8) ^ (ports)) + +#define COMPUTE_INDEX(key, sz) (key % sz) + +#define FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) { \ + if ((tail) != NULL) { \ + ASSERT((tail)->b_next == NULL); \ + (tail)->b_next = (mp); \ + } else { \ + ASSERT((head) == NULL); \ + (head) = (mp); \ + } \ + (tail) = (mp); \ + (cnt)++; \ + if ((bw_ctl)) \ + (sz) += (sz0); \ +} + +#define MAC_FANOUT_DEFAULT 0 +#define MAC_FANOUT_RND_ROBIN 1 +int mac_fanout_type = MAC_FANOUT_DEFAULT; + +#define MAX_SR_TYPES 3 +/* fanout types for port based hashing */ +enum pkt_type { + V4_TCP = 0, + V4_UDP, + OTH, + UNDEF +}; + +/* + * In general we do port based hashing to spread traffic over different + * softrings. The below tunable allows to override that behavior. Setting it + * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior + * is also the applicable to ipv6 packets carrying multiple optional headers + * and other uncommon packet types. + */ +boolean_t mac_src_ipv6_fanout = B_FALSE; + +/* + * Pair of local and remote ports in the transport header + */ +#define PORTS_SIZE 4 + +/* + * mac_rx_srs_proto_fanout + * + * This routine delivers packets destined to an SRS into one of the + * protocol soft rings. + * + * Given a chain of packets we need to split it up into multiple sub chains + * destined into TCP, UDP or OTH soft ring. Instead of entering + * the soft ring one packet at a time, we want to enter it in the form of a + * chain otherwise we get this start/stop behaviour where the worker thread + * goes to sleep and then next packets comes in forcing it to wake up etc. + */ +static void +mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) +{ + struct ether_header *ehp; + uint16_t etype; + ipha_t *ipha; + mac_soft_ring_t *softring; + size_t ether_hlen; + mblk_t *mp; + mblk_t *headmp[MAX_SR_TYPES]; + mblk_t *tailmp[MAX_SR_TYPES]; + int cnt[MAX_SR_TYPES]; + size_t sz[MAX_SR_TYPES]; + size_t sz1; + boolean_t bw_ctl = B_FALSE; + boolean_t hw_classified; + boolean_t dls_bypass = B_TRUE; + enum pkt_type type; + mac_client_impl_t *mcip = mac_srs->srs_mcip; + struct ether_vlan_header *evhp; + + if (mac_srs->srs_type & SRST_BW_CONTROL) + bw_ctl = B_TRUE; + + /* + * If we don't have a Rx ring, S/W classification would have done + * its job and its a packet meant for us. If we were polling on + * the default ring (i.e. there was a ring assigned to this SRS), + * then we need to make sure that the mac address really belongs + * to us. + */ + hw_classified = mac_srs->srs_ring != NULL && + mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; + + /* + * Special clients (eg. VLAN, non ether, etc) need DLS + * processing in the Rx path. SRST_DLS_BYPASS will be clear for + * such SRSs. + */ + if (!(mac_srs->srs_type & SRST_DLS_BYPASS)) + dls_bypass = B_FALSE; + + bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *)); + bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *)); + bzero(cnt, MAX_SR_TYPES * sizeof (int)); + bzero(sz, MAX_SR_TYPES * sizeof (size_t)); + + /* + * We got a chain from SRS that we need to send to the soft rings. + * Since squeues for TCP & IPv4 sap poll their soft rings (for + * performance reasons), we need to separate out v4_tcp, v4_udp + * and the rest goes in other. + */ + while (head != NULL) { + mp = head; + head = head->b_next; + mp->b_next = NULL; + + type = OTH; + sz1 = msgdsize(mp); + + if (!dls_bypass) { + mac_impl_t *mip = mcip->mci_mip; + + ehp = (struct ether_header *)mp->b_rptr; + + /* + * For VLAN packets, if the VLAN id doesn't belong + * to this client, we drop the packet. + */ + if (mip->mi_info.mi_nativemedia == DL_ETHER && + ntohs(ehp->ether_type) == VLAN_TPID) { + /* + * LINTED: cast may result in improper + * alignment + */ + evhp = (struct ether_vlan_header *)ehp; + if (!mac_client_check_flow_vid(mcip, + VLAN_ID(ntohs(evhp->ether_tci)))) { + mac_rx_drop_pkt(mac_srs, mp); + continue; + } + } + FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], + cnt[type], bw_ctl, sz[type], sz1, mp); + continue; + } + + /* + * At this point we can be sure the packet at least + * has an ether header. + */ + if (sz1 < sizeof (struct ether_header)) { + mac_rx_drop_pkt(mac_srs, mp); + continue; + } + /* LINTED: cast may result in improper alignment */ + ehp = (struct ether_header *)mp->b_rptr; + + /* + * Determine if this is a VLAN or non-VLAN packet. + */ + if ((etype = ntohs(ehp->ether_type)) == VLAN_TPID) { + /* LINTED: cast may result in improper alignment */ + evhp = (struct ether_vlan_header *)mp->b_rptr; + etype = ntohs(evhp->ether_type); + ether_hlen = sizeof (struct ether_vlan_header); + /* + * Check if the VID of the packet, if any, belongs + * to this client. + */ + if (!mac_client_check_flow_vid(mcip, + VLAN_ID(ntohs(evhp->ether_tci)))) { + mac_rx_drop_pkt(mac_srs, mp); + continue; + } + } else { + ether_hlen = sizeof (struct ether_header); + } + + if (etype == ETHERTYPE_IP) { + /* + * If we are H/W classified, but we have promisc + * on, then we need to check for the unicast address. + */ + if (hw_classified && mcip->mci_promisc_list != NULL) { + mac_address_t *map; + + rw_enter(&mcip->mci_rw_lock, RW_READER); + map = mcip->mci_unicast; + if (bcmp(&ehp->ether_dhost, map->ma_addr, + map->ma_len) == 0) + type = UNDEF; + rw_exit(&mcip->mci_rw_lock); + } else if (((((uint8_t *)&ehp->ether_dhost)[0] & + 0x01) == 0)) { + type = UNDEF; + } + } + + /* + * This needs to become a contract with the driver for + * the fast path. + * + * In the normal case the packet will have at least the L2 + * header and the IP + Transport header in the same mblk. + * This is usually the case when the NIC driver sends up + * the packet. This is also true when the stack generates + * a packet that is looped back and when the stack uses the + * fastpath mechanism. The normal case is optimized for + * performance and may bypass DLS. All other cases go through + * the 'OTH' type path without DLS bypass. + */ + + /* LINTED: cast may result in improper alignment */ + ipha = (ipha_t *)(mp->b_rptr + ether_hlen); + if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) + type = OTH; + + if (type == OTH) { + FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], + cnt[type], bw_ctl, sz[type], sz1, mp); + continue; + } + + ASSERT(type == UNDEF); + /* + * We look for at least 4 bytes past the IP header to get + * the port information. If we get an IP fragment, we don't + * have the port information, and we use just the protocol + * information. + */ + switch (ipha->ipha_protocol) { + case IPPROTO_TCP: + type = V4_TCP; + mp->b_rptr += ether_hlen; + break; + case IPPROTO_UDP: + type = V4_UDP; + mp->b_rptr += ether_hlen; + break; + default: + type = OTH; + break; + } + + ASSERT(type != UNDEF); + + FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type], + bw_ctl, sz[type], sz1, mp); + } + + for (type = V4_TCP; type < UNDEF; type++) { + if (headmp[type] != NULL) { + ASSERT(tailmp[type]->b_next == NULL); + switch (type) { + case V4_TCP: + softring = mac_srs->srs_tcp_soft_rings[0]; + break; + case V4_UDP: + softring = mac_srs->srs_udp_soft_rings[0]; + break; + case OTH: + softring = mac_srs->srs_oth_soft_rings[0]; + } + mac_rx_soft_ring_process(mac_srs->srs_mcip, softring, + headmp[type], tailmp[type], cnt[type], sz[type]); + } + } +} + +int fanout_unalligned = 0; + +/* + * mac_rx_srs_long_fanout + * + * The fanout routine for IPv6 + */ +static int +mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, + uint16_t etype, enum pkt_type *type, uint_t *indx) +{ + ip6_t *ip6h; + uint8_t *whereptr; + uint_t hash; + uint16_t remlen; + uint8_t nexthdr; + uint16_t hdr_len; + + if (etype == ETHERTYPE_IPV6) { + boolean_t modifiable = B_TRUE; + + ASSERT(MBLKL(mp) >= sizeof (struct ether_header)); + + ip6h = (ip6_t *)(mp->b_rptr + sizeof (struct ether_header)); + if ((unsigned char *)ip6h == mp->b_wptr) { + /* + * The first mblk_t only includes the ethernet header. + * Note that it is safe to change the mp pointer here, + * as the subsequent operation does not assume mp + * points to the start of the ethernet header. + */ + mp = mp->b_cont; + + /* + * Make sure ip6h holds the full ip6_t structure. + */ + if (mp == NULL) + return (-1); + + if (MBLKL(mp) < IPV6_HDR_LEN) { + modifiable = (DB_REF(mp) == 1); + + if (modifiable && + !pullupmsg(mp, IPV6_HDR_LEN)) { + return (-1); + } + } + + ip6h = (ip6_t *)mp->b_rptr; + } + + if (!modifiable || !(OK_32PTR((char *)ip6h)) || + ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) { + /* + * If either ip6h is not alligned, or ip6h does not + * hold the complete ip6_t structure (a pullupmsg() + * is not an option since it would result in an + * unalligned ip6h), fanout to the default ring. Note + * that this may cause packets reordering. + */ + *indx = 0; + *type = OTH; + fanout_unalligned++; + return (0); + } + + remlen = ntohs(ip6h->ip6_plen); + nexthdr = ip6h->ip6_nxt; + + if (remlen < MIN_EHDR_LEN) + return (-1); + /* + * Do src based fanout if below tunable is set to B_TRUE or + * when mac_ip_hdr_length_v6() fails because of malformed + * packets or because mblk's need to be concatenated using + * pullupmsg(). + */ + if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(mp, ip6h, + &hdr_len, &nexthdr)) { + goto src_based_fanout; + } + whereptr = (uint8_t *)ip6h + hdr_len; + + /* If the transport is one of below, we do port based fanout */ + switch (nexthdr) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_SCTP: + case IPPROTO_ESP: + /* + * If the ports in the transport header is not part of + * the mblk, do src_based_fanout, instead of calling + * pullupmsg(). + */ + if (mp->b_cont != NULL && + whereptr + PORTS_SIZE > mp->b_wptr) { + goto src_based_fanout; + } + break; + default: + break; + } + + switch (nexthdr) { + case IPPROTO_TCP: + hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), + *(uint32_t *)whereptr); + *indx = COMPUTE_INDEX(hash, + mac_srs->srs_tcp_ring_count); + *type = OTH; + break; + + case IPPROTO_UDP: + case IPPROTO_SCTP: + case IPPROTO_ESP: + if (mac_fanout_type == MAC_FANOUT_DEFAULT) { + hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), + *(uint32_t *)whereptr); + *indx = COMPUTE_INDEX(hash, + mac_srs->srs_udp_ring_count); + } else { + *indx = mac_srs->srs_ind % + mac_srs->srs_udp_ring_count; + mac_srs->srs_ind++; + } + *type = OTH; + break; + + /* For all other protocol, do source based fanout */ + default: + goto src_based_fanout; + } + } else { + *indx = 0; + *type = OTH; + } + return (0); + +src_based_fanout: + hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0); + *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); + *type = OTH; + return (0); +} + +/* + * mac_rx_srs_fanout + * + * This routine delivers packets destined to an SRS into a soft ring member + * of the set. + * + * Given a chain of packets we need to split it up into multiple sub chains + * destined for one of the TCP, UDP or OTH soft rings. Instead of entering + * the soft ring one packet at a time, we want to enter it in the form of a + * chain otherwise we get this start/stop behaviour where the worker thread + * goes to sleep and then next packets comes in forcing it to wake up etc. + * + * Note: + * Since we know what is the maximum fanout possible, we create a 2D array + * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz + * variables so that we can enter the softrings with chain. We need the + * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc + * for each packet would be expensive). If we ever want to have the + * ability to have unlimited fanout, we should probably declare a head, + * tail, cnt, sz with each soft ring (a data struct which contains a softring + * along with these members) and create an array of this uber struct so we + * don't have to do kmem_alloc. + */ +int fanout_oth1 = 0; +int fanout_oth2 = 0; +int fanout_oth3 = 0; +int fanout_oth4 = 0; +int fanout_oth5 = 0; + +static void +mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) +{ + struct ether_header *ehp; + uint16_t etype; + ipha_t *ipha; + uint_t indx; + int ports_offset = -1; + int ipha_len; + uint_t hash; + mac_soft_ring_t *softring; + size_t ether_hlen; + uint16_t frag_offset_flags; + mblk_t *mp; + mblk_t *headmp[MAX_SR_TYPES][MAX_SR_FANOUT]; + mblk_t *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT]; + int cnt[MAX_SR_TYPES][MAX_SR_FANOUT]; + size_t sz[MAX_SR_TYPES][MAX_SR_FANOUT]; + size_t sz1; + boolean_t bw_ctl = B_FALSE; + boolean_t hw_classified; + boolean_t dls_bypass = B_TRUE; + int i; + int fanout_cnt; + enum pkt_type type; + mac_client_impl_t *mcip = mac_srs->srs_mcip; + struct ether_vlan_header *evhp; + + if (mac_srs->srs_type & SRST_BW_CONTROL) + bw_ctl = B_TRUE; + + /* + * If we don't have a Rx ring, S/W classification would have done + * its job and its a packet meant for us. If we were polling on + * the default ring (i.e. there was a ring assigned to this SRS), + * then we need to make sure that the mac address really belongs + * to us. + */ + hw_classified = mac_srs->srs_ring != NULL && + mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; + + /* + * Special clients (eg. VLAN, non ether, etc) need DLS + * processing in the Rx path. SRST_DLS_BYPASS will be clear for + * such SRSs. + */ + if (!(mac_srs->srs_type & SRST_DLS_BYPASS)) + dls_bypass = B_FALSE; + + /* + * Since the softrings are never destroyed and we always + * create equal number of softrings for TCP, UDP and rest, + * its OK to check one of them for count and use it without + * any lock. In future, if soft rings get destroyed because + * of reduction in fanout, we will need to ensure that happens + * behind the SRS_PROC. + */ + fanout_cnt = mac_srs->srs_tcp_ring_count; + + bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); + bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); + bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int)); + bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t)); + + /* + * We got a chain from SRS that we need to send to the soft rings. + * Since squeues for TCP & IPv4 sap poll their soft rings (for + * performance reasons), we need to separate out v4_tcp, v4_udp + * and the rest goes in other. + */ + while (head != NULL) { + mp = head; + head = head->b_next; + mp->b_next = NULL; + + type = OTH; + sz1 = msgdsize(mp); + + if (!dls_bypass) { + mac_impl_t *mip = mcip->mci_mip; + + indx = 0; + if (mip->mi_info.mi_nativemedia == DL_ETHER) { + ehp = (struct ether_header *)mp->b_rptr; + etype = ntohs(ehp->ether_type); + /* + * For VLAN packets, if the VLAN id doesn't + * belong to this client, we drop the packet. + */ + if (etype == VLAN_TPID) { + /* + * LINTED: cast may result in improper + * alignment + */ + evhp = (struct ether_vlan_header *) + mp->b_rptr; + if (!mac_client_check_flow_vid(mcip, + VLAN_ID(ntohs(evhp->ether_tci)))) { + mac_rx_drop_pkt(mac_srs, mp); + continue; + } + } + if (mac_rx_srs_long_fanout(mac_srs, mp, etype, + &type, &indx) == -1) { + mac_rx_drop_pkt(mac_srs, mp); + continue; + } + } + + FANOUT_ENQUEUE_MP(headmp[type][indx], + tailmp[type][indx], cnt[type][indx], bw_ctl, + sz[type][indx], sz1, mp); + continue; + } + + /* + * At this point we can be sure the packet at least + * has an ether header. On the outbound side, GLD/stack + * ensure this. On the inbound side, the driver needs + * to ensure this. + */ + if (sz1 < sizeof (struct ether_header)) { + mac_rx_drop_pkt(mac_srs, mp); + continue; + } + /* LINTED: cast may result in improper alignment */ + ehp = (struct ether_header *)mp->b_rptr; + + /* + * Determine if this is a VLAN or non-VLAN packet. + */ + if ((etype = ntohs(ehp->ether_type)) == VLAN_TPID) { + /* LINTED: cast may result in improper alignment */ + evhp = (struct ether_vlan_header *)mp->b_rptr; + etype = ntohs(evhp->ether_type); + ether_hlen = sizeof (struct ether_vlan_header); + /* + * Check if the VID of the packet, if any, belongs + * to this client. + */ + if (!mac_client_check_flow_vid(mcip, + VLAN_ID(ntohs(evhp->ether_tci)))) { + mac_rx_drop_pkt(mac_srs, mp); + continue; + } + } else { + ether_hlen = sizeof (struct ether_header); + } + + + /* + * If we are using the default Rx ring where H/W or S/W + * classification has not happened, we need to verify if + * this unicast packet really belongs to us. + */ + if (etype == ETHERTYPE_IP) { + /* + * If we are H/W classified, but we have promisc + * on, then we need to check for the unicast address. + */ + if (hw_classified && mcip->mci_promisc_list != NULL) { + mac_address_t *map; + + rw_enter(&mcip->mci_rw_lock, RW_READER); + map = mcip->mci_unicast; + if (bcmp(&ehp->ether_dhost, map->ma_addr, + map->ma_len) == 0) + type = UNDEF; + rw_exit(&mcip->mci_rw_lock); + } else if (((((uint8_t *)&ehp->ether_dhost)[0] & + 0x01) == 0)) { + type = UNDEF; + } + } + + /* + * This needs to become a contract with the driver for + * the fast path. + */ + + /* LINTED: cast may result in improper alignment */ + ipha = (ipha_t *)(mp->b_rptr + ether_hlen); + if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) { + type = OTH; + fanout_oth1++; + } + + if (type != OTH) { + switch (ipha->ipha_protocol) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_SCTP: + case IPPROTO_ESP: + ipha_len = IPH_HDR_LENGTH(ipha); + if ((uchar_t *)ipha + ipha_len + PORTS_SIZE > + mp->b_wptr) { + type = OTH; + break; + } + frag_offset_flags = + ntohs(ipha->ipha_fragment_offset_and_flags); + if ((frag_offset_flags & + (IPH_MF | IPH_OFFSET)) != 0) { + type = OTH; + fanout_oth3++; + break; + } + ports_offset = ether_hlen + ipha_len; + break; + default: + type = OTH; + fanout_oth4++; + break; + } + } + + if (type == OTH) { + if (mac_rx_srs_long_fanout(mac_srs, mp, etype, + &type, &indx) == -1) { + mac_rx_drop_pkt(mac_srs, mp); + continue; + } + + FANOUT_ENQUEUE_MP(headmp[type][indx], + tailmp[type][indx], cnt[type][indx], bw_ctl, + sz[type][indx], sz1, mp); + continue; + } + + ASSERT(type == UNDEF); + + /* + * XXX-Sunay: We should hold srs_lock since ring_count + * below can change. But if we are always called from + * mac_rx_srs_drain and SRS_PROC is set, then we can + * enforce that ring_count can't be changed i.e. + * to change fanout type or ring count, the calling + * thread needs to be behind SRS_PROC. + */ + switch (ipha->ipha_protocol) { + case IPPROTO_TCP: + /* + * Note that for ESP, we fanout on SPI and it is at the + * same offset as the 2x16-bit ports. So it is clumped + * along with TCP, UDP and SCTP. + */ + hash = HASH_ADDR(ipha->ipha_src, + *(uint32_t *)(mp->b_rptr + ports_offset)); + indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count); + type = V4_TCP; + mp->b_rptr += ether_hlen; + break; + case IPPROTO_UDP: + case IPPROTO_SCTP: + case IPPROTO_ESP: + if (mac_fanout_type == MAC_FANOUT_DEFAULT) { + hash = HASH_ADDR(ipha->ipha_src, + *(uint32_t *)(mp->b_rptr + ports_offset)); + indx = COMPUTE_INDEX(hash, + mac_srs->srs_udp_ring_count); + } else { + indx = mac_srs->srs_ind % + mac_srs->srs_udp_ring_count; + mac_srs->srs_ind++; + } + type = V4_UDP; + mp->b_rptr += ether_hlen; + break; + } + + ASSERT(type != UNDEF); + + FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx], + cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp); + } + + for (type = V4_TCP; type < UNDEF; type++) { + for (i = 0; i < fanout_cnt; i++) { + if (headmp[type][i] != NULL) { + ASSERT(tailmp[type][i]->b_next == NULL); + switch (type) { + case V4_TCP: + softring = + mac_srs->srs_tcp_soft_rings[i]; + break; + case V4_UDP: + softring = + mac_srs->srs_udp_soft_rings[i]; + break; + case OTH: + softring = + mac_srs->srs_oth_soft_rings[i]; + break; + } + mac_rx_soft_ring_process(mac_srs->srs_mcip, + softring, headmp[type][i], tailmp[type][i], + cnt[type][i], sz[type][i]); + } + } + } +} + +#define SRS_BYTES_TO_PICKUP 150000 +ssize_t max_bytes_to_pickup = SRS_BYTES_TO_PICKUP; + +/* + * mac_rx_srs_poll_ring + * + * This SRS Poll thread uses this routine to poll the underlying hardware + * Rx ring to get a chain of packets. It can inline process that chain + * if mac_latency_optimize is set (default) or signal the SRS worker thread + * to do the remaining processing. + * + * Since packets come in the system via interrupt or poll path, we also + * update the stats and deal with promiscous clients here. + */ +void +mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs) +{ + kmutex_t *lock = &mac_srs->srs_lock; + kcondvar_t *async = &mac_srs->srs_cv; + mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; + mblk_t *head, *tail, *mp; + callb_cpr_t cprinfo; + ssize_t bytes_to_pickup; + size_t sz; + int count; + mac_client_impl_t *smcip; + + CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll"); + mutex_enter(lock); + +start: + for (;;) { + if (mac_srs->srs_state & SRS_PAUSE) + goto done; + + CALLB_CPR_SAFE_BEGIN(&cprinfo); + cv_wait(async, lock); + CALLB_CPR_SAFE_END(&cprinfo, lock); + + if (mac_srs->srs_state & SRS_PAUSE) + goto done; + +check_again: + if (mac_srs->srs_type & SRST_BW_CONTROL) { + /* + * We pick as many bytes as we are allowed to queue. + * Its possible that we will exceed the total + * packets queued in case this SRS is part of the + * Rx ring group since > 1 poll thread can be pulling + * upto the max allowed packets at the same time + * but that should be OK. + */ + mutex_enter(&mac_srs->srs_bw->mac_bw_lock); + bytes_to_pickup = + mac_srs->srs_bw->mac_bw_drop_threshold - + mac_srs->srs_bw->mac_bw_sz; + /* + * We shouldn't have been signalled if we + * have 0 or less bytes to pick but since + * some of the bytes accounting is driver + * dependant, we do the safety check. + */ + if (bytes_to_pickup < 0) + bytes_to_pickup = 0; + mutex_exit(&mac_srs->srs_bw->mac_bw_lock); + } else { + /* + * ToDO: Need to change the polling API + * to add a packet count and a flag which + * tells the driver whether we want packets + * based on a count, or bytes, or all the + * packets queued in the driver/HW. This + * way, we never have to check the limits + * on poll path. We truly let only as many + * packets enter the system as we are willing + * to process or queue. + * + * Something along the lines of + * pkts_to_pickup = mac_soft_ring_max_q_cnt - + * mac_srs->srs_poll_pkt_cnt + */ + + /* + * Since we are not doing B/W control, pick + * as many packets as allowed. + */ + bytes_to_pickup = max_bytes_to_pickup; + } + + /* Poll the underlying Hardware */ + mutex_exit(lock); + head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup); + mutex_enter(lock); + + ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) == + SRS_POLL_THR_OWNER); + + mp = tail = head; + count = 0; + sz = 0; + while (mp != NULL) { + tail = mp; + sz += msgdsize(mp); + mp = mp->b_next; + count++; + } + + if (head != NULL) { + tail->b_next = NULL; + smcip = mac_srs->srs_mcip; + + if ((mac_srs->srs_type & SRST_FLOW) || + (smcip == NULL)) { + FLOW_STAT_UPDATE(mac_srs->srs_flent, + rbytes, sz); + FLOW_STAT_UPDATE(mac_srs->srs_flent, + ipackets, count); + } + + /* + * If there are any promiscuous mode callbacks + * defined for this MAC client, pass them a copy + * if appropriate and also update the counters. + */ + if (smcip != NULL) { + smcip->mci_stat_ibytes += sz; + smcip->mci_stat_ipackets += count; + + if (smcip->mci_mip->mi_promisc_list != NULL) { + mutex_exit(lock); + mac_promisc_dispatch(smcip->mci_mip, + head, NULL); + mutex_enter(lock); + } + } + if (mac_srs->srs_type & SRST_BW_CONTROL) { + mutex_enter(&mac_srs->srs_bw->mac_bw_lock); + mac_srs->srs_bw->mac_bw_polled += sz; + mutex_exit(&mac_srs->srs_bw->mac_bw_lock); + } + srs_rx->sr_poll_count += count; + MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, + count, sz); + if (count <= 10) + srs_rx->sr_chain_cnt_undr10++; + else if (count > 10 && count <= 50) + srs_rx->sr_chain_cnt_10to50++; + else + srs_rx->sr_chain_cnt_over50++; + } + + /* + * We are guaranteed that SRS_PROC will be set if we + * are here. Also, poll thread gets to run only if + * the drain was being done by a worker thread although + * its possible that worker thread is still running + * and poll thread was sent down to keep the pipeline + * going instead of doing a complete drain and then + * trying to poll the NIC. + * + * So we need to check SRS_WORKER flag to make sure + * that the worker thread is not processing the queue + * in parallel to us. The flags and conditions are + * protected by the srs_lock to prevent any race. We + * ensure that we don't drop the srs_lock from now + * till the end and similarly we don't drop the srs_lock + * in mac_rx_srs_drain() till similar condition check + * are complete. The mac_rx_srs_drain() needs to ensure + * that SRS_WORKER flag remains set as long as its + * processing the queue. + */ + if (!(mac_srs->srs_state & SRS_WORKER) && + (mac_srs->srs_first != NULL)) { + /* + * We have packets to process and worker thread + * is not running. Check to see if poll thread is + * allowed to process. Let it do processing only if it + * picked up some packets from the NIC otherwise + * wakeup the worker thread. + */ + if ((mac_srs->srs_state & SRS_LATENCY_OPT) && + (head != NULL)) { + mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC); + if (srs_rx->sr_poll_pkt_cnt <= + srs_rx->sr_lowat) { + srs_rx->sr_poll_again++; + goto check_again; + } else { + /* + * We are already above low water mark + * so stay in the polling mode but no + * need to poll. Once we dip below + * the polling threshold, the processing + * thread (soft ring) will signal us + * to poll again (MAC_UPDATE_SRS_COUNT) + */ + srs_rx->sr_poll_drain_no_poll++; + mac_srs->srs_state &= + ~(SRS_PROC|SRS_GET_PKTS); + /* + * In B/W control case, its possible + * that the backlog built up due to + * B/W limit being reached and packets + * are queued only in SRS. In this case, + * we should schedule worker thread + * since no one else will wake us up. + */ + if ((mac_srs->srs_type & + SRST_BW_CONTROL) && + (mac_srs->srs_tid == NULL)) { + mac_srs->srs_tid = + timeout(mac_srs_fire, + mac_srs, 1); + srs_rx->sr_poll_worker_wakeup++; + } + } + } else { + /* + * Wakeup the worker thread for more processing. + * We optimize for throughput in this case. + */ + mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); + MAC_SRS_WORKER_WAKEUP(mac_srs); + srs_rx->sr_poll_sig_worker++; + } + } else if ((mac_srs->srs_first == NULL) && + !(mac_srs->srs_state & SRS_WORKER)) { + /* + * There is nothing queued in SRS and + * no worker thread running. Plus we + * didn't get anything from the H/W + * as well (head == NULL); + */ + ASSERT(head == NULL); + mac_srs->srs_state &= + ~(SRS_PROC|SRS_GET_PKTS); + + /* + * If we have a packets in soft ring, don't allow + * more packets to come into this SRS by keeping the + * interrupts off but not polling the H/W. The + * poll thread will get signaled as soon as + * srs_poll_pkt_cnt dips below poll threshold. + */ + if (srs_rx->sr_poll_pkt_cnt == 0) { + srs_rx->sr_poll_intr_enable++; + MAC_SRS_POLLING_OFF(mac_srs); + } else { + /* + * We know nothing is queued in SRS + * since we are here after checking + * srs_first is NULL. The backlog + * is entirely due to packets queued + * in Soft ring which will wake us up + * and get the interface out of polling + * mode once the backlog dips below + * sr_poll_thres. + */ + srs_rx->sr_poll_no_poll++; + } + } else { + /* + * Worker thread is already running. + * Nothing much to do. If the polling + * was enabled, worker thread will deal + * with that. + */ + mac_srs->srs_state &= ~SRS_GET_PKTS; + srs_rx->sr_poll_goto_sleep++; + } + } +done: + mac_srs->srs_state |= SRS_POLL_THR_QUIESCED; + cv_signal(&mac_srs->srs_async); + /* + * If this is a temporary quiesce then wait for the restart signal + * from the srs worker. Then clear the flags and signal the srs worker + * to ensure a positive handshake and go back to start. + */ + while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART))) + cv_wait(async, lock); + if (mac_srs->srs_state & SRS_POLL_THR_RESTART) { + ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); + mac_srs->srs_state &= + ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART); + cv_signal(&mac_srs->srs_async); + goto start; + } else { + mac_srs->srs_state |= SRS_POLL_THR_EXITED; + cv_signal(&mac_srs->srs_async); + CALLB_CPR_EXIT(&cprinfo); + thread_exit(); + } +} + +/* + * mac_srs_pick_chain + * + * In Bandwidth control case, checks how many packets can be processed + * and return them in a sub chain. + */ +static mblk_t * +mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail, + size_t *chain_sz, int *chain_cnt) +{ + mblk_t *head = NULL; + mblk_t *tail = NULL; + size_t sz; + size_t tsz = 0; + int cnt = 0; + mblk_t *mp; + + ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); + mutex_enter(&mac_srs->srs_bw->mac_bw_lock); + if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <= + mac_srs->srs_bw->mac_bw_limit) || + (mac_srs->srs_bw->mac_bw_limit == 0)) { + mutex_exit(&mac_srs->srs_bw->mac_bw_lock); + head = mac_srs->srs_first; + mac_srs->srs_first = NULL; + *chain_tail = mac_srs->srs_last; + mac_srs->srs_last = NULL; + *chain_sz = mac_srs->srs_size; + *chain_cnt = mac_srs->srs_count; + mac_srs->srs_count = 0; + mac_srs->srs_size = 0; + return (head); + } + + /* + * Can't clear the entire backlog. + * Need to find how many packets to pick + */ + ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock)); + while ((mp = mac_srs->srs_first) != NULL) { + sz = msgdsize(mp); + if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) > + mac_srs->srs_bw->mac_bw_limit) { + if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) + mac_srs->srs_bw->mac_bw_state |= + SRS_BW_ENFORCED; + break; + } + + /* + * The _size & cnt is decremented from the softrings + * when they send up the packet for polling to work + * properly. + */ + tsz += sz; + cnt++; + mac_srs->srs_count--; + mac_srs->srs_size -= sz; + if (tail != NULL) + tail->b_next = mp; + else + head = mp; + tail = mp; + mac_srs->srs_first = mac_srs->srs_first->b_next; + } + mutex_exit(&mac_srs->srs_bw->mac_bw_lock); + if (mac_srs->srs_first == NULL) + mac_srs->srs_last = NULL; + + if (tail != NULL) + tail->b_next = NULL; + *chain_tail = tail; + *chain_cnt = cnt; + *chain_sz = tsz; + + return (head); +} + +/* + * mac_rx_srs_drain + * + * The SRS drain routine. Gets to run to clear the queue. Any thread + * (worker, interrupt, poll) can call this based on processing model. + * The first thing we do is disable interrupts if possible and then + * drain the queue. we also try to poll the underlying hardware if + * there is a dedicated hardware Rx ring assigned to this SRS. + * + * There is a equivalent drain routine in bandwidth control mode + * mac_rx_srs_drain_bw. There is some code duplication between the two + * routines but they are highly performance sensitive and are easier + * to read/debug if they stay separate. Any code changes here might + * also apply to mac_rx_srs_drain_bw as well. + */ +void +mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) +{ + mblk_t *head; + mblk_t *tail; + timeout_id_t tid; + int cnt = 0; + mac_client_impl_t *mcip = mac_srs->srs_mcip; + mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; + + ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); + ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL)); +again: + /* If we are blanked i.e. can't do upcalls, then we are done */ + if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { + ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || + (mac_srs->srs_state & SRS_PAUSE)); + goto out; + } + + if (mac_srs->srs_first == NULL) + goto out; + + head = mac_srs->srs_first; + mac_srs->srs_first = NULL; + tail = mac_srs->srs_last; + mac_srs->srs_last = NULL; + cnt = mac_srs->srs_count; + mac_srs->srs_count = 0; + + ASSERT(head != NULL); + ASSERT(tail != NULL); + + if ((tid = mac_srs->srs_tid) != 0) + mac_srs->srs_tid = 0; + + mac_srs->srs_state |= (SRS_PROC|proc_type); + + /* Switch to polling mode */ + MAC_SRS_WORKER_POLLING_ON(mac_srs); + if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) + MAC_SRS_POLL_RING(mac_srs); + /* + * mcip is NULL for broadcast and multicast flows. The promisc + * callbacks for broadcast and multicast packets are delivered from + * mac_rx() and we don't need to worry about that case in this path + */ + if (mcip != NULL && mcip->mci_promisc_list != NULL) { + mutex_exit(&mac_srs->srs_lock); + mac_promisc_client_dispatch(mcip, head); + mutex_enter(&mac_srs->srs_lock); + } + + /* + * Check if SRS itself is doing the processing + * This direct path does not apply when subflows are present. In this + * case, packets need to be dispatched to a soft ring according to the + * flow's bandwidth and other resources contraints. + */ + if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { + mac_direct_rx_t proc; + void *arg1; + mac_resource_handle_t arg2; + + /* + * This is the case when a Rx is directly + * assigned and we have a fully classified + * protocol chain. We can deal with it in + * one shot. + */ + proc = srs_rx->sr_func; + arg1 = srs_rx->sr_arg1; + arg2 = srs_rx->sr_arg2; + + mac_srs->srs_state |= SRS_CLIENT_PROC; + mutex_exit(&mac_srs->srs_lock); + if (tid != 0) { + (void) untimeout(tid); + tid = 0; + } + + proc(arg1, arg2, head, NULL); + /* + * Decrement the size and count here itelf + * since the packet has been processed. + */ + mutex_enter(&mac_srs->srs_lock); + MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); + if (mac_srs->srs_state & SRS_CLIENT_WAIT) + cv_signal(&mac_srs->srs_client_cv); + mac_srs->srs_state &= ~SRS_CLIENT_PROC; + } else { + /* Some kind of softrings based fanout is required */ + mutex_exit(&mac_srs->srs_lock); + if (tid != 0) { + (void) untimeout(tid); + tid = 0; + } + + /* + * Since the fanout routines can deal with chains, + * shoot the entire chain up. + */ + if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) + mac_rx_srs_fanout(mac_srs, head); + else + mac_rx_srs_proto_fanout(mac_srs, head); + mutex_enter(&mac_srs->srs_lock); + } + + /* + * Send the poll thread to pick up any packets arrived + * so far. This also serves as the last check in case + * nothing else is queued in the SRS. The poll thread + * is signalled only in the case the drain was done + * by the worker thread and SRS_WORKER is set. The + * worker thread can run in parallel as long as the + * SRS_WORKER flag is set. We we have nothing else to + * process, we can exit while leaving SRS_PROC set + * which gives the poll thread control to process and + * cleanup once it returns from the NIC. + * + * If we have nothing else to process, we need to + * ensure that we keep holding the srs_lock till + * all the checks below are done and control is + * handed to the poll thread if it was running. + */ + if (mac_srs->srs_first != NULL) { + if (proc_type == SRS_WORKER) { + if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) + MAC_SRS_POLL_RING(mac_srs); + srs_rx->sr_drain_again++; + goto again; + } else { + srs_rx->sr_drain_worker_sig++; + cv_signal(&mac_srs->srs_async); + } + } + +out: + + if (mac_srs->srs_state & SRS_GET_PKTS) { + /* + * Poll thread is already running. Leave the + * SRS_RPOC set and hand over the control to + * poll thread. + */ + mac_srs->srs_state &= ~proc_type; + srs_rx->sr_drain_poll_running++; + return; + } + + /* + * Even if there are no packets queued in SRS, we + * need to make sure that the shared counter is + * clear and any associated softrings have cleared + * all the backlog. Otherwise, leave the interface + * in polling mode and the poll thread will get + * signalled once the count goes down to zero. + * + * If someone is already draining the queue (SRS_PROC is + * set) when the srs_poll_pkt_cnt goes down to zero, + * then it means that drain is already running and we + * will turn off polling at that time if there is + * no backlog. + * + * As long as there are packets queued either + * in soft ring set or its soft rings, we will leave + * the interface in polling mode (even if the drain + * was done being the interrupt thread). We signal + * the poll thread as well if we have dipped below + * low water mark. + * + * NOTE: We can't use the MAC_SRS_POLLING_ON macro + * since that turn polling on only for worker thread. + * Its not worth turning polling on for interrupt + * thread (since NIC will not issue another interrupt) + * unless a backlog builds up. + */ + if ((srs_rx->sr_poll_pkt_cnt > 0) && + (mac_srs->srs_state & SRS_POLLING_CAPAB)) { + mac_srs->srs_state &= ~(SRS_PROC|proc_type); + srs_rx->sr_drain_keep_polling++; + MAC_SRS_POLLING_ON(mac_srs); + if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) + MAC_SRS_POLL_RING(mac_srs); + return; + } + + /* Nothing else to do. Get out of poll mode */ + MAC_SRS_POLLING_OFF(mac_srs); + mac_srs->srs_state &= ~(SRS_PROC|proc_type); + srs_rx->sr_drain_finish_intr++; +} + +/* + * mac_rx_srs_drain_bw + * + * The SRS BW drain routine. Gets to run to clear the queue. Any thread + * (worker, interrupt, poll) can call this based on processing model. + * The first thing we do is disable interrupts if possible and then + * drain the queue. we also try to poll the underlying hardware if + * there is a dedicated hardware Rx ring assigned to this SRS. + * + * There is a equivalent drain routine in non bandwidth control mode + * mac_rx_srs_drain. There is some code duplication between the two + * routines but they are highly performance sensitive and are easier + * to read/debug if they stay separate. Any code changes here might + * also apply to mac_rx_srs_drain as well. + */ +void +mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type) +{ + mblk_t *head; + mblk_t *tail; + timeout_id_t tid; + size_t sz = 0; + int cnt = 0; + mac_client_impl_t *mcip = mac_srs->srs_mcip; + mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; + + ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); + ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); +again: + /* Check if we are doing B/W control */ + mutex_enter(&mac_srs->srs_bw->mac_bw_lock); + if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { + mac_srs->srs_bw->mac_bw_curr_time = lbolt; + mac_srs->srs_bw->mac_bw_used = 0; + if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) + mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; + } else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) { + mutex_exit(&mac_srs->srs_bw->mac_bw_lock); + goto done; + } else if (mac_srs->srs_bw->mac_bw_used > + mac_srs->srs_bw->mac_bw_limit) { + mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; + mutex_exit(&mac_srs->srs_bw->mac_bw_lock); + goto done; + } + mutex_exit(&mac_srs->srs_bw->mac_bw_lock); + + /* If we are blanked i.e. can't do upcalls, then we are done */ + if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { + ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || + (mac_srs->srs_state & SRS_PAUSE)); + goto done; + } + + sz = 0; + cnt = 0; + if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) { + /* + * We couldn't pick up a single packet. + */ + mutex_enter(&mac_srs->srs_bw->mac_bw_lock); + if ((mac_srs->srs_bw->mac_bw_used == 0) && + (mac_srs->srs_size != 0) && + !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { + /* + * Seems like configured B/W doesn't + * even allow processing of 1 packet + * per tick. + * + * XXX: raise the limit to processing + * at least 1 packet per tick. + */ + mac_srs->srs_bw->mac_bw_limit += + mac_srs->srs_bw->mac_bw_limit; + mac_srs->srs_bw->mac_bw_drop_threshold += + mac_srs->srs_bw->mac_bw_drop_threshold; + cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) " + "raised B/W limit to %d since not even a " + "single packet can be processed per " + "tick %d\n", (void *)mac_srs, + (int)mac_srs->srs_bw->mac_bw_limit, + (int)msgdsize(mac_srs->srs_first)); + } + mutex_exit(&mac_srs->srs_bw->mac_bw_lock); + goto done; + } + + ASSERT(head != NULL); + ASSERT(tail != NULL); + + /* zero bandwidth: drop all and return to interrupt mode */ + mutex_enter(&mac_srs->srs_bw->mac_bw_lock); + if (mac_srs->srs_bw->mac_bw_limit == 0) { + srs_rx->sr_drop_count += cnt; + ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz); + mac_srs->srs_bw->mac_bw_sz -= sz; + mac_srs->srs_bw->mac_bw_drop_bytes += sz; + mutex_exit(&mac_srs->srs_bw->mac_bw_lock); + mac_pkt_drop(NULL, NULL, head, B_FALSE); + goto leave_poll; + } else { + mutex_exit(&mac_srs->srs_bw->mac_bw_lock); + } + + /* + * We can continue processing the queue. + * We need to figure out if there is a fanout needed or + * we can just process this here. + */ + + if ((tid = mac_srs->srs_tid) != 0) + mac_srs->srs_tid = 0; + + mac_srs->srs_state |= (SRS_PROC|proc_type); + MAC_SRS_WORKER_POLLING_ON(mac_srs); + + /* + * mcip is NULL for broadcast and multicast flows. The promisc + * callbacks for broadcast and multicast packets are delivered from + * mac_rx() and we don't need to worry about that case in this path + */ + if (mcip != NULL && mcip->mci_promisc_list != NULL) { + mutex_exit(&mac_srs->srs_lock); + mac_promisc_client_dispatch(mcip, head); + mutex_enter(&mac_srs->srs_lock); + } + + /* + * Check if SRS itself is doing the processing + * This direct path does not apply when subflows are present. In this + * case, packets need to be dispatched to a soft ring according to the + * flow's bandwidth and other resources contraints. + */ + if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { + mac_direct_rx_t proc; + void *arg1; + mac_resource_handle_t arg2; + + /* + * This is the case when a Rx is directly + * assigned and we have a fully classified + * protocol chain. We can deal with it in + * one shot. + */ + proc = srs_rx->sr_func; + arg1 = srs_rx->sr_arg1; + arg2 = srs_rx->sr_arg2; + + mac_srs->srs_state |= SRS_CLIENT_PROC; + mutex_exit(&mac_srs->srs_lock); + if (tid != 0) { + (void) untimeout(tid); + tid = 0; + } + + proc(arg1, arg2, head, NULL); + /* + * Decrement the size and count here itelf + * since the packet has been processed. + */ + mutex_enter(&mac_srs->srs_lock); + MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); + MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); + + if (mac_srs->srs_state & SRS_CLIENT_WAIT) + cv_signal(&mac_srs->srs_client_cv); + mac_srs->srs_state &= ~SRS_CLIENT_PROC; + } else { + /* Some kind of softrings based fanout is required */ + mutex_exit(&mac_srs->srs_lock); + if (tid != 0) { + (void) untimeout(tid); + tid = 0; + } + + /* + * Since the fanout routines can deal with chains, + * shoot the entire chain up. + */ + if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) + mac_rx_srs_fanout(mac_srs, head); + else + mac_rx_srs_proto_fanout(mac_srs, head); + mutex_enter(&mac_srs->srs_lock); + } + + /* + * Send the poll thread to pick up any packets arrived + * so far. This also serves as the last check in case + * nothing else is queued in the SRS. The poll thread + * is signalled only in the case the drain was done + * by the worker thread and SRS_WORKER is set. The + * worker thread can run in parallel as long as the + * SRS_WORKER flag is set. We we have nothing else to + * process, we can exit while leaving SRS_PROC set + * which gives the poll thread control to process and + * cleanup once it returns from the NIC. + * + * If we have nothing else to process, we need to + * ensure that we keep holding the srs_lock till + * all the checks below are done and control is + * handed to the poll thread if it was running. + */ + mutex_enter(&mac_srs->srs_bw->mac_bw_lock); + if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { + if (mac_srs->srs_first != NULL) { + if (proc_type == SRS_WORKER) { + mutex_exit(&mac_srs->srs_bw->mac_bw_lock); + if (srs_rx->sr_poll_pkt_cnt <= + srs_rx->sr_lowat) + MAC_SRS_POLL_RING(mac_srs); + goto again; + } else { + cv_signal(&mac_srs->srs_async); + } + } + } + mutex_exit(&mac_srs->srs_bw->mac_bw_lock); + +done: + + if (mac_srs->srs_state & SRS_GET_PKTS) { + /* + * Poll thread is already running. Leave the + * SRS_RPOC set and hand over the control to + * poll thread. + */ + mac_srs->srs_state &= ~proc_type; + return; + } + + /* + * If we can't process packets because we have exceeded + * B/W limit for this tick, just set the timeout + * and leave. + * + * Even if there are no packets queued in SRS, we + * need to make sure that the shared counter is + * clear and any associated softrings have cleared + * all the backlog. Otherwise, leave the interface + * in polling mode and the poll thread will get + * signalled once the count goes down to zero. + * + * If someone is already draining the queue (SRS_PROC is + * set) when the srs_poll_pkt_cnt goes down to zero, + * then it means that drain is already running and we + * will turn off polling at that time if there is + * no backlog. As long as there are packets queued either + * is soft ring set or its soft rings, we will leave + * the interface in polling mode. + */ + mutex_enter(&mac_srs->srs_bw->mac_bw_lock); + if ((mac_srs->srs_state & SRS_POLLING_CAPAB) && + ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) || + (srs_rx->sr_poll_pkt_cnt > 0))) { + MAC_SRS_POLLING_ON(mac_srs); + mac_srs->srs_state &= ~(SRS_PROC|proc_type); + if ((mac_srs->srs_first != NULL) && + (mac_srs->srs_tid == NULL)) + mac_srs->srs_tid = timeout(mac_srs_fire, + mac_srs, 1); + mutex_exit(&mac_srs->srs_bw->mac_bw_lock); + return; + } + mutex_exit(&mac_srs->srs_bw->mac_bw_lock); + +leave_poll: + + /* Nothing else to do. Get out of poll mode */ + MAC_SRS_POLLING_OFF(mac_srs); + mac_srs->srs_state &= ~(SRS_PROC|proc_type); +} + +/* + * mac_srs_worker + * + * The SRS worker routine. Drains the queue when no one else is + * processing it. + */ +void +mac_srs_worker(mac_soft_ring_set_t *mac_srs) +{ + kmutex_t *lock = &mac_srs->srs_lock; + kcondvar_t *async = &mac_srs->srs_async; + callb_cpr_t cprinfo; + boolean_t bw_ctl_flag; + + CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker"); + mutex_enter(lock); + +start: + for (;;) { + bw_ctl_flag = B_FALSE; + if (mac_srs->srs_type & SRST_BW_CONTROL) { + MAC_SRS_BW_LOCK(mac_srs); + MAC_SRS_CHECK_BW_CONTROL(mac_srs); + if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) + bw_ctl_flag = B_TRUE; + MAC_SRS_BW_UNLOCK(mac_srs); + } + /* + * The SRS_BW_ENFORCED flag may change since we have dropped + * the mac_bw_lock. However the drain function can handle both + * a drainable SRS or a bandwidth controlled SRS, and the + * effect of scheduling a timeout is to wakeup the worker + * thread which in turn will call the drain function. Since + * we release the srs_lock atomically only in the cv_wait there + * isn't a fear of waiting for ever. + */ + while (((mac_srs->srs_state & SRS_PROC) || + (mac_srs->srs_first == NULL) || bw_ctl_flag || + (mac_srs->srs_state & SRS_TX_BLOCKED)) && + !(mac_srs->srs_state & SRS_PAUSE)) { + /* + * If we have packets queued and we are here + * because B/W control is in place, we better + * schedule the worker wakeup after 1 tick + * to see if bandwidth control can be relaxed. + */ + if (bw_ctl_flag && mac_srs->srs_tid == NULL) { + /* + * We need to ensure that a timer is already + * scheduled or we force schedule one for + * later so that we can continue processing + * after this quanta is over. + */ + mac_srs->srs_tid = timeout(mac_srs_fire, + mac_srs, 1); + } +wait: + CALLB_CPR_SAFE_BEGIN(&cprinfo); + cv_wait(async, lock); + CALLB_CPR_SAFE_END(&cprinfo, lock); + + if (mac_srs->srs_state & SRS_PAUSE) + goto done; + if (mac_srs->srs_state & SRS_PROC) + goto wait; + + if (mac_srs->srs_first != NULL && + mac_srs->srs_type & SRST_BW_CONTROL) { + MAC_SRS_BW_LOCK(mac_srs); + if (mac_srs->srs_bw->mac_bw_state & + SRS_BW_ENFORCED) { + MAC_SRS_CHECK_BW_CONTROL(mac_srs); + } + bw_ctl_flag = mac_srs->srs_bw->mac_bw_state & + SRS_BW_ENFORCED; + MAC_SRS_BW_UNLOCK(mac_srs); + } + } + + if (mac_srs->srs_state & SRS_PAUSE) + goto done; + mac_srs->srs_drain_func(mac_srs, SRS_WORKER); + } +done: + /* + * The Rx SRS quiesce logic first cuts off packet supply to the SRS + * from both hard and soft classifications and waits for such threads + * to finish before signaling the worker. So at this point the only + * thread left that could be competing with the worker is the poll + * thread. In the case of Tx, there shouldn't be any thread holding + * SRS_PROC at this point. + */ + if (!(mac_srs->srs_state & SRS_PROC)) { + mac_srs->srs_state |= SRS_PROC; + } else { + ASSERT((mac_srs->srs_type & SRST_TX) == 0); + /* + * Poll thread still owns the SRS and is still running + */ + ASSERT((mac_srs->srs_poll_thr == NULL) || + ((mac_srs->srs_state & SRS_POLL_THR_OWNER) == + SRS_POLL_THR_OWNER)); + } + mac_srs_worker_quiesce(mac_srs); + /* + * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator + * of the quiesce operation + */ + while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART))) + cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); + + if (mac_srs->srs_state & SRS_RESTART) { + ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); + mac_srs_worker_restart(mac_srs); + mac_srs->srs_state &= ~SRS_PROC; + goto start; + } + + if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE)) + mac_srs_worker_quiesce(mac_srs); + + mac_srs->srs_state &= ~SRS_PROC; + /* The macro drops the srs_lock */ + CALLB_CPR_EXIT(&cprinfo); + thread_exit(); +} + +/* + * mac_rx_srs_subflow_process + * + * Receive side routine called from interrupt path when there are + * sub flows present on this SRS. + */ +/* ARGSUSED */ +void +mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs, + mblk_t *mp_chain, boolean_t loopback) +{ + flow_entry_t *flent = NULL; + flow_entry_t *prev_flent = NULL; + mblk_t *mp = NULL; + mblk_t *tail = NULL; + mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; + mac_client_impl_t *mcip; + + mcip = mac_srs->srs_mcip; + ASSERT(mcip != NULL); + + /* + * We need to determine the SRS for every packet + * by walking the flow table, if we don't get any, + * then we proceed using the SRS we came with. + */ + mp = tail = mp_chain; + while (mp != NULL) { + + /* + * We will increment the stats for the mactching subflow. + * when we get the bytes/pkt count for the classified packets + * later in mac_rx_srs_process. + */ + (void) mac_flow_lookup(mcip->mci_subflow_tab, mp, + FLOW_INBOUND, &flent); + + if (mp == mp_chain || flent == prev_flent) { + if (prev_flent != NULL) + FLOW_REFRELE(prev_flent); + prev_flent = flent; + flent = NULL; + tail = mp; + mp = mp->b_next; + continue; + } + tail->b_next = NULL; + /* + * A null indicates, this is for the mac_srs itself. + * XXX-venu : probably assert for fe_rx_srs_cnt == 0. + */ + if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { + mac_rx_srs_process(arg, + (mac_resource_handle_t)mac_srs, mp_chain, + loopback); + } else { + (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, + prev_flent->fe_cb_arg2, mp_chain, loopback); + FLOW_REFRELE(prev_flent); + } + prev_flent = flent; + flent = NULL; + mp_chain = mp; + tail = mp; + mp = mp->b_next; + } + /* Last chain */ + ASSERT(mp_chain != NULL); + if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { + mac_rx_srs_process(arg, + (mac_resource_handle_t)mac_srs, mp_chain, loopback); + } else { + (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, + prev_flent->fe_cb_arg2, mp_chain, loopback); + FLOW_REFRELE(prev_flent); + } +} + +/* + * mac_rx_srs_process + * + * Receive side routine called from the interrupt path. + * + * loopback is set to force a context switch on the loopback + * path between MAC clients. + */ +/* ARGSUSED */ +void +mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain, + boolean_t loopback) +{ + mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; + mblk_t *mp, *tail, *head; + int count = 0; + int count1; + size_t sz = 0; + size_t chain_sz, sz1; + mac_bw_ctl_t *mac_bw; + mac_client_impl_t *smcip; + mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; + + /* + * Set the tail, count and sz. We set the sz irrespective + * of whether we are doing B/W control or not for the + * purpose of updating the stats. + */ + mp = tail = mp_chain; + while (mp != NULL) { + tail = mp; + count++; + sz += msgdsize(mp); + mp = mp->b_next; + } + + mutex_enter(&mac_srs->srs_lock); + smcip = mac_srs->srs_mcip; + + if (mac_srs->srs_type & SRST_FLOW || smcip == NULL) { + FLOW_STAT_UPDATE(mac_srs->srs_flent, rbytes, sz); + FLOW_STAT_UPDATE(mac_srs->srs_flent, ipackets, count); + } + if (smcip != NULL) { + smcip->mci_stat_ibytes += sz; + smcip->mci_stat_ipackets += count; + } + + /* + * If the SRS in already being processed; has been blanked; + * can be processed by worker thread only; or the B/W limit + * has been reached, then queue the chain and check if + * worker thread needs to be awakend. + */ + if (mac_srs->srs_type & SRST_BW_CONTROL) { + mac_bw = mac_srs->srs_bw; + ASSERT(mac_bw != NULL); + mutex_enter(&mac_bw->mac_bw_lock); + /* Count the packets and bytes via interrupt */ + srs_rx->sr_intr_count += count; + mac_bw->mac_bw_intr += sz; + if (mac_bw->mac_bw_limit == 0) { + /* zero bandwidth: drop all */ + srs_rx->sr_drop_count += count; + mac_bw->mac_bw_drop_bytes += sz; + mutex_exit(&mac_bw->mac_bw_lock); + mutex_exit(&mac_srs->srs_lock); + mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); + return; + } else { + if ((mac_bw->mac_bw_sz + sz) <= + mac_bw->mac_bw_drop_threshold) { + mutex_exit(&mac_bw->mac_bw_lock); + MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, + tail, count, sz); + } else { + mp = mp_chain; + chain_sz = 0; + count1 = 0; + tail = NULL; + head = NULL; + while (mp != NULL) { + sz1 = msgdsize(mp); + if (mac_bw->mac_bw_sz + chain_sz + sz1 > + mac_bw->mac_bw_drop_threshold) + break; + chain_sz += sz1; + count1++; + tail = mp; + mp = mp->b_next; + } + mutex_exit(&mac_bw->mac_bw_lock); + if (tail != NULL) { + head = tail->b_next; + tail->b_next = NULL; + MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, + mp_chain, tail, count1, chain_sz); + sz -= chain_sz; + count -= count1; + } else { + /* Can't pick up any */ + head = mp_chain; + } + if (head != NULL) { + /* Drop any packet over the threshold */ + srs_rx->sr_drop_count += count; + mutex_enter(&mac_bw->mac_bw_lock); + mac_bw->mac_bw_drop_bytes += sz; + mutex_exit(&mac_bw->mac_bw_lock); + freemsgchain(head); + } + } + MAC_SRS_WORKER_WAKEUP(mac_srs); + mutex_exit(&mac_srs->srs_lock); + return; + } + } + + /* + * If the total number of packets queued in the SRS and + * its associated soft rings exceeds the max allowed, + * then drop the chain. If we are polling capable, this + * shouldn't be happening. + */ + if (!(mac_srs->srs_type & SRST_BW_CONTROL) && + (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) { + mac_bw = mac_srs->srs_bw; + srs_rx->sr_drop_count += count; + mutex_enter(&mac_bw->mac_bw_lock); + mac_bw->mac_bw_drop_bytes += sz; + mutex_exit(&mac_bw->mac_bw_lock); + freemsgchain(mp_chain); + mutex_exit(&mac_srs->srs_lock); + return; + } + + MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz); + /* Count the packets entering via interrupt path */ + srs_rx->sr_intr_count += count; + + if (!(mac_srs->srs_state & SRS_PROC)) { + /* + * If we are coming via loopback or if we are not + * optimizing for latency, we should signal the + * worker thread. + */ + if (loopback || ((count > 1) && + !(mac_srs->srs_state & SRS_LATENCY_OPT))) { + /* + * For loopback, We need to let the worker take + * over as we don't want to continue in the same + * thread even if we can. This could lead to stack + * overflows and may also end up using + * resources (cpu) incorrectly. + */ + cv_signal(&mac_srs->srs_async); + } else { + /* + * Seems like no one is processing the SRS and + * there is no backlog. We also inline process + * our packet if its a single packet in non + * latency optimized case (in latency optimized + * case, we inline process chains of any size). + */ + mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST); + } + } + mutex_exit(&mac_srs->srs_lock); +} + +/* TX SIDE ROUTINES (RUNTIME) */ + +/* + * mac_tx_srs_no_desc + * + * This routine is called by Tx single ring default mode + * when Tx ring runs out of descs. + */ +mac_tx_cookie_t +mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, + uint16_t flag, mblk_t **ret_mp) +{ + mac_tx_cookie_t cookie = NULL; + mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; + boolean_t wakeup_worker = B_TRUE; + uint32_t tx_mode = srs_tx->st_mode; + int cnt, sz; + mblk_t *tail; + + ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW); + if (flag & MAC_DROP_ON_NO_DESC) { + MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); + } else { + if (mac_srs->srs_first != NULL) + wakeup_worker = B_FALSE; + MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); + if (flag & MAC_TX_NO_ENQUEUE) { + /* + * If TX_QUEUED is not set, queue the + * packet and let mac_tx_srs_drain() + * set the TX_BLOCKED bit for the + * reasons explained above. Otherwise, + * return the mblks. + */ + if (wakeup_worker) { + MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, + mp_chain, tail, cnt, sz); + } else { + MAC_TX_SET_NO_ENQUEUE(mac_srs, + mp_chain, ret_mp, cookie); + } + } else { + MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, + tail, cnt, sz, cookie); + } + if (wakeup_worker) + cv_signal(&mac_srs->srs_async); + } + return (cookie); +} + +/* + * mac_tx_srs_enqueue + * + * This routine is called when Tx SRS is operating in either serializer + * or bandwidth mode. In serializer mode, a packet will get enqueued + * when a thread cannot enter SRS exclusively. In bandwidth mode, + * packets gets queued if allowed byte-count limit for a tick is + * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and + * MAC_TX_NO_ENQUEUE is set is different than when operaing in either + * the default mode or fanout mode. Here packets get dropped or + * returned back to the caller only after hi-watermark worth of data + * is queued. + */ +static mac_tx_cookie_t +mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, + uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp) +{ + mac_tx_cookie_t cookie = NULL; + int cnt, sz; + mblk_t *tail; + boolean_t wakeup_worker = B_TRUE; + + if (mac_srs->srs_first != NULL) + wakeup_worker = B_FALSE; + MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); + if (flag & MAC_DROP_ON_NO_DESC) { + if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) { + MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); + } else { + MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, + mp_chain, tail, cnt, sz); + } + } else if (flag & MAC_TX_NO_ENQUEUE) { + if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) || + (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) { + MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain, + ret_mp, cookie); + } else { + mp_chain->b_prev = (mblk_t *)fanout_hint; + MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, + mp_chain, tail, cnt, sz); + } + } else { + /* + * If you are BW_ENFORCED, just enqueue the + * packet. srs_worker will drain it at the + * prescribed rate. Before enqueueing, save + * the fanout hint. + */ + mp_chain->b_prev = (mblk_t *)fanout_hint; + MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, + tail, cnt, sz, cookie); + } + if (wakeup_worker) + cv_signal(&mac_srs->srs_async); + return (cookie); +} + +/* + * There are five tx modes: + * + * 1) Default mode (SRS_TX_DEFAULT) + * 2) Serialization mode (SRS_TX_SERIALIZE) + * 3) Fanout mode (SRS_TX_FANOUT) + * 4) Bandwdith mode (SRS_TX_BW) + * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT) + * + * The tx mode in which an SRS operates is decided in mac_tx_srs_setup() + * based on the number of Tx rings requested for an SRS and whether + * bandwidth control is requested or not. + * + * In the default mode (i.e., no fanout/no bandwidth), the SRS acts as a + * pass-thru. Packets will go directly to mac_tx_send(). When the underlying + * Tx ring runs out of Tx descs, it starts queueing up packets in SRS. + * When flow-control is relieved, the srs_worker drains the queued + * packets and informs blocked clients to restart sending packets. + * + * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized. + * + * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple + * Tx rings. Each Tx ring will have a soft ring associated with it. + * These soft rings will be hung off the Tx SRS. Queueing if it happens + * due to lack of Tx desc will be in individual soft ring (and not srs) + * associated with Tx ring. + * + * In the TX_BW mode, tx srs will allow packets to go down to Tx ring + * only if bw is available. Otherwise the packets will be queued in + * SRS. If fanout to multiple Tx rings is configured, the packets will + * be fanned out among the soft rings associated with the Tx rings. + * + * Four flags are used in srs_state for indicating flow control + * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT. + * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the + * driver below. + * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat + * and flow-control pressure is applied back to clients. The clients expect + * wakeup when flow-control is relieved. + * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk + * got returned back to client either due to lack of Tx descs or due to bw + * control reasons. The clients expect a wakeup when condition is relieved. + * + * The fourth argument to mac_tx() is the flag. Normally it will be 0 but + * some clients set the following values too: MAC_DROP_ON_NO_DESC, + * MAC_TX_NO_ENQUEUE + * Mac clients that do not want packets to be enqueued in the mac layer set + * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or + * Tx soft rings but instead get dropped when the NIC runs out of desc. The + * behaviour of this flag is different when the Tx is running in serializer + * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet + * get dropped when Tx high watermark is reached. + * There are some mac clients like vsw, aggr that want the mblks to be + * returned back to clients instead of being queued in Tx SRS (or Tx soft + * rings) under flow-control (i.e., out of desc or exceeding bw limits) + * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set. + * In the default and Tx fanout mode, the un-transmitted mblks will be + * returned back to the clients when the driver runs out of Tx descs. + * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or + * soft ring) so that the clients can be woken up when Tx desc become + * available. When running in serializer or bandwidth mode mode, + * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached. + */ + +mac_tx_func_t +mac_tx_get_func(uint32_t mode) +{ + return (mac_tx_mode_list[mode].mac_tx_func); +} + +/* ARGSUSED */ +static mac_tx_cookie_t +mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, + uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) +{ + mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; + boolean_t is_subflow; + mac_tx_stats_t stats; + mac_tx_cookie_t cookie = NULL; + + ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT); + + /* Regular case with a single Tx ring */ + /* + * SRS_TX_BLOCKED is set when underlying NIC runs + * out of Tx descs and messages start getting + * queued. It won't get reset until + * tx_srs_drain() completely drains out the + * messages. + */ + if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { + /* Tx descs/resources not available */ + mutex_enter(&mac_srs->srs_lock); + if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { + cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, + flag, ret_mp); + mutex_exit(&mac_srs->srs_lock); + return (cookie); + } + /* + * While we were computing mblk count, the + * flow control condition got relieved. + * Continue with the transmission. + */ + mutex_exit(&mac_srs->srs_lock); + } + + is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); + + mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, + mp_chain, (is_subflow ? &stats : NULL)); + + /* + * Multiple threads could be here sending packets. + * Under such conditions, it is not possible to + * automically set SRS_TX_BLOCKED bit to indicate + * out of tx desc condition. To atomically set + * this, we queue the returned packet and do + * the setting of SRS_TX_BLOCKED in + * mac_tx_srs_drain(). + */ + if (mp_chain != NULL) { + mutex_enter(&mac_srs->srs_lock); + cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp); + mutex_exit(&mac_srs->srs_lock); + return (cookie); + } + + if (is_subflow) + FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); + + return (NULL); +} + +/* + * mac_tx_serialize_mode + * + * This is an experimental mode implemented as per the request of PAE. + * In this mode, all callers attempting to send a packet to the NIC + * will get serialized. Only one thread at any time will access the + * NIC to send the packet out. + */ +/* ARGSUSED */ +static mac_tx_cookie_t +mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, + uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) +{ + boolean_t is_subflow; + mac_tx_stats_t stats; + mac_tx_cookie_t cookie = NULL; + mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; + + /* Single ring, serialize below */ + ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE); + mutex_enter(&mac_srs->srs_lock); + if ((mac_srs->srs_first != NULL) || + (mac_srs->srs_state & SRS_PROC)) { + /* + * In serialization mode, queue all packets until + * TX_HIWAT is set. + * If drop bit is set, drop if TX_HIWAT is set. + * If no_enqueue is set, still enqueue until hiwat + * is set and return mblks after TX_HIWAT is set. + */ + cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, + flag, NULL, ret_mp); + mutex_exit(&mac_srs->srs_lock); + return (cookie); + } + /* + * No packets queued, nothing on proc and no flow + * control condition. Fast-path, ok. Do inline + * processing. + */ + mac_srs->srs_state |= SRS_PROC; + mutex_exit(&mac_srs->srs_lock); + + is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); + + mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, + mp_chain, (is_subflow ? &stats : NULL)); + + mutex_enter(&mac_srs->srs_lock); + mac_srs->srs_state &= ~SRS_PROC; + if (mp_chain != NULL) { + cookie = mac_tx_srs_enqueue(mac_srs, + mp_chain, flag, NULL, ret_mp); + } + if (mac_srs->srs_first != NULL) { + /* + * We processed inline our packet and a new + * packet/s got queued while we were + * processing. Wakeup srs worker + */ + cv_signal(&mac_srs->srs_async); + } + mutex_exit(&mac_srs->srs_lock); + + if (is_subflow && cookie == NULL) + FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); + + return (cookie); +} + +/* + * mac_tx_fanout_mode + * + * In this mode, the SRS will have access to multiple Tx rings to send + * the packet out. The fanout hint that is passed as an argument is + * used to find an appropriate ring to fanout the traffic. Each Tx + * ring, in turn, will have a soft ring associated with it. If a Tx + * ring runs out of Tx desc's the returned packet will be queued in + * the soft ring associated with that Tx ring. The srs itself will not + * queue any packets. + */ +static mac_tx_cookie_t +mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, + uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) +{ + mac_soft_ring_t *softring; + uint_t indx, hash; + + ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT); + hash = HASH_HINT(fanout_hint); + indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); + softring = mac_srs->srs_oth_soft_rings[indx]; + return (mac_tx_soft_ring_process(softring, mp_chain, flag, ret_mp)); +} + +/* + * mac_tx_bw_mode + * + * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring + * only if bw is available. Otherwise the packets will be queued in + * SRS. If the SRS has multiple Tx rings, then packets will get fanned + * out to a Tx rings. + */ +static mac_tx_cookie_t +mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, + uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) +{ + int cnt, sz; + mblk_t *tail; + mac_tx_cookie_t cookie = NULL; + mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; + + ASSERT(TX_BANDWIDTH_MODE(mac_srs)); + ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); + mutex_enter(&mac_srs->srs_lock); + if (mac_srs->srs_bw->mac_bw_limit == 0) { + /* zero bandwidth: drop all */ + MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); + mutex_exit(&mac_srs->srs_lock); + return (cookie); + } else if ((mac_srs->srs_first != NULL) || + (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { + cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, + fanout_hint, ret_mp); + mutex_exit(&mac_srs->srs_lock); + return (cookie); + } + MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); + if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { + mac_srs->srs_bw->mac_bw_curr_time = lbolt; + mac_srs->srs_bw->mac_bw_used = 0; + } else if (mac_srs->srs_bw->mac_bw_used > + mac_srs->srs_bw->mac_bw_limit) { + mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; + MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, + mp_chain, tail, cnt, sz); + /* + * Wakeup worker thread. Note that worker + * thread has to be woken up so that it + * can fire up the timer to be woken up + * on the next tick. Also once + * BW_ENFORCED is set, it can only be + * reset by srs_worker thread. Until then + * all packets will get queued up in SRS + * and hence this this code path won't be + * entered until BW_ENFORCED is reset. + */ + cv_signal(&mac_srs->srs_async); + mutex_exit(&mac_srs->srs_lock); + return (cookie); + } + + mac_srs->srs_bw->mac_bw_used += sz; + mutex_exit(&mac_srs->srs_lock); + + if (srs_tx->st_mode == SRS_TX_BW_FANOUT) { + mac_soft_ring_t *softring; + uint_t indx, hash; + + hash = HASH_HINT(fanout_hint); + indx = COMPUTE_INDEX(hash, + mac_srs->srs_oth_ring_count); + softring = mac_srs->srs_oth_soft_rings[indx]; + return (mac_tx_soft_ring_process(softring, mp_chain, flag, + ret_mp)); + } else { + boolean_t is_subflow; + mac_tx_stats_t stats; + + is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); + + mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, + mp_chain, (is_subflow ? &stats : NULL)); + + if (mp_chain != NULL) { + mutex_enter(&mac_srs->srs_lock); + MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); + if (mac_srs->srs_bw->mac_bw_used > sz) + mac_srs->srs_bw->mac_bw_used -= sz; + else + mac_srs->srs_bw->mac_bw_used = 0; + cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, + fanout_hint, ret_mp); + mutex_exit(&mac_srs->srs_lock); + return (cookie); + } + if (is_subflow) + FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); + + return (NULL); + } +} + +/* ARGSUSED */ +void +mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) +{ + mblk_t *head, *tail; + size_t sz; + uint32_t tx_mode; + uint_t saved_pkt_count; + boolean_t is_subflow; + mac_tx_stats_t stats; + mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; + + saved_pkt_count = 0; + ASSERT(mutex_owned(&mac_srs->srs_lock)); + ASSERT(!(mac_srs->srs_state & SRS_PROC)); + + mac_srs->srs_state |= SRS_PROC; + + is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); + tx_mode = srs_tx->st_mode; + if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) { + if (mac_srs->srs_first != NULL) { + head = mac_srs->srs_first; + tail = mac_srs->srs_last; + saved_pkt_count = mac_srs->srs_count; + mac_srs->srs_first = NULL; + mac_srs->srs_last = NULL; + mac_srs->srs_count = 0; + mutex_exit(&mac_srs->srs_lock); + + head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, + head, &stats); + + mutex_enter(&mac_srs->srs_lock); + if (head != NULL) { + /* Device out of tx desc, set block */ + if (head->b_next == NULL) + VERIFY(head == tail); + tail->b_next = mac_srs->srs_first; + mac_srs->srs_first = head; + mac_srs->srs_count += + (saved_pkt_count - stats.ts_opackets); + if (mac_srs->srs_last == NULL) + mac_srs->srs_last = tail; + MAC_TX_SRS_BLOCK(mac_srs, head); + } else { + srs_tx->st_woken_up = B_FALSE; + if (is_subflow) { + FLOW_TX_STATS_UPDATE( + mac_srs->srs_flent, &stats); + } + } + } + } else if (tx_mode == SRS_TX_BW) { + /* + * We are here because the timer fired and we have some data + * to tranmit. Also mac_tx_srs_worker should have reset + * SRS_BW_ENFORCED flag + */ + ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)); + head = tail = mac_srs->srs_first; + while (mac_srs->srs_first != NULL) { + tail = mac_srs->srs_first; + tail->b_prev = NULL; + mac_srs->srs_first = tail->b_next; + if (mac_srs->srs_first == NULL) + mac_srs->srs_last = NULL; + mac_srs->srs_count--; + sz = msgdsize(tail); + mac_srs->srs_size -= sz; + saved_pkt_count++; + MAC_TX_UPDATE_BW_INFO(mac_srs, sz); + + if (mac_srs->srs_bw->mac_bw_used < + mac_srs->srs_bw->mac_bw_limit) + continue; + + if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { + mac_srs->srs_bw->mac_bw_curr_time = lbolt; + mac_srs->srs_bw->mac_bw_used = sz; + continue; + } + mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; + break; + } + + ASSERT((head == NULL && tail == NULL) || + (head != NULL && tail != NULL)); + if (tail != NULL) { + tail->b_next = NULL; + mutex_exit(&mac_srs->srs_lock); + + head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, + head, &stats); + + mutex_enter(&mac_srs->srs_lock); + if (head != NULL) { + uint_t size_sent; + + /* Device out of tx desc, set block */ + if (head->b_next == NULL) + VERIFY(head == tail); + tail->b_next = mac_srs->srs_first; + mac_srs->srs_first = head; + mac_srs->srs_count += + (saved_pkt_count - stats.ts_opackets); + if (mac_srs->srs_last == NULL) + mac_srs->srs_last = tail; + size_sent = sz - stats.ts_obytes; + mac_srs->srs_size += size_sent; + mac_srs->srs_bw->mac_bw_sz += size_sent; + if (mac_srs->srs_bw->mac_bw_used > size_sent) { + mac_srs->srs_bw->mac_bw_used -= + size_sent; + } else { + mac_srs->srs_bw->mac_bw_used = 0; + } + MAC_TX_SRS_BLOCK(mac_srs, head); + } else { + srs_tx->st_woken_up = B_FALSE; + if (is_subflow) { + FLOW_TX_STATS_UPDATE( + mac_srs->srs_flent, &stats); + } + } + } + } else if (tx_mode == SRS_TX_BW_FANOUT) { + mblk_t *prev; + mac_soft_ring_t *softring; + uint64_t hint; + + /* + * We are here because the timer fired and we + * have some quota to tranmit. + */ + prev = NULL; + head = tail = mac_srs->srs_first; + while (mac_srs->srs_first != NULL) { + tail = mac_srs->srs_first; + mac_srs->srs_first = tail->b_next; + if (mac_srs->srs_first == NULL) + mac_srs->srs_last = NULL; + mac_srs->srs_count--; + sz = msgdsize(tail); + mac_srs->srs_size -= sz; + mac_srs->srs_bw->mac_bw_used += sz; + if (prev == NULL) + hint = (ulong_t)tail->b_prev; + if (hint != (ulong_t)tail->b_prev) { + prev->b_next = NULL; + mutex_exit(&mac_srs->srs_lock); + TX_SRS_TO_SOFT_RING(mac_srs, head, hint); + head = tail; + hint = (ulong_t)tail->b_prev; + mutex_enter(&mac_srs->srs_lock); + } + + prev = tail; + tail->b_prev = NULL; + if (mac_srs->srs_bw->mac_bw_used < + mac_srs->srs_bw->mac_bw_limit) + continue; + + if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { + mac_srs->srs_bw->mac_bw_curr_time = lbolt; + mac_srs->srs_bw->mac_bw_used = 0; + continue; + } + mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; + break; + } + ASSERT((head == NULL && tail == NULL) || + (head != NULL && tail != NULL)); + if (tail != NULL) { + tail->b_next = NULL; + mutex_exit(&mac_srs->srs_lock); + TX_SRS_TO_SOFT_RING(mac_srs, head, hint); + mutex_enter(&mac_srs->srs_lock); + } + } + /* + * SRS_TX_FANOUT case not considered here because packets + * won't be queued in the SRS for this case. Packets will + * be sent directly to soft rings underneath and if there + * is any queueing at all, it would be in Tx side soft + * rings. + */ + + /* + * When srs_count becomes 0, reset SRS_TX_HIWAT and + * SRS_TX_WAKEUP_CLIENT and wakeup registered clients. + */ + if (mac_srs->srs_count == 0 && (mac_srs->srs_state & + (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) { + mac_tx_notify_cb_t *mtnfp; + mac_cb_t *mcb; + mac_client_impl_t *mcip = mac_srs->srs_mcip; + boolean_t wakeup_required = B_FALSE; + + if (mac_srs->srs_state & + (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) { + wakeup_required = B_TRUE; + } + mac_srs->srs_state &= ~(SRS_TX_HIWAT | + SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED); + mutex_exit(&mac_srs->srs_lock); + if (wakeup_required) { + /* Wakeup callback registered clients */ + MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info); + for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL; + mcb = mcb->mcb_nextp) { + mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp; + mtnfp->mtnf_fn(mtnfp->mtnf_arg, + (mac_tx_cookie_t)mac_srs); + } + MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info, + &mcip->mci_tx_notify_cb_list); + /* + * If the client is not the primary MAC client, then we + * need to send the notification to the clients upper + * MAC, i.e. mci_upper_mip. + */ + mac_tx_notify(mcip->mci_upper_mip != NULL ? + mcip->mci_upper_mip : mcip->mci_mip); + } + mutex_enter(&mac_srs->srs_lock); + } + mac_srs->srs_state &= ~SRS_PROC; +} + +/* + * Given a packet, get the flow_entry that identifies the flow + * to which that packet belongs. The flow_entry will contain + * the transmit function to be used to send the packet. If the + * function returns NULL, the packet should be sent using the + * underlying NIC. + */ +static flow_entry_t * +mac_tx_classify(mac_impl_t *mip, mblk_t *mp) +{ + flow_entry_t *flent = NULL; + mac_client_impl_t *mcip; + int err; + + /* + * Do classification on the packet. + */ + err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent); + if (err != 0) + return (NULL); + + /* + * This flent might just be an additional one on the MAC client, + * i.e. for classification purposes (different fdesc), however + * the resources, SRS et. al., are in the mci_flent, so if + * this isn't the mci_flent, we need to get it. + */ + if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) { + FLOW_REFRELE(flent); + flent = mcip->mci_flent; + FLOW_TRY_REFHOLD(flent, err); + if (err != 0) + return (NULL); + } + + return (flent); +} + +/* + * This macro is only meant to be used by mac_tx_send(). + */ +#define CHECK_VID_AND_ADD_TAG(mp) { \ + if (vid_check) { \ + int err = 0; \ + \ + MAC_VID_CHECK(src_mcip, (mp), err); \ + if (err != 0) { \ + freemsg((mp)); \ + (mp) = next; \ + oerrors++; \ + continue; \ + } \ + } \ + if (add_tag) { \ + (mp) = mac_add_vlan_tag((mp), 0, vid); \ + if ((mp) == NULL) { \ + (mp) = next; \ + oerrors++; \ + continue; \ + } \ + } \ +} + +mblk_t * +mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, + mac_tx_stats_t *stats) +{ + mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch; + mac_impl_t *mip = src_mcip->mci_mip; + uint_t obytes = 0, opackets = 0, oerrors = 0; + mblk_t *mp = NULL, *next; + boolean_t vid_check, add_tag; + uint16_t vid = 0; + + if (mip->mi_nclients > 1) { + vid_check = MAC_VID_CHECK_NEEDED(src_mcip); + add_tag = MAC_TAG_NEEDED(src_mcip); + if (add_tag) + vid = mac_client_vid(mch); + } else { + ASSERT(mip->mi_nclients == 1); + vid_check = add_tag = B_FALSE; + } + + /* + * Fastpath: if there's only one client, and there's no + * multicast listeners, we simply send the packet down to the + * underlying NIC. + */ + if (mip->mi_nactiveclients == 1 && mip->mi_promisc_list == NULL) { + DTRACE_PROBE2(fastpath, + mac_client_impl_t *, src_mcip, mblk_t *, mp_chain); + + mp = mp_chain; + while (mp != NULL) { + next = mp->b_next; + mp->b_next = NULL; + opackets++; + obytes += (mp->b_cont == NULL ? MBLKL(mp) : + msgdsize(mp)); + + CHECK_VID_AND_ADD_TAG(mp); + MAC_TX(mip, ring, mp, src_mcip); + + /* + * If the driver is out of descriptors and does a + * partial send it will return a chain of unsent + * mblks. Adjust the accounting stats. + */ + if (mp != NULL) { + opackets--; + obytes -= msgdsize(mp); + mp->b_next = next; + break; + } + mp = next; + } + goto done; + } + + /* + * No fastpath, we either have more than one MAC client + * defined on top of the same MAC, or one or more MAC + * client promiscuous callbacks. + */ + DTRACE_PROBE3(slowpath, mac_client_impl_t *, + src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain); + + if (mip->mi_promisc_list != NULL) + mac_promisc_dispatch(mip, mp_chain, src_mcip); + + mp = mp_chain; + while (mp != NULL) { + flow_entry_t *dst_flow_ent; + void *flow_cookie; + size_t pkt_size; + mblk_t *mp1; + + next = mp->b_next; + mp->b_next = NULL; + opackets++; + pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp)); + obytes += pkt_size; + CHECK_VID_AND_ADD_TAG(mp); + + /* + * Find the destination. + */ + dst_flow_ent = mac_tx_classify(mip, mp); + + if (dst_flow_ent != NULL) { + size_t hdrsize; + int err = 0; + + if (mip->mi_info.mi_nativemedia == DL_ETHER) { + struct ether_vlan_header *evhp = + (struct ether_vlan_header *)mp->b_rptr; + + if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) + hdrsize = sizeof (*evhp); + else + hdrsize = sizeof (struct ether_header); + } else { + mac_header_info_t mhi; + + err = mac_header_info((mac_handle_t)mip, + mp, &mhi); + if (err == 0) + hdrsize = mhi.mhi_hdrsize; + } + + /* + * Got a matching flow. It's either another + * MAC client, or a broadcast/multicast flow. + * Make sure the packet size is within the + * allowed size. If not drop the packet and + * move to next packet. + */ + if (err != 0 || + (pkt_size - hdrsize) > mip->mi_sdu_max) { + oerrors++; + DTRACE_PROBE2(loopback__drop, size_t, pkt_size, + mblk_t *, mp); + freemsg(mp); + mp = next; + FLOW_REFRELE(dst_flow_ent); + continue; + } + flow_cookie = mac_flow_get_client_cookie(dst_flow_ent); + if (flow_cookie != NULL) { + /* + * The vnic_bcast_send function expects + * to receive the sender MAC client + * as value for arg2. + */ + mac_bcast_send(flow_cookie, src_mcip, mp, + B_TRUE); + } else { + /* + * loopback the packet to a + * local MAC client. We force a context + * switch if both source and destination + * MAC clients are used by IP, i.e. bypass + * is set. + */ + boolean_t do_switch; + mac_client_impl_t *dst_mcip = + dst_flow_ent->fe_mcip; + + do_switch = ((src_mcip->mci_state_flags & + dst_mcip->mci_state_flags & + MCIS_CLIENT_POLL_CAPABLE) != 0); + + if ((mp1 = mac_fix_cksum(mp)) != NULL) { + (dst_flow_ent->fe_cb_fn)( + dst_flow_ent->fe_cb_arg1, + dst_flow_ent->fe_cb_arg2, + mp1, do_switch); + } + } + FLOW_REFRELE(dst_flow_ent); + } else { + /* + * Unknown destination, send via the underlying + * NIC. + */ + MAC_TX(mip, ring, mp, src_mcip); + if (mp != NULL) { + /* + * Adjust for the last packet that + * could not be transmitted + */ + opackets--; + obytes -= pkt_size; + mp->b_next = next; + break; + } + } + mp = next; + } + +done: + src_mcip->mci_stat_obytes += obytes; + src_mcip->mci_stat_opackets += opackets; + src_mcip->mci_stat_oerrors += oerrors; + + if (stats != NULL) { + stats->ts_opackets = opackets; + stats->ts_obytes = obytes; + stats->ts_oerrors = oerrors; + } + return (mp); +} + +/* + * mac_tx_srs_ring_present + * + * Returns whether the specified ring is part of the specified SRS. + */ +boolean_t +mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring) +{ + int i; + mac_soft_ring_t *soft_ring; + + if (srs->srs_tx.st_arg2 == tx_ring) + return (B_TRUE); + + for (i = 0; i < srs->srs_oth_ring_count; i++) { + soft_ring = srs->srs_oth_soft_rings[i]; + if (soft_ring->s_ring_tx_arg2 == tx_ring) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * mac_tx_srs_wakeup + * + * Called when Tx desc become available. Wakeup the appropriate worker + * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the + * state field. + */ +void +mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring) +{ + int i; + mac_soft_ring_t *sringp; + mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; + + mutex_enter(&mac_srs->srs_lock); + if (TX_SINGLE_RING_MODE(mac_srs)) { + if (srs_tx->st_arg2 == ring && + mac_srs->srs_state & SRS_TX_BLOCKED) { + mac_srs->srs_state &= ~SRS_TX_BLOCKED; + srs_tx->st_unblocked_cnt++; + cv_signal(&mac_srs->srs_async); + } + /* + * A wakeup can come before tx_srs_drain() could + * grab srs lock and set SRS_TX_BLOCKED. So + * always set woken_up flag when we come here. + */ + srs_tx->st_woken_up = B_TRUE; + mutex_exit(&mac_srs->srs_lock); + return; + } + + /* If you are here, it is for FANOUT or BW_FANOUT case */ + ASSERT(TX_MULTI_RING_MODE(mac_srs)); + for (i = 0; i < mac_srs->srs_oth_ring_count; i++) { + sringp = mac_srs->srs_oth_soft_rings[i]; + mutex_enter(&sringp->s_ring_lock); + if (sringp->s_ring_tx_arg2 == ring) { + if (sringp->s_ring_state & S_RING_BLOCK) { + sringp->s_ring_state &= ~S_RING_BLOCK; + sringp->s_ring_unblocked_cnt++; + cv_signal(&sringp->s_ring_async); + } + sringp->s_ring_tx_woken_up = B_TRUE; + } + mutex_exit(&sringp->s_ring_lock); + } + mutex_exit(&mac_srs->srs_lock); +} + +/* + * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash + * the blocked clients again. + */ +void +mac_tx_notify(mac_impl_t *mip) +{ + i_mac_notify(mip, MAC_NOTE_TX); +} + +/* + * RX SOFTRING RELATED FUNCTIONS + * + * These functions really belong in mac_soft_ring.c and here for + * a short period. + */ + +#define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ + /* \ + * Enqueue our mblk chain. \ + */ \ + ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \ + \ + if ((ringp)->s_ring_last != NULL) \ + (ringp)->s_ring_last->b_next = (mp); \ + else \ + (ringp)->s_ring_first = (mp); \ + (ringp)->s_ring_last = (tail); \ + (ringp)->s_ring_count += (cnt); \ + ASSERT((ringp)->s_ring_count > 0); \ + if ((ringp)->s_ring_type & ST_RING_BW_CTL) { \ + (ringp)->s_ring_size += sz; \ + } \ +} + +/* + * Default entry point to deliver a packet chain to a MAC client. + * If the MAC client has flows, do the classification with these + * flows as well. + */ +/* ARGSUSED */ +void +mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain, + mac_header_info_t *arg3) +{ + mac_client_impl_t *mcip = arg1; + + if (mcip->mci_nvids == 1 && + !(mcip->mci_state_flags & MCIS_TAG_DISABLE)) { + /* + * If the client has exactly one VID associated with it + * and striping of VLAN header is not disabled, + * remove the VLAN tag from the packet before + * passing it on to the client's receive callback. + * Note that this needs to be done after we dispatch + * the packet to the promiscuous listeners of the + * client, since they expect to see the whole + * frame including the VLAN headers. + */ + mp_chain = mac_strip_vlan_tag_chain(mp_chain); + } + + mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE); +} + +/* + * mac_rx_soft_ring_process + * + * process a chain for a given soft ring. The number of packets queued + * in the SRS and its associated soft rings (including this one) is + * very small (tracked by srs_poll_pkt_cnt), then allow the entering + * thread (interrupt or poll thread) to do inline processing. This + * helps keep the latency down under low load. + * + * The proc and arg for each mblk is already stored in the mblk in + * appropriate places. + */ +/* ARGSUSED */ +void +mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp, + mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz) +{ + mac_direct_rx_t proc; + void *arg1; + mac_resource_handle_t arg2; + mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; + + ASSERT(ringp != NULL); + ASSERT(mp_chain != NULL); + ASSERT(tail != NULL); + ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); + + mutex_enter(&ringp->s_ring_lock); + ringp->s_ring_total_inpkt += cnt; + if ((ringp->s_ring_type & ST_RING_ANY) || + ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) && + !mac_srs->srs_rx.sr_enqueue_always)) { + /* If on processor or blanking on, then enqueue and return */ + if (ringp->s_ring_state & S_RING_BLANK || + ringp->s_ring_state & S_RING_PROC) { + SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); + mutex_exit(&ringp->s_ring_lock); + return; + } + + proc = ringp->s_ring_rx_func; + arg1 = ringp->s_ring_rx_arg1; + arg2 = ringp->s_ring_rx_arg2; + /* + * See if anything is already queued. If we are the + * first packet, do inline processing else queue the + * packet and do the drain. + */ + if (ringp->s_ring_first == NULL) { + /* + * Fast-path, ok to process and nothing queued. + */ + ringp->s_ring_run = curthread; + ringp->s_ring_state |= (S_RING_PROC); + + mutex_exit(&ringp->s_ring_lock); + + /* + * We are the chain of 1 packet so + * go through this fast path. + */ + ASSERT(mp_chain->b_next == NULL); + + (*proc)(arg1, arg2, mp_chain, NULL); + + ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); + /* + * If we have a soft ring set which is doing + * bandwidth control, we need to decrement + * srs_size and count so it the SRS can have a + * accurate idea of what is the real data + * queued between SRS and its soft rings. We + * decrement the counters only when the packet + * gets processed by both SRS and the soft ring. + */ + mutex_enter(&mac_srs->srs_lock); + MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); + MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); + mutex_exit(&mac_srs->srs_lock); + + mutex_enter(&ringp->s_ring_lock); + ringp->s_ring_run = NULL; + ringp->s_ring_state &= ~S_RING_PROC; + if (ringp->s_ring_state & S_RING_CLIENT_WAIT) + cv_signal(&ringp->s_ring_client_cv); + + if ((ringp->s_ring_first == NULL) || + (ringp->s_ring_state & S_RING_BLANK)) { + /* + * We processed inline our packet and + * nothing new has arrived or our + * receiver doesn't want to receive + * any packets. We are done. + */ + mutex_exit(&ringp->s_ring_lock); + return; + } + } else { + SOFT_RING_ENQUEUE_CHAIN(ringp, + mp_chain, tail, cnt, sz); + } + + /* + * We are here because either we couldn't do inline + * processing (because something was already + * queued), or we had a chain of more than one + * packet, or something else arrived after we were + * done with inline processing. + */ + ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); + ASSERT(ringp->s_ring_first != NULL); + + ringp->s_ring_drain_func(ringp); + mutex_exit(&ringp->s_ring_lock); + return; + } else { + /* ST_RING_WORKER_ONLY case */ + SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); + mac_soft_ring_worker_wakeup(ringp); + mutex_exit(&ringp->s_ring_lock); + } +} + +/* + * TX SOFTRING RELATED FUNCTIONS + * + * These functions really belong in mac_soft_ring.c and here for + * a short period. + */ + +#define TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ + ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); \ + ringp->s_ring_state |= S_RING_ENQUEUED; \ + SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); \ +} + +/* + * mac_tx_sring_queued + * + * When we are out of transmit descriptors and we already have a + * queue that exceeds hiwat (or the client called us with + * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the + * soft ring pointer as the opaque cookie for the client enable + * flow control. + */ +static mac_tx_cookie_t +mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, + mblk_t **ret_mp) +{ + int cnt; + size_t sz; + mblk_t *tail; + mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; + mac_tx_cookie_t cookie = NULL; + boolean_t wakeup_worker = B_TRUE; + + ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); + MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); + if (flag & MAC_DROP_ON_NO_DESC) { + mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); + /* increment freed stats */ + ringp->s_ring_drops += cnt; + cookie = (mac_tx_cookie_t)ringp; + } else { + if (ringp->s_ring_first != NULL) + wakeup_worker = B_FALSE; + + if (flag & MAC_TX_NO_ENQUEUE) { + /* + * If QUEUED is not set, queue the packet + * and let mac_tx_soft_ring_drain() set + * the TX_BLOCKED bit for the reasons + * explained above. Otherwise, return the + * mblks. + */ + if (wakeup_worker) { + TX_SOFT_RING_ENQUEUE_CHAIN(ringp, + mp_chain, tail, cnt, sz); + } else { + ringp->s_ring_state |= S_RING_WAKEUP_CLIENT; + cookie = (mac_tx_cookie_t)ringp; + *ret_mp = mp_chain; + } + } else { + boolean_t enqueue = B_TRUE; + + if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { + /* + * flow-controlled. Store ringp in cookie + * so that it can be returned as + * mac_tx_cookie_t to client + */ + ringp->s_ring_state |= S_RING_TX_HIWAT; + cookie = (mac_tx_cookie_t)ringp; + ringp->s_ring_hiwat_cnt++; + if (ringp->s_ring_count > + ringp->s_ring_tx_max_q_cnt) { + /* increment freed stats */ + ringp->s_ring_drops += cnt; + /* + * b_prev may be set to the fanout hint + * hence can't use freemsg directly + */ + mac_pkt_drop(NULL, NULL, + mp_chain, B_FALSE); + DTRACE_PROBE1(tx_queued_hiwat, + mac_soft_ring_t *, ringp); + enqueue = B_FALSE; + } + } + if (enqueue) { + TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, + tail, cnt, sz); + } + } + if (wakeup_worker) + cv_signal(&ringp->s_ring_async); + } + return (cookie); +} + + +/* + * mac_tx_soft_ring_process + * + * This routine is called when fanning out outgoing traffic among + * multipe Tx rings. + * Note that a soft ring is associated with a h/w Tx ring. + */ +mac_tx_cookie_t +mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain, + uint16_t flag, mblk_t **ret_mp) +{ + mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; + int cnt; + size_t sz; + mblk_t *tail; + mac_tx_cookie_t cookie = NULL; + + ASSERT(ringp != NULL); + ASSERT(mp_chain != NULL); + ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); + /* + * Only two modes can come here; either it can be + * SRS_TX_BW_FANOUT or SRS_TX_FANOUT + */ + ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT || + mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT); + + if (ringp->s_ring_type & ST_RING_WORKER_ONLY) { + /* Serialization mode */ + + mutex_enter(&ringp->s_ring_lock); + if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { + cookie = mac_tx_sring_enqueue(ringp, mp_chain, + flag, ret_mp); + mutex_exit(&ringp->s_ring_lock); + return (cookie); + } + MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); + TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); + if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) { + /* + * If ring is blocked due to lack of Tx + * descs, just return. Worker thread + * will get scheduled when Tx desc's + * become available. + */ + mutex_exit(&ringp->s_ring_lock); + return (cookie); + } + mac_soft_ring_worker_wakeup(ringp); + mutex_exit(&ringp->s_ring_lock); + return (cookie); + } else { + /* Default fanout mode */ + /* + * S_RING_BLOCKED is set when underlying NIC runs + * out of Tx descs and messages start getting + * queued. It won't get reset until + * tx_srs_drain() completely drains out the + * messages. + */ + boolean_t is_subflow; + mac_tx_stats_t stats; + + if (ringp->s_ring_state & S_RING_ENQUEUED) { + /* Tx descs/resources not available */ + mutex_enter(&ringp->s_ring_lock); + if (ringp->s_ring_state & S_RING_ENQUEUED) { + cookie = mac_tx_sring_enqueue(ringp, mp_chain, + flag, ret_mp); + mutex_exit(&ringp->s_ring_lock); + return (cookie); + } + /* + * While we were computing mblk count, the + * flow control condition got relieved. + * Continue with the transmission. + */ + mutex_exit(&ringp->s_ring_lock); + } + is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); + + mp_chain = mac_tx_send(ringp->s_ring_tx_arg1, + ringp->s_ring_tx_arg2, mp_chain, + (is_subflow ? &stats : NULL)); + + /* + * Multiple threads could be here sending packets. + * Under such conditions, it is not possible to + * automically set S_RING_BLOCKED bit to indicate + * out of tx desc condition. To atomically set + * this, we queue the returned packet and do + * the setting of S_RING_BLOCKED in + * mac_tx_soft_ring_drain(). + */ + if (mp_chain != NULL) { + mutex_enter(&ringp->s_ring_lock); + cookie = + mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp); + mutex_exit(&ringp->s_ring_lock); + return (cookie); + } + if (is_subflow) { + FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); + } + return (NULL); + } +} diff --git a/usr/src/uts/common/io/mac/mac_soft_ring.c b/usr/src/uts/common/io/mac/mac_soft_ring.c new file mode 100644 index 0000000000..ff6991ada2 --- /dev/null +++ b/usr/src/uts/common/io/mac/mac_soft_ring.c @@ -0,0 +1,732 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * General Soft rings - Simulating Rx rings in S/W. + * + * Soft ring is a data abstraction containing a queue and a worker + * thread and represents a hardware Rx ring in software. Each soft + * ring set can have a collection of soft rings for separating + * L3/L4 specific traffic (IPv4 from IPv6 or TCP from UDP) or for + * allowing a higher degree of parallelism by sending traffic to + * one of the soft rings for a SRS (using a hash on src IP or port). + * Each soft ring worker thread can be bound to a different CPU + * allowing the processing for each soft ring to happen in parallel + * and independent from each other. + * + * Protocol soft rings: + * + * Each SRS has at an minimum 3 softrings. One each for IPv4 TCP, + * IPv4 UDP and rest (OTH - for IPv6 and everything else). The + * SRS does dynamic polling and enforces link level bandwidth but + * it does so for all traffic (IPv4 and IPv6 and all protocols) on + * that link. However, each protocol layer wants a different + * behaviour. For instance IPv4 TCP has per CPU squeues which + * enforce their own polling and flow control so IPv4 TCP traffic + * needs to go to a separate soft ring which can be polled by the + * TCP squeue. It also allows TCP squeue to push back flow control + * all the way to NIC hardware (if it puts its corresponding soft + * ring in the poll mode and soft ring queue builds up, the + * shared srs_poll_pkt_cnt goes up and SRS automatically stops + * more packets from entering the system). + * + * Similarly, the UDP benefits from a DLS bypass and packet chaining + * so sending it to a separate soft ring is desired. All the rest of + * the traffic (including IPv6 is sent to OTH softring). The IPv6 + * traffic current goes through OTH softring and via DLS because + * it need more processing to be done. Irrespective of the sap + * (IPv4 or IPv6) or the transport, the dynamic polling, B/W enforcement, + * cpu assignment, fanout, etc apply to all traffic since they + * are implement by the SRS which is agnostic to sap or transport. + * + * Fanout soft rings: + * + * On a multithreaded system, we can assign more CPU and multi thread + * the stack by creating a soft ring per CPU and spreading traffic + * based on a hash computed on src IP etc. Since we still need to + * keep the protocol separation, we create a set of 3 soft ring per + * CPU (specified by cpu list or degree of fanout). + * + * NOTE: See the block level comment on top of mac_sched.c + */ + +#include <sys/types.h> +#include <sys/callb.h> +#include <sys/sdt.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <sys/vlan.h> +#include <inet/ipsec_impl.h> +#include <inet/ip_impl.h> +#include <inet/sadb.h> +#include <inet/ipsecesp.h> +#include <inet/ipsecah.h> + +#include <sys/mac_impl.h> +#include <sys/mac_client_impl.h> +#include <sys/mac_soft_ring.h> +#include <sys/mac_flow_impl.h> + +static void mac_rx_soft_ring_drain(mac_soft_ring_t *); +static void mac_soft_ring_fire(void *); +static void mac_soft_ring_worker(mac_soft_ring_t *); +static void mac_tx_soft_ring_drain(mac_soft_ring_t *); + +uint32_t mac_tx_soft_ring_max_q_cnt = 100000; +uint32_t mac_tx_soft_ring_hiwat = 1000; + +extern kmem_cache_t *mac_soft_ring_cache; + +#define ADD_SOFTRING_TO_SET(mac_srs, softring) { \ + if (mac_srs->srs_soft_ring_head == NULL) { \ + mac_srs->srs_soft_ring_head = softring; \ + mac_srs->srs_soft_ring_tail = softring; \ + } else { \ + /* ADD to the list */ \ + softring->s_ring_prev = \ + mac_srs->srs_soft_ring_tail; \ + mac_srs->srs_soft_ring_tail->s_ring_next = softring; \ + mac_srs->srs_soft_ring_tail = softring; \ + } \ + mac_srs->srs_soft_ring_count++; \ +} + +/* + * mac_soft_ring_worker_wakeup + * + * Wake up the soft ring worker thread to process the queue as long + * as no one else is processing it and upper layer (client) is still + * ready to receive packets. + */ +void +mac_soft_ring_worker_wakeup(mac_soft_ring_t *ringp) +{ + ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); + if (!(ringp->s_ring_state & S_RING_PROC) && + !(ringp->s_ring_state & S_RING_BLANK) && + (ringp->s_ring_tid == NULL)) { + if (ringp->s_ring_wait != 0) { + ringp->s_ring_tid = + timeout(mac_soft_ring_fire, ringp, + ringp->s_ring_wait); + } else { + /* Schedule the worker thread. */ + cv_signal(&ringp->s_ring_async); + } + } +} + +/* + * mac_soft_ring_create + * + * Create a soft ring, do the necessary setup and bind the worker + * thread to the assigned CPU. + */ +mac_soft_ring_t * +mac_soft_ring_create(int id, clock_t wait, void *flent, uint16_t type, + pri_t pri, mac_client_impl_t *mcip, mac_soft_ring_set_t *mac_srs, + processorid_t cpuid, mac_direct_rx_t rx_func, void *x_arg1, + mac_resource_handle_t x_arg2) +{ + mac_soft_ring_t *ringp; + char name[64]; + + bzero(name, 64); + ringp = kmem_cache_alloc(mac_soft_ring_cache, KM_SLEEP); + + if (type & ST_RING_TCP) { + (void) snprintf(name, sizeof (name), + "mac_tcp_soft_ring_%d_%p", id, mac_srs); + } else if (type & ST_RING_UDP) { + (void) snprintf(name, sizeof (name), + "mac_udp_soft_ring_%d_%p", id, mac_srs); + } else { + (void) snprintf(name, sizeof (name), + "mac_oth_soft_ring_%d_%p", id, mac_srs); + } + + bzero(ringp, sizeof (mac_soft_ring_t)); + (void) strncpy(ringp->s_ring_name, name, S_RING_NAMELEN + 1); + ringp->s_ring_name[S_RING_NAMELEN] = '\0'; + mutex_init(&ringp->s_ring_lock, NULL, MUTEX_DEFAULT, NULL); + ringp->s_ring_notify_cb_info.mcbi_lockp = &ringp->s_ring_lock; + + ringp->s_ring_type = type; + ringp->s_ring_wait = MSEC_TO_TICK(wait); + ringp->s_ring_mcip = mcip; + ringp->s_ring_set = mac_srs; + ringp->s_ring_flent = flent; + + /* + * Protect against access from DR callbacks (mac_walk_srs_bind/unbind) + * which can't grab the mac perimeter + */ + mutex_enter(&mac_srs->srs_lock); + ADD_SOFTRING_TO_SET(mac_srs, ringp); + mutex_exit(&mac_srs->srs_lock); + + /* + * set the bind CPU to -1 to indicate + * no thread affinity set + */ + ringp->s_ring_cpuid = ringp->s_ring_cpuid_save = -1; + ringp->s_ring_worker = thread_create(NULL, 0, + mac_soft_ring_worker, ringp, 0, &p0, TS_RUN, pri); + if (type & ST_RING_TX) { + ringp->s_ring_drain_func = mac_tx_soft_ring_drain; + ringp->s_ring_tx_arg1 = x_arg1; + ringp->s_ring_tx_arg2 = x_arg2; + ringp->s_ring_tx_max_q_cnt = mac_tx_soft_ring_max_q_cnt; + ringp->s_ring_tx_hiwat = + (mac_tx_soft_ring_hiwat > mac_tx_soft_ring_max_q_cnt) ? + mac_tx_soft_ring_max_q_cnt : mac_tx_soft_ring_hiwat; + } else { + ringp->s_ring_drain_func = mac_rx_soft_ring_drain; + ringp->s_ring_rx_func = rx_func; + ringp->s_ring_rx_arg1 = x_arg1; + ringp->s_ring_rx_arg2 = x_arg2; + } + if (cpuid != -1) + (void) mac_soft_ring_bind(ringp, cpuid); + + return (ringp); +} + +/* + * mac_soft_ring_free + * + * Free the soft ring once we are done with it. + */ +void +mac_soft_ring_free(mac_soft_ring_t *softring, boolean_t release_tx_ring) +{ + ASSERT((softring->s_ring_state & + (S_RING_CONDEMNED | S_RING_CONDEMNED_DONE | S_RING_PROC)) == + (S_RING_CONDEMNED | S_RING_CONDEMNED_DONE)); + mac_pkt_drop(NULL, NULL, softring->s_ring_first, B_FALSE); + if (release_tx_ring && softring->s_ring_tx_arg2 != NULL) { + ASSERT(softring->s_ring_type & ST_RING_TX); + mac_release_tx_ring(softring->s_ring_tx_arg2); + } + if (softring->s_ring_ksp) + kstat_delete(softring->s_ring_ksp); + mac_callback_free(softring->s_ring_notify_cb_list); + kmem_cache_free(mac_soft_ring_cache, softring); +} + +int mac_soft_ring_thread_bind = 1; + +/* + * mac_soft_ring_bind + * + * Bind a soft ring worker thread to supplied CPU. + */ +cpu_t * +mac_soft_ring_bind(mac_soft_ring_t *ringp, processorid_t cpuid) +{ + cpu_t *cp; + boolean_t clear = B_FALSE; + + ASSERT(MUTEX_HELD(&cpu_lock)); + + if (mac_soft_ring_thread_bind == 0) { + DTRACE_PROBE1(mac__soft__ring__no__cpu__bound, + mac_soft_ring_t *, ringp); + return (NULL); + } + + cp = cpu_get(cpuid); + if (cp == NULL || !cpu_is_online(cp)) + return (NULL); + + mutex_enter(&ringp->s_ring_lock); + ringp->s_ring_state |= S_RING_BOUND; + if (ringp->s_ring_cpuid != -1) + clear = B_TRUE; + ringp->s_ring_cpuid = cpuid; + mutex_exit(&ringp->s_ring_lock); + + if (clear) + thread_affinity_clear(ringp->s_ring_worker); + + DTRACE_PROBE2(mac__soft__ring__cpu__bound, mac_soft_ring_t *, + ringp, processorid_t, cpuid); + + thread_affinity_set(ringp->s_ring_worker, cpuid); + + return (cp); +} + +/* + * mac_soft_ring_unbind + * + * Un Bind a soft ring worker thread. + */ +void +mac_soft_ring_unbind(mac_soft_ring_t *ringp) +{ + ASSERT(MUTEX_HELD(&cpu_lock)); + + mutex_enter(&ringp->s_ring_lock); + if (!(ringp->s_ring_state & S_RING_BOUND)) { + ASSERT(ringp->s_ring_cpuid == -1); + mutex_exit(&ringp->s_ring_lock); + return; + } + + ringp->s_ring_cpuid = -1; + ringp->s_ring_state &= ~S_RING_BOUND; + thread_affinity_clear(ringp->s_ring_worker); + mutex_exit(&ringp->s_ring_lock); +} + +/* + * PRIVATE FUNCTIONS + */ + +static void +mac_soft_ring_fire(void *arg) +{ + mac_soft_ring_t *ringp = arg; + + mutex_enter(&ringp->s_ring_lock); + if (ringp->s_ring_tid == 0) { + mutex_exit(&ringp->s_ring_lock); + return; + } + + ringp->s_ring_tid = 0; + + if (!(ringp->s_ring_state & S_RING_PROC)) { + cv_signal(&ringp->s_ring_async); + } + mutex_exit(&ringp->s_ring_lock); +} + +/* + * mac_rx_soft_ring_drain + * + * Called when worker thread model (ST_RING_WORKER_ONLY) of processing + * incoming packets is used. s_ring_first contain the queued packets. + * s_ring_rx_func contains the upper level (client) routine where the + * packets are destined and s_ring_rx_arg1/s_ring_rx_arg2 are the + * cookie meant for the client. + */ +/* ARGSUSED */ +static void +mac_rx_soft_ring_drain(mac_soft_ring_t *ringp) +{ + mblk_t *mp; + void *arg1; + mac_resource_handle_t arg2; + timeout_id_t tid; + mac_direct_rx_t proc; + size_t sz; + int cnt; + mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; + + ringp->s_ring_run = curthread; + ASSERT(mutex_owned(&ringp->s_ring_lock)); + ASSERT(!(ringp->s_ring_state & S_RING_PROC)); + + if ((tid = ringp->s_ring_tid) != 0) + ringp->s_ring_tid = 0; + + ringp->s_ring_state |= S_RING_PROC; + + proc = ringp->s_ring_rx_func; + arg1 = ringp->s_ring_rx_arg1; + arg2 = ringp->s_ring_rx_arg2; + + while ((ringp->s_ring_first != NULL) && + !(ringp->s_ring_state & S_RING_PAUSE)) { + mp = ringp->s_ring_first; + ringp->s_ring_first = NULL; + ringp->s_ring_last = NULL; + cnt = ringp->s_ring_count; + ringp->s_ring_count = 0; + sz = ringp->s_ring_size; + ringp->s_ring_size = 0; + mutex_exit(&ringp->s_ring_lock); + + if (tid != 0) { + (void) untimeout(tid); + tid = 0; + } + + (*proc)(arg1, arg2, mp, NULL); + + /* + * If we have a soft ring set which is doing + * bandwidth control, we need to decrement its + * srs_size so it can have a accurate idea of + * what is the real data queued between SRS and + * its soft rings. We decrement the size for a + * packet only when it gets processed by both + * SRS and the soft ring. + */ + mutex_enter(&mac_srs->srs_lock); + MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); + MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); + mutex_exit(&mac_srs->srs_lock); + + mutex_enter(&ringp->s_ring_lock); + } + ringp->s_ring_state &= ~S_RING_PROC; + if (ringp->s_ring_state & S_RING_CLIENT_WAIT) + cv_signal(&ringp->s_ring_client_cv); + ringp->s_ring_run = NULL; +} + +/* + * mac_soft_ring_worker + * + * The soft ring worker routine to process any queued packets. In + * normal case, the worker thread is bound to a CPU. It the soft + * ring is dealing with TCP packets, then the worker thread will + * be bound to the same CPU as the TCP squeue. + */ +static void +mac_soft_ring_worker(mac_soft_ring_t *ringp) +{ + kmutex_t *lock = &ringp->s_ring_lock; + kcondvar_t *async = &ringp->s_ring_async; + mac_soft_ring_set_t *srs = ringp->s_ring_set; + callb_cpr_t cprinfo; + + CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_soft_ring"); + mutex_enter(lock); +start: + for (;;) { + while (((ringp->s_ring_first == NULL || + (ringp->s_ring_state & S_RING_BLOCK)) && + !(ringp->s_ring_state & S_RING_PAUSE)) || + (ringp->s_ring_state & S_RING_PROC)) { + + CALLB_CPR_SAFE_BEGIN(&cprinfo); + cv_wait(async, lock); + CALLB_CPR_SAFE_END(&cprinfo, lock); + } + + /* + * Either we have work to do, or we have been asked to + * shutdown temporarily or permanently + */ + if (ringp->s_ring_state & S_RING_PAUSE) + goto done; + + ringp->s_ring_drain_func(ringp); + } +done: + mutex_exit(lock); + mutex_enter(&srs->srs_lock); + mutex_enter(lock); + + ringp->s_ring_state |= S_RING_QUIESCE_DONE; + if (!(ringp->s_ring_state & S_RING_CONDEMNED)) { + srs->srs_soft_ring_quiesced_count++; + cv_broadcast(&srs->srs_async); + mutex_exit(&srs->srs_lock); + while (!(ringp->s_ring_state & + (S_RING_RESTART | S_RING_CONDEMNED))) + cv_wait(&ringp->s_ring_async, &ringp->s_ring_lock); + mutex_exit(lock); + mutex_enter(&srs->srs_lock); + mutex_enter(lock); + srs->srs_soft_ring_quiesced_count--; + if (ringp->s_ring_state & S_RING_RESTART) { + ASSERT(!(ringp->s_ring_state & S_RING_CONDEMNED)); + ringp->s_ring_state &= ~(S_RING_RESTART | + S_RING_QUIESCE | S_RING_QUIESCE_DONE); + cv_broadcast(&srs->srs_async); + mutex_exit(&srs->srs_lock); + goto start; + } + } + ASSERT(ringp->s_ring_state & S_RING_CONDEMNED); + ringp->s_ring_state |= S_RING_CONDEMNED_DONE; + CALLB_CPR_EXIT(&cprinfo); + srs->srs_soft_ring_condemned_count++; + cv_broadcast(&srs->srs_async); + mutex_exit(&srs->srs_lock); + thread_exit(); +} + +/* + * mac_soft_ring_intr_enable and mac_soft_ring_intr_disable + * + * these functions are called to toggle the sending of packets to the + * client. They are called by the client. the client gets the name + * of these routine and corresponding cookie (pointing to softring) + * during capability negotiation at setup time. + * + * Enabling is allow the processing thread to send packets to the + * client while disabling does the opposite. + */ +void +mac_soft_ring_intr_enable(void *arg) +{ + mac_soft_ring_t *ringp = (mac_soft_ring_t *)arg; + mutex_enter(&ringp->s_ring_lock); + ringp->s_ring_state &= ~S_RING_BLANK; + if (ringp->s_ring_first != NULL) + mac_soft_ring_worker_wakeup(ringp); + mutex_exit(&ringp->s_ring_lock); +} + +void +mac_soft_ring_intr_disable(void *arg) +{ + mac_soft_ring_t *ringp = (mac_soft_ring_t *)arg; + /* + * Stop worker thread from sending packets above. + * Squeue will poll soft ring when it needs packets. + */ + mutex_enter(&ringp->s_ring_lock); + ringp->s_ring_state |= S_RING_BLANK; + mutex_exit(&ringp->s_ring_lock); +} + +/* + * mac_soft_ring_poll + * + * This routine is called by the client to poll for packets from + * the soft ring. The function name and cookie corresponding to + * the soft ring is exchanged during capability negotiation during + * setup. + */ +mblk_t * +mac_soft_ring_poll(mac_soft_ring_t *ringp, int bytes_to_pickup) +{ + mblk_t *head, *tail; + mblk_t *mp; + size_t sz = 0; + int cnt = 0; + mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; + + ASSERT(mac_srs != NULL); + + mutex_enter(&ringp->s_ring_lock); + head = tail = mp = ringp->s_ring_first; + if (head == NULL) { + mutex_exit(&ringp->s_ring_lock); + return (NULL); + } + + if (ringp->s_ring_size <= bytes_to_pickup) { + head = ringp->s_ring_first; + ringp->s_ring_first = NULL; + ringp->s_ring_last = NULL; + cnt = ringp->s_ring_count; + ringp->s_ring_count = 0; + sz = ringp->s_ring_size; + ringp->s_ring_size = 0; + } else { + while (mp && sz <= bytes_to_pickup) { + sz += msgdsize(mp); + cnt++; + tail = mp; + mp = mp->b_next; + } + ringp->s_ring_count -= cnt; + ringp->s_ring_size -= sz; + tail->b_next = NULL; + if (mp == NULL) { + ringp->s_ring_first = NULL; + ringp->s_ring_last = NULL; + ASSERT(ringp->s_ring_count == 0); + } else { + ringp->s_ring_first = mp; + } + } + + mutex_exit(&ringp->s_ring_lock); + /* + * Update the shared count and size counters so + * that SRS has a accurate idea of queued packets. + */ + mutex_enter(&mac_srs->srs_lock); + MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); + MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); + mutex_exit(&mac_srs->srs_lock); + return (head); +} + +/* + * mac_soft_ring_dls_bypass + * + * Enable direct client (IP) callback function from the softrings. + * Callers need to make sure they don't need any DLS layer processing + */ +void +mac_soft_ring_dls_bypass(void *arg, mac_direct_rx_t rx_func, void *rx_arg1) +{ + mac_soft_ring_t *softring = arg; + mac_soft_ring_set_t *srs; + + ASSERT(rx_func != NULL); + + mutex_enter(&softring->s_ring_lock); + softring->s_ring_rx_func = rx_func; + softring->s_ring_rx_arg1 = rx_arg1; + mutex_exit(&softring->s_ring_lock); + + srs = softring->s_ring_set; + mutex_enter(&srs->srs_lock); + srs->srs_type |= SRST_DLS_BYPASS; + mutex_exit(&srs->srs_lock); +} + +/* + * mac_soft_ring_signal + * + * Typically used to set the soft ring state to QUIESCE, CONDEMNED, or + * RESTART. + * + * In the Rx side, the quiescing is done bottom up. After the Rx upcalls + * from the driver are done, then the Rx SRS is quiesced and only then can + * we signal the soft rings. Thus this function can't be called arbitrarily + * without satisfying the prerequisites. On the Tx side, the threads from + * top need to quiesced, then the Tx SRS and only then can we signal the + * Tx soft rings. + */ +void +mac_soft_ring_signal(mac_soft_ring_t *softring, uint_t sr_flag) +{ + mutex_enter(&softring->s_ring_lock); + softring->s_ring_state |= sr_flag; + cv_signal(&softring->s_ring_async); + mutex_exit(&softring->s_ring_lock); +} + +/* + * mac_tx_soft_ring_drain + * + * The transmit side drain routine in case the soft ring was being + * used to transmit packets. + */ +static void +mac_tx_soft_ring_drain(mac_soft_ring_t *ringp) +{ + mblk_t *mp; + void *arg1; + void *arg2; + mblk_t *tail; + uint_t saved_pkt_count, saved_size; + boolean_t is_subflow; + mac_tx_stats_t stats; + mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; + + saved_pkt_count = saved_size = 0; + ringp->s_ring_run = curthread; + ASSERT(mutex_owned(&ringp->s_ring_lock)); + ASSERT(!(ringp->s_ring_state & S_RING_PROC)); + + ringp->s_ring_state |= S_RING_PROC; + is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); + arg1 = ringp->s_ring_tx_arg1; + arg2 = ringp->s_ring_tx_arg2; + + while (ringp->s_ring_first != NULL) { + mp = ringp->s_ring_first; + tail = ringp->s_ring_last; + saved_pkt_count = ringp->s_ring_count; + saved_size = ringp->s_ring_size; + ringp->s_ring_first = NULL; + ringp->s_ring_last = NULL; + ringp->s_ring_count = 0; + ringp->s_ring_size = 0; + mutex_exit(&ringp->s_ring_lock); + + mp = mac_tx_send(arg1, arg2, mp, &stats); + + mutex_enter(&ringp->s_ring_lock); + if (mp != NULL) { + /* Device out of tx desc, set block */ + tail->b_next = ringp->s_ring_first; + ringp->s_ring_first = mp; + ringp->s_ring_count += + (saved_pkt_count - stats.ts_opackets); + ringp->s_ring_size += (saved_size - stats.ts_obytes); + if (ringp->s_ring_last == NULL) + ringp->s_ring_last = tail; + + if (ringp->s_ring_tx_woken_up) { + ringp->s_ring_tx_woken_up = B_FALSE; + } else { + ringp->s_ring_state |= S_RING_BLOCK; + ringp->s_ring_blocked_cnt++; + } + + ringp->s_ring_state &= ~S_RING_PROC; + ringp->s_ring_run = NULL; + return; + } else { + ringp->s_ring_tx_woken_up = B_FALSE; + if (is_subflow) { + FLOW_TX_STATS_UPDATE( + mac_srs->srs_flent, &stats); + } + } + } + + if (ringp->s_ring_count == 0 && ringp->s_ring_state & + (S_RING_TX_HIWAT | S_RING_WAKEUP_CLIENT | S_RING_ENQUEUED)) { + mac_tx_notify_cb_t *mtnfp; + mac_cb_t *mcb; + mac_client_impl_t *mcip = ringp->s_ring_mcip; + boolean_t wakeup_required = B_FALSE; + + if (ringp->s_ring_state & + (S_RING_TX_HIWAT|S_RING_WAKEUP_CLIENT)) { + wakeup_required = B_TRUE; + } + ringp->s_ring_state &= + ~(S_RING_TX_HIWAT | S_RING_WAKEUP_CLIENT | S_RING_ENQUEUED); + mutex_exit(&ringp->s_ring_lock); + if (wakeup_required) { + /* Wakeup callback registered clients */ + MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info); + for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL; + mcb = mcb->mcb_nextp) { + mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp; + mtnfp->mtnf_fn(mtnfp->mtnf_arg, + (mac_tx_cookie_t)ringp); + } + MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info, + &mcip->mci_tx_notify_cb_list); + /* + * If the client is not the primary MAC client, then we + * need to send the notification to the clients upper + * MAC, i.e. mci_upper_mip. + */ + mac_tx_notify(mcip->mci_upper_mip != NULL ? + mcip->mci_upper_mip : mcip->mci_mip); + } + mutex_enter(&ringp->s_ring_lock); + } + ringp->s_ring_state &= ~S_RING_PROC; + ringp->s_ring_run = NULL; +} diff --git a/usr/src/uts/common/io/mac/mac_util.c b/usr/src/uts/common/io/mac/mac_util.c new file mode 100644 index 0000000000..1615060736 --- /dev/null +++ b/usr/src/uts/common/io/mac/mac_util.c @@ -0,0 +1,823 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * MAC Services Module - misc utilities + */ + +#include <sys/types.h> +#include <sys/mac.h> +#include <sys/mac_impl.h> +#include <sys/mac_client_priv.h> +#include <sys/mac_client_impl.h> +#include <sys/mac_soft_ring.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <sys/vlan.h> +#include <sys/pattr.h> +#include <sys/pci_tools.h> +#include <inet/ip.h> +#include <inet/ip_impl.h> +#include <inet/ip6.h> +#include <sys/vtrace.h> +#include <sys/dlpi.h> +#include <sys/sunndi.h> + +/* + * Copy an mblk, preserving its hardware checksum flags. + */ +static mblk_t * +mac_copymsg_cksum(mblk_t *mp) +{ + mblk_t *mp1; + uint32_t start, stuff, end, value, flags; + + mp1 = copymsg(mp); + if (mp1 == NULL) + return (NULL); + + hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags); + (void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value, + flags, KM_NOSLEEP); + + return (mp1); +} + +/* + * Copy an mblk chain, presenting the hardware checksum flags of the + * individual mblks. + */ +mblk_t * +mac_copymsgchain_cksum(mblk_t *mp) +{ + mblk_t *nmp = NULL; + mblk_t **nmpp = &nmp; + + for (; mp != NULL; mp = mp->b_next) { + if ((*nmpp = mac_copymsg_cksum(mp)) == NULL) { + freemsgchain(nmp); + return (NULL); + } + + nmpp = &((*nmpp)->b_next); + } + + return (nmp); +} + +/* + * Process the specified mblk chain for proper handling of hardware + * checksum offload. This routine is invoked for loopback traffic + * between MAC clients. + * The function handles a NULL mblk chain passed as argument. + */ +mblk_t * +mac_fix_cksum(mblk_t *mp_chain) +{ + mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1; + uint32_t flags, start, stuff, end, value; + + for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) { + uint16_t len; + uint32_t offset; + struct ether_header *ehp; + uint16_t sap; + + hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, + &flags); + if (flags == 0) + continue; + + /* + * Since the processing of checksum offload for loopback + * traffic requires modification of the packet contents, + * ensure sure that we are always modifying our own copy. + */ + if (DB_REF(mp) > 1) { + mp1 = copymsg(mp); + if (mp1 == NULL) + continue; + mp1->b_next = mp->b_next; + mp->b_next = NULL; + freemsg(mp); + if (prev != NULL) + prev->b_next = mp1; + else + new_chain = mp1; + mp = mp1; + } + + /* + * Ethernet, and optionally VLAN header. + */ + /* LINTED: improper alignment cast */ + ehp = (struct ether_header *)mp->b_rptr; + if (ntohs(ehp->ether_type) == VLAN_TPID) { + struct ether_vlan_header *evhp; + + ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); + /* LINTED: improper alignment cast */ + evhp = (struct ether_vlan_header *)mp->b_rptr; + sap = ntohs(evhp->ether_type); + offset = sizeof (struct ether_vlan_header); + } else { + sap = ntohs(ehp->ether_type); + offset = sizeof (struct ether_header); + } + + if (MBLKL(mp) <= offset) { + offset -= MBLKL(mp); + if (mp->b_cont == NULL) { + /* corrupted packet, skip it */ + if (prev != NULL) + prev->b_next = mp->b_next; + else + new_chain = mp->b_next; + mp1 = mp->b_next; + mp->b_next = NULL; + freemsg(mp); + mp = mp1; + continue; + } + mp = mp->b_cont; + } + + if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) { + ipha_t *ipha = NULL; + + /* + * In order to compute the full and header + * checksums, we need to find and parse + * the IP and/or ULP headers. + */ + + sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; + + /* + * IP header. + */ + if (sap != ETHERTYPE_IP) + continue; + + ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t)); + /* LINTED: improper alignment cast */ + ipha = (ipha_t *)(mp->b_rptr + offset); + + if (flags & HCK_FULLCKSUM) { + ipaddr_t src, dst; + uint32_t cksum; + uint16_t *up; + uint8_t proto; + + /* + * Pointer to checksum field in ULP header. + */ + proto = ipha->ipha_protocol; + ASSERT(ipha->ipha_version_and_hdr_length == + IP_SIMPLE_HDR_VERSION); + if (proto == IPPROTO_TCP) { + /* LINTED: improper alignment cast */ + up = IPH_TCPH_CHECKSUMP(ipha, + IP_SIMPLE_HDR_LENGTH); + } else { + ASSERT(proto == IPPROTO_UDP); + /* LINTED: improper alignment cast */ + up = IPH_UDPH_CHECKSUMP(ipha, + IP_SIMPLE_HDR_LENGTH); + } + + /* + * Pseudo-header checksum. + */ + src = ipha->ipha_src; + dst = ipha->ipha_dst; + len = ntohs(ipha->ipha_length) - + IP_SIMPLE_HDR_LENGTH; + + cksum = (dst >> 16) + (dst & 0xFFFF) + + (src >> 16) + (src & 0xFFFF); + cksum += htons(len); + + /* + * The checksum value stored in the packet needs + * to be correct. Compute it here. + */ + *up = 0; + cksum += (((proto) == IPPROTO_UDP) ? + IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP); + cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH + + offset, cksum); + *(up) = (uint16_t)(cksum ? cksum : ~cksum); + + flags |= HCK_FULLCKSUM_OK; + value = 0xffff; + } + + if (flags & HCK_IPV4_HDRCKSUM) { + ASSERT(ipha != NULL); + ipha->ipha_hdr_checksum = + (uint16_t)ip_csum_hdr(ipha); + } + } + + if (flags & HCK_PARTIALCKSUM) { + uint16_t *up, partial, cksum; + uchar_t *ipp; /* ptr to beginning of IP header */ + + if (mp->b_cont != NULL) { + mblk_t *mp1; + + mp1 = msgpullup(mp, offset + end); + if (mp1 == NULL) + continue; + mp1->b_next = mp->b_next; + mp->b_next = NULL; + freemsg(mp); + if (prev != NULL) + prev->b_next = mp1; + else + new_chain = mp1; + mp = mp1; + } + + ipp = mp->b_rptr + offset; + /* LINTED: cast may result in improper alignment */ + up = (uint16_t *)((uchar_t *)ipp + stuff); + partial = *up; + *up = 0; + + cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start, + end - start, partial); + cksum = ~cksum; + *up = cksum ? cksum : ~cksum; + + /* + * Since we already computed the whole checksum, + * indicate to the stack that it has already + * been verified by the hardware. + */ + flags &= ~HCK_PARTIALCKSUM; + flags |= (HCK_FULLCKSUM | HCK_FULLCKSUM_OK); + value = 0xffff; + } + + (void) hcksum_assoc(mp, NULL, NULL, start, stuff, end, + value, flags, KM_NOSLEEP); + } + + return (new_chain); +} + +/* + * Add VLAN tag to the specified mblk. + */ +mblk_t * +mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid) +{ + mblk_t *hmp; + struct ether_vlan_header *evhp; + struct ether_header *ehp; + uint32_t start, stuff, end, value, flags; + + ASSERT(pri != 0 || vid != 0); + + /* + * Allocate an mblk for the new tagged ethernet header, + * and copy the MAC addresses and ethertype from the + * original header. + */ + + hmp = allocb(sizeof (struct ether_vlan_header), BPRI_MED); + if (hmp == NULL) { + freemsg(mp); + return (NULL); + } + + evhp = (struct ether_vlan_header *)hmp->b_rptr; + ehp = (struct ether_header *)mp->b_rptr; + + bcopy(ehp, evhp, (ETHERADDRL * 2)); + evhp->ether_type = ehp->ether_type; + evhp->ether_tpid = htons(ETHERTYPE_VLAN); + + hmp->b_wptr += sizeof (struct ether_vlan_header); + mp->b_rptr += sizeof (struct ether_header); + + /* + * Free the original message if it's now empty. Link the + * rest of messages to the header message. + */ + hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags); + (void) hcksum_assoc(hmp, NULL, NULL, start, stuff, end, value, flags, + KM_NOSLEEP); + if (MBLKL(mp) == 0) { + hmp->b_cont = mp->b_cont; + freeb(mp); + } else { + hmp->b_cont = mp; + } + ASSERT(MBLKL(hmp) >= sizeof (struct ether_vlan_header)); + + /* + * Initialize the new TCI (Tag Control Information). + */ + evhp->ether_tci = htons(VLAN_TCI(pri, 0, vid)); + + return (hmp); +} + +/* + * Adds a VLAN tag with the specified VID and priority to each mblk of + * the specified chain. + */ +mblk_t * +mac_add_vlan_tag_chain(mblk_t *mp_chain, uint_t pri, uint16_t vid) +{ + mblk_t *next_mp, **prev, *mp; + + mp = mp_chain; + prev = &mp_chain; + + while (mp != NULL) { + next_mp = mp->b_next; + mp->b_next = NULL; + if ((mp = mac_add_vlan_tag(mp, pri, vid)) == NULL) { + freemsgchain(next_mp); + break; + } + *prev = mp; + prev = &mp->b_next; + mp = mp->b_next = next_mp; + } + + return (mp_chain); +} + +/* + * Strip VLAN tag + */ +mblk_t * +mac_strip_vlan_tag(mblk_t *mp) +{ + mblk_t *newmp; + struct ether_vlan_header *evhp; + + evhp = (struct ether_vlan_header *)mp->b_rptr; + if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) { + ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); + + if (DB_REF(mp) > 1) { + newmp = copymsg(mp); + if (newmp == NULL) + return (NULL); + freemsg(mp); + mp = newmp; + } + + evhp = (struct ether_vlan_header *)mp->b_rptr; + + ovbcopy(mp->b_rptr, mp->b_rptr + VLAN_TAGSZ, 2 * ETHERADDRL); + mp->b_rptr += VLAN_TAGSZ; + } + return (mp); +} + +/* + * Strip VLAN tag from each mblk of the chain. + */ +mblk_t * +mac_strip_vlan_tag_chain(mblk_t *mp_chain) +{ + mblk_t *mp, *next_mp, **prev; + + mp = mp_chain; + prev = &mp_chain; + + while (mp != NULL) { + next_mp = mp->b_next; + mp->b_next = NULL; + if ((mp = mac_strip_vlan_tag(mp)) == NULL) { + freemsgchain(next_mp); + break; + } + *prev = mp; + prev = &mp->b_next; + mp = mp->b_next = next_mp; + } + + return (mp_chain); +} + +/* + * Default callback function. Used when the datapath is not yet initialized. + */ +/* ARGSUSED */ +void +mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp, + boolean_t loopback) +{ + mblk_t *mp1 = mp; + + while (mp1 != NULL) { + mp1->b_prev = NULL; + mp1->b_queue = NULL; + mp1 = mp1->b_next; + } + freemsgchain(mp); +} + +/* + * Determines the IPv6 header length accounting for all the optional IPv6 + * headers (hop-by-hop, destination, routing and fragment). The header length + * and next header value (a transport header) is captured. + * + * Returns B_FALSE if all the IP headers are not in the same mblk otherwise + * returns B_TRUE. + */ +boolean_t +mac_ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length, + uint8_t *next_hdr) +{ + uint16_t length; + uint_t ehdrlen; + uint8_t *whereptr; + uint8_t *endptr; + uint8_t *nexthdrp; + ip6_dest_t *desthdr; + ip6_rthdr_t *rthdr; + ip6_frag_t *fraghdr; + + endptr = mp->b_wptr; + if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr) + return (B_FALSE); + ASSERT((IPH_HDR_VERSION(ip6h) & ~IP_FORWARD_PROG_BIT) == IPV6_VERSION); + length = IPV6_HDR_LEN; + whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ + + nexthdrp = &ip6h->ip6_nxt; + while (whereptr < endptr) { + /* Is there enough left for len + nexthdr? */ + if (whereptr + MIN_EHDR_LEN > endptr) + break; + + switch (*nexthdrp) { + case IPPROTO_HOPOPTS: + case IPPROTO_DSTOPTS: + /* Assumes the headers are identical for hbh and dst */ + desthdr = (ip6_dest_t *)whereptr; + ehdrlen = 8 * (desthdr->ip6d_len + 1); + if ((uchar_t *)desthdr + ehdrlen > endptr) + return (B_FALSE); + nexthdrp = &desthdr->ip6d_nxt; + break; + case IPPROTO_ROUTING: + rthdr = (ip6_rthdr_t *)whereptr; + ehdrlen = 8 * (rthdr->ip6r_len + 1); + if ((uchar_t *)rthdr + ehdrlen > endptr) + return (B_FALSE); + nexthdrp = &rthdr->ip6r_nxt; + break; + case IPPROTO_FRAGMENT: + fraghdr = (ip6_frag_t *)whereptr; + ehdrlen = sizeof (ip6_frag_t); + if ((uchar_t *)&fraghdr[1] > endptr) + return (B_FALSE); + nexthdrp = &fraghdr->ip6f_nxt; + break; + case IPPROTO_NONE: + /* No next header means we're finished */ + default: + *hdr_length = length; + *next_hdr = *nexthdrp; + return (B_TRUE); + } + length += ehdrlen; + whereptr += ehdrlen; + *hdr_length = length; + *next_hdr = *nexthdrp; + } + switch (*nexthdrp) { + case IPPROTO_HOPOPTS: + case IPPROTO_DSTOPTS: + case IPPROTO_ROUTING: + case IPPROTO_FRAGMENT: + /* + * If any know extension headers are still to be processed, + * the packet's malformed (or at least all the IP header(s) are + * not in the same mblk - and that should never happen. + */ + return (B_FALSE); + + default: + /* + * If we get here, we know that all of the IP headers were in + * the same mblk, even if the ULP header is in the next mblk. + */ + *hdr_length = length; + *next_hdr = *nexthdrp; + return (B_TRUE); + } +} + +typedef struct mac_dladm_intr { + int ino; + int cpu_id; + char driver_path[MAXPATHLEN]; + char nexus_path[MAXPATHLEN]; +} mac_dladm_intr_t; + +/* Bind the interrupt to cpu_num */ +static int +mac_set_intr(ldi_handle_t lh, processorid_t cpu_num, int ino) +{ + pcitool_intr_set_t iset; + int err; + + iset.ino = ino; + iset.cpu_id = cpu_num; + iset.user_version = PCITOOL_VERSION; + err = ldi_ioctl(lh, PCITOOL_DEVICE_SET_INTR, (intptr_t)&iset, FKIOCTL, + kcred, NULL); + + return (err); +} + +/* + * Search interrupt information. iget is filled in with the info to search + */ +static boolean_t +mac_search_intrinfo(pcitool_intr_get_t *iget_p, mac_dladm_intr_t *dln) +{ + int i; + char driver_path[2 * MAXPATHLEN]; + + for (i = 0; i < iget_p->num_devs; i++) { + (void) strlcpy(driver_path, iget_p->dev[i].path, MAXPATHLEN); + (void) snprintf(&driver_path[strlen(driver_path)], MAXPATHLEN, + ":%s%d", iget_p->dev[i].driver_name, + iget_p->dev[i].dev_inst); + /* Match the device path for the device path */ + if (strcmp(driver_path, dln->driver_path) == 0) { + dln->ino = iget_p->ino; + dln->cpu_id = iget_p->cpu_id; + return (B_TRUE); + } + } + return (B_FALSE); +} + +/* + * Get information about ino, i.e. if this is the interrupt for our + * device and where it is bound etc. + */ +static boolean_t +mac_get_single_intr(ldi_handle_t lh, int ino, mac_dladm_intr_t *dln) +{ + pcitool_intr_get_t *iget_p; + int ipsz; + int nipsz; + int err; + uint8_t inum; + + /* + * Check if SLEEP is OK, i.e if could come here in response to + * changing the fanout due to some callback from the driver, say + * link speed changes. + */ + ipsz = PCITOOL_IGET_SIZE(0); + iget_p = kmem_zalloc(ipsz, KM_SLEEP); + + iget_p->num_devs_ret = 0; + iget_p->user_version = PCITOOL_VERSION; + iget_p->ino = ino; + + err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, + FKIOCTL, kcred, NULL); + if (err != 0) { + kmem_free(iget_p, ipsz); + return (B_FALSE); + } + if (iget_p->num_devs == 0) { + kmem_free(iget_p, ipsz); + return (B_FALSE); + } + inum = iget_p->num_devs; + if (iget_p->num_devs_ret < iget_p->num_devs) { + /* Reallocate */ + nipsz = PCITOOL_IGET_SIZE(iget_p->num_devs); + + kmem_free(iget_p, ipsz); + ipsz = nipsz; + iget_p = kmem_zalloc(ipsz, KM_SLEEP); + + iget_p->num_devs_ret = inum; + iget_p->ino = ino; + iget_p->user_version = PCITOOL_VERSION; + err = ldi_ioctl(lh, PCITOOL_DEVICE_GET_INTR, (intptr_t)iget_p, + FKIOCTL, kcred, NULL); + if (err != 0) { + kmem_free(iget_p, ipsz); + return (B_FALSE); + } + /* defensive */ + if (iget_p->num_devs != iget_p->num_devs_ret) { + kmem_free(iget_p, ipsz); + return (B_FALSE); + } + } + + if (mac_search_intrinfo(iget_p, dln)) { + kmem_free(iget_p, ipsz); + return (B_TRUE); + } + kmem_free(iget_p, ipsz); + return (B_FALSE); +} + +/* + * Get the interrupts and check each one to see if it is for our device. + */ +static int +mac_validate_intr(ldi_handle_t lh, mac_dladm_intr_t *dln, processorid_t cpuid) +{ + pcitool_intr_info_t intr_info; + int err; + int ino; + + err = ldi_ioctl(lh, PCITOOL_SYSTEM_INTR_INFO, (intptr_t)&intr_info, + FKIOCTL, kcred, NULL); + if (err != 0) + return (-1); + + for (ino = 0; ino < intr_info.num_intr; ino++) { + if (mac_get_single_intr(lh, ino, dln)) { + if (dln->cpu_id == cpuid) + return (0); + return (1); + } + } + return (-1); +} + +/* + * Obtain the nexus parent node info. for mdip. + */ +static dev_info_t * +mac_get_nexus_node(dev_info_t *mdip, mac_dladm_intr_t *dln) +{ + struct dev_info *tdip = (struct dev_info *)mdip; + struct ddi_minor_data *minordata; + int circ; + dev_info_t *pdip; + char pathname[MAXPATHLEN]; + + while (tdip != NULL) { + ndi_devi_enter((dev_info_t *)tdip, &circ); + for (minordata = tdip->devi_minor; minordata != NULL; + minordata = minordata->next) { + if (strncmp(minordata->ddm_node_type, DDI_NT_INTRCTL, + strlen(DDI_NT_INTRCTL)) == 0) { + pdip = minordata->dip; + (void) ddi_pathname(pdip, pathname); + (void) snprintf(dln->nexus_path, MAXPATHLEN, + "/devices%s:intr", pathname); + (void) ddi_pathname_minor(minordata, pathname); + ndi_devi_exit((dev_info_t *)tdip, circ); + return (pdip); + } + } + ndi_devi_exit((dev_info_t *)tdip, circ); + tdip = tdip->devi_parent; + } + return (NULL); +} + +/* + * For a primary MAC client, if the user has set a list or CPUs or + * we have obtained it implicitly, we try to retarget the interrupt + * for that device on one of the CPUs in the list. + * We assign the interrupt to the same CPU as the poll thread. + */ +static boolean_t +mac_check_interrupt_binding(dev_info_t *mdip, int32_t cpuid) +{ + ldi_handle_t lh = NULL; + ldi_ident_t li = NULL; + int err; + int ret; + mac_dladm_intr_t dln; + dev_info_t *dip; + struct ddi_minor_data *minordata; + + dln.nexus_path[0] = '\0'; + dln.driver_path[0] = '\0'; + + minordata = ((struct dev_info *)mdip)->devi_minor; + while (minordata != NULL) { + if (minordata->type == DDM_MINOR) + break; + minordata = minordata->next; + } + if (minordata == NULL) + return (B_FALSE); + + (void) ddi_pathname_minor(minordata, dln.driver_path); + + dip = mac_get_nexus_node(mdip, &dln); + /* defensive */ + if (dip == NULL) + return (B_FALSE); + + err = ldi_ident_from_major(ddi_driver_major(dip), &li); + if (err != 0) + return (B_FALSE); + + err = ldi_open_by_name(dln.nexus_path, FREAD|FWRITE, kcred, &lh, li); + if (err != 0) + return (B_FALSE); + + ret = mac_validate_intr(lh, &dln, cpuid); + if (ret < 0) { + (void) ldi_close(lh, FREAD|FWRITE, kcred); + return (B_FALSE); + } + /* cmn_note? */ + if (ret != 0) + if ((err = (mac_set_intr(lh, cpuid, dln.ino))) != 0) { + (void) ldi_close(lh, FREAD|FWRITE, kcred); + return (B_FALSE); + } + (void) ldi_close(lh, FREAD|FWRITE, kcred); + return (B_TRUE); +} + +void +mac_client_set_intr_cpu(void *arg, mac_client_handle_t mch, int32_t cpuid) +{ + dev_info_t *mdip = (dev_info_t *)arg; + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_resource_props_t *mrp; + mac_perim_handle_t mph; + + if (cpuid == -1 || !mac_check_interrupt_binding(mdip, cpuid)) + return; + + mac_perim_enter_by_mh((mac_handle_t)mcip->mci_mip, &mph); + mrp = MCIP_RESOURCE_PROPS(mcip); + mrp->mrp_intr_cpu = cpuid; + mac_perim_exit(mph); +} + +int32_t +mac_client_intr_cpu(mac_client_handle_t mch) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_cpus_t *srs_cpu; + mac_soft_ring_set_t *rx_srs; + flow_entry_t *flent = mcip->mci_flent; + mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip); + + /* + * Check if we need to retarget the interrupt. We do this only + * for the primary MAC client. We do this if we have the only + * exclusive ring in the group. + */ + if (mac_is_primary_client(mcip) && flent->fe_rx_srs_cnt == 2) { + rx_srs = flent->fe_rx_srs[1]; + srs_cpu = &rx_srs->srs_cpu; + if (mrp->mrp_intr_cpu == srs_cpu->mc_pollid) + return (-1); + return (srs_cpu->mc_pollid); + } + return (-1); +} + +void * +mac_get_devinfo(mac_handle_t mh) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + return ((void *)mip->mi_dip); +} diff --git a/usr/src/uts/common/io/mac/plugins/mac_ether.c b/usr/src/uts/common/io/mac/plugins/mac_ether.c index f4cf08eb66..abaab66add 100644 --- a/usr/src/uts/common/io/mac/plugins/mac_ether.c +++ b/usr/src/uts/common/io/mac/plugins/mac_ether.c @@ -30,9 +30,8 @@ #include <sys/types.h> #include <sys/modctl.h> #include <sys/dlpi.h> -#include <sys/mac.h> +#include <sys/dld_impl.h> #include <sys/mac_ether.h> -#include <sys/dls.h> #include <sys/ethernet.h> #include <sys/byteorder.h> #include <sys/strsun.h> diff --git a/usr/src/uts/common/io/mac/plugins/mac_wifi.c b/usr/src/uts/common/io/mac/plugins/mac_wifi.c index 668d7dbda1..fb45c8ef1c 100644 --- a/usr/src/uts/common/io/mac/plugins/mac_wifi.c +++ b/usr/src/uts/common/io/mac/plugins/mac_wifi.c @@ -32,9 +32,8 @@ #include <sys/types.h> #include <sys/modctl.h> #include <sys/dlpi.h> -#include <sys/mac.h> +#include <sys/dld_impl.h> #include <sys/mac_wifi.h> -#include <sys/dls.h> #include <sys/ethernet.h> #include <sys/byteorder.h> #include <sys/strsun.h> diff --git a/usr/src/uts/common/io/mxfe/mxfe.c b/usr/src/uts/common/io/mxfe/mxfe.c index 9470ac6b6b..044274acbf 100644 --- a/usr/src/uts/common/io/mxfe/mxfe.c +++ b/usr/src/uts/common/io/mxfe/mxfe.c @@ -177,7 +177,6 @@ static mac_callbacks_t mxfe_m_callbacks = { mxfe_m_multicst, mxfe_m_unicst, mxfe_m_tx, - NULL, /* mc_resources */ NULL, /* mc_ioctl */ NULL, /* mc_getcapab */ NULL, /* mc_open */ diff --git a/usr/src/uts/common/io/mxfe/mxfeimpl.h b/usr/src/uts/common/io/mxfe/mxfeimpl.h index c1bc8ab265..d5742eeceb 100644 --- a/usr/src/uts/common/io/mxfe/mxfeimpl.h +++ b/usr/src/uts/common/io/mxfe/mxfeimpl.h @@ -36,14 +36,14 @@ #ifndef _MXFEIMPL_H #define _MXFEIMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * This entire file is private to the MXFE driver. */ #ifdef _KERNEL +#include <sys/mac_provider.h> + /* * Compile time tunables. */ diff --git a/usr/src/uts/common/io/net80211/net80211.c b/usr/src/uts/common/io/net80211/net80211.c index 4b74943c85..fd49066fcc 100644 --- a/usr/src/uts/common/io/net80211/net80211.c +++ b/usr/src/uts/common/io/net80211/net80211.c @@ -35,8 +35,6 @@ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * IEEE 802.11 generic handler */ @@ -47,6 +45,7 @@ #include <sys/modctl.h> #include <sys/stropts.h> #include <sys/door.h> +#include <sys/mac_provider.h> #include "net80211_impl.h" uint32_t ieee80211_debug = 0x0; /* debug msg flags */ diff --git a/usr/src/uts/common/io/net80211/net80211_input.c b/usr/src/uts/common/io/net80211/net80211_input.c index ca948788d0..eb95149ea6 100644 --- a/usr/src/uts/common/io/net80211/net80211_input.c +++ b/usr/src/uts/common/io/net80211/net80211_input.c @@ -39,6 +39,7 @@ * Process received frame */ +#include <sys/mac_provider.h> #include <sys/byteorder.h> #include <sys/strsun.h> #include "net80211_impl.h" diff --git a/usr/src/uts/common/io/net80211/net80211_ioctl.c b/usr/src/uts/common/io/net80211/net80211_ioctl.c index 8e905971ff..44935e0979 100644 --- a/usr/src/uts/common/io/net80211/net80211_ioctl.c +++ b/usr/src/uts/common/io/net80211/net80211_ioctl.c @@ -41,7 +41,7 @@ #include <inet/nd.h> #include <inet/mi.h> #include <sys/note.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <inet/wifi_ioctl.h> #include "net80211_impl.h" diff --git a/usr/src/uts/common/io/nge/nge.h b/usr/src/uts/common/io/nge/nge.h index 430df8b83b..2944c6b820 100644 --- a/usr/src/uts/common/io/nge/nge.h +++ b/usr/src/uts/common/io/nge/nge.h @@ -61,7 +61,7 @@ extern "C" { #include <sys/ddi.h> #include <sys/sunddi.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> /* diff --git a/usr/src/uts/common/io/nge/nge_main.c b/usr/src/uts/common/io/nge/nge_main.c index 7ea4165779..f7b22f86e6 100644 --- a/usr/src/uts/common/io/nge/nge_main.c +++ b/usr/src/uts/common/io/nge/nge_main.c @@ -196,7 +196,6 @@ static mac_callbacks_t nge_m_callbacks = { nge_m_multicst, nge_m_unicst, nge_m_tx, - NULL, nge_m_ioctl, nge_m_getcapab, NULL, @@ -2137,12 +2136,6 @@ nge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) return (B_FALSE); break; } - case MAC_CAPAB_POLL: - /* - * There's nothing for us to fill in, simply returning - * B_TRUE, stating that we support polling is sufficient. - */ - break; default: return (B_FALSE); } diff --git a/usr/src/uts/common/io/ntxn/unm_nic.h b/usr/src/uts/common/io/ntxn/unm_nic.h index 6c8232757f..e23c385ce5 100644 --- a/usr/src/uts/common/io/ntxn/unm_nic.h +++ b/usr/src/uts/common/io/ntxn/unm_nic.h @@ -54,7 +54,7 @@ #include <inet/mi.h> #include <inet/nd.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> #include <sys/miiregs.h> /* by fjlite out of intel */ diff --git a/usr/src/uts/common/io/ntxn/unm_nic_main.c b/usr/src/uts/common/io/ntxn/unm_nic_main.c index b7e0c5832d..3db781fc8f 100644 --- a/usr/src/uts/common/io/ntxn/unm_nic_main.c +++ b/usr/src/uts/common/io/ntxn/unm_nic_main.c @@ -2513,9 +2513,6 @@ ntxn_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM); } break; - - case MAC_CAPAB_POLL: - case MAC_CAPAB_MULTIADDRESS: default: return (B_FALSE); } @@ -2534,7 +2531,6 @@ static mac_callbacks_t ntxn_m_callbacks = { ntxn_m_multicst, ntxn_m_unicst, ntxn_m_tx, - NULL, /* mc_resources */ ntxn_m_ioctl, ntxn_m_getcapab, NULL, /* mc_open */ diff --git a/usr/src/uts/common/io/nxge/nxge_fzc.c b/usr/src/uts/common/io/nxge/nxge_fzc.c index 91b5712895..3831d77eed 100644 --- a/usr/src/uts/common/io/nxge/nxge_fzc.c +++ b/usr/src/uts/common/io/nxge/nxge_fzc.c @@ -942,15 +942,18 @@ nxge_fzc_rdc_tbl_unbind(p_nxge_t nxge, int rdc_tbl) NXGE_DEBUG_MSG((nxge, DMA_CTL, "==> nxge_fzc_rdc_tbl_unbind(%d)", rdc_tbl)); + MUTEX_ENTER(&nhd->lock); table = &nhd->rdc_tbl[rdc_tbl]; if (table->nxge != (uintptr_t)nxge) { NXGE_ERROR_MSG((nxge, DMA_CTL, "nxge_fzc_rdc_tbl_unbind(%d): func%d not owner", nxge->function_num, rdc_tbl)); + MUTEX_EXIT(&nhd->lock); return (EINVAL); } else { bzero(table, sizeof (*table)); } + MUTEX_EXIT(&nhd->lock); NXGE_DEBUG_MSG((nxge, DMA_CTL, "<== nxge_fzc_rdc_tbl_unbind(%d)", rdc_tbl)); diff --git a/usr/src/uts/common/io/nxge/nxge_hcall.s b/usr/src/uts/common/io/nxge/nxge_hcall.s index c9f82b52df..56c85945b5 100644 --- a/usr/src/uts/common/io/nxge/nxge_hcall.s +++ b/usr/src/uts/common/io/nxge/nxge_hcall.s @@ -24,8 +24,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Hypervisor calls called by niu leaf driver. */ @@ -34,6 +32,8 @@ #include <sys/hypervisor_api.h> #include <sys/nxge/nxge_impl.h> +#if defined(sun4v) + /* * NIU HV API v1.0 definitions */ @@ -518,3 +518,5 @@ hv_niu_vrrx_set_ino(uint32_t cookie, uint64_t vridx, uint32_t ino) SET_SIZE(hv_niu_vrtx_param_set) #endif /* lint || __lint */ + +#endif /*defined(sun4v)*/ diff --git a/usr/src/uts/common/io/nxge/nxge_hio.c b/usr/src/uts/common/io/nxge/nxge_hio.c index f4aa20706d..2b9a972fec 100644 --- a/usr/src/uts/common/io/nxge/nxge_hio.c +++ b/usr/src/uts/common/io/nxge/nxge_hio.c @@ -34,6 +34,7 @@ * */ +#include <sys/mac_provider.h> #include <sys/nxge/nxge_impl.h> #include <sys/nxge/nxge_fzc.h> #include <sys/nxge/nxge_rxdma.h> @@ -49,7 +50,9 @@ extern npi_status_t npi_rxdma_dump_rdc_table(npi_handle_t, uint8_t); /* The following function may be found in nxge_main.c */ -extern int nxge_m_mmac_remove(void *arg, mac_addr_slot_t slot); +extern int nxge_m_mmac_remove(void *arg, int slot); +extern int nxge_m_mmac_add_g(void *arg, const uint8_t *maddr, int rdctbl, + boolean_t usetbl); /* The following function may be found in nxge_[t|r]xdma.c */ extern npi_status_t nxge_txdma_channel_disable(nxge_t *, int); @@ -129,6 +132,7 @@ int nxge_hio_init(nxge_t *nxge) { nxge_hio_data_t *nhd; + int i; nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio; if (nhd == 0) { @@ -137,6 +141,31 @@ nxge_hio_init(nxge_t *nxge) nxge->nxge_hw_p->hio = (uintptr_t)nhd; } + /* + * Initialize share and ring group structures. + */ + for (i = 0; i < NXGE_MAX_TDCS; i++) + nxge->tdc_is_shared[i] = B_FALSE; + + for (i = 0; i < NXGE_MAX_TDC_GROUPS; i++) { + nxge->tx_hio_groups[i].ghandle = NULL; + nxge->tx_hio_groups[i].nxgep = nxge; + nxge->tx_hio_groups[i].type = MAC_RING_TYPE_TX; + nxge->tx_hio_groups[i].gindex = 0; + nxge->tx_hio_groups[i].sindex = 0; + } + + for (i = 0; i < NXGE_MAX_RDC_GROUPS; i++) { + nxge->rx_hio_groups[i].ghandle = NULL; + nxge->rx_hio_groups[i].nxgep = nxge; + nxge->rx_hio_groups[i].type = MAC_RING_TYPE_RX; + nxge->rx_hio_groups[i].gindex = 0; + nxge->rx_hio_groups[i].sindex = 0; + nxge->rx_hio_groups[i].started = B_FALSE; + nxge->rx_hio_groups[i].rdctbl = -1; + nxge->rx_hio_groups[i].n_mac_addrs = 0; + } + nhd->hio.ldoms = B_FALSE; return (NXGE_OK); @@ -400,7 +429,7 @@ nxge_grp_dc_add( NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_grp_dc_add")); - if (group == NULL) + if (group == 0) return (0); switch (type) { @@ -424,7 +453,6 @@ nxge_grp_dc_add( default: NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL, "nxge_grp_dc_add: unknown type channel(%d)", channel)); - return (NXGE_ERROR); } NXGE_DEBUG_MSG((nxge, HIO_CTL, @@ -540,9 +568,6 @@ nxge_grp_dc_remove( MUTEX_ENTER(&nhd->lock); set = dc->type == VP_BOUND_TX ? &nxge->tx_set : &nxge->rx_set; - if (isLDOMs(nxge) && ((1 << channel) && set->shared.map)) { - NXGE_DC_RESET(group->map, channel); - } /* Remove the DC from its group. */ if (nxge_grp_dc_unlink(nxge, group, channel) != dc) { @@ -663,7 +688,10 @@ nxge_grp_dc_append( * Any domain */ nxge_hio_dc_t * -nxge_grp_dc_unlink(nxge_t *nxge, nxge_grp_t *group, int channel) +nxge_grp_dc_unlink( + nxge_t *nxge, + nxge_grp_t *group, + int channel) { nxge_hio_dc_t *current, *previous; @@ -699,6 +727,7 @@ nxge_grp_dc_unlink(nxge_t *nxge, nxge_grp_t *group, int channel) current->next = 0; current->group = 0; + NXGE_DC_RESET(group->map, channel); group->count--; } @@ -914,15 +943,14 @@ nxge_ddi_perror( * Local prototypes */ static nxge_hio_vr_t *nxge_hio_vr_share(nxge_t *); - -static int nxge_hio_dc_share(nxge_t *, nxge_hio_vr_t *, mac_ring_type_t); static void nxge_hio_unshare(nxge_hio_vr_t *); -static int nxge_hio_addres(nxge_hio_vr_t *, mac_ring_type_t, int); +static int nxge_hio_addres(nxge_hio_vr_t *, mac_ring_type_t, uint64_t *); static void nxge_hio_remres(nxge_hio_vr_t *, mac_ring_type_t, res_map_t); -static void nxge_hio_tdc_unshare(nxge_t *nxge, int channel); -static void nxge_hio_rdc_unshare(nxge_t *nxge, int channel); +static void nxge_hio_tdc_unshare(nxge_t *nxge, int dev_grpid, int channel); +static void nxge_hio_rdc_unshare(nxge_t *nxge, int dev_grpid, int channel); +static int nxge_hio_dc_share(nxge_t *, nxge_hio_vr_t *, mac_ring_type_t, int); static void nxge_hio_dc_unshare(nxge_t *, nxge_hio_vr_t *, mac_ring_type_t, int); @@ -967,6 +995,28 @@ nxge_hio_init( } } + /* + * Initialize share and ring group structures. + */ + for (i = 0; i < NXGE_MAX_TDC_GROUPS; i++) { + nxge->tx_hio_groups[i].ghandle = NULL; + nxge->tx_hio_groups[i].nxgep = nxge; + nxge->tx_hio_groups[i].type = MAC_RING_TYPE_TX; + nxge->tx_hio_groups[i].gindex = 0; + nxge->tx_hio_groups[i].sindex = 0; + } + + for (i = 0; i < NXGE_MAX_RDC_GROUPS; i++) { + nxge->rx_hio_groups[i].ghandle = NULL; + nxge->rx_hio_groups[i].nxgep = nxge; + nxge->rx_hio_groups[i].type = MAC_RING_TYPE_RX; + nxge->rx_hio_groups[i].gindex = 0; + nxge->rx_hio_groups[i].sindex = 0; + nxge->rx_hio_groups[i].started = B_FALSE; + nxge->rx_hio_groups[i].rdctbl = -1; + nxge->rx_hio_groups[i].n_mac_addrs = 0; + } + if (!isLDOMs(nxge)) { nhd->hio.ldoms = B_FALSE; return (NXGE_OK); @@ -983,22 +1033,15 @@ nxge_hio_init( nhd->vrs = NXGE_VR_SR_MAX - 2; /* - * Initialize tdc share state, shares and ring group structures. + * Initialize the share stuctures. */ for (i = 0; i < NXGE_MAX_TDCS; i++) nxge->tdc_is_shared[i] = B_FALSE; - for (i = 0; i < NXGE_MAX_RDC_GROUPS; i++) { - nxge->rx_hio_groups[i].ghandle = NULL; - nxge->rx_hio_groups[i].nxgep = nxge; - nxge->rx_hio_groups[i].gindex = 0; - nxge->rx_hio_groups[i].sindex = 0; - } - for (i = 0; i < NXGE_VR_SR_MAX; i++) { nxge->shares[i].nxgep = nxge; nxge->shares[i].index = 0; - nxge->shares[i].vrp = (void *)NULL; + nxge->shares[i].vrp = NULL; nxge->shares[i].tmap = 0; nxge->shares[i].rmap = 0; nxge->shares[i].rxgroup = 0; @@ -1033,77 +1076,251 @@ nxge_hio_init( return (0); } +#endif /* defined(sun4v) */ + +static int +nxge_hio_group_mac_add(nxge_t *nxge, nxge_ring_group_t *g, + const uint8_t *macaddr) +{ + int rv; + nxge_rdc_grp_t *group; + + mutex_enter(nxge->genlock); + + /* + * Initialize the NXGE RDC table data structure. + */ + group = &nxge->pt_config.rdc_grps[g->rdctbl]; + if (!group->flag) { + group->port = NXGE_GET_PORT_NUM(nxge->function_num); + group->config_method = RDC_TABLE_ENTRY_METHOD_REP; + group->flag = B_TRUE; /* This group has been configured. */ + } + + mutex_exit(nxge->genlock); + + /* + * Add the MAC address. + */ + if ((rv = nxge_m_mmac_add_g((void *)nxge, macaddr, + g->rdctbl, B_TRUE)) != 0) { + return (rv); + } + + mutex_enter(nxge->genlock); + g->n_mac_addrs++; + mutex_exit(nxge->genlock); + return (0); +} static int nxge_hio_add_mac(void *arg, const uint8_t *mac_addr) { - nxge_rx_ring_group_t *rxgroup = (nxge_rx_ring_group_t *)arg; - p_nxge_t nxge = rxgroup->nxgep; - int group = rxgroup->gindex; - int rv, sindex; + nxge_ring_group_t *group = (nxge_ring_group_t *)arg; + p_nxge_t nxge = group->nxgep; + int rv; nxge_hio_vr_t *vr; /* The Virtualization Region */ - sindex = nxge->rx_hio_groups[group].sindex; - vr = (nxge_hio_vr_t *)nxge->shares[sindex].vrp; + ASSERT(group->type == MAC_RING_TYPE_RX); + + mutex_enter(nxge->genlock); /* - * Program the mac address for the group/share. + * If the group is associated with a VR, then only one + * address may be assigned to the group. */ - if ((rv = nxge_hio_hostinfo_init(nxge, vr, - (ether_addr_t *)mac_addr)) != 0) { + vr = (nxge_hio_vr_t *)nxge->shares[group->sindex].vrp; + if ((vr != NULL) && (group->n_mac_addrs)) { + mutex_exit(nxge->genlock); + return (ENOSPC); + } + + mutex_exit(nxge->genlock); + + /* + * Program the mac address for the group. + */ + if ((rv = nxge_hio_group_mac_add(nxge, group, + mac_addr)) != 0) { return (rv); } return (0); } +static int +find_mac_slot(nxge_mmac_t *mmac_info, const uint8_t *mac_addr) +{ + int i; + for (i = 0; i <= mmac_info->num_mmac; i++) { + if (memcmp(mmac_info->mac_pool[i].addr, mac_addr, + ETHERADDRL) == 0) { + return (i); + } + } + return (-1); +} + /* ARGSUSED */ static int nxge_hio_rem_mac(void *arg, const uint8_t *mac_addr) { - nxge_rx_ring_group_t *rxgroup = (nxge_rx_ring_group_t *)arg; - p_nxge_t nxge = rxgroup->nxgep; - int group = rxgroup->gindex; - int sindex; - nxge_hio_vr_t *vr; /* The Virtualization Region */ + nxge_ring_group_t *group = (nxge_ring_group_t *)arg; + p_nxge_t nxge = group->nxgep; + nxge_mmac_t *mmac_info; + int rv, slot; + + ASSERT(group->type == MAC_RING_TYPE_RX); + + mutex_enter(nxge->genlock); + + mmac_info = &nxge->nxge_mmac_info; + slot = find_mac_slot(mmac_info, mac_addr); + if (slot < 0) { + mutex_exit(nxge->genlock); + return (EINVAL); + } + + mutex_exit(nxge->genlock); + + /* + * Remove the mac address for the group + */ + if ((rv = nxge_m_mmac_remove(nxge, slot)) != 0) { + return (rv); + } + + mutex_enter(nxge->genlock); + group->n_mac_addrs--; + mutex_exit(nxge->genlock); + + return (0); +} - sindex = nxge->rx_hio_groups[group].sindex; - vr = (nxge_hio_vr_t *)nxge->shares[sindex].vrp; +static int +nxge_hio_group_start(mac_group_driver_t gdriver) +{ + nxge_ring_group_t *group = (nxge_ring_group_t *)gdriver; + int rdctbl; + int dev_gindex; + + ASSERT(group->type == MAC_RING_TYPE_RX); + +#ifdef later + ASSERT(group->nxgep->nxge_mac_state == NXGE_MAC_STARTED); +#endif + if (group->nxgep->nxge_mac_state != NXGE_MAC_STARTED) + return (ENXIO); + + mutex_enter(group->nxgep->genlock); + dev_gindex = group->nxgep->pt_config.hw_config.def_mac_rxdma_grpid + + group->gindex; /* - * Remove the mac address for the group/share. + * Get an rdc table for this group. + * Group ID is given by the caller, and that's the group it needs + * to bind to. The default group is already bound when the driver + * was attached. + * + * For Group 0, it's RDC table was allocated at attach time + * no need to allocate a new table. */ - nxge_hio_hostinfo_uninit(nxge, vr); + if (group->gindex != 0) { + rdctbl = nxge_fzc_rdc_tbl_bind(group->nxgep, + dev_gindex, B_TRUE); + if (rdctbl < 0) { + mutex_exit(group->nxgep->genlock); + return (rdctbl); + } + } else { + rdctbl = group->nxgep->pt_config.hw_config.def_mac_rxdma_grpid; + } + + group->rdctbl = rdctbl; + + (void) nxge_init_fzc_rdc_tbl(group->nxgep, rdctbl); + + group->started = B_TRUE; + mutex_exit(group->nxgep->genlock); return (0); } +static void +nxge_hio_group_stop(mac_group_driver_t gdriver) +{ + nxge_ring_group_t *group = (nxge_ring_group_t *)gdriver; + + ASSERT(group->type == MAC_RING_TYPE_RX); + + mutex_enter(group->nxgep->genlock); + group->started = B_FALSE; + + /* + * Unbind the RDC table previously bound for this group. + * + * Since RDC table for group 0 was allocated at attach + * time, no need to unbind the table here. + */ + if (group->gindex != 0) + (void) nxge_fzc_rdc_tbl_unbind(group->nxgep, group->rdctbl); + + mutex_exit(group->nxgep->genlock); +} + /* ARGSUSED */ void -nxge_hio_group_get(void *arg, mac_ring_type_t type, int group, +nxge_hio_group_get(void *arg, mac_ring_type_t type, int groupid, mac_group_info_t *infop, mac_group_handle_t ghdl) { - p_nxge_t nxgep = (p_nxge_t)arg; - nxge_rx_ring_group_t *rxgroup; + p_nxge_t nxgep = (p_nxge_t)arg; + nxge_ring_group_t *group; + int dev_gindex; switch (type) { case MAC_RING_TYPE_RX: - rxgroup = &nxgep->rx_hio_groups[group]; - rxgroup->gindex = group; - - infop->mrg_driver = (mac_group_driver_t)rxgroup; - infop->mrg_start = NULL; - infop->mrg_stop = NULL; - infop->mrg_addmac = nxge_hio_add_mac; - infop->mrg_remmac = nxge_hio_rem_mac; - infop->mrg_count = NXGE_HIO_SHARE_MAX_CHANNELS; + group = &nxgep->rx_hio_groups[groupid]; + group->nxgep = nxgep; + group->ghandle = ghdl; + group->gindex = groupid; + group->sindex = 0; /* not yet bound to a share */ + + dev_gindex = nxgep->pt_config.hw_config.def_mac_rxdma_grpid + + groupid; + + infop->mgi_driver = (mac_group_driver_t)group; + infop->mgi_start = nxge_hio_group_start; + infop->mgi_stop = nxge_hio_group_stop; + infop->mgi_addmac = nxge_hio_add_mac; + infop->mgi_remmac = nxge_hio_rem_mac; + infop->mgi_count = + nxgep->pt_config.rdc_grps[dev_gindex].max_rdcs; break; case MAC_RING_TYPE_TX: + /* + * 'groupid' for TX should be incremented by one since + * the default group (groupid 0) is not known by the MAC layer + */ + group = &nxgep->tx_hio_groups[groupid + 1]; + group->nxgep = nxgep; + group->ghandle = ghdl; + group->gindex = groupid + 1; + group->sindex = 0; /* not yet bound to a share */ + + infop->mgi_driver = (mac_group_driver_t)group; + infop->mgi_start = NULL; + infop->mgi_stop = NULL; + infop->mgi_addmac = NULL; /* not needed */ + infop->mgi_remmac = NULL; /* not needed */ + /* no rings associated with group initially */ + infop->mgi_count = 0; break; } } +#if defined(sun4v) + int nxge_hio_share_assign( nxge_t *nxge, @@ -1126,7 +1343,6 @@ nxge_hio_share_assign( NXGE_ERROR_MSG((nxge, HIO_CTL, "nxge_hio_share_assign: " "vr->assign() returned %d", hv_rv)); - nxge_hio_unshare(vr); return (-EIO); } @@ -1189,7 +1405,7 @@ nxge_hio_share_assign( return (0); } -int +void nxge_hio_share_unassign( nxge_hio_vr_t *vr) { @@ -1237,23 +1453,15 @@ nxge_hio_share_unassign( vr->cookie, hv_rv)); } } - - return (0); } int -nxge_hio_share_alloc(void *arg, uint64_t cookie, uint64_t *rcookie, - mac_share_handle_t *shandle) +nxge_hio_share_alloc(void *arg, mac_share_handle_t *shandle) { - p_nxge_t nxge = (p_nxge_t)arg; - nxge_rx_ring_group_t *rxgroup; - nxge_share_handle_t *shp; - - nxge_hio_vr_t *vr; /* The Virtualization Region */ - uint64_t rmap, tmap; - int rdctbl, rv; - - nxge_hio_data_t *nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio; + p_nxge_t nxge = (p_nxge_t)arg; + nxge_share_handle_t *shp; + nxge_hio_vr_t *vr; /* The Virtualization Region */ + nxge_hio_data_t *nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio; NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_share")); @@ -1269,65 +1477,257 @@ nxge_hio_share_alloc(void *arg, uint64_t cookie, uint64_t *rcookie, if ((vr = nxge_hio_vr_share(nxge)) == 0) return (EAGAIN); + shp = &nxge->shares[vr->region]; + shp->nxgep = nxge; + shp->index = vr->region; + shp->vrp = (void *)vr; + shp->tmap = shp->rmap = 0; /* to be assigned by ms_sbind */ + shp->rxgroup = 0; /* to be assigned by ms_sadd */ + shp->active = B_FALSE; /* not bound yet */ + + *shandle = (mac_share_handle_t)shp; + + NXGE_DEBUG_MSG((nxge, HIO_CTL, "<== nxge_hio_share")); + return (0); +} + + +void +nxge_hio_share_free(mac_share_handle_t shandle) +{ + nxge_share_handle_t *shp = (nxge_share_handle_t *)shandle; + nxge_hio_vr_t *vr; + /* - * Get an RDC group for us to use. + * Clear internal handle state. */ - if ((rdctbl = nxge_hio_hostinfo_get_rdc_table(nxge)) < 0) { - nxge_hio_unshare(vr); - return (EBUSY); + vr = shp->vrp; + shp->vrp = (void *)NULL; + shp->index = 0; + shp->tmap = 0; + shp->rmap = 0; + shp->rxgroup = 0; + shp->active = B_FALSE; + + /* + * Free VR resource. + */ + nxge_hio_unshare(vr); +} + + +void +nxge_hio_share_query(mac_share_handle_t shandle, mac_ring_type_t type, + mac_ring_handle_t *rings, uint_t *n_rings) +{ + nxge_t *nxge; + nxge_share_handle_t *shp = (nxge_share_handle_t *)shandle; + nxge_ring_handle_t *rh; + uint32_t offset; + + nxge = shp->nxgep; + + switch (type) { + case MAC_RING_TYPE_RX: + rh = nxge->rx_ring_handles; + offset = nxge->pt_config.hw_config.start_rdc; + break; + + case MAC_RING_TYPE_TX: + rh = nxge->tx_ring_handles; + offset = nxge->pt_config.hw_config.tdc.start; + break; } - vr->rdc_tbl = (uint8_t)rdctbl; + + /* + * In version 1.0, we may only give a VR 2 RDCs/TDCs. Not only that, + * but the HV has statically assigned the channels like so: + * VR0: RDC0 & RDC1 + * VR1: RDC2 & RDC3, etc. + * The TDCs are assigned in exactly the same way. + */ + if (rings != NULL) { + rings[0] = rh[(shp->index * 2) - offset].ring_handle; + rings[1] = rh[(shp->index * 2 + 1) - offset].ring_handle; + } + if (n_rings != NULL) { + *n_rings = 2; + } +} + +int +nxge_hio_share_add_group(mac_share_handle_t shandle, + mac_group_driver_t ghandle) +{ + nxge_t *nxge; + nxge_share_handle_t *shp = (nxge_share_handle_t *)shandle; + nxge_ring_group_t *rg = (nxge_ring_group_t *)ghandle; + nxge_hio_vr_t *vr; /* The Virtualization Region */ + nxge_grp_t *group; + int i; + + if (rg->sindex != 0) { + /* the group is already bound to a share */ + return (EALREADY); + } + + nxge = rg->nxgep; + vr = shp->vrp; + + switch (rg->type) { + case MAC_RING_TYPE_RX: + /* + * Make sure that the group has the right rings associated + * for the share. In version 1.0, we may only give a VR + * 2 RDCs. Not only that, but the HV has statically + * assigned the channels like so: + * VR0: RDC0 & RDC1 + * VR1: RDC2 & RDC3, etc. + */ + group = nxge->rx_set.group[rg->gindex]; + + if (group->count > 2) { + /* a share can have at most 2 rings */ + return (EINVAL); + } + + for (i = 0; i < NXGE_MAX_RDCS; i++) { + if (group->map & (1 << i)) { + if ((i != shp->index * 2) && + (i != (shp->index * 2 + 1))) { + /* + * A group with invalid rings was + * attempted to bind to this share + */ + return (EINVAL); + } + } + } + + rg->sindex = vr->region; + vr->rdc_tbl = rg->rdctbl; + shp->rxgroup = vr->rdc_tbl; + break; + + case MAC_RING_TYPE_TX: + /* + * Make sure that the group has the right rings associated + * for the share. In version 1.0, we may only give a VR + * 2 TDCs. Not only that, but the HV has statically + * assigned the channels like so: + * VR0: TDC0 & TDC1 + * VR1: TDC2 & TDC3, etc. + */ + group = nxge->tx_set.group[rg->gindex]; + + if (group->count > 2) { + /* a share can have at most 2 rings */ + return (EINVAL); + } + + for (i = 0; i < NXGE_MAX_TDCS; i++) { + if (group->map & (1 << i)) { + if ((i != shp->index * 2) && + (i != (shp->index * 2 + 1))) { + /* + * A group with invalid rings was + * attempted to bind to this share + */ + return (EINVAL); + } + } + } + + vr->tdc_tbl = nxge->pt_config.hw_config.def_mac_txdma_grpid + + rg->gindex; + rg->sindex = vr->region; + break; + } + return (0); +} + +int +nxge_hio_share_rem_group(mac_share_handle_t shandle, + mac_group_driver_t ghandle) +{ + nxge_share_handle_t *shp = (nxge_share_handle_t *)shandle; + nxge_ring_group_t *group = (nxge_ring_group_t *)ghandle; + nxge_hio_vr_t *vr; /* The Virtualization Region */ + int rv = 0; + + vr = shp->vrp; + + switch (group->type) { + case MAC_RING_TYPE_RX: + group->sindex = 0; + vr->rdc_tbl = 0; + shp->rxgroup = 0; + break; + + case MAC_RING_TYPE_TX: + group->sindex = 0; + vr->tdc_tbl = 0; + break; + } + + return (rv); +} + +int +nxge_hio_share_bind(mac_share_handle_t shandle, uint64_t cookie, + uint64_t *rcookie) +{ + nxge_t *nxge; + nxge_share_handle_t *shp = (nxge_share_handle_t *)shandle; + nxge_hio_vr_t *vr; + uint64_t rmap, tmap, hv_rmap, hv_tmap; + int rv; + + nxge = shp->nxgep; + vr = (nxge_hio_vr_t *)shp->vrp; /* * Add resources to the share. + * For each DMA channel associated with the VR, bind its resources + * to the VR. */ tmap = 0; - rv = nxge_hio_addres(vr, MAC_RING_TYPE_TX, - NXGE_HIO_SHARE_MAX_CHANNELS); + rv = nxge_hio_addres(vr, MAC_RING_TYPE_TX, &tmap); if (rv != 0) { - nxge_hio_unshare(vr); return (rv); } rmap = 0; - rv = nxge_hio_addres(vr, MAC_RING_TYPE_RX, - NXGE_HIO_SHARE_MAX_CHANNELS); + rv = nxge_hio_addres(vr, MAC_RING_TYPE_RX, &rmap); if (rv != 0) { nxge_hio_remres(vr, MAC_RING_TYPE_TX, tmap); - nxge_hio_unshare(vr); return (rv); } - if ((rv = nxge_hio_share_assign(nxge, cookie, &tmap, &rmap, vr))) { - nxge_hio_remres(vr, MAC_RING_TYPE_RX, tmap); + /* + * Ask the Hypervisor to set up the VR and allocate slots for + * each rings associated with the VR. + */ + hv_tmap = hv_rmap = 0; + if ((rv = nxge_hio_share_assign(nxge, cookie, + &hv_tmap, &hv_rmap, vr))) { nxge_hio_remres(vr, MAC_RING_TYPE_TX, tmap); - nxge_hio_unshare(vr); + nxge_hio_remres(vr, MAC_RING_TYPE_RX, rmap); return (rv); } - rxgroup = &nxge->rx_hio_groups[vr->rdc_tbl]; - rxgroup->gindex = vr->rdc_tbl; - rxgroup->sindex = vr->region; - - shp = &nxge->shares[vr->region]; - shp->index = vr->region; - shp->vrp = (void *)vr; - shp->tmap = tmap; - shp->rmap = rmap; - shp->rxgroup = vr->rdc_tbl; shp->active = B_TRUE; + shp->tmap = hv_tmap; + shp->rmap = hv_rmap; /* high 32 bits are cfg_hdl and low 32 bits are HV cookie */ *rcookie = (((uint64_t)nxge->niu_cfg_hdl) << 32) | vr->cookie; - *shandle = (mac_share_handle_t)shp; - - NXGE_DEBUG_MSG((nxge, HIO_CTL, "<== nxge_hio_share")); return (0); } void -nxge_hio_share_free(mac_share_handle_t shandle) +nxge_hio_share_unbind(mac_share_handle_t shandle) { nxge_share_handle_t *shp = (nxge_share_handle_t *)shandle; @@ -1335,52 +1735,15 @@ nxge_hio_share_free(mac_share_handle_t shandle) * First, unassign the VR (take it back), * so we can enable interrupts again. */ - (void) nxge_hio_share_unassign(shp->vrp); + nxge_hio_share_unassign(shp->vrp); /* * Free Ring Resources for TX and RX */ nxge_hio_remres(shp->vrp, MAC_RING_TYPE_TX, shp->tmap); nxge_hio_remres(shp->vrp, MAC_RING_TYPE_RX, shp->rmap); - - /* - * Free VR resource. - */ - nxge_hio_unshare(shp->vrp); - - /* - * Clear internal handle state. - */ - shp->index = 0; - shp->vrp = (void *)NULL; - shp->tmap = 0; - shp->rmap = 0; - shp->rxgroup = 0; - shp->active = B_FALSE; } -void -nxge_hio_share_query(mac_share_handle_t shandle, mac_ring_type_t type, - uint32_t *rmin, uint32_t *rmax, uint64_t *rmap, uint64_t *gnum) -{ - nxge_share_handle_t *shp = (nxge_share_handle_t *)shandle; - - switch (type) { - case MAC_RING_TYPE_RX: - *rmin = NXGE_HIO_SHARE_MIN_CHANNELS; - *rmax = NXGE_HIO_SHARE_MAX_CHANNELS; - *rmap = shp->rmap; - *gnum = shp->rxgroup; - break; - - case MAC_RING_TYPE_TX: - *rmin = NXGE_HIO_SHARE_MIN_CHANNELS; - *rmax = NXGE_HIO_SHARE_MAX_CHANNELS; - *rmap = shp->tmap; - *gnum = 0; - break; - } -} /* * nxge_hio_vr_share @@ -1474,7 +1837,11 @@ nxge_hio_unshare( * * nxge_hio_hostinfo_uninit(nxge, vr); */ - (void) nxge_fzc_rdc_tbl_unbind(nxge, vr->rdc_tbl); + + /* + * XXX: This is done by ms_sremove? + * (void) nxge_fzc_rdc_tbl_unbind(nxge, vr->rdc_tbl); + */ nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio; @@ -1495,23 +1862,53 @@ int nxge_hio_addres( nxge_hio_vr_t *vr, mac_ring_type_t type, - int count) + uint64_t *map) { - nxge_t *nxge = (nxge_t *)vr->nxge; - int i; + nxge_t *nxge = (nxge_t *)vr->nxge; + nxge_grp_t *group; + int groupid; + int i; + int max_dcs; NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_addres")); if (!nxge) return (EINVAL); - for (i = 0; i < count; i++) { - int rv; - if ((rv = nxge_hio_dc_share(nxge, vr, type)) < 0) { - if (i == 0) /* Couldn't get even one DC. */ - return (-rv); - else - break; + /* + * For each ring associated with the group, add the resources + * to the group and bind. + */ + max_dcs = (type == MAC_RING_TYPE_TX) ? NXGE_MAX_TDCS : NXGE_MAX_RDCS; + if (type == MAC_RING_TYPE_TX) { + /* set->group is an array of group indexed by a port group id */ + groupid = vr->tdc_tbl - + nxge->pt_config.hw_config.def_mac_txdma_grpid; + group = nxge->tx_set.group[groupid]; + } else { + /* set->group is an array of group indexed by a port group id */ + groupid = vr->rdc_tbl - + nxge->pt_config.hw_config.def_mac_rxdma_grpid; + group = nxge->rx_set.group[groupid]; + } + + if (group->map == 0) { + NXGE_DEBUG_MSG((nxge, HIO_CTL, "There is no rings associated " + "with this VR")); + return (EINVAL); + } + + for (i = 0; i < max_dcs; i++) { + if (group->map & (1 << i)) { + int rv; + + if ((rv = nxge_hio_dc_share(nxge, vr, type, i)) < 0) { + if (*map == 0) /* Couldn't get even one DC. */ + return (-rv); + else + break; + } + *map |= (1 << i); } } @@ -1538,6 +1935,10 @@ nxge_hio_remres( NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_remres(%lx)", res_map)); + /* + * For each ring bound to the group, remove the DMA resources + * from the group and unbind. + */ group = (type == MAC_RING_TYPE_TX ? &vr->tx_group : &vr->rx_group); while (group->dc) { nxge_hio_dc_t *dc = group->dc; @@ -1628,12 +2029,11 @@ nxge_hio_tdc_share( nxge->tdc_is_shared[channel] = B_TRUE; MUTEX_EXIT(&nhd->lock); - if (nxge_intr_remove(nxge, VP_BOUND_TX, channel) != NXGE_OK) { NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL, "nxge_hio_tdc_share: " "Failed to remove interrupt for TxDMA channel %d", channel)); - return (NXGE_ERROR); + return (-EINVAL); } /* Disable TxDMA A.9.6.10 */ @@ -1698,13 +2098,9 @@ nxge_hio_rdc_share( nxge_hio_vr_t *vr, int channel) { - nxge_hio_data_t *nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio; - nxge_hw_pt_cfg_t *hardware = &nxge->pt_config.hw_config; nxge_grp_set_t *set = &nxge->rx_set; nxge_rdc_grp_t *rdc_grp; - int current, last; - NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_rdc_share")); /* Disable interrupts. */ @@ -1739,21 +2135,6 @@ nxge_hio_rdc_share( nxge_grp_dc_remove(nxge, VP_BOUND_RX, channel); /* - * We have to reconfigure the RDC table(s) - * to which this channel belongs. - */ - current = hardware->def_mac_rxdma_grpid; - last = current + hardware->max_rdc_grpids; - for (; current < last; current++) { - if (nhd->rdc_tbl[current].nxge == (uintptr_t)nxge) { - rdc_grp = &nxge->pt_config.rdc_grps[current]; - rdc_grp->map = set->owned.map; - rdc_grp->max_rdcs--; - (void) nxge_init_fzc_rdc_tbl(nxge, current); - } - } - - /* * The guest domain will reconfigure the RDC later. * * But in the meantime, we must re-enable the Rx MAC so @@ -1791,12 +2172,6 @@ nxge_hio_rdc_share( } NXGE_DC_SET(rdc_grp->map, channel); - if (nxge_init_fzc_rdc_tbl(nxge, vr->rdc_tbl) != NXGE_OK) { - NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL, - "nxge_hio_rdc_share: nxge_init_fzc_rdc_tbl failed")); - return (-EIO); - } - NXGE_DEBUG_MSG((nxge, HIO_CTL, "<== nxge_hio_rdc_share")); return (0); @@ -1811,8 +2186,7 @@ nxge_hio_rdc_share( * nxge * vr The VR that <channel> will belong to. * type Tx or Rx. - * res_map The resource map used by the caller, which we will - * update if successful. + * channel Channel to share * * Notes: * @@ -1823,59 +2197,17 @@ int nxge_hio_dc_share( nxge_t *nxge, nxge_hio_vr_t *vr, - mac_ring_type_t type) + mac_ring_type_t type, + int channel) { nxge_hio_data_t *nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio; - nxge_hw_pt_cfg_t *hardware; nxge_hio_dc_t *dc; - int channel, limit; - - nxge_grp_set_t *set; nxge_grp_t *group; - int slot; NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_dc_share(%cdc %d", type == MAC_RING_TYPE_TX ? 't' : 'r', channel)); - /* - * In version 1.0, we may only give a VR 2 RDCs or TDCs. - * Not only that, but the HV has statically assigned the - * channels like so: - * VR0: RDC0 & RDC1 - * VR1: RDC2 & RDC3, etc. - * The TDCs are assigned in exactly the same way. - * - * So, for example - * hardware->start_rdc + vr->region * 2; - * VR1: hardware->start_rdc + 1 * 2; - * VR3: hardware->start_rdc + 3 * 2; - * If start_rdc is 0, we end up with 2 or 6. - * If start_rdc is 8, we end up with 10 or 14. - */ - - set = (type == MAC_RING_TYPE_TX ? &nxge->tx_set : &nxge->rx_set); - hardware = &nxge->pt_config.hw_config; - - // This code is still NIU-specific (assuming only 2 ports) - channel = hardware->start_rdc + (vr->region % 4) * 2; - limit = channel + 2; - - MUTEX_ENTER(&nhd->lock); - for (; channel < limit; channel++) { - if ((1 << channel) & set->owned.map) { - break; - } - } - - if (channel == limit) { - MUTEX_EXIT(&nhd->lock); - NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL, - "nxge_hio_dc_share: there are no channels to share")); - return (-EIO); - } - - MUTEX_EXIT(&nhd->lock); /* -------------------------------------------------- */ slot = (type == MAC_RING_TYPE_TX) ? @@ -1884,9 +2216,9 @@ nxge_hio_dc_share( if (slot < 0) { if (type == MAC_RING_TYPE_RX) { - nxge_hio_rdc_unshare(nxge, channel); + nxge_hio_rdc_unshare(nxge, vr->rdc_tbl, channel); } else { - nxge_hio_tdc_unshare(nxge, channel); + nxge_hio_tdc_unshare(nxge, vr->tdc_tbl, channel); } return (slot); } @@ -1912,7 +2244,6 @@ nxge_hio_dc_share( group = (type == MAC_RING_TYPE_TX ? &vr->tx_group : &vr->rx_group); dc->group = group; - /* Initialize <group>, if necessary */ if (group->count == 0) { group->nxge = nxge; @@ -1952,16 +2283,21 @@ nxge_hio_dc_share( void nxge_hio_tdc_unshare( nxge_t *nxge, + int dev_grpid, int channel) { nxge_grp_set_t *set = &nxge->tx_set; - nxge_grp_t *group = set->group[0]; + nxge_grp_t *group; + int grpid; NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_tdc_unshare")); NXGE_DC_RESET(set->shared.map, channel); set->shared.count--; + grpid = dev_grpid - nxge->pt_config.hw_config.def_mac_txdma_grpid; + group = set->group[grpid]; + if ((nxge_grp_dc_add(nxge, group, VP_BOUND_TX, channel))) { NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL, "nxge_hio_tdc_unshare: " "Failed to initialize TxDMA channel %d", channel)); @@ -1994,14 +2330,12 @@ nxge_hio_tdc_unshare( void nxge_hio_rdc_unshare( nxge_t *nxge, + int dev_grpid, int channel) { - nxge_hio_data_t *nhd = (nxge_hio_data_t *)nxge->nxge_hw_p->hio; - nxge_hw_pt_cfg_t *hardware = &nxge->pt_config.hw_config; - - nxge_grp_set_t *set = &nxge->rx_set; - nxge_grp_t *group = set->group[0]; - int current, last; + nxge_grp_set_t *set = &nxge->rx_set; + nxge_grp_t *group; + int grpid; NXGE_DEBUG_MSG((nxge, HIO_CTL, "==> nxge_hio_rdc_unshare")); @@ -2024,6 +2358,9 @@ nxge_hio_rdc_unshare( NXGE_DC_RESET(set->shared.map, channel); set->shared.count--; + grpid = dev_grpid - nxge->pt_config.hw_config.def_mac_rxdma_grpid; + group = set->group[grpid]; + /* * Assert RST: RXDMA_CFIG1[30] = 1 * @@ -2035,7 +2372,7 @@ nxge_hio_rdc_unshare( /* Be sure to re-enable the RX MAC. */ if (nxge_rx_mac_enable(nxge) != NXGE_OK) { NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL, - "nxge_hio_rdc_unshare: Rx MAC still disabled")); + "nxge_hio_rdc_share: Rx MAC still disabled")); } NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL, "nxge_hio_rdc_unshare: " "Failed to initialize RxDMA channel %d", channel)); @@ -2043,27 +2380,11 @@ nxge_hio_rdc_unshare( } /* - * We have to reconfigure the RDC table(s) - * to which this channel once again belongs. - */ - current = hardware->def_mac_rxdma_grpid; - last = current + hardware->max_rdc_grpids; - for (; current < last; current++) { - if (nhd->rdc_tbl[current].nxge == (uintptr_t)nxge) { - nxge_rdc_grp_t *group; - group = &nxge->pt_config.rdc_grps[current]; - group->map = set->owned.map; - group->max_rdcs++; - (void) nxge_init_fzc_rdc_tbl(nxge, current); - } - } - - /* * Enable RxMAC = A.9.2.10 */ if (nxge_rx_mac_enable(nxge) != NXGE_OK) { NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL, - "nxge_hio_rdc_unshare: Rx MAC still disabled")); + "nxge_hio_rdc_share: Rx MAC still disabled")); return; } @@ -2120,9 +2441,9 @@ nxge_hio_dc_unshare( dc->cookie = 0; if (type == MAC_RING_TYPE_RX) { - nxge_hio_rdc_unshare(nxge, channel); + nxge_hio_rdc_unshare(nxge, vr->rdc_tbl, channel); } else { - nxge_hio_tdc_unshare(nxge, channel); + nxge_hio_tdc_unshare(nxge, vr->tdc_tbl, channel); } NXGE_DEBUG_MSG((nxge, HIO_CTL, "<== nxge_hio_dc_unshare")); diff --git a/usr/src/uts/common/io/nxge/nxge_hio_guest.c b/usr/src/uts/common/io/nxge/nxge_hio_guest.c index 5fbcbfdfe1..5517b9ceee 100644 --- a/usr/src/uts/common/io/nxge/nxge_hio_guest.c +++ b/usr/src/uts/common/io/nxge/nxge_hio_guest.c @@ -208,7 +208,6 @@ static void nxge_check_guest_state(nxge_hio_vr_t *); * Guest domain */ /* ARGSUSED */ - int nxge_hio_vr_add(nxge_t *nxge) { @@ -249,7 +248,7 @@ nxge_hio_vr_add(nxge_t *nxge) return (NXGE_ERROR); } - cookie = (uint32_t)reg_val[0]; + cookie = (uint32_t)(reg_val[0]); ddi_prop_free(reg_val); fp = &nhd->hio.vr; @@ -521,11 +520,17 @@ res_map_parse( */ if (type == NXGE_TRANSMIT_GROUP) { nxge_dma_pt_cfg_t *port = &nxge->pt_config; + nxge_tdc_grp_t *tdc_grp = &nxge->pt_config.tdc_grps[0]; hardware->tdc.start = first; hardware->tdc.count = count; hardware->tdc.owned = count; + tdc_grp->start_tdc = first; + tdc_grp->max_tdcs = (uint8_t)count; + tdc_grp->grp_index = group->index; + tdc_grp->map = slots; + group->map = slots; /* @@ -944,7 +949,6 @@ nxge_check_guest_state( NXGE_DEBUG_MSG((nxge, SYSERR_CTL, "==> nxge_check_guest_state")); MUTEX_ENTER(nxge->genlock); - nxge->nxge_timerid = 0; if (nxge->nxge_mac_state == NXGE_MAC_STARTED) { diff --git a/usr/src/uts/common/io/nxge/nxge_hv.c b/usr/src/uts/common/io/nxge/nxge_hv.c index a454b3ee72..1a42fcd9a7 100644 --- a/usr/src/uts/common/io/nxge/nxge_hv.c +++ b/usr/src/uts/common/io/nxge/nxge_hv.c @@ -24,8 +24,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * nxge_hv.c * @@ -37,6 +35,8 @@ #include <sys/nxge/nxge_impl.h> #include <sys/nxge/nxge_hio.h> +#if defined(sun4v) + void nxge_hio_hv_init(nxge_t *nxge) { @@ -79,3 +79,5 @@ nxge_hio_hv_init(nxge_t *nxge) rx->getinfo = &hv_niu_vrrx_getinfo; } + +#endif /* defined(sun4v) */ diff --git a/usr/src/uts/common/io/nxge/nxge_hw.c b/usr/src/uts/common/io/nxge/nxge_hw.c index 4a6cbbea6d..5513ce4f4e 100644 --- a/usr/src/uts/common/io/nxge/nxge_hw.c +++ b/usr/src/uts/common/io/nxge/nxge_hw.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/nxge/nxge_impl.h> /* @@ -221,7 +219,6 @@ nxge_intr(void *arg1, void *arg2) NXGE_DEBUG_MSG((nxgep, INT_CTL, "==> nxge_intr(%d): #ldvs %d " " #intrs %d", i, nldvs, nintrs)); /* Get this group's flag bits. */ - t_ldgp->interrupted = B_FALSE; rs = npi_ldsv_ldfs_get(handle, t_ldgp->ldg, &vector0, &vector1, &vector2); if (rs) { @@ -235,7 +232,6 @@ nxge_intr(void *arg1, void *arg2) NXGE_DEBUG_MSG((nxgep, INT_CTL, "==> nxge_intr: " "vector0 0x%llx vector1 0x%llx vector2 0x%llx", vector0, vector1, vector2)); - t_ldgp->interrupted = B_TRUE; nldvs = t_ldgp->nldvs; for (j = 0; j < nldvs; j++, t_ldvp++) { /* @@ -261,12 +257,10 @@ nxge_intr(void *arg1, void *arg2) t_ldgp = ldgp; for (i = 0; i < nintrs; i++, t_ldgp++) { /* rearm group interrupts */ - if (t_ldgp->interrupted) { - NXGE_DEBUG_MSG((nxgep, INT_CTL, "==> nxge_intr: arm " - "group %d", t_ldgp->ldg)); - (void) npi_intr_ldg_mgmt_set(handle, t_ldgp->ldg, - t_ldgp->arm, t_ldgp->ldg_timer); - } + NXGE_DEBUG_MSG((nxgep, INT_CTL, "==> nxge_intr: arm " + "group %d", t_ldgp->ldg)); + (void) npi_intr_ldg_mgmt_set(handle, t_ldgp->ldg, + t_ldgp->arm, t_ldgp->ldg_timer); } NXGE_DEBUG_MSG((nxgep, INT_CTL, "<== nxge_intr: serviced 0x%x", diff --git a/usr/src/uts/common/io/nxge/nxge_mac.c b/usr/src/uts/common/io/nxge/nxge_mac.c index d009bdbd98..8ca60cf7a7 100644 --- a/usr/src/uts/common/io/nxge/nxge_mac.c +++ b/usr/src/uts/common/io/nxge/nxge_mac.c @@ -46,13 +46,6 @@ extern uint32_t nxge_lb_dbg; extern boolean_t nxge_jumbo_enable; extern uint32_t nxge_jumbo_mtu; - /* The following functions may be found in nxge_main.c */ -extern void nxge_mmac_kstat_update(p_nxge_t nxgep, mac_addr_slot_t slot, - boolean_t factory); -extern int nxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr); -extern int nxge_m_mmac_remove(void *arg, mac_addr_slot_t slot); -extern int nxge_altmac_set(p_nxge_t nxgep, uint8_t *mac_addr, - mac_addr_slot_t slot, uint8_t rdctbl); typedef enum { CHECK_LINK_RESCHEDULE, @@ -3040,160 +3033,6 @@ fail: return (NXGE_ERROR | rs); } -int -nxge_hio_hostinfo_get_rdc_table(p_nxge_t nxgep) -{ - int rdc_tbl; - - /* - * Get an RDC table (version 0). - */ - if ((rdc_tbl = nxge_fzc_rdc_tbl_bind(nxgep, -1, B_FALSE)) < 0) { - NXGE_ERROR_MSG((nxgep, OBP_CTL, - "nxge_hio_hostinfo_get_rdc_table: " - "there are no free RDC tables!")); - return (EBUSY); - } - - return (rdc_tbl); -} - -/* - * nxge_hio_hostinfo_init - * - * Initialize an alternate MAC address, and bind a macrdctbln to it. - * - * Arguments: - * nxge - * vr The Virtualization Region - * macaddr The alternate MAC address - * - * Notes: - * 1. Find & bind an RDC table to <nxge>. - * 2. Program an alternate MAC address (<macaddr>). - * 3. Bind the RDC table to <macaddr>. - * - * Context: - * Service domain - * - * Side Effects: - * nxge->class_config.mac_host_info[slot].rdctbl - * vr->slot & vr->altmac - * - */ -int -nxge_hio_hostinfo_init(nxge_t *nxge, nxge_hio_vr_t *vr, ether_addr_t *macaddr) -{ - int slot, error; - uint8_t rdc_tbl; - nxge_mmac_t *mmac_info; - nxge_rdc_grp_t *group; - uint8_t *addr = (uint8_t *)macaddr; - - mutex_enter(nxge->genlock); - - rdc_tbl = (uint8_t)vr->rdc_tbl; - - /* Initialize the NXGE RDC table data structure. */ - group = &nxge->pt_config.rdc_grps[rdc_tbl]; - group->port = NXGE_GET_PORT_NUM(nxge->function_num); - group->config_method = RDC_TABLE_ENTRY_METHOD_REP; - group->flag = 1; /* This group has been configured. */ - - mmac_info = &nxge->nxge_mmac_info; - - /* - * Are there free slots. - */ - if (mmac_info->naddrfree == 0) { - mutex_exit(nxge->genlock); - return (ENOSPC); - } - - /* - * Find a slot for the VR to use for Hybrid I/O. - */ - if (mmac_info->num_factory_mmac < mmac_info->num_mmac) { - for (slot = mmac_info->num_factory_mmac + 1; - slot <= mmac_info->num_mmac; slot++) { - if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED)) - break; - } - if (slot > mmac_info->num_mmac) { - for (slot = 1; slot <= mmac_info->num_factory_mmac; - slot++) { - if (!(mmac_info->mac_pool[slot].flags - & MMAC_SLOT_USED)) - break; - } - } - } else { - for (slot = 1; slot <= mmac_info->num_mmac; slot++) { - if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED)) - break; - } - } - ASSERT(slot <= mmac_info->num_mmac); - vr->slot = slot; - - error = nxge_altmac_set(nxge, addr, slot, rdc_tbl); - if (error != 0) { - mutex_exit(nxge->genlock); - return (EIO); - } - - bcopy(macaddr, vr->altmac, sizeof (vr->altmac)); - - /* - * Update mmac - */ - bcopy(addr, mmac_info->mac_pool[vr->slot].addr, ETHERADDRL); - mmac_info->mac_pool[vr->slot].flags |= MMAC_SLOT_USED; - mmac_info->mac_pool[vr->slot].flags &= ~MMAC_VENDOR_ADDR; - mmac_info->naddrfree--; - nxge_mmac_kstat_update(nxge, vr->slot, B_FALSE); - - mutex_exit(nxge->genlock); - return (0); -} - -/* - * nxge_hio_hostinfo_uninit - * - * Uninitialize an alternate MAC address. - * - * Arguments: - * nxge - * vr The Virtualization Region - * - * Notes: - * Remove the VR's alternate MAC address. - * - * Context: - * Service domain - * - * Side Effects: - * nxge->class_config.mac_host_info[slot].rdctbl - * - */ -void -nxge_hio_hostinfo_uninit(nxge_t *nxge, nxge_hio_vr_t *vr) -{ - nxge_class_pt_cfg_t *class; - uint8_t addrn; - - addrn = vr->slot - 1; - (void) npi_mac_altaddr_disable(nxge->npi_handle, - nxge->mac.portnum, addrn); - - /* Set this variable to its default. */ - class = (p_nxge_class_pt_cfg_t)&nxge->class_config; - class->mac_host_info[addrn].rdctbl = - nxge->pt_config.hw_config.def_mac_rxdma_grpid; - - (void) nxge_m_mmac_remove(nxge, vr->slot); - vr->slot = -1; -} /* Initialize the RxMAC sub-block */ diff --git a/usr/src/uts/common/io/nxge/nxge_main.c b/usr/src/uts/common/io/nxge/nxge_main.c index ca2ca6b30b..9b20c438f4 100644 --- a/usr/src/uts/common/io/nxge/nxge_main.c +++ b/usr/src/uts/common/io/nxge/nxge_main.c @@ -117,14 +117,6 @@ nxge_tx_mode_t nxge_tx_scheme = NXGE_USE_SERIAL; #define NXGE_LSO_MAXLEN 65535 uint32_t nxge_lso_max = NXGE_LSO_MAXLEN; -/* - * Debugging flags: - * nxge_no_tx_lb : transmit load balancing - * nxge_tx_lb_policy: 0 - TCP port (default) - * 3 - DEST MAC - */ -uint32_t nxge_no_tx_lb = 0; -uint32_t nxge_tx_lb_policy = NXGE_TX_LB_TCPUDP; /* * Add tunable to reduce the amount of time spent in the @@ -208,8 +200,7 @@ static void nxge_remove_hard_properties(p_nxge_t); /* * These two functions are required by nxge_hio.c */ -extern int nxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr); -extern int nxge_m_mmac_remove(void *arg, mac_addr_slot_t slot); +extern int nxge_m_mmac_remove(void *arg, int slot); extern void nxge_grp_cleanup(p_nxge_t nxge); static nxge_status_t nxge_setup_system_dma_pages(p_nxge_t); @@ -224,9 +215,7 @@ static void nxge_test_map_regs(p_nxge_t nxgep); #endif static nxge_status_t nxge_add_intrs(p_nxge_t nxgep); -static nxge_status_t nxge_add_soft_intrs(p_nxge_t nxgep); static void nxge_remove_intrs(p_nxge_t nxgep); -static void nxge_remove_soft_intrs(p_nxge_t nxgep); static nxge_status_t nxge_add_intrs_adv(p_nxge_t nxgep); static nxge_status_t nxge_add_intrs_adv_type(p_nxge_t, uint32_t); @@ -284,20 +273,19 @@ extern int nxge_param_set_mac(p_nxge_t, queue_t *, mblk_t *, */ static int nxge_m_start(void *); static void nxge_m_stop(void *); -static int nxge_m_unicst(void *, const uint8_t *); static int nxge_m_multicst(void *, boolean_t, const uint8_t *); static int nxge_m_promisc(void *, boolean_t); static void nxge_m_ioctl(void *, queue_t *, mblk_t *); -static void nxge_m_resources(void *); -mblk_t *nxge_m_tx(void *arg, mblk_t *); static nxge_status_t nxge_mac_register(p_nxge_t); -int nxge_altmac_set(p_nxge_t nxgep, uint8_t *mac_addr, - mac_addr_slot_t slot, uint8_t rdctbl); -void nxge_mmac_kstat_update(p_nxge_t nxgep, mac_addr_slot_t slot, +static int nxge_altmac_set(p_nxge_t nxgep, uint8_t *mac_addr, + int slot, int rdctbl, boolean_t usetbl); +void nxge_mmac_kstat_update(p_nxge_t nxgep, int slot, boolean_t factory); -static int nxge_m_mmac_reserve(void *arg, mac_multi_addr_t *maddr); -static int nxge_m_mmac_modify(void *arg, mac_multi_addr_t *maddr); -static int nxge_m_mmac_get(void *arg, mac_multi_addr_t *maddr); +#if defined(sun4v) +extern mblk_t *nxge_m_tx(void *arg, mblk_t *mp); +#endif + +static void nxge_m_getfactaddr(void *, uint_t, uint8_t *); static boolean_t nxge_m_getcapab(void *, mac_capab_t, void *); static int nxge_m_setprop(void *, const char *, mac_prop_id_t, uint_t, const void *); @@ -308,6 +296,12 @@ static int nxge_set_priv_prop(nxge_t *, const char *, uint_t, static int nxge_get_priv_prop(nxge_t *, const char *, uint_t, uint_t, void *, uint_t *); static int nxge_get_def_val(nxge_t *, mac_prop_id_t, uint_t, void *); +static void nxge_fill_ring(void *, mac_ring_type_t, const int, const int, + mac_ring_info_t *, mac_ring_handle_t); +static void nxge_group_add_ring(mac_group_driver_t, mac_ring_driver_t, + mac_ring_type_t); +static void nxge_group_rem_ring(mac_group_driver_t, mac_ring_driver_t, + mac_ring_type_t); static void nxge_niu_peu_reset(p_nxge_t nxgep); static void nxge_set_pci_replay_timeout(nxge_t *); @@ -336,15 +330,11 @@ mac_priv_prop_t nxge_priv_props[] = { #define NXGE_MAX_PRIV_PROPS \ (sizeof (nxge_priv_props)/sizeof (mac_priv_prop_t)) -#define NXGE_M_CALLBACK_FLAGS\ - (MC_RESOURCES | MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP) - - #define NXGE_NEPTUNE_MAGIC 0x4E584745UL #define MAX_DUMP_SZ 256 #define NXGE_M_CALLBACK_FLAGS \ - (MC_RESOURCES | MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP) + (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP) mac_callbacks_t nxge_m_callbacks = { NXGE_M_CALLBACK_FLAGS, @@ -353,9 +343,8 @@ mac_callbacks_t nxge_m_callbacks = { nxge_m_stop, nxge_m_promisc, nxge_m_multicst, - nxge_m_unicst, - nxge_m_tx, - nxge_m_resources, + NULL, + NULL, nxge_m_ioctl, nxge_m_getcapab, NULL, @@ -631,6 +620,11 @@ nxge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) if (nxgep->niu_type != N2_NIU) { nxge_set_pci_replay_timeout(nxgep); } +#if defined(sun4v) + if (isLDOMguest(nxgep)) { + nxge_m_callbacks.mc_tx = nxge_m_tx; + } +#endif #if defined(sun4v) /* This is required by nxge_hio_init(), which follows. */ @@ -847,13 +841,6 @@ nxge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) goto nxge_attach_fail; } - status = nxge_add_soft_intrs(nxgep); - if (status != DDI_SUCCESS) { - NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, - "add_soft_intr failed")); - goto nxge_attach_fail; - } - /* If a guest, register with vio_net instead. */ if ((status = nxge_mac_register(nxgep)) != NXGE_OK) { NXGE_DEBUG_MSG((nxgep, DDI_CTL, @@ -1032,9 +1019,6 @@ nxge_unattach(p_nxge_t nxgep) */ nxge_remove_intrs(nxgep); - /* remove soft interrups */ - nxge_remove_soft_intrs(nxgep); - /* * Stop the device and free resources. */ @@ -3742,6 +3726,20 @@ nxge_m_start_exit: return (0); } + +static boolean_t +nxge_check_groups_stopped(p_nxge_t nxgep) +{ + int i; + + for (i = 0; i < NXGE_MAX_RDC_GROUPS; i++) { + if (nxgep->rx_hio_groups[i].started) + return (B_FALSE); + } + + return (B_TRUE); +} + /* * nxge_m_stop(): stop transmitting and receiving. */ @@ -3749,9 +3747,21 @@ static void nxge_m_stop(void *arg) { p_nxge_t nxgep = (p_nxge_t)arg; + boolean_t groups_stopped; NXGE_DEBUG_MSG((nxgep, NXGE_CTL, "==> nxge_m_stop")); + groups_stopped = nxge_check_groups_stopped(nxgep); +#ifdef later + ASSERT(groups_stopped == B_FALSE); +#endif + + if (!groups_stopped) { + cmn_err(CE_WARN, "nxge(%d): groups are not stopped!\n", + nxgep->instance); + return; + } + MUTEX_ENTER(nxgep->genlock); nxgep->nxge_mac_state = NXGE_MAC_STOPPING; @@ -3770,26 +3780,6 @@ nxge_m_stop(void *arg) } static int -nxge_m_unicst(void *arg, const uint8_t *macaddr) -{ - p_nxge_t nxgep = (p_nxge_t)arg; - struct ether_addr addrp; - - NXGE_DEBUG_MSG((nxgep, MAC_CTL, "==> nxge_m_unicst")); - - bcopy(macaddr, (uint8_t *)&addrp, ETHERADDRL); - if (nxge_set_mac_addr(nxgep, &addrp)) { - NXGE_ERROR_MSG((nxgep, NXGE_ERR_CTL, - "<== nxge_m_unicst: set unitcast failed")); - return (EINVAL); - } - - NXGE_DEBUG_MSG((nxgep, MAC_CTL, "<== nxge_m_unicst")); - - return (0); -} - -static int nxge_m_multicst(void *arg, boolean_t add, const uint8_t *mca) { p_nxge_t nxgep = (p_nxge_t)arg; @@ -3942,77 +3932,8 @@ nxge_m_ioctl(void *arg, queue_t *wq, mblk_t *mp) extern void nxge_rx_hw_blank(void *arg, time_t ticks, uint_t count); -static void -nxge_m_resources(void *arg) -{ - p_nxge_t nxgep = arg; - mac_rx_fifo_t mrf; - - nxge_grp_set_t *set = &nxgep->rx_set; - uint8_t rdc; - - rx_rcr_ring_t *ring; - - NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_m_resources")); - - MUTEX_ENTER(nxgep->genlock); - - if (set->owned.map == 0) { - NXGE_ERROR_MSG((NULL, NXGE_ERR_CTL, - "nxge_m_resources: no receive resources")); - goto nxge_m_resources_exit; - } - - /* - * CR 6492541 Check to see if the drv_state has been initialized, - * if not * call nxge_init(). - */ - if (!(nxgep->drv_state & STATE_HW_INITIALIZED)) { - if (nxge_init(nxgep) != NXGE_OK) - goto nxge_m_resources_exit; - } - - mrf.mrf_type = MAC_RX_FIFO; - mrf.mrf_blank = nxge_rx_hw_blank; - mrf.mrf_arg = (void *)nxgep; - - mrf.mrf_normal_blank_time = 128; - mrf.mrf_normal_pkt_count = 8; - - /* - * Export our receive resources to the MAC layer. - */ - for (rdc = 0; rdc < NXGE_MAX_RDCS; rdc++) { - if ((1 << rdc) & set->owned.map) { - ring = nxgep->rx_rcr_rings->rcr_rings[rdc]; - if (ring == 0) { - /* - * This is a big deal only if we are - * *not* in an LDOMs environment. - */ - if (nxgep->environs == SOLARIS_DOMAIN) { - cmn_err(CE_NOTE, - "==> nxge_m_resources: " - "ring %d == 0", rdc); - } - continue; - } - ring->rcr_mac_handle = mac_resource_add - (nxgep->mach, (mac_resource_t *)&mrf); - - NXGE_DEBUG_MSG((nxgep, NXGE_CTL, - "==> nxge_m_resources: RDC %d RCR %p MAC handle %p", - rdc, ring, ring->rcr_mac_handle)); - } - } - -nxge_m_resources_exit: - MUTEX_EXIT(nxgep->genlock); - NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_m_resources")); -} - void -nxge_mmac_kstat_update(p_nxge_t nxgep, mac_addr_slot_t slot, boolean_t factory) +nxge_mmac_kstat_update(p_nxge_t nxgep, int slot, boolean_t factory) { p_nxge_mmac_stats_t mmac_stats; int i; @@ -4040,9 +3961,9 @@ nxge_mmac_kstat_update(p_nxge_t nxgep, mac_addr_slot_t slot, boolean_t factory) /* * nxge_altmac_set() -- Set an alternate MAC address */ -int -nxge_altmac_set(p_nxge_t nxgep, uint8_t *maddr, mac_addr_slot_t slot, - uint8_t rdctbl) +static int +nxge_altmac_set(p_nxge_t nxgep, uint8_t *maddr, int slot, + int rdctbl, boolean_t usetbl) { uint8_t addrn; uint8_t portn; @@ -4050,6 +3971,7 @@ nxge_altmac_set(p_nxge_t nxgep, uint8_t *maddr, mac_addr_slot_t slot, hostinfo_t mac_rdc; p_nxge_class_pt_cfg_t clscfgp; + altmac.w2 = ((uint16_t)maddr[0] << 8) | ((uint16_t)maddr[1] & 0x0ff); altmac.w1 = ((uint16_t)maddr[2] << 8) | ((uint16_t)maddr[3] & 0x0ff); altmac.w0 = ((uint16_t)maddr[4] << 8) | ((uint16_t)maddr[5] & 0x0ff); @@ -4057,8 +3979,8 @@ nxge_altmac_set(p_nxge_t nxgep, uint8_t *maddr, mac_addr_slot_t slot, portn = nxgep->mac.portnum; addrn = (uint8_t)slot - 1; - if (npi_mac_altaddr_entry(nxgep->npi_handle, OP_SET, portn, - addrn, &altmac) != NPI_SUCCESS) + if (npi_mac_altaddr_entry(nxgep->npi_handle, OP_SET, + nxgep->function_num, addrn, &altmac) != NPI_SUCCESS) return (EIO); /* @@ -4067,8 +3989,11 @@ nxge_altmac_set(p_nxge_t nxgep, uint8_t *maddr, mac_addr_slot_t slot, */ clscfgp = (p_nxge_class_pt_cfg_t)&nxgep->class_config; mac_rdc.value = 0; - clscfgp->mac_host_info[addrn].rdctbl = rdctbl; - mac_rdc.bits.w0.rdc_tbl_num = rdctbl; + if (usetbl) + mac_rdc.bits.w0.rdc_tbl_num = rdctbl; + else + mac_rdc.bits.w0.rdc_tbl_num = + clscfgp->mac_host_info[addrn].rdctbl; mac_rdc.bits.w0.mac_pref = clscfgp->mac_host_info[addrn].mpr_npr; if (npi_mac_hostinfo_entry(nxgep->npi_handle, OP_SET, @@ -4088,22 +4013,25 @@ nxge_altmac_set(p_nxge_t nxgep, uint8_t *maddr, mac_addr_slot_t slot, else addrn = (uint8_t)slot; - if (npi_mac_altaddr_enable(nxgep->npi_handle, portn, addrn) - != NPI_SUCCESS) + if (npi_mac_altaddr_enable(nxgep->npi_handle, + nxgep->function_num, addrn) != NPI_SUCCESS) { return (EIO); + } + return (0); } /* - * nxeg_m_mmac_add() - find an unused address slot, set the address + * nxeg_m_mmac_add_g() - find an unused address slot, set the address * value to the one specified, enable the port to start filtering on * the new MAC address. Returns 0 on success. */ int -nxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr) +nxge_m_mmac_add_g(void *arg, const uint8_t *maddr, int rdctbl, + boolean_t usetbl) { p_nxge_t nxgep = arg; - mac_addr_slot_t slot; + int slot; nxge_mmac_t *mmac_info; int err; nxge_status_t status; @@ -4127,16 +4055,10 @@ nxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr) mutex_exit(nxgep->genlock); return (ENOSPC); } - if (!mac_unicst_verify(nxgep->mach, maddr->mma_addr, - maddr->mma_addrlen)) { - mutex_exit(nxgep->genlock); - return (EINVAL); - } + /* * Search for the first available slot. Because naddrfree * is not zero, we are guaranteed to find one. - * Slot 0 is for unique (primary) MAC. The first alternate - * MAC slot is slot 1. * Each of the first two ports of Neptune has 16 alternate * MAC slots but only the first 7 (of 15) slots have assigned factory * MAC addresses. We first search among the slots without bundled @@ -4146,131 +4068,26 @@ nxge_m_mmac_add(void *arg, mac_multi_addr_t *maddr) * But the slot could be used by factory MAC again after calling * nxge_m_mmac_remove and nxge_m_mmac_reserve. */ - if (mmac_info->num_factory_mmac < mmac_info->num_mmac) { - for (slot = mmac_info->num_factory_mmac + 1; - slot <= mmac_info->num_mmac; slot++) { - if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED)) - break; - } - if (slot > mmac_info->num_mmac) { - for (slot = 1; slot <= mmac_info->num_factory_mmac; - slot++) { - if (!(mmac_info->mac_pool[slot].flags - & MMAC_SLOT_USED)) - break; - } - } - } else { - for (slot = 1; slot <= mmac_info->num_mmac; slot++) { - if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED)) - break; - } + for (slot = 0; slot <= mmac_info->num_mmac; slot++) { + if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED)) + break; } + ASSERT(slot <= mmac_info->num_mmac); - /* - * def_mac_rxdma_grpid is the default rdc table for the port. - */ - if ((err = nxge_altmac_set(nxgep, maddr->mma_addr, slot, - nxgep->pt_config.hw_config.def_mac_rxdma_grpid)) != 0) { + if ((err = nxge_altmac_set(nxgep, (uint8_t *)maddr, slot, rdctbl, + usetbl)) != 0) { mutex_exit(nxgep->genlock); return (err); } - bcopy(maddr->mma_addr, mmac_info->mac_pool[slot].addr, ETHERADDRL); + bcopy(maddr, mmac_info->mac_pool[slot].addr, ETHERADDRL); mmac_info->mac_pool[slot].flags |= MMAC_SLOT_USED; mmac_info->mac_pool[slot].flags &= ~MMAC_VENDOR_ADDR; mmac_info->naddrfree--; nxge_mmac_kstat_update(nxgep, slot, B_FALSE); - maddr->mma_slot = slot; - - mutex_exit(nxgep->genlock); - return (0); -} - -/* - * This function reserves an unused slot and programs the slot and the HW - * with a factory mac address. - */ -static int -nxge_m_mmac_reserve(void *arg, mac_multi_addr_t *maddr) -{ - p_nxge_t nxgep = arg; - mac_addr_slot_t slot; - nxge_mmac_t *mmac_info; - int err; - nxge_status_t status; - - mutex_enter(nxgep->genlock); - - /* - * Make sure that nxge is initialized, if _start() has - * not been called. - */ - if (!(nxgep->drv_state & STATE_HW_INITIALIZED)) { - status = nxge_init(nxgep); - if (status != NXGE_OK) { - mutex_exit(nxgep->genlock); - return (ENXIO); - } - } - - mmac_info = &nxgep->nxge_mmac_info; - if (mmac_info->naddrfree == 0) { - mutex_exit(nxgep->genlock); - return (ENOSPC); - } - - slot = maddr->mma_slot; - if (slot == -1) { /* -1: Take the first available slot */ - for (slot = 1; slot <= mmac_info->num_factory_mmac; slot++) { - if (!(mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED)) - break; - } - if (slot > mmac_info->num_factory_mmac) { - mutex_exit(nxgep->genlock); - return (ENOSPC); - } - } - if (slot < 1 || slot > mmac_info->num_factory_mmac) { - /* - * Do not support factory MAC at a slot greater than - * num_factory_mmac even when there are available factory - * MAC addresses because the alternate MACs are bundled with - * slot[1] through slot[num_factory_mmac] - */ - mutex_exit(nxgep->genlock); - return (EINVAL); - } - if (mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED) { - mutex_exit(nxgep->genlock); - return (EBUSY); - } - /* Verify the address to be reserved */ - if (!mac_unicst_verify(nxgep->mach, - mmac_info->factory_mac_pool[slot], ETHERADDRL)) { - mutex_exit(nxgep->genlock); - return (EINVAL); - } - if (err = nxge_altmac_set(nxgep, - mmac_info->factory_mac_pool[slot], slot, - nxgep->pt_config.hw_config.def_mac_rxdma_grpid)) { - mutex_exit(nxgep->genlock); - return (err); - } - bcopy(mmac_info->factory_mac_pool[slot], maddr->mma_addr, ETHERADDRL); - mmac_info->mac_pool[slot].flags |= MMAC_SLOT_USED | MMAC_VENDOR_ADDR; - mmac_info->naddrfree--; - - nxge_mmac_kstat_update(nxgep, slot, B_TRUE); mutex_exit(nxgep->genlock); - - /* Pass info back to the caller */ - maddr->mma_slot = slot; - maddr->mma_addrlen = ETHERADDRL; - maddr->mma_flags = MMAC_SLOT_USED | MMAC_VENDOR_ADDR; - return (0); } @@ -4279,7 +4096,7 @@ nxge_m_mmac_reserve(void *arg, mac_multi_addr_t *maddr) * the mac address anymore. */ int -nxge_m_mmac_remove(void *arg, mac_addr_slot_t slot) +nxge_m_mmac_remove(void *arg, int slot) { p_nxge_t nxgep = arg; nxge_mmac_t *mmac_info; @@ -4350,141 +4167,37 @@ nxge_m_mmac_remove(void *arg, mac_addr_slot_t slot) } /* - * Modify a mac address added by nxge_m_mmac_add or nxge_m_mmac_reserve(). - */ -static int -nxge_m_mmac_modify(void *arg, mac_multi_addr_t *maddr) -{ - p_nxge_t nxgep = arg; - mac_addr_slot_t slot; - nxge_mmac_t *mmac_info; - int err = 0; - nxge_status_t status; - - if (!mac_unicst_verify(nxgep->mach, maddr->mma_addr, - maddr->mma_addrlen)) - return (EINVAL); - - slot = maddr->mma_slot; - - mutex_enter(nxgep->genlock); - - /* - * Make sure that nxge is initialized, if _start() has - * not been called. - */ - if (!(nxgep->drv_state & STATE_HW_INITIALIZED)) { - status = nxge_init(nxgep); - if (status != NXGE_OK) { - mutex_exit(nxgep->genlock); - return (ENXIO); - } - } - - mmac_info = &nxgep->nxge_mmac_info; - if (slot < 1 || slot > mmac_info->num_mmac) { - mutex_exit(nxgep->genlock); - return (EINVAL); - } - if (mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED) { - if ((err = nxge_altmac_set(nxgep, - maddr->mma_addr, slot, - nxgep->pt_config.hw_config.def_mac_rxdma_grpid)) != 0) { - bcopy(maddr->mma_addr, mmac_info->mac_pool[slot].addr, - ETHERADDRL); - /* - * Assume that the MAC passed down from the caller - * is not a factory MAC address (The user should - * call mmac_remove followed by mmac_reserve if - * he wants to use the factory MAC for this slot). - */ - mmac_info->mac_pool[slot].flags &= ~MMAC_VENDOR_ADDR; - nxge_mmac_kstat_update(nxgep, slot, B_FALSE); - } - } else { - err = EINVAL; - } - mutex_exit(nxgep->genlock); - return (err); -} - -/* - * nxge_m_mmac_get() - Get the MAC address and other information - * related to the slot. mma_flags should be set to 0 in the call. - * Note: although kstat shows MAC address as zero when a slot is - * not used, Crossbow expects nxge_m_mmac_get to copy factory MAC - * to the caller as long as the slot is not using a user MAC address. - * The following table shows the rules, - * - * USED VENDOR mma_addr - * ------------------------------------------------------------ - * (1) Slot uses a user MAC: yes no user MAC - * (2) Slot uses a factory MAC: yes yes factory MAC - * (3) Slot is not used but is - * factory MAC capable: no yes factory MAC - * (4) Slot is not used and is - * not factory MAC capable: no no 0 - * ------------------------------------------------------------ + * The callback to query all the factory addresses. naddr must be the same as + * the number of factory addresses (returned by MAC_CAPAB_MULTIFACTADDR), and + * mcm_addr is the space allocated for keep all the addresses, whose size is + * naddr * MAXMACADDRLEN. */ -static int -nxge_m_mmac_get(void *arg, mac_multi_addr_t *maddr) +static void +nxge_m_getfactaddr(void *arg, uint_t naddr, uint8_t *addr) { - nxge_t *nxgep = arg; - mac_addr_slot_t slot; - nxge_mmac_t *mmac_info; - nxge_status_t status; - - slot = maddr->mma_slot; + nxge_t *nxgep = arg; + nxge_mmac_t *mmac_info; + int i; mutex_enter(nxgep->genlock); - /* - * Make sure that nxge is initialized, if _start() has - * not been called. - */ - if (!(nxgep->drv_state & STATE_HW_INITIALIZED)) { - status = nxge_init(nxgep); - if (status != NXGE_OK) { - mutex_exit(nxgep->genlock); - return (ENXIO); - } - } - mmac_info = &nxgep->nxge_mmac_info; + ASSERT(naddr == mmac_info->num_factory_mmac); - if (slot < 1 || slot > mmac_info->num_mmac) { - mutex_exit(nxgep->genlock); - return (EINVAL); + for (i = 0; i < naddr; i++) { + bcopy(mmac_info->factory_mac_pool[i + 1], + addr + i * MAXMACADDRLEN, ETHERADDRL); } - maddr->mma_flags = 0; - if (mmac_info->mac_pool[slot].flags & MMAC_SLOT_USED) - maddr->mma_flags |= MMAC_SLOT_USED; - if (mmac_info->mac_pool[slot].flags & MMAC_VENDOR_ADDR) { - maddr->mma_flags |= MMAC_VENDOR_ADDR; - bcopy(mmac_info->factory_mac_pool[slot], - maddr->mma_addr, ETHERADDRL); - maddr->mma_addrlen = ETHERADDRL; - } else { - if (maddr->mma_flags & MMAC_SLOT_USED) { - bcopy(mmac_info->mac_pool[slot].addr, - maddr->mma_addr, ETHERADDRL); - maddr->mma_addrlen = ETHERADDRL; - } else { - bzero(maddr->mma_addr, ETHERADDRL); - maddr->mma_addrlen = 0; - } - } mutex_exit(nxgep->genlock); - return (0); } + static boolean_t nxge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) { nxge_t *nxgep = arg; uint32_t *txflags = cap_data; - multiaddress_capab_t *mmacp = cap_data; switch (cap) { case MAC_CAPAB_HCKSUM: @@ -4495,33 +4208,15 @@ nxge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) } break; - case MAC_CAPAB_POLL: - /* - * There's nothing for us to fill in, simply returning - * B_TRUE stating that we support polling is sufficient. - */ - break; + case MAC_CAPAB_MULTIFACTADDR: { + mac_capab_multifactaddr_t *mfacp = cap_data; - case MAC_CAPAB_MULTIADDRESS: - mmacp = (multiaddress_capab_t *)cap_data; mutex_enter(nxgep->genlock); - - mmacp->maddr_naddr = nxgep->nxge_mmac_info.num_mmac; - mmacp->maddr_naddrfree = nxgep->nxge_mmac_info.naddrfree; - mmacp->maddr_flag = 0; /* 0 is required by PSARC2006/265 */ - /* - * maddr_handle is driver's private data, passed back to - * entry point functions as arg. - */ - mmacp->maddr_handle = nxgep; - mmacp->maddr_add = nxge_m_mmac_add; - mmacp->maddr_remove = nxge_m_mmac_remove; - mmacp->maddr_modify = nxge_m_mmac_modify; - mmacp->maddr_get = nxge_m_mmac_get; - mmacp->maddr_reserve = nxge_m_mmac_reserve; - + mfacp->mcm_naddr = nxgep->nxge_mmac_info.num_factory_mmac; + mfacp->mcm_getaddr = nxge_m_getfactaddr; mutex_exit(nxgep->genlock); break; + } case MAC_CAPAB_LSO: { mac_capab_lso_t *cap_lso = cap_data; @@ -4541,39 +4236,49 @@ nxge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) } } -#if defined(sun4v) case MAC_CAPAB_RINGS: { - mac_capab_rings_t *mrings = (mac_capab_rings_t *)cap_data; - - /* - * Only the service domain driver responds to - * this capability request. - */ - if (isLDOMservice(nxgep)) { - mrings->mr_handle = (void *)nxgep; + mac_capab_rings_t *cap_rings = cap_data; + p_nxge_hw_pt_cfg_t p_cfgp = &nxgep->pt_config.hw_config; - /* - * No dynamic allocation of groups and - * rings at this time. Shares dictate the - * configuration. - */ - mrings->mr_gadd_ring = NULL; - mrings->mr_grem_ring = NULL; - mrings->mr_rget = NULL; - mrings->mr_gget = nxge_hio_group_get; - - if (mrings->mr_type == MAC_RING_TYPE_RX) { - mrings->mr_rnum = 8; /* XXX */ - mrings->mr_gnum = 6; /* XXX */ + mutex_enter(nxgep->genlock); + if (cap_rings->mr_type == MAC_RING_TYPE_RX) { + cap_rings->mr_group_type = MAC_GROUP_TYPE_DYNAMIC; + cap_rings->mr_rnum = p_cfgp->max_rdcs; + cap_rings->mr_rget = nxge_fill_ring; + cap_rings->mr_gnum = p_cfgp->max_rdc_grpids; + cap_rings->mr_gget = nxge_hio_group_get; + cap_rings->mr_gaddring = nxge_group_add_ring; + cap_rings->mr_gremring = nxge_group_rem_ring; + + NXGE_DEBUG_MSG((nxgep, RX_CTL, + "==> nxge_m_getcapab: rx nrings[%d] ngroups[%d]", + p_cfgp->max_rdcs, p_cfgp->max_rdc_grpids)); + } else { + cap_rings->mr_group_type = MAC_GROUP_TYPE_DYNAMIC; + cap_rings->mr_rnum = p_cfgp->tdc.count; + cap_rings->mr_rget = nxge_fill_ring; + if (isLDOMservice(nxgep)) { + /* share capable */ + /* Do not report the default ring: hence -1 */ + cap_rings->mr_gnum = + NXGE_MAX_TDC_GROUPS / nxgep->nports - 1; } else { - mrings->mr_rnum = 8; /* XXX */ - mrings->mr_gnum = 0; /* XXX */ + cap_rings->mr_gnum = 0; } - } else - return (B_FALSE); + + cap_rings->mr_gget = nxge_hio_group_get; + cap_rings->mr_gaddring = nxge_group_add_ring; + cap_rings->mr_gremring = nxge_group_rem_ring; + + NXGE_DEBUG_MSG((nxgep, TX_CTL, + "==> nxge_m_getcapab: tx rings # of rings %d", + p_cfgp->tdc.count)); + } + mutex_exit(nxgep->genlock); break; } +#if defined(sun4v) case MAC_CAPAB_SHARES: { mac_capab_share_t *mshares = (mac_capab_share_t *)cap_data; @@ -4581,16 +4286,22 @@ nxge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) * Only the service domain driver responds to * this capability request. */ + mutex_enter(nxgep->genlock); if (isLDOMservice(nxgep)) { mshares->ms_snum = 3; mshares->ms_handle = (void *)nxgep; mshares->ms_salloc = nxge_hio_share_alloc; mshares->ms_sfree = nxge_hio_share_free; - mshares->ms_sadd = NULL; - mshares->ms_sremove = NULL; + mshares->ms_sadd = nxge_hio_share_add_group; + mshares->ms_sremove = nxge_hio_share_rem_group; mshares->ms_squery = nxge_hio_share_query; - } else + mshares->ms_sbind = nxge_hio_share_bind; + mshares->ms_sunbind = nxge_hio_share_unbind; + mutex_exit(nxgep->genlock); + } else { + mutex_exit(nxgep->genlock); return (B_FALSE); + } break; } #endif @@ -5160,12 +4871,6 @@ nxge_set_priv_prop(p_nxge_t nxgep, const char *pr_name, uint_t pr_valsize, } if (strcmp(pr_name, "_soft_lso_enable") == 0) { - if (nxgep->nxge_mac_state == NXGE_MAC_STARTED) { - NXGE_DEBUG_MSG((nxgep, NXGE_CTL, - "==> nxge_set_priv_prop: name %s (busy)", pr_name)); - err = EBUSY; - return (err); - } if (pr_val == NULL) { NXGE_DEBUG_MSG((nxgep, NXGE_CTL, "==> nxge_set_priv_prop: name %s (null)", pr_name)); @@ -5695,6 +5400,290 @@ _info(struct modinfo *modinfop) } /*ARGSUSED*/ +static int +nxge_tx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num) +{ + p_nxge_ring_handle_t rhp = (p_nxge_ring_handle_t)rdriver; + p_nxge_t nxgep = rhp->nxgep; + uint32_t channel; + p_tx_ring_t ring; + + channel = nxgep->pt_config.hw_config.tdc.start + rhp->index; + ring = nxgep->tx_rings->rings[channel]; + + MUTEX_ENTER(&ring->lock); + ring->tx_ring_handle = rhp->ring_handle; + MUTEX_EXIT(&ring->lock); + + return (0); +} + +static void +nxge_tx_ring_stop(mac_ring_driver_t rdriver) +{ + p_nxge_ring_handle_t rhp = (p_nxge_ring_handle_t)rdriver; + p_nxge_t nxgep = rhp->nxgep; + uint32_t channel; + p_tx_ring_t ring; + + channel = nxgep->pt_config.hw_config.tdc.start + rhp->index; + ring = nxgep->tx_rings->rings[channel]; + + MUTEX_ENTER(&ring->lock); + ring->tx_ring_handle = (mac_ring_handle_t)NULL; + MUTEX_EXIT(&ring->lock); +} + +static int +nxge_rx_ring_start(mac_ring_driver_t rdriver, uint64_t mr_gen_num) +{ + p_nxge_ring_handle_t rhp = (p_nxge_ring_handle_t)rdriver; + p_nxge_t nxgep = rhp->nxgep; + uint32_t channel; + p_rx_rcr_ring_t ring; + int i; + + channel = nxgep->pt_config.hw_config.start_rdc + rhp->index; + ring = nxgep->rx_rcr_rings->rcr_rings[channel]; + + MUTEX_ENTER(&ring->lock); + + if (nxgep->rx_channel_started[channel] == B_TRUE) { + MUTEX_EXIT(&ring->lock); + return (0); + } + + /* set rcr_ring */ + for (i = 0; i < nxgep->ldgvp->maxldvs; i++) { + if ((nxgep->ldgvp->ldvp[i].is_rxdma == 1) && + (nxgep->ldgvp->ldvp[i].channel == channel)) { + ring->ldvp = &nxgep->ldgvp->ldvp[i]; + ring->ldgp = nxgep->ldgvp->ldvp[i].ldgp; + } + } + + nxgep->rx_channel_started[channel] = B_TRUE; + ring->rcr_mac_handle = rhp->ring_handle; + ring->rcr_gen_num = mr_gen_num; + MUTEX_EXIT(&ring->lock); + + return (0); +} + +static void +nxge_rx_ring_stop(mac_ring_driver_t rdriver) +{ + p_nxge_ring_handle_t rhp = (p_nxge_ring_handle_t)rdriver; + p_nxge_t nxgep = rhp->nxgep; + uint32_t channel; + p_rx_rcr_ring_t ring; + + channel = nxgep->pt_config.hw_config.start_rdc + rhp->index; + ring = nxgep->rx_rcr_rings->rcr_rings[channel]; + + MUTEX_ENTER(&ring->lock); + nxgep->rx_channel_started[channel] = B_FALSE; + ring->rcr_mac_handle = NULL; + MUTEX_EXIT(&ring->lock); +} + +/* + * Callback funtion for MAC layer to register all rings. + */ +static void +nxge_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, + const int index, mac_ring_info_t *infop, mac_ring_handle_t rh) +{ + p_nxge_t nxgep = (p_nxge_t)arg; + p_nxge_hw_pt_cfg_t p_cfgp = &nxgep->pt_config.hw_config; + + NXGE_DEBUG_MSG((nxgep, TX_CTL, + "==> nxge_fill_ring 0x%x index %d", rtype, index)); + + switch (rtype) { + case MAC_RING_TYPE_TX: { + p_nxge_ring_handle_t rhandlep; + + NXGE_DEBUG_MSG((nxgep, TX_CTL, + "==> nxge_fill_ring (TX) 0x%x index %d ntdcs %d", + rtype, index, p_cfgp->tdc.count)); + + ASSERT((index >= 0) && (index < p_cfgp->tdc.count)); + rhandlep = &nxgep->tx_ring_handles[index]; + rhandlep->nxgep = nxgep; + rhandlep->index = index; + rhandlep->ring_handle = rh; + + infop->mri_driver = (mac_ring_driver_t)rhandlep; + infop->mri_start = nxge_tx_ring_start; + infop->mri_stop = nxge_tx_ring_stop; + infop->mri_tx = nxge_tx_ring_send; + + break; + } + case MAC_RING_TYPE_RX: { + p_nxge_ring_handle_t rhandlep; + int nxge_rindex; + mac_intr_t nxge_mac_intr; + + NXGE_DEBUG_MSG((nxgep, RX_CTL, + "==> nxge_fill_ring (RX) 0x%x index %d nrdcs %d", + rtype, index, p_cfgp->max_rdcs)); + + /* + * 'index' is the ring index within the group. + * Find the ring index in the nxge instance. + */ + nxge_rindex = nxge_get_rxring_index(nxgep, rg_index, index); + + ASSERT((nxge_rindex >= 0) && (nxge_rindex < p_cfgp->max_rdcs)); + rhandlep = &nxgep->rx_ring_handles[nxge_rindex]; + rhandlep->nxgep = nxgep; + rhandlep->index = nxge_rindex; + rhandlep->ring_handle = rh; + + /* + * Entrypoint to enable interrupt (disable poll) and + * disable interrupt (enable poll). + */ + nxge_mac_intr.mi_handle = (mac_intr_handle_t)rhandlep; + nxge_mac_intr.mi_enable = (mac_intr_enable_t)nxge_disable_poll; + nxge_mac_intr.mi_disable = (mac_intr_disable_t)nxge_enable_poll; + infop->mri_driver = (mac_ring_driver_t)rhandlep; + infop->mri_start = nxge_rx_ring_start; + infop->mri_stop = nxge_rx_ring_stop; + infop->mri_intr = nxge_mac_intr; /* ??? */ + infop->mri_poll = nxge_rx_poll; + + break; + } + default: + break; + } + + NXGE_DEBUG_MSG((nxgep, DDI_CTL, "<== nxge_fill_ring 0x%x", + rtype)); +} + +static void +nxge_group_add_ring(mac_group_driver_t gh, mac_ring_driver_t rh, + mac_ring_type_t type) +{ + nxge_ring_group_t *rgroup = (nxge_ring_group_t *)gh; + nxge_ring_handle_t *rhandle = (nxge_ring_handle_t *)rh; + nxge_t *nxge; + nxge_grp_t *grp; + nxge_rdc_grp_t *rdc_grp; + uint16_t channel; /* device-wise ring id */ + int dev_gindex; + int rv; + + nxge = rgroup->nxgep; + + switch (type) { + case MAC_RING_TYPE_TX: + /* + * nxge_grp_dc_add takes a channel number which is a + * "devise" ring ID. + */ + channel = nxge->pt_config.hw_config.tdc.start + rhandle->index; + + /* + * Remove the ring from the default group + */ + if (rgroup->gindex != 0) { + (void) nxge_grp_dc_remove(nxge, VP_BOUND_TX, channel); + } + + /* + * nxge->tx_set.group[] is an array of groups indexed by + * a "port" group ID. + */ + grp = nxge->tx_set.group[rgroup->gindex]; + rv = nxge_grp_dc_add(nxge, grp, VP_BOUND_TX, channel); + if (rv != 0) { + NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL, + "nxge_group_add_ring: nxge_grp_dc_add failed")); + } + break; + + case MAC_RING_TYPE_RX: + /* + * nxge->rx_set.group[] is an array of groups indexed by + * a "port" group ID. + */ + grp = nxge->rx_set.group[rgroup->gindex]; + + dev_gindex = nxge->pt_config.hw_config.def_mac_rxdma_grpid + + rgroup->gindex; + rdc_grp = &nxge->pt_config.rdc_grps[dev_gindex]; + + /* + * nxge_grp_dc_add takes a channel number which is a + * "devise" ring ID. + */ + channel = nxge->pt_config.hw_config.start_rdc + rhandle->index; + rv = nxge_grp_dc_add(nxge, grp, VP_BOUND_RX, channel); + if (rv != 0) { + NXGE_ERROR_MSG((nxge, NXGE_ERR_CTL, + "nxge_group_add_ring: nxge_grp_dc_add failed")); + } + + rdc_grp->map |= (1 << channel); + rdc_grp->max_rdcs++; + + (void) nxge_init_fzc_rdc_tbl(nxge, rgroup->rdctbl); + break; + } +} + +static void +nxge_group_rem_ring(mac_group_driver_t gh, mac_ring_driver_t rh, + mac_ring_type_t type) +{ + nxge_ring_group_t *rgroup = (nxge_ring_group_t *)gh; + nxge_ring_handle_t *rhandle = (nxge_ring_handle_t *)rh; + nxge_t *nxge; + uint16_t channel; /* device-wise ring id */ + nxge_rdc_grp_t *rdc_grp; + int dev_gindex; + + nxge = rgroup->nxgep; + + switch (type) { + case MAC_RING_TYPE_TX: + dev_gindex = nxge->pt_config.hw_config.def_mac_txdma_grpid + + rgroup->gindex; + channel = nxge->pt_config.hw_config.tdc.start + rhandle->index; + nxge_grp_dc_remove(nxge, VP_BOUND_TX, channel); + + /* + * Add the ring back to the default group + */ + if (rgroup->gindex != 0) { + nxge_grp_t *grp; + grp = nxge->tx_set.group[0]; + (void) nxge_grp_dc_add(nxge, grp, VP_BOUND_TX, channel); + } + break; + + case MAC_RING_TYPE_RX: + dev_gindex = nxge->pt_config.hw_config.def_mac_rxdma_grpid + + rgroup->gindex; + rdc_grp = &nxge->pt_config.rdc_grps[dev_gindex]; + channel = rdc_grp->start_rdc + rhandle->index; + nxge_grp_dc_remove(nxge, VP_BOUND_RX, channel); + + rdc_grp->map &= ~(1 << channel); + rdc_grp->max_rdcs--; + + (void) nxge_init_fzc_rdc_tbl(nxge, rgroup->rdctbl); + break; + } +} + + +/*ARGSUSED*/ static nxge_status_t nxge_add_intrs(p_nxge_t nxgep) { @@ -5818,33 +5807,6 @@ nxge_add_intrs(p_nxge_t nxgep) return (status); } -/*ARGSUSED*/ -static nxge_status_t -nxge_add_soft_intrs(p_nxge_t nxgep) -{ - - int ddi_status = DDI_SUCCESS; - nxge_status_t status = NXGE_OK; - - NXGE_DEBUG_MSG((nxgep, DDI_CTL, "==> nxge_add_soft_intrs")); - - nxgep->resched_id = NULL; - nxgep->resched_running = B_FALSE; - ddi_status = ddi_add_softintr(nxgep->dip, DDI_SOFTINT_LOW, - &nxgep->resched_id, - NULL, NULL, nxge_reschedule, (caddr_t)nxgep); - if (ddi_status != DDI_SUCCESS) { - NXGE_ERROR_MSG((nxgep, NXGE_ERR_CTL, "<== nxge_add_soft_intrs: " - "ddi_add_softintrs failed: status 0x%08x", - ddi_status)); - return (NXGE_ERROR | NXGE_DDI_FAILED); - } - - NXGE_DEBUG_MSG((nxgep, DDI_CTL, "<== nxge_ddi_add_soft_intrs")); - - return (status); -} - static nxge_status_t nxge_add_intrs_adv(p_nxge_t nxgep) { @@ -6277,21 +6239,6 @@ nxge_remove_intrs(p_nxge_t nxgep) /*ARGSUSED*/ static void -nxge_remove_soft_intrs(p_nxge_t nxgep) -{ - NXGE_DEBUG_MSG((nxgep, INT_CTL, "==> nxge_remove_soft_intrs")); - if (nxgep->resched_id) { - ddi_remove_softintr(nxgep->resched_id); - NXGE_DEBUG_MSG((nxgep, INT_CTL, - "==> nxge_remove_soft_intrs: removed")); - nxgep->resched_id = NULL; - } - - NXGE_DEBUG_MSG((nxgep, INT_CTL, "<== nxge_remove_soft_intrs")); -} - -/*ARGSUSED*/ -static void nxge_intrs_enable(p_nxge_t nxgep) { p_nxge_intr_t intrp; @@ -6389,6 +6336,7 @@ nxge_mac_register(p_nxge_t nxgep) macp->m_margin = VLAN_TAGSZ; macp->m_priv_props = nxge_priv_props; macp->m_priv_prop_count = NXGE_MAX_PRIV_PROPS; + macp->m_v12n = MAC_VIRT_HIO | MAC_VIRT_LEVEL1 | MAC_VIRT_SERIALIZE; NXGE_DEBUG_MSG((nxgep, MAC_CTL, "==> nxge_mac_register: instance %d " @@ -6941,7 +6889,7 @@ nxge_niu_peu_reset(p_nxge_t nxgep) static void nxge_set_pci_replay_timeout(p_nxge_t nxgep) { - p_dev_regs_t dev_regs; + p_dev_regs_t dev_regs; uint32_t value; NXGE_DEBUG_MSG((nxgep, DDI_CTL, "==> nxge_set_pci_replay_timeout")); diff --git a/usr/src/uts/common/io/nxge/nxge_ndd.c b/usr/src/uts/common/io/nxge/nxge_ndd.c index 90c8128428..38bf3d5969 100644 --- a/usr/src/uts/common/io/nxge/nxge_ndd.c +++ b/usr/src/uts/common/io/nxge/nxge_ndd.c @@ -980,15 +980,13 @@ nxge_param_get_txdma_info(p_nxge_t nxgep, queue_t *q, p_mblk_t mp, caddr_t cp) mp->b_cont = np; print_len = 0; - ((mblk_t *)np)->b_wptr += print_len; - buf_len -= print_len; print_len = snprintf((char *)((mblk_t *)np)->b_wptr, buf_len, "TDC\t HW TDC\t\n"); ((mblk_t *)np)->b_wptr += print_len; buf_len -= print_len; set = &nxgep->tx_set; - for (tdc = 0; tdc < NXGE_MAX_RDCS; tdc++) { + for (tdc = 0; tdc < NXGE_MAX_TDCS; tdc++) { if ((1 << tdc) & set->owned.map) { print_len = snprintf((char *)((mblk_t *)np)->b_wptr, buf_len, "%d\n", tdc); diff --git a/usr/src/uts/common/io/nxge/nxge_rxdma.c b/usr/src/uts/common/io/nxge/nxge_rxdma.c index e0e81491c6..8aeb88f7c5 100644 --- a/usr/src/uts/common/io/nxge/nxge_rxdma.c +++ b/usr/src/uts/common/io/nxge/nxge_rxdma.c @@ -39,6 +39,13 @@ (rdc + nxgep->pt_config.hw_config.start_rdc) /* + * XXX: This is a tunable to limit the number of packets each interrupt + * handles. 0 (default) means that each interrupt takes as much packets + * as it finds. + */ +extern int nxge_max_intr_pkts; + +/* * Globals: tunable parameters (/etc/system or adb) * */ @@ -115,7 +122,7 @@ nxge_status_t nxge_disable_rxdma_channel(p_nxge_t, uint16_t); static p_rx_msg_t nxge_allocb(size_t, uint32_t, p_nxge_dma_common_t); static void nxge_freeb(p_rx_msg_t); -static void nxge_rx_pkts_vring(p_nxge_t, uint_t, rx_dma_ctl_stat_t); +static mblk_t *nxge_rx_pkts_vring(p_nxge_t, uint_t, rx_dma_ctl_stat_t); static nxge_status_t nxge_rx_err_evnts(p_nxge_t, int, rx_dma_ctl_stat_t); static nxge_status_t nxge_rxdma_handle_port_errors(p_nxge_t, @@ -137,8 +144,10 @@ nxge_status_t nxge_init_rxdma_channels(p_nxge_t nxgep) { nxge_grp_set_t *set = &nxgep->rx_set; - int i, count, rdc, channel; + int i, count, channel; nxge_grp_t *group; + dc_map_t map; + int dev_gindex; NXGE_DEBUG_MSG((nxgep, MEM2_CTL, "==> nxge_init_rxdma_channels")); @@ -158,9 +167,11 @@ nxge_init_rxdma_channels(p_nxge_t nxgep) for (i = 0, count = 0; i < NXGE_LOGICAL_GROUP_MAX; i++) { if ((1 << i) & set->lg.map) { group = set->group[i]; - + dev_gindex = + nxgep->pt_config.hw_config.def_mac_rxdma_grpid + i; + map = nxgep->pt_config.rdc_grps[dev_gindex].map; for (channel = 0; channel < NXGE_MAX_RDCS; channel++) { - if ((1 << channel) & group->map) { + if ((1 << channel) & map) { if ((nxge_grp_dc_add(nxgep, group, VP_BOUND_RX, channel))) goto init_rxdma_channels_exit; @@ -178,15 +189,16 @@ init_rxdma_channels_exit: for (i = 0, count = 0; i < NXGE_LOGICAL_GROUP_MAX; i++) { if ((1 << i) & set->lg.map) { group = set->group[i]; - - for (rdc = 0; rdc < NXGE_MAX_RDCS; rdc++) { - if ((1 << rdc) & group->map) { + dev_gindex = + nxgep->pt_config.hw_config.def_mac_rxdma_grpid + i; + map = nxgep->pt_config.rdc_grps[dev_gindex].map; + for (channel = 0; channel < NXGE_MAX_RDCS; channel++) { + if ((1 << channel) & map) { nxge_grp_dc_remove(nxgep, - VP_BOUND_RX, rdc); + VP_BOUND_RX, channel); } } } - if (++count == set->lg.count) break; } @@ -1175,35 +1187,6 @@ nxge_rxdma_regs_dump(p_nxge_t nxgep, int rdc) "<== nxge_rxdma_regs_dump: rdc rdc %d", rdc)); } -void -nxge_rxdma_stop(p_nxge_t nxgep) -{ - NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_rxdma_stop")); - - (void) nxge_link_monitor(nxgep, LINK_MONITOR_STOP); - (void) nxge_rx_mac_disable(nxgep); - (void) nxge_rxdma_hw_mode(nxgep, NXGE_DMA_STOP); - NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rxdma_stop")); -} - -void -nxge_rxdma_stop_reinit(p_nxge_t nxgep) -{ - NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_rxdma_stop_reinit")); - - (void) nxge_rxdma_stop(nxgep); - (void) nxge_uninit_rxdma_channels(nxgep); - (void) nxge_init_rxdma_channels(nxgep); - -#ifndef AXIS_DEBUG_LB - (void) nxge_xcvr_init(nxgep); - (void) nxge_link_monitor(nxgep, LINK_MONITOR_START); -#endif - (void) nxge_rx_mac_enable(nxgep); - - NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rxdma_stop_reinit")); -} - nxge_status_t nxge_rxdma_hw_mode(p_nxge_t nxgep, boolean_t enable) { @@ -1438,11 +1421,53 @@ nxge_rxdma_fixup_channel_fail: NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rxdma_fixup_channel")); } -/* ARGSUSED */ +/* + * Convert an absolute RDC number to a Receive Buffer Ring index. That is, + * map <channel> to an index into nxgep->rx_rbr_rings. + * (device ring index -> port ring index) + */ int nxge_rxdma_get_ring_index(p_nxge_t nxgep, uint16_t channel) { - return (channel); + int i, ndmas; + uint16_t rdc; + p_rx_rbr_rings_t rx_rbr_rings; + p_rx_rbr_ring_t *rbr_rings; + + NXGE_DEBUG_MSG((nxgep, RX_CTL, + "==> nxge_rxdma_get_ring_index: channel %d", channel)); + + rx_rbr_rings = nxgep->rx_rbr_rings; + if (rx_rbr_rings == NULL) { + NXGE_DEBUG_MSG((nxgep, RX_CTL, + "<== nxge_rxdma_get_ring_index: NULL ring pointer")); + return (-1); + } + ndmas = rx_rbr_rings->ndmas; + if (!ndmas) { + NXGE_DEBUG_MSG((nxgep, RX_CTL, + "<== nxge_rxdma_get_ring_index: no channel")); + return (-1); + } + + NXGE_DEBUG_MSG((nxgep, RX_CTL, + "==> nxge_rxdma_get_ring_index (ndmas %d)", ndmas)); + + rbr_rings = rx_rbr_rings->rbr_rings; + for (i = 0; i < ndmas; i++) { + rdc = rbr_rings[i]->rdc; + if (channel == rdc) { + NXGE_DEBUG_MSG((nxgep, RX_CTL, + "==> nxge_rxdma_get_rbr_ring: channel %d " + "(index %d) ring %d", channel, i, rbr_rings[i])); + return (i); + } + } + + NXGE_DEBUG_MSG((nxgep, RX_CTL, + "<== nxge_rxdma_get_rbr_ring_index: not found")); + + return (-1); } p_rx_rbr_ring_t @@ -1792,11 +1817,12 @@ nxge_rx_intr(void *arg1, void *arg2) uint8_t channel; npi_handle_t handle; rx_dma_ctl_stat_t cs; + p_rx_rcr_ring_t rcr_ring; + mblk_t *mp; #ifdef NXGE_DEBUG rxdma_cfig1_t cfg; #endif - uint_t serviced = DDI_INTR_UNCLAIMED; if (ldvp == NULL) { NXGE_DEBUG_MSG((NULL, INT_CTL, @@ -1826,11 +1852,37 @@ nxge_rx_intr(void *arg1, void *arg2) * receive dma channel. */ handle = NXGE_DEV_NPI_HANDLE(nxgep); + + rcr_ring = nxgep->rx_rcr_rings->rcr_rings[ldvp->vdma_index]; + + /* + * The RCR ring lock must be held when packets + * are being processed and the hardware registers are + * being read or written to prevent race condition + * among the interrupt thread, the polling thread + * (will cause fatal errors such as rcrincon bit set) + * and the setting of the poll_flag. + */ + MUTEX_ENTER(&rcr_ring->lock); + /* * Get the control and status for this channel. */ channel = ldvp->channel; ldgp = ldvp->ldgp; + + if (!isLDOMguest(nxgep)) { + if (!nxgep->rx_channel_started[channel]) { + NXGE_DEBUG_MSG((nxgep, INT_CTL, + "<== nxge_rx_intr: channel is not started")); + MUTEX_EXIT(&rcr_ring->lock); + return (DDI_INTR_CLAIMED); + } + } + + ASSERT(rcr_ring->ldgp == ldgp); + ASSERT(rcr_ring->ldvp == ldvp); + RXDMA_REG_READ64(handle, RX_DMA_CTL_STAT_REG, channel, &cs.value); NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_rx_intr:channel %d " @@ -1840,15 +1892,13 @@ nxge_rx_intr(void *arg1, void *arg2) cs.bits.hdw.rcrto, cs.bits.hdw.rcrthres)); - nxge_rx_pkts_vring(nxgep, ldvp->vdma_index, cs); - serviced = DDI_INTR_CLAIMED; + mp = nxge_rx_pkts_vring(nxgep, ldvp->vdma_index, cs); /* error events. */ if (cs.value & RX_DMA_CTL_STAT_ERROR) { (void) nxge_rx_err_evnts(nxgep, channel, cs); } -nxge_intr_exit: /* * Enable the mailbox update interrupt if we want * to use mailbox. We probably don't need to use @@ -1856,40 +1906,82 @@ nxge_intr_exit: * Also write 1 to rcrthres and rcrto to clear * these two edge triggered bits. */ - cs.value &= RX_DMA_CTL_STAT_WR1C; - cs.bits.hdw.mex = 1; + cs.bits.hdw.mex = rcr_ring->poll_flag ? 0 : 1; RXDMA_REG_WRITE64(handle, RX_DMA_CTL_STAT_REG, channel, cs.value); /* - * Rearm this logical group if this is a single device - * group. + * If the polling mode is enabled, disable the interrupt. */ - if (ldgp->nldvs == 1) { - ldgimgm_t mgm; - mgm.value = 0; - mgm.bits.ldw.arm = 1; - mgm.bits.ldw.timer = ldgp->ldg_timer; - if (isLDOMguest(nxgep)) { - nxge_hio_ldgimgn(nxgep, ldgp); - } else { + if (rcr_ring->poll_flag) { + NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, + "==> nxge_rx_intr: rdc %d ldgp $%p ldvp $%p " + "(disabling interrupts)", channel, ldgp, ldvp)); + /* + * Disarm this logical group if this is a single device + * group. + */ + if (ldgp->nldvs == 1) { + ldgimgm_t mgm; + mgm.value = 0; + mgm.bits.ldw.arm = 0; NXGE_REG_WR64(handle, - LDGIMGN_REG + LDSV_OFFSET(ldgp->ldg), - mgm.value); + LDGIMGN_REG + LDSV_OFFSET(ldgp->ldg), mgm.value); + } + } else { + /* + * Rearm this logical group if this is a single device group. + */ + if (ldgp->nldvs == 1) { + if (isLDOMguest(nxgep)) { + nxge_hio_ldgimgn(nxgep, ldgp); + } else { + ldgimgm_t mgm; + + mgm.value = 0; + mgm.bits.ldw.arm = 1; + mgm.bits.ldw.timer = ldgp->ldg_timer; + + NXGE_REG_WR64(handle, + LDGIMGN_REG + LDSV_OFFSET(ldgp->ldg), + mgm.value); + } } + + NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, + "==> nxge_rx_intr: rdc %d ldgp $%p " + "exiting ISR (and call mac_rx_ring)", channel, ldgp)); } + MUTEX_EXIT(&rcr_ring->lock); - NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rx_intr: serviced %d", - serviced)); - return (serviced); + if (mp) { + if (!isLDOMguest(nxgep)) + mac_rx_ring(nxgep->mach, rcr_ring->rcr_mac_handle, mp, + rcr_ring->rcr_gen_num); +#if defined(sun4v) + else { /* isLDOMguest(nxgep) */ + nxge_hio_data_t *nhd = (nxge_hio_data_t *) + nxgep->nxge_hw_p->hio; + nx_vio_fp_t *vio = &nhd->hio.vio; + + if (vio->cb.vio_net_rx_cb) { + (*vio->cb.vio_net_rx_cb) + (nxgep->hio_vr->vhp, mp); + } + } +#endif + } + NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rx_intr: DDI_INTR_CLAIMED")); + return (DDI_INTR_CLAIMED); } /* * Process the packets received in the specified logical device * and pass up a chain of message blocks to the upper layer. + * The RCR ring lock must be held before calling this function. */ -static void +static mblk_t * nxge_rx_pkts_vring(p_nxge_t nxgep, uint_t vindex, rx_dma_ctl_stat_t cs) { p_mblk_t mp; @@ -1897,15 +1989,14 @@ nxge_rx_pkts_vring(p_nxge_t nxgep, uint_t vindex, rx_dma_ctl_stat_t cs) NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_rx_pkts_vring")); rcrp = nxgep->rx_rcr_rings->rcr_rings[vindex]; - if (rcrp->poll_flag) { - /* It is in the poll mode */ - return; - } + NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, + "==> nxge_rx_pkts_vring: (calling nxge_rx_pkts)rdc %d " + "rcr_mac_handle $%p ", rcrp->rdc, rcrp->rcr_mac_handle)); if ((mp = nxge_rx_pkts(nxgep, rcrp, cs, -1)) == NULL) { NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rx_pkts_vring: no mp")); - return; + return (NULL); } NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_rx_pkts_vring: $%p", @@ -1947,21 +2038,11 @@ nxge_rx_pkts_vring(p_nxge_t nxgep, uint_t vindex, rx_dma_ctl_stat_t cs) mp->b_next->b_wptr - mp->b_next->b_rptr))); } #endif + NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, + "<== nxge_rx_pkts_vring: returning rdc %d rcr_mac_handle $%p ", + rcrp->rdc, rcrp->rcr_mac_handle)); - if (!isLDOMguest(nxgep)) - mac_rx(nxgep->mach, rcrp->rcr_mac_handle, mp); -#if defined(sun4v) - else { /* isLDOMguest(nxgep) */ - nxge_hio_data_t *nhd = (nxge_hio_data_t *) - nxgep->nxge_hw_p->hio; - nx_vio_fp_t *vio = &nhd->hio.vio; - - if (vio->cb.vio_net_rx_cb) { - (*vio->cb.vio_net_rx_cb) - (nxgep->hio_vr->vhp, mp); - } - } -#endif + return (mp); } @@ -1978,6 +2059,7 @@ nxge_rx_pkts_vring(p_nxge_t nxgep, uint_t vindex, rx_dma_ctl_stat_t cs) * a hardware control status register will be updated with the number of * packets were removed from the hardware queue. * + * The RCR ring lock is held when entering this function. */ static mblk_t * nxge_rx_pkts(p_nxge_t nxgep, p_rx_rcr_ring_t rcr_p, rx_dma_ctl_stat_t cs, @@ -1998,7 +2080,7 @@ nxge_rx_pkts(p_nxge_t nxgep, p_rx_rcr_ring_t rcr_p, rx_dma_ctl_stat_t cs, npi_status_t rs = NPI_SUCCESS; #endif - NXGE_DEBUG_MSG((nxgep, RX_CTL, "==> nxge_rx_pkts: " + NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, "==> nxge_rx_pkts: " "channel %d", rcr_p->rdc)); if (!(nxgep->drv_state & STATE_HW_INITIALIZED)) { @@ -2032,7 +2114,7 @@ nxge_rx_pkts(p_nxge_t nxgep, p_rx_rcr_ring_t rcr_p, rx_dma_ctl_stat_t cs, if (!qlen) { - NXGE_DEBUG_MSG((nxgep, RX_CTL, + NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, "==> nxge_rx_pkts:rcr channel %d " "qlen %d (no pkts)", channel, qlen)); @@ -2140,6 +2222,13 @@ nxge_rx_pkts(p_nxge_t nxgep, p_rx_rcr_ring_t rcr_p, rx_dma_ctl_stat_t cs, (totallen >= bytes_to_pickup)) { break; } + + /* limit the number of packets for interrupt */ + if (!(rcr_p->poll_flag)) { + if (npkt_read == nxge_max_intr_pkts) { + break; + } + } } rcr_p->rcr_desc_rd_head_pp = rcr_desc_rd_head_pp; @@ -2174,7 +2263,9 @@ nxge_rx_pkts(p_nxge_t nxgep, p_rx_rcr_ring_t rcr_p, rx_dma_ctl_stat_t cs, * read. */ - NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_rx_pkts")); + NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, "<== nxge_rx_pkts: return" + "channel %d", rcr_p->rdc)); + return (head_mp); } @@ -2280,7 +2371,7 @@ nxge_receive_packet(p_nxge_t nxgep, } /* - * Sofware workaround for BMAC hardware limitation that allows + * Software workaround for BMAC hardware limitation that allows * maxframe size of 1526, instead of 1522 for non-jumbo and 0x2406 * instead of 0x2400 for jumbo. */ @@ -2318,7 +2409,6 @@ nxge_receive_packet(p_nxge_t nxgep, hdr_size)); } - MUTEX_ENTER(&rcr_p->lock); MUTEX_ENTER(&rx_rbr_p->lock); NXGE_DEBUG_MSG((nxgep, RX_CTL, @@ -2344,7 +2434,6 @@ nxge_receive_packet(p_nxge_t nxgep, if (status != NXGE_OK) { MUTEX_EXIT(&rx_rbr_p->lock); - MUTEX_EXIT(&rcr_p->lock); NXGE_DEBUG_MSG((nxgep, RX_CTL, "<== nxge_receive_packet: found vaddr failed %d", status)); @@ -2392,7 +2481,6 @@ nxge_receive_packet(p_nxge_t nxgep, break; default: MUTEX_EXIT(&rx_rbr_p->lock); - MUTEX_EXIT(&rcr_p->lock); return; } @@ -2558,7 +2646,6 @@ nxge_receive_packet(p_nxge_t nxgep, } MUTEX_EXIT(&rx_rbr_p->lock); - MUTEX_EXIT(&rcr_p->lock); nxge_freeb(rx_msg_p); return; } @@ -2643,7 +2730,6 @@ nxge_receive_packet(p_nxge_t nxgep, rx_msg_p->free = B_TRUE; } MUTEX_EXIT(&rx_rbr_p->lock); - MUTEX_EXIT(&rcr_p->lock); nxge_freeb(rx_msg_p); return; } @@ -2657,7 +2743,6 @@ nxge_receive_packet(p_nxge_t nxgep, rcr_p->rcvd_pkt_bytes = bytes_read; MUTEX_EXIT(&rx_rbr_p->lock); - MUTEX_EXIT(&rcr_p->lock); if (rx_msg_p->free && rx_msg_p->rx_use_bcopy) { atomic_inc_32(&rx_msg_p->ref_cnt); @@ -2682,8 +2767,6 @@ nxge_receive_packet(p_nxge_t nxgep, if (is_valid && !multi) { /* - * Update hardware checksuming. - * * If the checksum flag nxge_chksum_offload * is 1, TCP and UDP packets can be sent * up with good checksum. If the checksum flag @@ -2727,6 +2810,177 @@ nxge_receive_packet(p_nxge_t nxgep, *multi_p, nmp, *mp, *mp_cont)); } +/* + * Enable polling for a ring. Interrupt for the ring is disabled when + * the nxge interrupt comes (see nxge_rx_intr). + */ +int +nxge_enable_poll(void *arg) +{ + p_nxge_ring_handle_t ring_handle = (p_nxge_ring_handle_t)arg; + p_rx_rcr_ring_t ringp; + p_nxge_t nxgep; + p_nxge_ldg_t ldgp; + uint32_t channel; + + if (ring_handle == NULL) { + return (0); + } + + nxgep = ring_handle->nxgep; + channel = nxgep->pt_config.hw_config.start_rdc + ring_handle->index; + ringp = nxgep->rx_rcr_rings->rcr_rings[channel]; + NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, + "==> nxge_enable_poll: rdc %d ", ringp->rdc)); + ldgp = ringp->ldgp; + if (ldgp == NULL) { + NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, + "==> nxge_enable_poll: rdc %d NULL ldgp: no change", + ringp->rdc)); + return (0); + } + + MUTEX_ENTER(&ringp->lock); + /* enable polling */ + if (ringp->poll_flag == 0) { + ringp->poll_flag = 1; + NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, + "==> nxge_enable_poll: rdc %d set poll flag to 1", + ringp->rdc)); + } + + MUTEX_EXIT(&ringp->lock); + return (0); +} +/* + * Disable polling for a ring and enable its interrupt. + */ +int +nxge_disable_poll(void *arg) +{ + p_nxge_ring_handle_t ring_handle = (p_nxge_ring_handle_t)arg; + p_rx_rcr_ring_t ringp; + p_nxge_t nxgep; + uint32_t channel; + + if (ring_handle == NULL) { + return (0); + } + + nxgep = ring_handle->nxgep; + channel = nxgep->pt_config.hw_config.start_rdc + ring_handle->index; + ringp = nxgep->rx_rcr_rings->rcr_rings[channel]; + + NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, + "==> nxge_disable_poll: rdc %d poll_flag %d", ringp->rdc)); + + MUTEX_ENTER(&ringp->lock); + + /* disable polling: enable interrupt */ + if (ringp->poll_flag) { + npi_handle_t handle; + rx_dma_ctl_stat_t cs; + uint8_t channel; + p_nxge_ldg_t ldgp; + + /* + * Get the control and status for this channel. + */ + handle = NXGE_DEV_NPI_HANDLE(nxgep); + channel = ringp->rdc; + RXDMA_REG_READ64(handle, RX_DMA_CTL_STAT_REG, + channel, &cs.value); + + /* + * Enable mailbox update + * Since packets were not read and the hardware uses + * bits pktread and ptrread to update the queue + * length, we need to set both bits to 0. + */ + cs.bits.ldw.pktread = 0; + cs.bits.ldw.ptrread = 0; + cs.bits.hdw.mex = 1; + RXDMA_REG_WRITE64(handle, RX_DMA_CTL_STAT_REG, channel, + cs.value); + + /* + * Rearm this logical group if this is a single device + * group. + */ + ldgp = ringp->ldgp; + if (ldgp == NULL) { + ringp->poll_flag = 0; + MUTEX_EXIT(&ringp->lock); + NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, + "==> nxge_disable_poll: no ldgp rdc %d " + "(still set poll to 0", ringp->rdc)); + return (0); + } + NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, + "==> nxge_disable_poll: rdc %d ldgp $%p (enable intr)", + ringp->rdc, ldgp)); + if (ldgp->nldvs == 1) { + ldgimgm_t mgm; + mgm.value = 0; + mgm.bits.ldw.arm = 1; + mgm.bits.ldw.timer = ldgp->ldg_timer; + NXGE_REG_WR64(handle, + LDGIMGN_REG + LDSV_OFFSET(ldgp->ldg), mgm.value); + } + ringp->poll_flag = 0; + } + + MUTEX_EXIT(&ringp->lock); + return (0); +} + +/* + * Poll 'bytes_to_pickup' bytes of message from the rx ring. + */ +mblk_t * +nxge_rx_poll(void *arg, int bytes_to_pickup) +{ + p_nxge_ring_handle_t ring_handle = (p_nxge_ring_handle_t)arg; + p_rx_rcr_ring_t rcr_p; + p_nxge_t nxgep; + npi_handle_t handle; + rx_dma_ctl_stat_t cs; + mblk_t *mblk; + p_nxge_ldv_t ldvp; + uint32_t channel; + + nxgep = ring_handle->nxgep; + + /* + * Get the control and status for this channel. + */ + handle = NXGE_DEV_NPI_HANDLE(nxgep); + channel = nxgep->pt_config.hw_config.start_rdc + ring_handle->index; + rcr_p = nxgep->rx_rcr_rings->rcr_rings[channel]; + MUTEX_ENTER(&rcr_p->lock); + ASSERT(rcr_p->poll_flag == 1); + + RXDMA_REG_READ64(handle, RX_DMA_CTL_STAT_REG, rcr_p->rdc, &cs.value); + + NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, + "==> nxge_rx_poll: calling nxge_rx_pkts: rdc %d poll_flag %d", + rcr_p->rdc, rcr_p->poll_flag)); + mblk = nxge_rx_pkts(nxgep, rcr_p, cs, bytes_to_pickup); + + ldvp = rcr_p->ldvp; + /* error events. */ + if (ldvp && (cs.value & RX_DMA_CTL_STAT_ERROR)) { + (void) nxge_rx_err_evnts(nxgep, ldvp->vdma_index, cs); + } + + MUTEX_EXIT(&rcr_p->lock); + + NXGE_DEBUG_MSG((nxgep, NXGE_ERR_CTL, + "<== nxge_rx_poll: rdc %d mblk $%p", rcr_p->rdc, mblk)); + return (mblk); +} + + /*ARGSUSED*/ static nxge_status_t nxge_rx_err_evnts(p_nxge_t nxgep, int channel, rx_dma_ctl_stat_t cs) @@ -4231,6 +4485,7 @@ nxge_rxdma_stop_channel(p_nxge_t nxgep, uint16_t channel) * Make sure channel is disabled. */ status = nxge_disable_rxdma_channel(nxgep, channel); + if (status != NXGE_OK) { NXGE_ERROR_MSG((nxgep, NXGE_ERR_CTL, " nxge_rxdma_stop_channel: " diff --git a/usr/src/uts/common/io/nxge/nxge_send.c b/usr/src/uts/common/io/nxge/nxge_send.c index 7e656c9072..2b21d22a1c 100644 --- a/usr/src/uts/common/io/nxge/nxge_send.c +++ b/usr/src/uts/common/io/nxge/nxge_send.c @@ -40,8 +40,6 @@ static void nxge_hcksum_retrieve(mblk_t *, uint32_t *, uint32_t *); static uint32_t nxge_csgen(uint16_t *, int); -extern void nxge_txdma_freemsg_task(p_tx_ring_t ringp); - extern uint32_t nxge_reclaim_pending; extern uint32_t nxge_bcopy_thresh; extern uint32_t nxge_dvma_thresh; @@ -51,18 +49,116 @@ extern uint32_t nxge_tx_intr_thres; extern uint32_t nxge_tx_max_gathers; extern uint32_t nxge_tx_tiny_pack; extern uint32_t nxge_tx_use_bcopy; -extern uint32_t nxge_tx_lb_policy; -extern uint32_t nxge_no_tx_lb; extern nxge_tx_mode_t nxge_tx_scheme; uint32_t nxge_lso_kick_cnt = 2; -typedef struct _mac_tx_hint { - uint16_t sap; - uint16_t vid; - void *hash; -} mac_tx_hint_t, *p_mac_tx_hint_t; -int nxge_tx_lb_ring_1(p_mblk_t, uint32_t, p_mac_tx_hint_t); +void +nxge_tx_ring_task(void *arg) +{ + p_tx_ring_t ring = (p_tx_ring_t)arg; + + MUTEX_ENTER(&ring->lock); + (void) nxge_txdma_reclaim(ring->nxgep, ring, 0); + MUTEX_EXIT(&ring->lock); + + if (!isLDOMguest(ring->nxgep) && !ring->tx_ring_offline) + mac_tx_ring_update(ring->nxgep->mach, ring->tx_ring_handle); +#if defined(sun4v) + else { + nxge_hio_data_t *nhd = + (nxge_hio_data_t *)ring->nxgep->nxge_hw_p->hio; + nx_vio_fp_t *vio = &nhd->hio.vio; + + /* Call back vnet. */ + if (vio->cb.vio_net_tx_update) { + (*vio->cb.vio_net_tx_update)(ring->nxgep->hio_vr->vhp); + } + } +#endif +} + +static void +nxge_tx_ring_dispatch(p_tx_ring_t ring) +{ + /* + * Kick the ring task to reclaim some buffers. + */ + (void) ddi_taskq_dispatch(ring->taskq, + nxge_tx_ring_task, (void *)ring, DDI_SLEEP); +} + +mblk_t * +nxge_tx_ring_send(void *arg, mblk_t *mp) +{ + p_nxge_ring_handle_t nrhp = (p_nxge_ring_handle_t)arg; + p_nxge_t nxgep; + p_tx_ring_t tx_ring_p; + int status, channel; + + ASSERT(nrhp != NULL); + nxgep = nrhp->nxgep; + channel = nxgep->pt_config.hw_config.tdc.start + nrhp->index; + tx_ring_p = nxgep->tx_rings->rings[channel]; + + ASSERT(nxgep == tx_ring_p->nxgep); + +#ifdef DEBUG + if (isLDOMservice(nxgep)) { + ASSERT(!tx_ring_p->tx_ring_offline); + } +#endif + + status = nxge_start(nxgep, tx_ring_p, mp); + if (status) { + nxge_tx_ring_dispatch(tx_ring_p); + return (mp); + } + + return ((mblk_t *)NULL); +} + +#if defined(sun4v) + +/* + * nxge_m_tx() is needed for Hybrid I/O operation of the vnet in + * the guest domain. See CR 6778758 for long term solution. + */ + +mblk_t * +nxge_m_tx(void *arg, mblk_t *mp) +{ + p_nxge_t nxgep = (p_nxge_t)arg; + mblk_t *next; + p_tx_ring_t tx_ring_p; + int status; + + NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_m_tx")); + + /* + * Get the default ring handle. + */ + tx_ring_p = nxgep->tx_rings->rings[0]; + + while (mp != NULL) { + next = mp->b_next; + mp->b_next = NULL; + + status = nxge_start(nxgep, tx_ring_p, mp); + if (status != 0) { + mp->b_next = next; + nxge_tx_ring_dispatch(tx_ring_p); + return (mp); + } + + mp = next; + } + + NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_m_tx")); + return ((mblk_t *)NULL); +} + +#endif int nxge_start(p_nxge_t nxgep, p_tx_ring_t tx_ring_p, p_mblk_t mp) @@ -305,8 +401,6 @@ start_again: tx_ring_p->tdc)); goto nxge_start_fail_lso; } else { - boolean_t skip_sched = B_FALSE; - cas32((uint32_t *)&tx_ring_p->queueing, 0, 1); tdc_stats->tx_no_desc++; @@ -316,16 +410,10 @@ start_again: (void) atomic_swap_32( &tx_ring_p->tx_ring_offline, NXGE_TX_RING_OFFLINED); - skip_sched = B_TRUE; } } MUTEX_EXIT(&tx_ring_p->lock); - if (nxgep->resched_needed && - !nxgep->resched_running && !skip_sched) { - nxgep->resched_running = B_TRUE; - ddi_trigger_softintr(nxgep->resched_id); - } status = 1; goto nxge_start_fail1; } @@ -1012,10 +1100,7 @@ nxge_start_control_header_only: MUTEX_EXIT(&tx_ring_p->lock); - nxge_txdma_freemsg_task(tx_ring_p); - NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_start")); - return (status); nxge_start_fail_lso: @@ -1105,8 +1190,6 @@ nxge_start_fail2: tx_ring_p->tx_wrap_mask); } - - nxgep->resched_needed = B_TRUE; } if (isLDOMservice(nxgep)) { @@ -1123,300 +1206,9 @@ nxge_start_fail1: /* Add FMA to check the access handle nxge_hregh */ NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_start")); - - return (status); -} - -int -nxge_serial_tx(mblk_t *mp, void *arg) -{ - p_tx_ring_t tx_ring_p = (p_tx_ring_t)arg; - p_nxge_t nxgep = tx_ring_p->nxgep; - int status = 0; - - if (isLDOMservice(nxgep)) { - if (tx_ring_p->tx_ring_offline) { - freemsg(mp); - return (status); - } - } - - status = nxge_start(nxgep, tx_ring_p, mp); return (status); } -boolean_t -nxge_send(p_nxge_t nxgep, mblk_t *mp, p_mac_tx_hint_t hp) -{ - p_tx_ring_t *tx_rings; - uint8_t ring_index; - p_tx_ring_t tx_ring_p; - nxge_grp_t *group; - - NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_send")); - - ASSERT(mp->b_next == NULL); - - group = nxgep->tx_set.group[0]; /* The default group */ - ring_index = nxge_tx_lb_ring_1(mp, group->count, hp); - - tx_rings = nxgep->tx_rings->rings; - tx_ring_p = tx_rings[group->legend[ring_index]]; - - if (isLDOMservice(nxgep)) { - if (tx_ring_p->tx_ring_offline) { - /* - * OFFLINE means that it is in the process of being - * shared - that is, it has been claimed by the HIO - * code, but hasn't been unlinked from <group> yet. - * So in this case use the first TDC, which always - * belongs to the service domain and can't be shared. - */ - ring_index = 0; - tx_ring_p = tx_rings[group->legend[ring_index]]; - } - } - - NXGE_DEBUG_MSG((nxgep, TX_CTL, "count %d, tx_rings[%d] = %p", - (int)group->count, group->legend[ring_index], tx_ring_p)); - - switch (nxge_tx_scheme) { - case NXGE_USE_START: - if (nxge_start(nxgep, tx_ring_p, mp)) { - NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_send: failed " - "ring index %d", ring_index)); - return (B_FALSE); - } - break; - - case NXGE_USE_SERIAL: - default: - nxge_serialize_enter(tx_ring_p->serial, mp); - break; - } - - NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_send: ring index %d", - ring_index)); - - return (B_TRUE); -} - -/* - * nxge_m_tx() - send a chain of packets - */ -mblk_t * -nxge_m_tx(void *arg, mblk_t *mp) -{ - p_nxge_t nxgep = (p_nxge_t)arg; - mblk_t *next; - mac_tx_hint_t hint; - - NXGE_DEBUG_MSG((nxgep, DDI_CTL, "==> nxge_m_tx")); - - if ((!(nxgep->drv_state & STATE_HW_INITIALIZED)) || - (nxgep->nxge_mac_state != NXGE_MAC_STARTED)) { - NXGE_DEBUG_MSG((nxgep, DDI_CTL, - "==> nxge_m_tx: hardware not initialized")); - NXGE_DEBUG_MSG((nxgep, DDI_CTL, - "<== nxge_m_tx")); - freemsgchain(mp); - mp = NULL; - return (mp); - } - - hint.hash = NULL; - hint.vid = 0; - hint.sap = 0; - - while (mp != NULL) { - next = mp->b_next; - mp->b_next = NULL; - - /* - * Until Nemo tx resource works, the mac driver - * does the load balancing based on TCP port, - * or CPU. For debugging, we use a system - * configurable parameter. - */ - if (!nxge_send(nxgep, mp, &hint)) { - mp->b_next = next; - break; - } - - mp = next; - - NXGE_DEBUG_MSG((NULL, TX_CTL, - "==> nxge_m_tx: (go back to loop) mp $%p next $%p", - mp, next)); - } - - NXGE_DEBUG_MSG((nxgep, DDI_CTL, "<== nxge_m_tx")); - return (mp); -} - -int -nxge_tx_lb_ring_1(p_mblk_t mp, uint32_t maxtdcs, p_mac_tx_hint_t hp) -{ - uint8_t ring_index = 0; - uint8_t *tcp_port; - p_mblk_t nmp; - size_t mblk_len; - size_t iph_len; - size_t hdrs_size; - uint8_t hdrs_buf[sizeof (struct ether_vlan_header) + - IP_MAX_HDR_LENGTH + sizeof (uint32_t)]; - /* - * allocate space big enough to cover - * the max ip header length and the first - * 4 bytes of the TCP/IP header. - */ - - boolean_t qos = B_FALSE; - ushort_t eth_type; - size_t eth_hdr_size; - - NXGE_DEBUG_MSG((NULL, TX_CTL, "==> nxge_tx_lb_ring")); - - if (hp->vid) { - qos = B_TRUE; - } - switch (nxge_tx_lb_policy) { - case NXGE_TX_LB_TCPUDP: /* default IPv4 TCP/UDP */ - default: - tcp_port = mp->b_rptr; - eth_type = ntohs(((struct ether_header *)tcp_port)->ether_type); - if (eth_type == VLAN_ETHERTYPE) { - eth_type = ntohs(((struct ether_vlan_header *) - tcp_port)->ether_type); - eth_hdr_size = sizeof (struct ether_vlan_header); - } else { - eth_hdr_size = sizeof (struct ether_header); - } - - if (!nxge_no_tx_lb && !qos && eth_type == ETHERTYPE_IP) { - nmp = mp; - mblk_len = MBLKL(nmp); - tcp_port = NULL; - if (mblk_len > eth_hdr_size + sizeof (uint8_t)) { - tcp_port = nmp->b_rptr + eth_hdr_size; - mblk_len -= eth_hdr_size; - iph_len = ((*tcp_port) & 0x0f) << 2; - if (mblk_len > (iph_len + sizeof (uint32_t))) { - tcp_port = nmp->b_rptr; - } else { - tcp_port = NULL; - } - } - if (tcp_port == NULL) { - hdrs_size = 0; - while ((nmp) && (hdrs_size < - sizeof (hdrs_buf))) { - mblk_len = MBLKL(nmp); - if (mblk_len >= - (sizeof (hdrs_buf) - hdrs_size)) - mblk_len = sizeof (hdrs_buf) - - hdrs_size; - bcopy(nmp->b_rptr, - &hdrs_buf[hdrs_size], mblk_len); - hdrs_size += mblk_len; - nmp = nmp->b_cont; - } - tcp_port = hdrs_buf; - } - tcp_port += eth_hdr_size; - if (!(tcp_port[6] & 0x3f) && !(tcp_port[7] & 0xff)) { - switch (tcp_port[9]) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_ESP: - tcp_port += ((*tcp_port) & 0x0f) << 2; - ring_index = - ((tcp_port[0] ^ - tcp_port[1] ^ - tcp_port[2] ^ - tcp_port[3]) % maxtdcs); - break; - - case IPPROTO_AH: - /* SPI starts at the 4th byte */ - tcp_port += ((*tcp_port) & 0x0f) << 2; - ring_index = - ((tcp_port[4] ^ - tcp_port[5] ^ - tcp_port[6] ^ - tcp_port[7]) % maxtdcs); - break; - - default: - ring_index = tcp_port[19] % maxtdcs; - break; - } - } else { /* fragmented packet */ - ring_index = tcp_port[19] % maxtdcs; - } - } else { - ring_index = mp->b_band % maxtdcs; - } - break; - - case NXGE_TX_LB_HASH: - if (hp->hash) { -#if defined(__i386) - ring_index = ((uint32_t)(hp->hash) % maxtdcs); -#else - ring_index = ((uint64_t)(hp->hash) % maxtdcs); -#endif - } else { - ring_index = mp->b_band % maxtdcs; - } - break; - - case NXGE_TX_LB_DEST_MAC: /* Use destination MAC address */ - tcp_port = mp->b_rptr; - ring_index = tcp_port[5] % maxtdcs; - break; - } - - NXGE_DEBUG_MSG((NULL, TX_CTL, "<== nxge_tx_lb_ring")); - - return (ring_index); -} - -uint_t -nxge_reschedule(caddr_t arg) -{ - p_nxge_t nxgep; - - nxgep = (p_nxge_t)arg; - - NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_reschedule")); - - if (nxgep->nxge_mac_state == NXGE_MAC_STARTED && - nxgep->resched_needed) { - if (!isLDOMguest(nxgep)) - mac_tx_update(nxgep->mach); -#if defined(sun4v) - else { /* isLDOMguest(nxgep) */ - nxge_hio_data_t *nhd = (nxge_hio_data_t *) - nxgep->nxge_hw_p->hio; - nx_vio_fp_t *vio = &nhd->hio.vio; - - /* Call back vnet. */ - if (vio->cb.vio_net_tx_update) { - (*vio->cb.vio_net_tx_update) - (nxgep->hio_vr->vhp); - } - } -#endif - nxgep->resched_needed = B_FALSE; - nxgep->resched_running = B_FALSE; - } - - NXGE_DEBUG_MSG((NULL, TX_CTL, "<== nxge_reschedule")); - return (DDI_INTR_CLAIMED); -} - - /* Software LSO starts here */ static void nxge_hcksum_retrieve(mblk_t *mp, diff --git a/usr/src/uts/common/io/nxge/nxge_txdma.c b/usr/src/uts/common/io/nxge/nxge_txdma.c index 892c7bb65a..766e900da7 100644 --- a/usr/src/uts/common/io/nxge/nxge_txdma.c +++ b/usr/src/uts/common/io/nxge/nxge_txdma.c @@ -31,7 +31,7 @@ #include <sys/llc1.h> uint32_t nxge_reclaim_pending = TXDMA_RECLAIM_PENDING_DEFAULT; -uint32_t nxge_tx_minfree = 32; +uint32_t nxge_tx_minfree = 64; uint32_t nxge_tx_intr_thres = 0; uint32_t nxge_tx_max_gathers = TX_MAX_GATHER_POINTERS; uint32_t nxge_tx_tiny_pack = 1; @@ -53,9 +53,7 @@ extern ddi_device_acc_attr_t nxge_dev_buf_dma_acc_attr; extern ddi_dma_attr_t nxge_desc_dma_attr; extern ddi_dma_attr_t nxge_tx_dma_attr; -extern int nxge_serial_tx(mblk_t *mp, void *arg); - -void nxge_txdma_freemsg_task(p_tx_ring_t tx_ring_p); +extern void nxge_tx_ring_task(void *arg); static nxge_status_t nxge_map_txdma(p_nxge_t, int); @@ -97,22 +95,25 @@ nxge_init_txdma_channels(p_nxge_t nxgep) nxge_grp_set_t *set = &nxgep->tx_set; int i, tdc, count; nxge_grp_t *group; + dc_map_t map; + int dev_gindex; NXGE_DEBUG_MSG((nxgep, MEM2_CTL, "==> nxge_init_txdma_channels")); for (i = 0, count = 0; i < NXGE_LOGICAL_GROUP_MAX; i++) { if ((1 << i) & set->lg.map) { group = set->group[i]; - + dev_gindex = + nxgep->pt_config.hw_config.def_mac_txdma_grpid + i; + map = nxgep->pt_config.tdc_grps[dev_gindex].map; for (tdc = 0; tdc < NXGE_MAX_TDCS; tdc++) { - if ((1 << tdc) & group->map) { - if ((nxge_grp_dc_add(nxgep, group, - VP_BOUND_TX, tdc))) + if ((1 << tdc) & map) { + if ((nxge_grp_dc_add(nxgep, + group, VP_BOUND_TX, tdc))) goto init_txdma_channels_exit; } } } - if (++count == set->lg.count) break; } @@ -124,21 +125,22 @@ init_txdma_channels_exit: for (i = 0, count = 0; i < NXGE_LOGICAL_GROUP_MAX; i++) { if ((1 << i) & set->lg.map) { group = set->group[i]; - + dev_gindex = + nxgep->pt_config.hw_config.def_mac_txdma_grpid + i; + map = nxgep->pt_config.tdc_grps[dev_gindex].map; for (tdc = 0; tdc < NXGE_MAX_TDCS; tdc++) { - if ((1 << tdc) & group->map) { + if ((1 << tdc) & map) { nxge_grp_dc_remove(nxgep, VP_BOUND_TX, tdc); } } } - if (++count == set->lg.count) break; } - NXGE_DEBUG_MSG((nxgep, MEM2_CTL, "<== nxge_init_txdma_channels")); return (NXGE_ERROR); + } nxge_status_t @@ -890,44 +892,6 @@ nxge_tx_pkt_nmblocks(p_mblk_t mp, int *tot_xfer_len_p) return (nmblks); } -static void -nxge_txdma_freemsg_list_add(p_tx_ring_t tx_ring_p, p_tx_msg_t msgp) -{ - MUTEX_ENTER(&tx_ring_p->freelock); - if (tx_ring_p->tx_free_list_p != NULL) - msgp->nextp = tx_ring_p->tx_free_list_p; - tx_ring_p->tx_free_list_p = msgp; - MUTEX_EXIT(&tx_ring_p->freelock); -} - -/* - * void - * nxge_txdma_freemsg_task() -- walk the list of messages to be - * freed and free the messages. - */ -void -nxge_txdma_freemsg_task(p_tx_ring_t tx_ring_p) -{ - p_tx_msg_t msgp, nextp; - - if (tx_ring_p->tx_free_list_p != NULL) { - MUTEX_ENTER(&tx_ring_p->freelock); - msgp = tx_ring_p->tx_free_list_p; - tx_ring_p->tx_free_list_p = (p_tx_msg_t)NULL; - MUTEX_EXIT(&tx_ring_p->freelock); - - while (msgp != NULL) { - nextp = msgp->nextp; - if (msgp->tx_message != NULL) { - freemsg(msgp->tx_message); - msgp->tx_message = NULL; - } - msgp->nextp = NULL; - msgp = nextp; - } - } -} - boolean_t nxge_txdma_reclaim(p_nxge_t nxgep, p_tx_ring_t tx_ring_p, int nmblks) { @@ -947,7 +911,7 @@ nxge_txdma_reclaim(p_nxge_t nxgep, p_tx_ring_t tx_ring_p, int nmblks) uint16_t head_index, tail_index; uint8_t tdc; boolean_t head_wrap, tail_wrap; - p_nxge_tx_ring_stats_t tdc_stats; + p_nxge_tx_ring_stats_t tdc_stats; int rc; NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_txdma_reclaim")); @@ -1093,13 +1057,12 @@ nxge_txdma_reclaim(p_nxge_t nxgep, p_tx_ring_t tx_ring_p, int nmblks) } NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_txdma_reclaim: count packets")); - /* * count a chained packet only once. */ if (tx_msg_p->tx_message != NULL) { - nxge_txdma_freemsg_list_add(tx_ring_p, - tx_msg_p); + freemsg(tx_msg_p->tx_message); + tx_msg_p->tx_message = NULL; } tx_msg_p->flags.dma_type = USE_NONE; @@ -1223,13 +1186,7 @@ nxge_tx_intr(void *arg1, void *arg2) "status 0x%08x (mk bit set, calling reclaim)", channel, vindex, rs)); - MUTEX_ENTER(&tx_ring_p->lock); - (void) nxge_txdma_reclaim(nxgep, tx_rings[vindex], 0); - MUTEX_EXIT(&tx_ring_p->lock); - - nxge_txdma_freemsg_task(tx_ring_p); - - mac_tx_update(nxgep->mach); + nxge_tx_ring_task((void *)tx_ring_p); } /* @@ -1596,7 +1553,6 @@ nxge_txdma_fixup_channel(p_nxge_t nxgep, p_tx_ring_t ring_p, uint16_t channel) ring_p->ring_kick_tail.value = 0; ring_p->descs_pending = 0; MUTEX_EXIT(&ring_p->lock); - nxge_txdma_freemsg_task(ring_p); NXGE_DEBUG_MSG((nxgep, TX_CTL, "<== nxge_txdma_fixup_channel")); } @@ -1831,7 +1787,6 @@ nxge_txdma_channel_hung(p_nxge_t nxgep, p_tx_ring_t tx_ring_p, uint16_t channel) tail_wrap = tx_ring_p->wr_index_wrap; tx_rd_index = tx_ring_p->rd_index; MUTEX_EXIT(&tx_ring_p->lock); - nxge_txdma_freemsg_task(tx_ring_p); NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_txdma_channel_hung: tdc %d tx_rd_index %d " @@ -2010,8 +1965,6 @@ nxge_txdma_fixup_hung_channel(p_nxge_t nxgep, p_tx_ring_t ring_p, (void) nxge_txdma_reclaim(nxgep, ring_p, 0); MUTEX_EXIT(&ring_p->lock); - nxge_txdma_freemsg_task(ring_p); - handle = NXGE_DEV_NPI_HANDLE(nxgep); /* * Stop the dma channel waits for the stop done. @@ -2072,10 +2025,8 @@ nxge_reclaim_rings(p_nxge_t nxgep) NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_reclaim_rings: TDC %d", tdc)); MUTEX_ENTER(&ring->lock); - (void) nxge_txdma_reclaim(nxgep, ring, tdc); + (void) nxge_txdma_reclaim(nxgep, ring, 0); MUTEX_EXIT(&ring->lock); - - nxge_txdma_freemsg_task(ring); } } } @@ -2580,6 +2531,7 @@ nxge_map_txdma_channel_buf_ring(p_nxge_t nxgep, uint16_t channel, int i, j, index; uint32_t size, bsize; uint32_t nblocks, nmsgs; + char qname[TASKQ_NAMELEN]; NXGE_DEBUG_MSG((nxgep, MEM3_CTL, "==> nxge_map_txdma_channel_buf_ring")); @@ -2611,14 +2563,19 @@ nxge_map_txdma_channel_buf_ring(p_nxge_t nxgep, uint16_t channel, KMEM_ZALLOC(sizeof (tx_ring_t), KM_SLEEP); MUTEX_INIT(&tx_ring_p->lock, NULL, MUTEX_DRIVER, (void *)nxgep->interrupt_cookie); - MUTEX_INIT(&tx_ring_p->freelock, NULL, MUTEX_DRIVER, - (void *)nxgep->interrupt_cookie); (void) atomic_swap_32(&tx_ring_p->tx_ring_offline, NXGE_TX_RING_ONLINE); tx_ring_p->tx_ring_busy = B_FALSE; tx_ring_p->nxgep = nxgep; - tx_ring_p->serial = nxge_serialize_create(nmsgs, - nxge_serial_tx, tx_ring_p); + tx_ring_p->tx_ring_handle = (mac_ring_handle_t)NULL; + (void) snprintf(qname, TASKQ_NAMELEN, "tx_%d_%d", + nxgep->instance, channel); + tx_ring_p->taskq = ddi_taskq_create(nxgep->dip, qname, 1, + TASKQ_DEFAULTPRI, 0); + if (tx_ring_p->taskq == NULL) { + goto nxge_map_txdma_channel_buf_ring_fail1; + } + /* * Allocate transmit message rings and handles for packets * not to be copied to premapped buffers. @@ -2683,7 +2640,6 @@ nxge_map_txdma_channel_buf_ring(p_nxge_t nxgep, uint16_t channel, for (j = 0; j < nblocks; j++) { tx_msg_ring[index].buf_dma_handle = tx_buf_dma_handle; - tx_msg_ring[index].nextp = NULL; dmap = &tx_msg_ring[index++].buf_dma; #ifdef TX_MEM_DEBUG NXGE_DEBUG_MSG((nxgep, MEM3_CTL, @@ -2705,9 +2661,9 @@ nxge_map_txdma_channel_buf_ring(p_nxge_t nxgep, uint16_t channel, goto nxge_map_txdma_channel_buf_ring_exit; nxge_map_txdma_channel_buf_ring_fail1: - if (tx_ring_p->serial) { - nxge_serialize_destroy(tx_ring_p->serial); - tx_ring_p->serial = NULL; + if (tx_ring_p->taskq) { + ddi_taskq_destroy(tx_ring_p->taskq); + tx_ring_p->taskq = NULL; } index--; @@ -2716,8 +2672,6 @@ nxge_map_txdma_channel_buf_ring_fail1: ddi_dma_free_handle(&tx_msg_ring[index].dma_handle); } } - - MUTEX_DESTROY(&tx_ring_p->freelock); MUTEX_DESTROY(&tx_ring_p->lock); KMEM_FREE(tx_msg_ring, size); KMEM_FREE(tx_ring_p, sizeof (tx_ring_t)); @@ -2783,12 +2737,11 @@ nxge_unmap_txdma_channel_buf_ring(p_nxge_t nxgep, p_tx_ring_t tx_ring_p) MUTEX_EXIT(&tx_ring_p->lock); - if (tx_ring_p->serial) { - nxge_serialize_destroy(tx_ring_p->serial); - tx_ring_p->serial = NULL; + if (tx_ring_p->taskq) { + ddi_taskq_destroy(tx_ring_p->taskq); + tx_ring_p->taskq = NULL; } - MUTEX_DESTROY(&tx_ring_p->freelock); MUTEX_DESTROY(&tx_ring_p->lock); KMEM_FREE(tx_msg_ring, sizeof (tx_msg_t) * tx_ring_p->tx_ring_size); KMEM_FREE(tx_ring_p, sizeof (tx_ring_t)); @@ -3408,8 +3361,6 @@ nxge_txdma_fatal_err_recover( if (status != NXGE_OK) goto fail; - nxge_txdma_freemsg_task(tx_ring_p); - NXGE_ERROR_MSG((nxgep, NXGE_ERR_CTL, "Recovery Successful, TxDMAChannel#%d Restored", channel)); @@ -3420,8 +3371,6 @@ nxge_txdma_fatal_err_recover( fail: MUTEX_EXIT(&tx_ring_p->lock); - nxge_txdma_freemsg_task(tx_ring_p); - NXGE_DEBUG_MSG((nxgep, TX_CTL, "nxge_txdma_fatal_err_recover (channel %d): " "failed to recover this txdma channel", channel)); @@ -3519,7 +3468,6 @@ nxge_tx_port_fatal_err_recover(p_nxge_t nxgep) tx_ring_t *ring = nxgep->tx_rings->rings[tdc]; if (ring) { (void) nxge_txdma_reclaim(nxgep, ring, 0); - nxge_txdma_freemsg_task(ring); } } } diff --git a/usr/src/uts/common/io/nxge/nxge_virtual.c b/usr/src/uts/common/io/nxge/nxge_virtual.c index 818f8451c2..2498f77e90 100644 --- a/usr/src/uts/common/io/nxge/nxge_virtual.c +++ b/usr/src/uts/common/io/nxge/nxge_virtual.c @@ -77,6 +77,12 @@ extern uint32_t nxge_rbr_spare_size; extern npi_status_t npi_mac_altaddr_disable(npi_handle_t, uint8_t, uint8_t); +/* + * XXX: Use temporarily to specify the number of packets each interrupt process + * By default, the number of packet processed per interrupt is 1. + */ +int nxge_max_intr_pkts; + static uint8_t p2_tx_fair[2] = {12, 12}; static uint8_t p2_tx_equal[2] = {12, 12}; static uint8_t p4_tx_fair[4] = {6, 6, 6, 6}; @@ -783,7 +789,7 @@ nxge_update_txdma_properties(p_nxge_t nxgep, config_token_t token, int ddi_status = DDI_SUCCESS; int num_ports = nxgep->nports; int port, bits, j; - uint8_t start_tdc = 0, num_tdc = 0; + uint8_t start_tdc, num_tdc = 0; p_nxge_param_t param_arr; uint32_t tdc_bitmap[MAX_SIBLINGS]; int custom_start_tdc[MAX_SIBLINGS]; @@ -1616,6 +1622,14 @@ nxge_get_config_properties(p_nxge_t nxgep) } /* + * XXX: read-in the config file to determine the number of packet + * to process by each interrupt. + */ + nxge_max_intr_pkts = ddi_getprop(DDI_DEV_T_ANY, nxgep->dip, + DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "max_intr_pkts", 1); + + + /* * Get info on how many ports Neptune card has. */ nxgep->nports = nxge_get_nports(nxgep); @@ -1806,12 +1820,12 @@ nxge_use_default_dma_config_n2(p_nxge_t nxgep) return (NXGE_DDI_FAILED); } - p_cfgp->tdc.count = nxgep->max_tdcs = ndmas; + p_cfgp->tdc.count = ndmas; p_cfgp->tdc.owned = p_cfgp->tdc.count; NXGE_DEBUG_MSG((nxgep, OBP_CTL, "==> nxge_use_default_dma_config_n2: " - "p_cfgp 0x%llx max_tdcs %d nxgep->max_tdcs %d start %d", - p_cfgp, p_cfgp->tdc.count, nxgep->max_tdcs, p_cfgp->tdc.start)); + "p_cfgp 0x%llx max_tdcs %d start %d", + p_cfgp, p_cfgp->tdc.count, p_cfgp->tdc.start)); /* Receive DMA */ ndmas = NXGE_RDMA_PER_NIU_PORT; @@ -1834,12 +1848,11 @@ nxge_use_default_dma_config_n2(p_nxge_t nxgep) return (NXGE_DDI_FAILED); } - p_cfgp->max_rdcs = nxgep->max_rdcs = ndmas; + p_cfgp->max_rdcs = ndmas; nxgep->rdc_mask = (ndmas - 1); /* Hypervisor: rdc # and group # use the same # !! */ p_cfgp->max_grpids = p_cfgp->max_rdcs + p_cfgp->tdc.owned; - p_cfgp->start_grpid = 0; p_cfgp->mif_ldvid = p_cfgp->mac_ldvid = p_cfgp->ser_ldvid = 0; if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, nxgep->dip, 0, @@ -1909,13 +1922,12 @@ nxge_use_default_dma_config_n2(p_nxge_t nxgep) p_cfgp->max_ldgs = p_cfgp->max_grpids; NXGE_DEBUG_MSG((nxgep, OBP_CTL, - "==> nxge_use_default_dma_config_n2: " - "p_cfgp 0x%llx max_rdcs %d nxgep->max_rdcs %d max_grpids %d" - "start_grpid %d macid %d mifid %d serrid %d", - p_cfgp, p_cfgp->max_rdcs, nxgep->max_rdcs, p_cfgp->max_grpids, - p_cfgp->start_grpid, + "==> nxge_use_default_dma_config_n2: p_cfgp 0x%llx max_rdcs %d " + "max_grpids %d macid %d mifid %d serrid %d", + p_cfgp, p_cfgp->max_rdcs, p_cfgp->max_grpids, p_cfgp->mac_ldvid, p_cfgp->mif_ldvid, p_cfgp->ser_ldvid)); + NXGE_DEBUG_MSG((nxgep, OBP_CTL, "==> nxge_use_default_dma_config_n2: " "p_cfgp p%p start_ldg %d nxgep->max_ldgs %d", p_cfgp, p_cfgp->start_ldg, p_cfgp->max_ldgs)); @@ -1923,12 +1935,14 @@ nxge_use_default_dma_config_n2(p_nxge_t nxgep) /* * RDC groups and the beginning RDC group assigned to this function. */ - p_cfgp->max_rdc_grpids = 1; - p_cfgp->def_mac_rxdma_grpid = (nxgep->function_num * 1); - - if ((p_cfgp->def_mac_rxdma_grpid = nxge_fzc_rdc_tbl_bind - (nxgep, p_cfgp->def_mac_rxdma_grpid, B_TRUE)) - >= NXGE_MAX_RDC_GRPS) { + p_cfgp->max_rdc_grpids = NXGE_MAX_RDC_GROUPS / nxgep->nports; + p_cfgp->def_mac_rxdma_grpid = + nxgep->function_num * NXGE_MAX_RDC_GROUPS / nxgep->nports; + p_cfgp->def_mac_txdma_grpid = + nxgep->function_num * NXGE_MAX_TDC_GROUPS / nxgep->nports; + + if ((p_cfgp->def_mac_rxdma_grpid = nxge_fzc_rdc_tbl_bind(nxgep, + p_cfgp->def_mac_rxdma_grpid, B_TRUE)) >= NXGE_MAX_RDC_GRPS) { NXGE_ERROR_MSG((nxgep, CFG_CTL, "nxge_use_default_dma_config_n2(): " "nxge_fzc_rdc_tbl_bind failed")); @@ -2060,11 +2074,10 @@ nxge_use_cfg_dma_config(p_nxge_t nxgep) prop, tx_ndmas); } - p_cfgp->tdc.count = nxgep->max_tdcs = tx_ndmas; + p_cfgp->tdc.count = tx_ndmas; p_cfgp->tdc.owned = p_cfgp->tdc.count; NXGE_DEBUG_MSG((nxgep, CFG_CTL, "==> nxge_use_cfg_dma_config: " - "p_cfgp 0x%llx max_tdcs %d nxgep->max_tdcs %d", - p_cfgp, p_cfgp->tdc.count, nxgep->max_tdcs)); + "p_cfgp 0x%llx max_tdcs %d", p_cfgp, p_cfgp->tdc.count)); prop = param_arr[param_rxdma_channels_begin].fcode_name; @@ -2149,44 +2162,23 @@ nxge_use_cfg_dma_config(p_nxge_t nxgep) prop, rx_ndmas); } - p_cfgp->max_rdcs = nxgep->max_rdcs = rx_ndmas; + p_cfgp->max_rdcs = rx_ndmas; - prop = param_arr[param_rdc_grps_start].fcode_name; - if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, 0, prop, - &prop_val, &prop_len) == DDI_PROP_SUCCESS) { - p_cfgp->def_mac_rxdma_grpid = *prop_val; - ddi_prop_free(prop_val); - if ((p_cfgp->def_mac_rxdma_grpid = nxge_fzc_rdc_tbl_bind - (nxgep, p_cfgp->def_mac_rxdma_grpid, B_TRUE)) - >= NXGE_MAX_RDC_GRPS) { - NXGE_ERROR_MSG((nxgep, CFG_CTL, - "nxge_use_cfg_dma_config(): " - "nxge_fzc_rdc_tbl_bind failed")); - cmn_err(CE_CONT, "nxge%d: group not available!\n", - nxgep->instance); - goto nxge_use_cfg_dma_config_exit; - } + /* + * RDC groups and the beginning RDC group assigned to this function. + * XXX: this may be wrong if prop value is used. + */ + p_cfgp->def_mac_rxdma_grpid = + nxgep->function_num * NXGE_MAX_RDC_GROUPS / nxgep->nports; + p_cfgp->def_mac_txdma_grpid = + nxgep->function_num * NXGE_MAX_TDC_GROUPS / nxgep->nports; - NXGE_DEBUG_MSG((nxgep, CFG_CTL, - "==> nxge_use_default_dma_config: " - "use property " "start_grpid %d ", - p_cfgp->start_grpid)); - } else { - p_cfgp->def_mac_rxdma_grpid = nxgep->function_num; - if ((p_cfgp->def_mac_rxdma_grpid = nxge_fzc_rdc_tbl_bind( - nxgep, p_cfgp->def_mac_rxdma_grpid, B_TRUE)) >= - NXGE_MAX_RDC_GRPS) { - cmn_err(CE_CONT, "nxge%d: group not available!\n", - nxgep->instance); - goto nxge_use_cfg_dma_config_exit; - } - (void) ddi_prop_update_int(DDI_DEV_T_NONE, nxgep->dip, - prop, p_cfgp->def_mac_rxdma_grpid); - NXGE_DEBUG_MSG((nxgep, CFG_CTL, - "==> nxge_use_default_dma_config: " - "use default " - "start_grpid %d (same as function #)", - p_cfgp->start_grpid)); + if ((p_cfgp->def_mac_rxdma_grpid = nxge_fzc_rdc_tbl_bind(nxgep, + p_cfgp->def_mac_rxdma_grpid, B_TRUE)) >= NXGE_MAX_RDC_GRPS) { + NXGE_ERROR_MSG((nxgep, CFG_CTL, + "nxge_use_default_dma_config2(): " + "nxge_fzc_rdc_tbl_bind failed")); + goto nxge_use_cfg_dma_config_exit; } prop = param_arr[param_rx_rdc_grps].fcode_name; @@ -2195,7 +2187,7 @@ nxge_use_cfg_dma_config(p_nxge_t nxgep) nrxgp = *prop_val; ddi_prop_free(prop_val); } else { - nrxgp = 1; + nrxgp = NXGE_MAX_RDC_GRPS / nxgep->nports; (void) ddi_prop_update_int(DDI_DEV_T_NONE, nxgep->dip, prop, nrxgp); NXGE_DEBUG_MSG((nxgep, CFG_CTL, @@ -2203,7 +2195,6 @@ nxge_use_cfg_dma_config(p_nxge_t nxgep) "num_rdc_grpid not found: use def:# of " "rdc groups %d\n", nrxgp)); } - p_cfgp->max_rdc_grpids = nrxgp; /* @@ -2213,10 +2204,9 @@ nxge_use_cfg_dma_config(p_nxge_t nxgep) p_cfgp->max_ldgs = NXGE_LDGRP_PER_4PORTS; NXGE_DEBUG_MSG((nxgep, CFG_CTL, "==> nxge_use_default_dma_config: " - "p_cfgp 0x%llx max_rdcs %d nxgep->max_rdcs %d max_grpids %d" - "start_grpid %d", - p_cfgp, p_cfgp->max_rdcs, nxgep->max_rdcs, p_cfgp->max_grpids, - p_cfgp->start_grpid)); + "p_cfgp 0x%llx max_rdcs %d max_grpids %d default_grpid %d", + p_cfgp, p_cfgp->max_rdcs, p_cfgp->max_grpids, + p_cfgp->def_mac_rxdma_grpid)); NXGE_DEBUG_MSG((nxgep, CFG_CTL, "==> nxge_use_cfg_dma_config: " "p_cfgp 0x%016llx start_ldg %d nxgep->max_ldgs %d " @@ -2264,7 +2254,7 @@ nxge_get_logical_props(p_nxge_t nxgep) (void) memset(port, 0, sizeof (*port)); - port->mac_port = 0; /* := function number */ + port->mac_port = nxgep->function_num; /* := function number */ /* * alloc_buf_size: @@ -2300,8 +2290,9 @@ nxge_get_logical_props(p_nxge_t nxgep) group = &port->rdc_grps[0]; - group->flag = 1; /* configured */ + group->flag = B_TRUE; /* configured */ group->config_method = RDC_TABLE_ENTRY_METHOD_REP; + group->port = NXGE_GET_PORT_NUM(nxgep->function_num); /* HIO futures: this is still an open question. */ hardware->max_macs = 1; @@ -2407,129 +2398,138 @@ nxge_set_rdc_intr_property(p_nxge_t nxgep) static void nxge_set_hw_dma_config(p_nxge_t nxgep) { - int i, ndmas, ngrps, bitmap, end, st_rdc; - int32_t status; - uint8_t rdcs_per_grp; - p_nxge_dma_pt_cfg_t p_dma_cfgp; - p_nxge_hw_pt_cfg_t p_cfgp; - p_nxge_rdc_grp_t rdc_grp_p; - int rdcgrp_cfg = CFG_NOT_SPECIFIED, rx_quick_cfg; - char *prop, *prop_val; - p_nxge_param_t param_arr; - config_token_t token; - nxge_grp_t *group; + int i, j, ngrps, bitmap, end, st_rdc; + p_nxge_dma_pt_cfg_t p_dma_cfgp; + p_nxge_hw_pt_cfg_t p_cfgp; + p_nxge_rdc_grp_t rdc_grp_p; + p_nxge_tdc_grp_t tdc_grp_p; + nxge_grp_t *group; + uint8_t nrdcs; + dc_map_t map = 0; NXGE_DEBUG_MSG((nxgep, CFG_CTL, "==> nxge_set_hw_dma_config")); p_dma_cfgp = (p_nxge_dma_pt_cfg_t)&nxgep->pt_config; p_cfgp = (p_nxge_hw_pt_cfg_t)&p_dma_cfgp->hw_config; - rdc_grp_p = p_dma_cfgp->rdc_grps; + switch (nxgep->niu_type) { + case NEPTUNE_4_1GC: + case NEPTUNE_2_10GF_2_1GC: + case NEPTUNE_1_10GF_3_1GC: + case NEPTUNE_1_1GC_1_10GF_2_1GC: + case NEPTUNE_2_10GF_2_1GRF: + default: + ngrps = 2; + break; + case NEPTUNE_2_10GF: + case NEPTUNE_2_1GRF: + case N2_NIU: + ngrps = 4; + break; + } + + /* + * Setup TDC groups + */ bitmap = 0; end = p_cfgp->tdc.start + p_cfgp->tdc.owned; - p_dma_cfgp->tx_dma_map = 0; for (i = p_cfgp->tdc.start; i < end; i++) { bitmap |= (1 << i); } nxgep->tx_set.owned.map |= bitmap; /* Owned, & not shared. */ + nxgep->tx_set.owned.count = p_cfgp->tdc.owned; + p_dma_cfgp->tx_dma_map = bitmap; - group = (nxge_grp_t *)nxge_grp_add(nxgep, NXGE_TRANSMIT_GROUP); - group->map = bitmap; + for (i = 0; i < ngrps; i++) { + group = (nxge_grp_t *)nxge_grp_add(nxgep, + NXGE_TRANSMIT_GROUP); + tdc_grp_p = &p_dma_cfgp->tdc_grps[ + p_cfgp->def_mac_txdma_grpid + i]; + if (i == 0) + tdc_grp_p->map = bitmap; + else + tdc_grp_p->map = 0; + /* no ring is associated with a group initially */ + tdc_grp_p->start_tdc = 0; + tdc_grp_p->max_tdcs = 0; + tdc_grp_p->grp_index = group->index; + } - p_dma_cfgp->tx_dma_map = bitmap; - param_arr = nxgep->param_arr; + for (i = 0; i < NXGE_MAX_RDCS; i++) { + nxgep->rx_channel_started[i] = B_FALSE; + } - /* Assume RDCs are evenly distributed */ - rx_quick_cfg = param_arr[param_rx_quick_cfg].value; - switch (rx_quick_cfg) { - case CFG_NOT_SPECIFIED: - prop = "rxdma-grp-cfg"; - status = ddi_prop_lookup_string(DDI_DEV_T_NONE, - nxgep->dip, 0, prop, (char **)&prop_val); - if (status != DDI_PROP_SUCCESS) { - NXGE_DEBUG_MSG((nxgep, CFG_CTL, - " property %s not found", prop)); - rdcgrp_cfg = CFG_L3_DISTRIBUTE; - } else { - token = nxge_get_config_token(prop_val); - switch (token) { - case L2_CLASSIFY: + /* + * Setup RDC groups + */ + st_rdc = p_cfgp->start_rdc; + for (i = 0; i < ngrps; i++) { + /* + * All rings are associated with the default group initially + */ + if (i == 0) { + /* default group */ + switch (nxgep->niu_type) { + case NEPTUNE_4_1GC: + nrdcs = rx_4_1G[nxgep->function_num]; + break; + case N2_NIU: + case NEPTUNE_2_10GF: + nrdcs = rx_2_10G[nxgep->function_num]; + break; + case NEPTUNE_2_10GF_2_1GC: + nrdcs = rx_2_10G_2_1G[nxgep->function_num]; break; - case CLASSIFY: - case L3_CLASSIFY: - case L3_DISTRIBUTE: - case L3_TCAM: - rdcgrp_cfg = CFG_L3_DISTRIBUTE; + case NEPTUNE_1_10GF_3_1GC: + nrdcs = rx_1_10G_3_1G[nxgep->function_num]; + break; + case NEPTUNE_1_1GC_1_10GF_2_1GC: + nrdcs = rx_1_1G_1_10G_2_1G[nxgep->function_num]; break; default: - rdcgrp_cfg = CFG_L3_DISTRIBUTE; + switch (nxgep->platform_type) { + case P_NEPTUNE_ALONSO: + nrdcs = + rx_2_10G_2_1G[nxgep->function_num]; + break; + default: + nrdcs = rx_4_1G[nxgep->function_num]; + break; + } break; } - ddi_prop_free(prop_val); + } else { + nrdcs = 0; } - break; - case CFG_L3_WEB: - case CFG_L3_DISTRIBUTE: - case CFG_L2_CLASSIFY: - case CFG_L3_TCAM: - rdcgrp_cfg = rx_quick_cfg; - break; - default: - rdcgrp_cfg = CFG_L3_DISTRIBUTE; - break; - } - - st_rdc = p_cfgp->start_rdc; - - switch (rdcgrp_cfg) { - case CFG_L3_DISTRIBUTE: - case CFG_L3_WEB: - case CFG_L3_TCAM: - ndmas = p_cfgp->max_rdcs; - ngrps = 1; - rdcs_per_grp = ndmas / ngrps; - break; - case CFG_L2_CLASSIFY: - ndmas = p_cfgp->max_rdcs / 2; - if (p_cfgp->max_rdcs < 2) - ndmas = 1; - ngrps = 1; - rdcs_per_grp = ndmas / ngrps; - break; - default: - ngrps = p_cfgp->max_rdc_grpids; - ndmas = p_cfgp->max_rdcs; - rdcs_per_grp = ndmas / ngrps; - break; - } - - for (i = 0; i < ngrps; i++) { - uint8_t count = rdcs_per_grp; - dc_map_t map = 0; rdc_grp_p = &p_dma_cfgp->rdc_grps[ p_cfgp->def_mac_rxdma_grpid + i]; - rdc_grp_p->start_rdc = st_rdc + i * rdcs_per_grp; - rdc_grp_p->max_rdcs = rdcs_per_grp; + rdc_grp_p->start_rdc = st_rdc; + rdc_grp_p->max_rdcs = nrdcs; rdc_grp_p->def_rdc = rdc_grp_p->start_rdc; /* default to: 0, 1, 2, 3, ...., 0, 1, 2, 3.... */ - while (count) { - map |= (1 << count); - count--; - } - map >>= 1; /* In case <start_rdc> is zero (0) */ - map <<= rdc_grp_p->start_rdc; + if (nrdcs != 0) { + for (j = 0; j < nrdcs; j++) { + map |= (1 << j); + } + map <<= rdc_grp_p->start_rdc; + } else + map = 0; rdc_grp_p->map = map; nxgep->rx_set.owned.map |= map; /* Owned, & not shared. */ + nxgep->rx_set.owned.count = nrdcs; group = (nxge_grp_t *)nxge_grp_add(nxgep, NXGE_RECEIVE_GROUP); - group->map = rdc_grp_p->map; rdc_grp_p->config_method = RDC_TABLE_ENTRY_METHOD_SEQ; - rdc_grp_p->flag = 1; /* This group has been configured. */ + rdc_grp_p->flag = B_TRUE; /* This group has been configured. */ + rdc_grp_p->grp_index = group->index; + rdc_grp_p->port = NXGE_GET_PORT_NUM(nxgep->function_num); + + map = 0; } @@ -2742,7 +2742,7 @@ nxge_set_hw_mac_class_config(p_nxge_t nxgep) " id %d grp %d", mac_map->param_id, mac_map->map_to)); mac_host_info[mac_map->param_id].mpr_npr = - mac_map->pref; + p_cfgp->mac_pref; mac_host_info[mac_map->param_id].rdctbl = mac_map->map_to + p_cfgp->def_mac_rxdma_grpid; @@ -2967,16 +2967,12 @@ nxge_ldgv_init_n2(p_nxge_t nxgep, int *navail_p, int *nrequired_p) } /* - * Port0 uses the HW based syserr interrupt, and port1 uses the - * SW based syserr interrupt. There is only one syserr and the - * function zero device gets it. + * HW based syserr interrupt for port0, and SW based syserr interrupt + * for port1 */ if (own_sys_err && p_cfgp->ser_ldvid) { ldv = p_cfgp->ser_ldvid; /* - * Port0 - HW based: use an intr vector - */ - /* * Unmask the system interrupt states. */ (void) nxge_fzc_sys_err_mask_set(nxgep, SYS_ERR_SMX_MASK | @@ -2999,8 +2995,8 @@ nxge_ldgv_init_n2(p_nxge_t nxgep, int *navail_p, int *nrequired_p) nldvs++; } else { /* - * Port1 - SW based: allocate the ldv for the syserr since - * the vector should not be consumed for port1 + * SW based: allocate the ldv for the syserr since the vector + * should not be consumed for port1 */ sysldvp = KMEM_ZALLOC(sizeof (nxge_ldv_t), KM_SLEEP); sysldvp->use_timer = B_TRUE; @@ -3010,9 +3006,10 @@ nxge_ldgv_init_n2(p_nxge_t nxgep, int *navail_p, int *nrequired_p) sysldvp->ldv_ldf_masks = 0; sysldvp->nxgep = nxgep; ldgvp->ldvp_syserr = sysldvp; - ldgvp->ldvp_syserr_allocated = B_TRUE; + ldgvp->ldvp_syserr_alloced = B_TRUE; } + NXGE_DEBUG_MSG((nxgep, INT_CTL, "==> nxge_ldgv_init_n2: " "(before rx) func %d nldvs %d navail %d nrequired %d", func, nldvs, *navail_p, *nrequired_p)); @@ -3326,7 +3323,7 @@ nxge_ldgv_uninit(p_nxge_t nxgep) "no logical group configured.")); return (NXGE_OK); } - if (ldgvp->ldvp_syserr_allocated == B_TRUE) { + if (ldgvp->ldvp_syserr_alloced == B_TRUE) { KMEM_FREE(ldgvp->ldvp_syserr, sizeof (nxge_ldv_t)); } if (ldgvp->ldgp) { @@ -3925,3 +3922,29 @@ nxge_init_mmac(p_nxge_t nxgep, boolean_t compute_addrs) nxgep->statsp->mmac_stats.mmac_max_cnt = mmac_info->num_mmac; nxgep->statsp->mmac_stats.mmac_avail_cnt = mmac_info->num_mmac; } + +/* + * Convert an RDC group index into a port ring index. That is, map + * <groupid> to an index into nxgep->rx_ring_handles. + * (group ring index -> port ring index) + */ +int +nxge_get_rxring_index(p_nxge_t nxgep, int groupid, int ringidx) +{ + int i; + int index = 0; + p_nxge_rdc_grp_t rdc_grp_p; + p_nxge_dma_pt_cfg_t p_dma_cfgp; + p_nxge_hw_pt_cfg_t p_cfgp; + + p_dma_cfgp = &nxgep->pt_config; + p_cfgp = &p_dma_cfgp->hw_config; + + for (i = 0; i < groupid; i++) { + rdc_grp_p = + &p_dma_cfgp->rdc_grps[p_cfgp->def_mac_rxdma_grpid + i]; + index += rdc_grp_p->max_rdcs; + } + + return (index + ringidx); +} diff --git a/usr/src/uts/common/io/pcan/pcan.c b/usr/src/uts/common/io/pcan/pcan.c index b5b0604831..498a9eea60 100644 --- a/usr/src/uts/common/io/pcan/pcan.c +++ b/usr/src/uts/common/io/pcan/pcan.c @@ -46,7 +46,7 @@ #include <sys/pccard.h> #include <sys/pci.h> #include <sys/policy.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/stream.h> #include <inet/common.h> #include <inet/nd.h> @@ -104,7 +104,6 @@ mac_callbacks_t pcan_m_callbacks = { pcan_sdmulti, pcan_saddr, pcan_tx, - NULL, pcan_ioctl }; diff --git a/usr/src/uts/common/io/pcwl/pcwl.c b/usr/src/uts/common/io/pcwl/pcwl.c index f8d0cd2c4b..a2bad90c68 100644 --- a/usr/src/uts/common/io/pcwl/pcwl.c +++ b/usr/src/uts/common/io/pcwl/pcwl.c @@ -46,7 +46,7 @@ #include <sys/pccard.h> #include <sys/pci.h> #include <sys/policy.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/stream.h> #include <inet/common.h> #include <inet/nd.h> @@ -89,7 +89,6 @@ mac_callbacks_t pcwl_m_callbacks = { pcwl_sdmulti, pcwl_saddr, pcwl_tx, - NULL, pcwl_ioctl }; diff --git a/usr/src/uts/common/io/ral/rt2560.c b/usr/src/uts/common/io/ral/rt2560.c index d1473e1972..e6feee3ff4 100644 --- a/usr/src/uts/common/io/ral/rt2560.c +++ b/usr/src/uts/common/io/ral/rt2560.c @@ -43,7 +43,7 @@ #include <sys/modctl.h> #include <sys/devops.h> #include <sys/dlpi.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_wifi.h> #include <sys/net80211.h> #include <sys/net80211_proto.h> @@ -196,7 +196,6 @@ static mac_callbacks_t rt2560_m_callbacks = { rt2560_m_multicst, rt2560_m_unicst, rt2560_m_tx, - NULL, /* mc_resources; */ rt2560_m_ioctl, NULL, /* mc_getcapab */ NULL, diff --git a/usr/src/uts/common/io/rge/rge.h b/usr/src/uts/common/io/rge/rge.h index 4cab63b289..4a58da1c92 100755..100644 --- a/usr/src/uts/common/io/rge/rge.h +++ b/usr/src/uts/common/io/rge/rge.h @@ -26,8 +26,6 @@ #ifndef _RGE_H #define _RGE_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -59,7 +57,7 @@ extern "C" { #include <sys/ddi.h> #include <sys/sunddi.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> /* @@ -430,7 +428,6 @@ typedef struct rge { uint32_t rf_next; /* current free buf index */ uint32_t rc_next; /* current recycle buf index */ uint32_t rx_free; /* number of rx free buf */ - mac_resource_handle_t handle; /* used for send */ rge_bd_t *tx_ring; @@ -705,7 +702,7 @@ void rge_chip_init(rge_t *rgep); void rge_chip_start(rge_t *rgep); void rge_chip_stop(rge_t *rgep, boolean_t fault); void rge_chip_sync(rge_t *rgep, enum rge_sync_op todo); -void rge_chip_blank(void *arg, time_t ticks, uint_t count); +void rge_chip_blank(void *arg, time_t ticks, uint_t count, int flag); void rge_tx_trigger(rge_t *rgep); void rge_hw_stats_dump(rge_t *rgep); uint_t rge_intr(caddr_t arg1, caddr_t arg2); diff --git a/usr/src/uts/common/io/rge/rge_chip.c b/usr/src/uts/common/io/rge/rge_chip.c index 6210fc25fc..c509e01ebb 100644 --- a/usr/src/uts/common/io/rge/rge_chip.c +++ b/usr/src/uts/common/io/rge/rge_chip.c @@ -1258,11 +1258,12 @@ rge_chip_sync(rge_t *rgep, enum rge_sync_op todo) } } -void rge_chip_blank(void *arg, time_t ticks, uint_t count); +void rge_chip_blank(void *arg, time_t ticks, uint_t count, int flag); #pragma no_inline(rge_chip_blank) +/* ARGSUSED */ void -rge_chip_blank(void *arg, time_t ticks, uint_t count) +rge_chip_blank(void *arg, time_t ticks, uint_t count, int flag) { _NOTE(ARGUNUSED(arg, ticks, count)); } diff --git a/usr/src/uts/common/io/rge/rge_main.c b/usr/src/uts/common/io/rge/rge_main.c index c473a86b7f..ab9ed63203 100644 --- a/usr/src/uts/common/io/rge/rge_main.c +++ b/usr/src/uts/common/io/rge/rge_main.c @@ -109,11 +109,10 @@ static void rge_m_stop(void *); static int rge_m_promisc(void *, boolean_t); static int rge_m_multicst(void *, boolean_t, const uint8_t *); static int rge_m_unicst(void *, const uint8_t *); -static void rge_m_resources(void *); static void rge_m_ioctl(void *, queue_t *, mblk_t *); static boolean_t rge_m_getcapab(void *, mac_capab_t, void *); -#define RGE_M_CALLBACK_FLAGS (MC_RESOURCES | MC_IOCTL | MC_GETCAPAB) +#define RGE_M_CALLBACK_FLAGS (MC_IOCTL | MC_GETCAPAB) static mac_callbacks_t rge_m_callbacks = { RGE_M_CALLBACK_FLAGS, @@ -124,7 +123,6 @@ static mac_callbacks_t rge_m_callbacks = { rge_m_multicst, rge_m_unicst, rge_m_tx, - rge_m_resources, rge_m_ioctl, rge_m_getcapab }; @@ -1249,28 +1247,6 @@ rge_m_ioctl(void *arg, queue_t *wq, mblk_t *mp) } } -static void -rge_m_resources(void *arg) -{ - rge_t *rgep = arg; - mac_rx_fifo_t mrf; - - mutex_enter(rgep->genlock); - - /* - * Register Rx rings as resources and save mac - * resource id for future reference - */ - mrf.mrf_type = MAC_RX_FIFO; - mrf.mrf_blank = rge_chip_blank; - mrf.mrf_arg = (void *)rgep; - mrf.mrf_normal_blank_time = RGE_RX_INT_TIME; - mrf.mrf_normal_pkt_count = RGE_RX_INT_PKTS; - rgep->handle = mac_resource_add(rgep->mh, (mac_resource_t *)&mrf); - - mutex_exit(rgep->genlock); -} - /* ARGSUSED */ static boolean_t rge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) @@ -1302,12 +1278,6 @@ rge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) } break; } - case MAC_CAPAB_POLL: - /* - * There's nothing for us to fill in, simply returning - * B_TRUE stating that we support polling is sufficient. - */ - break; default: return (B_FALSE); } diff --git a/usr/src/uts/common/io/rge/rge_rxtx.c b/usr/src/uts/common/io/rge/rge_rxtx.c index 301b023e5a..09d23825d3 100755..100644 --- a/usr/src/uts/common/io/rge/rge_rxtx.c +++ b/usr/src/uts/common/io/rge/rge_rxtx.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "rge.h" #define U32TOPTR(x) ((void *)(uintptr_t)(uint32_t)(x)) @@ -369,7 +367,7 @@ rge_receive(rge_t *rgep) mutex_exit(rgep->rx_lock); if (mp != NULL) - mac_rx(rgep->mh, rgep->handle, mp); + mac_rx(rgep->mh, NULL, mp); } diff --git a/usr/src/uts/common/io/rtw/rtw.c b/usr/src/uts/common/io/rtw/rtw.c index 1b99f01099..fa471c83a8 100644 --- a/usr/src/uts/common/io/rtw/rtw.c +++ b/usr/src/uts/common/io/rtw/rtw.c @@ -54,7 +54,7 @@ #include <sys/sunddi.h> #include <sys/pci.h> #include <sys/errno.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/dlpi.h> #include <sys/ethernet.h> #include <sys/list.h> @@ -178,7 +178,6 @@ static mac_callbacks_t rtw_m_callbacks = { rtw_m_multicst, rtw_m_unicst, rtw_m_tx, - NULL, rtw_m_ioctl, NULL, /* mc_getcapab */ NULL, diff --git a/usr/src/uts/common/io/rum/rum.c b/usr/src/uts/common/io/rum/rum.c index 8b09c53171..6c61cbbebd 100644 --- a/usr/src/uts/common/io/rum/rum.c +++ b/usr/src/uts/common/io/rum/rum.c @@ -43,7 +43,7 @@ #include <sys/modctl.h> #include <sys/devops.h> #include <sys/dlpi.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_wifi.h> #include <sys/net80211.h> #include <sys/net80211_proto.h> @@ -291,7 +291,6 @@ static mac_callbacks_t rum_m_callbacks = { rum_m_multicst, rum_m_unicst, rum_m_tx, - NULL, /* mc_resources; */ rum_m_ioctl, NULL, /* mc_getcapab */ NULL, diff --git a/usr/src/uts/common/io/sfe/sfe_util.c b/usr/src/uts/common/io/sfe/sfe_util.c index 0d8f736d15..fdee7b6d2f 100644 --- a/usr/src/uts/common/io/sfe/sfe_util.c +++ b/usr/src/uts/common/io/sfe/sfe_util.c @@ -32,6 +32,11 @@ */ /* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* * System Header files. */ #include <sys/types.h> @@ -1958,7 +1963,7 @@ next: * send up received packets */ mutex_exit(&dp->intrlock); - mac_rx(dp->mh, dp->mac_rx_ring_ha, rx_head); + mac_rx(dp->mh, NULL, rx_head); mutex_enter(&dp->intrlock); } @@ -4050,11 +4055,10 @@ static int gem_m_setpromisc(void *, boolean_t); static int gem_m_multicst(void *, boolean_t, const uint8_t *); static int gem_m_unicst(void *, const uint8_t *); static mblk_t *gem_m_tx(void *, mblk_t *); -static void gem_m_resources(void *); static void gem_m_ioctl(void *, queue_t *, mblk_t *); static boolean_t gem_m_getcapab(void *, mac_capab_t, void *); -#define GEM_M_CALLBACK_FLAGS (MC_RESOURCES | MC_IOCTL | MC_GETCAPAB) +#define GEM_M_CALLBACK_FLAGS (MC_IOCTL | MC_GETCAPAB) static mac_callbacks_t gem_m_callbacks = { GEM_M_CALLBACK_FLAGS, @@ -4065,7 +4069,6 @@ static mac_callbacks_t gem_m_callbacks = { gem_m_multicst, gem_m_unicst, gem_m_tx, - gem_m_resources, gem_m_ioctl, gem_m_getcapab, }; @@ -4590,45 +4593,6 @@ gem_m_tx(void *arg, mblk_t *mp) } static void -gem_set_coalease(void *arg, time_t ticks, uint_t count) -{ - struct gem_dev *dp = arg; - DPRINTF(1, (CE_CONT, "%s: %s: ticks:%d count:%d", - dp->name, __func__, ticks, count)); - - mutex_enter(&dp->intrlock); - dp->poll_pkt_delay = min(count, dp->gc.gc_rx_ring_size/2); - mutex_exit(&dp->intrlock); -} - -static void -gem_m_resources(void *arg) -{ - struct gem_dev *dp = arg; - mac_rx_fifo_t mrf; - - DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); - - mutex_enter(&dp->intrlock); - mutex_enter(&dp->xmitlock); - - /* - * Register Rx rings as resources and save mac - * resource id for future reference - */ - mrf.mrf_type = MAC_RX_FIFO; - mrf.mrf_blank = gem_set_coalease; - mrf.mrf_arg = (void *)dp; - mrf.mrf_normal_blank_time = 1; /* in uS */ - mrf.mrf_normal_pkt_count = dp->poll_pkt_delay; - - dp->mac_rx_ring_ha = mac_resource_add(dp->mh, (mac_resource_t *)&mrf); - - mutex_exit(&dp->xmitlock); - mutex_exit(&dp->intrlock); -} - -static void gem_m_ioctl(void *arg, queue_t *wq, mblk_t *mp) { DPRINTF(0, (CE_CONT, "!%s: %s: called", @@ -4637,18 +4601,11 @@ gem_m_ioctl(void *arg, queue_t *wq, mblk_t *mp) gem_mac_ioctl((struct gem_dev *)arg, wq, mp); } +/* ARGSUSED */ static boolean_t gem_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) { - boolean_t ret; - - ret = B_FALSE; - switch (cap) { - case MAC_CAPAB_POLL: - ret = B_TRUE; - break; - } - return (ret); + return (B_FALSE); } static void diff --git a/usr/src/uts/common/io/sfe/sfe_util.h b/usr/src/uts/common/io/sfe/sfe_util.h index 576a3d5d08..6c8ca8fea4 100644 --- a/usr/src/uts/common/io/sfe/sfe_util.h +++ b/usr/src/uts/common/io/sfe/sfe_util.h @@ -31,9 +31,14 @@ * DAMAGE. */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + #ifndef _SFE_UTIL_H_ #define _SFE_UTIL_H_ -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> /* diff --git a/usr/src/uts/common/io/softmac/softmac_ctl.c b/usr/src/uts/common/io/softmac/softmac_ctl.c index b1b8cd4f42..99c665aae6 100644 --- a/usr/src/uts/common/io/softmac/softmac_ctl.c +++ b/usr/src/uts/common/io/softmac/softmac_ctl.c @@ -23,9 +23,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/stropts.h> +#include <sys/strsubr.h> +#include <sys/callb.h> #include <sys/softmac_impl.h> int @@ -192,11 +192,9 @@ softmac_m_ioctl(void *arg, queue_t *wq, mblk_t *mp) } static void -softmac_process_notify_ind(queue_t *rq, mblk_t *mp) +softmac_process_notify_ind(softmac_t *softmac, mblk_t *mp) { - softmac_lower_t *slp = rq->q_ptr; dl_notify_ind_t *dlnip = (dl_notify_ind_t *)mp->b_rptr; - softmac_t *softmac = slp->sl_softmac; uint_t addroff, addrlen; ASSERT(dlnip->dl_primitive == DL_NOTIFY_IND); @@ -231,6 +229,73 @@ softmac_process_notify_ind(queue_t *rq, mblk_t *mp) freemsg(mp); } +void +softmac_notify_thread(void *arg) +{ + softmac_t *softmac = arg; + callb_cpr_t cprinfo; + + CALLB_CPR_INIT(&cprinfo, &softmac->smac_mutex, callb_generic_cpr, + "softmac_notify_thread"); + + mutex_enter(&softmac->smac_mutex); + + /* + * Quit the thread if smac_mh is unregistered. + */ + while (softmac->smac_mh != NULL && + !(softmac->smac_flags & SOFTMAC_NOTIFY_QUIT)) { + mblk_t *mp, *nextmp; + + if ((mp = softmac->smac_notify_head) == NULL) { + CALLB_CPR_SAFE_BEGIN(&cprinfo); + cv_wait(&softmac->smac_cv, &softmac->smac_mutex); + CALLB_CPR_SAFE_END(&cprinfo, &softmac->smac_mutex); + continue; + } + + softmac->smac_notify_head = softmac->smac_notify_tail = NULL; + mutex_exit(&softmac->smac_mutex); + + while (mp != NULL) { + nextmp = mp->b_next; + mp->b_next = NULL; + softmac_process_notify_ind(softmac, mp); + mp = nextmp; + } + mutex_enter(&softmac->smac_mutex); + } + + /* + * The softmac is being destroyed, simply free all of the DL_NOTIFY_IND + * messages left in the queue which did not have the chance to be + * processed. + */ + freemsgchain(softmac->smac_notify_head); + softmac->smac_notify_head = softmac->smac_notify_tail = NULL; + softmac->smac_notify_thread = NULL; + cv_broadcast(&softmac->smac_cv); + CALLB_CPR_EXIT(&cprinfo); + thread_exit(); +} + +static void +softmac_enqueue_notify_ind(queue_t *rq, mblk_t *mp) +{ + softmac_lower_t *slp = rq->q_ptr; + softmac_t *softmac = slp->sl_softmac; + + mutex_enter(&softmac->smac_mutex); + if (softmac->smac_notify_tail == NULL) { + softmac->smac_notify_head = softmac->smac_notify_tail = mp; + } else { + softmac->smac_notify_tail->b_next = mp; + softmac->smac_notify_tail = mp; + } + cv_broadcast(&softmac->smac_cv); + mutex_exit(&softmac->smac_mutex); +} + static void softmac_process_dlpi(softmac_lower_t *slp, mblk_t *mp, uint_t minlen, t_uscalar_t reqprim) @@ -295,7 +360,29 @@ softmac_rput_process_proto(queue_t *rq, mblk_t *mp) if (len < DL_NOTIFY_IND_SIZE) goto runt; - softmac_process_notify_ind(rq, mp); + /* + * Enqueue all the DL_NOTIFY_IND messages and process them + * in another separate thread to avoid deadlock. Here is an + * example of the deadlock scenario: + * + * Thread A: mac_promisc_set()->softmac_m_promisc() + * + * The softmac driver waits for the ACK of the + * DL_PROMISC_PHYS request with the MAC perimeter; + * + * Thread B: + * + * The driver handles the DL_PROMISC_PHYS request. Before + * it sends back the ACK, it could first send a + * DL_NOTE_PROMISC_ON_PHYS notification. + * + * Since DL_NOTIFY_IND could eventually cause softmac to call + * mac_xxx_update(), which requires MAC perimeter, this would + * cause deadlock between the two threads. Enqueuing the + * DL_NOTIFY_IND message and defer its processing would + * avoid the potential deadlock. + */ + softmac_enqueue_notify_ind(rq, mp); return; case DL_NOTIFY_ACK: diff --git a/usr/src/uts/common/io/softmac/softmac_dev.c b/usr/src/uts/common/io/softmac/softmac_dev.c index 3d2164e782..f548df055d 100644 --- a/usr/src/uts/common/io/softmac/softmac_dev.c +++ b/usr/src/uts/common/io/softmac/softmac_dev.c @@ -222,11 +222,6 @@ softmac_close(queue_t *rq) slp->sl_softmac = NULL; slp->sl_lh = NULL; - /* - * slp->sl_handle could be non-NULL if it is in the aggregation. - */ - slp->sl_handle = (mac_resource_handle_t)NULL; - ASSERT(slp->sl_ack_mp == NULL); ASSERT(slp->sl_ctl_inprogress == B_FALSE); ASSERT(slp->sl_pending_prim == DL_PRIM_INVAL); @@ -266,6 +261,16 @@ softmac_rput(queue_t *rq, mblk_t *mp) } /* + * If this message is looped back from the legacy devices, + * drop it as the Nemo framework will be responsible for + * looping it back by the mac_txloop() function. + */ + if (mp->b_flag & MSGNOLOOP) { + freemsg(mp); + return; + } + + /* * This is the most common case. */ if (DB_REF(mp) == 1) { @@ -276,7 +281,7 @@ softmac_rput(queue_t *rq, mblk_t *mp) * is reset to NULL when DL_CAPAB_POLL is * disabled. */ - mac_rx(slp->sl_softmac->smac_mh, slp->sl_handle, mp); + mac_rx(slp->sl_softmac->smac_mh, NULL, mp); return; } else { softmac_rput_process_data(slp, mp); diff --git a/usr/src/uts/common/io/softmac/softmac_main.c b/usr/src/uts/common/io/softmac/softmac_main.c index d325e3b4c6..0187cf8a28 100644 --- a/usr/src/uts/common/io/softmac/softmac_main.c +++ b/usr/src/uts/common/io/softmac/softmac_main.c @@ -44,6 +44,8 @@ #include <sys/file.h> #include <sys/cred.h> #include <sys/dlpi.h> +#include <sys/mac_provider.h> +#include <sys/disp.h> #include <sys/sunndi.h> #include <sys/modhash.h> #include <sys/stropts.h> @@ -53,11 +55,19 @@ #include <sys/softmac.h> #include <sys/dls.h> +/* Used as a parameter to the mod hash walk of softmac structures */ +typedef struct { + softmac_t *smw_softmac; + boolean_t smw_retry; +} softmac_walk_t; + /* * Softmac hash table including softmacs for both style-2 and style-1 devices. */ static krwlock_t softmac_hash_lock; static mod_hash_t *softmac_hash; +static kmutex_t smac_global_lock; +static kcondvar_t smac_global_cv; #define SOFTMAC_HASHSZ 64 @@ -71,7 +81,7 @@ static void softmac_m_close(void *); static boolean_t softmac_m_getcapab(void *, mac_capab_t, void *); #define SOFTMAC_M_CALLBACK_FLAGS \ - (MC_RESOURCES | MC_IOCTL | MC_GETCAPAB | MC_OPEN | MC_CLOSE) + (MC_IOCTL | MC_GETCAPAB | MC_OPEN | MC_CLOSE) static mac_callbacks_t softmac_m_callbacks = { SOFTMAC_M_CALLBACK_FLAGS, @@ -82,7 +92,6 @@ static mac_callbacks_t softmac_m_callbacks = { softmac_m_multicst, softmac_m_unicst, softmac_m_tx, - softmac_m_resources, softmac_m_ioctl, softmac_m_getcapab, softmac_m_open, @@ -97,6 +106,8 @@ softmac_init() mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); rw_init(&softmac_hash_lock, NULL, RW_DEFAULT, NULL); + mutex_init(&smac_global_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&smac_global_cv, NULL, CV_DRIVER, NULL); } void @@ -104,6 +115,8 @@ softmac_fini() { rw_destroy(&softmac_hash_lock); mod_hash_destroy_hash(softmac_hash); + mutex_destroy(&smac_global_lock); + cv_destroy(&smac_global_cv); } /* ARGSUSED */ @@ -128,7 +141,8 @@ softmac_busy() } /* - * This function is called for each minor node during the post-attach of + * + * softmac_create() is called for each minor node during the post-attach of * each DDI_NT_NET device instance. Note that it is possible that a device * instance has two minor nodes (DLPI style-1 and style-2), so that for that * specific device, softmac_create() could be called twice. @@ -139,7 +153,99 @@ softmac_busy() * For each minor node of a legacy device, a taskq is started to finish * softmac_mac_register(), which will finish the rest of work (see comments * above softmac_mac_register()). + * + * softmac state machine + * -------------------------------------------------------------------------- + * OLD STATE EVENT NEW STATE + * -------------------------------------------------------------------------- + * UNINIT attach of 1st minor node ATTACH_INPROG + * okcnt = 0 net_postattach -> softmac_create okcnt = 1 + * + * ATTACH_INPROG attach of 2nd minor node (GLDv3) ATTACH_DONE + * okcnt = 1 net_postattach -> softmac_create okcnt = 2 + * + * ATTACH_INPROG attach of 2nd minor node (legacy) ATTACH_INPROG + * okcnt = 1 net_postattach -> softmac_create okcnt = 2 + * schedule softmac_mac_register + * + * ATTACH_INPROG legacy device node ATTACH_DONE + * okcnt = 2 softmac_mac_register okcnt = 2 + * + * ATTACH_DONE detach of 1st minor node DETACH_INPROG + * okcnt = 2 (success) okcnt = 1 + * + * DETACH_INPROG detach of 2nd minor node UNINIT (or free) + * okcnt = 1 (success) okcnt = 0 + * + * ATTACH_DONE detach failure state unchanged + * DETACH_INPROG left = okcnt + * + * DETACH_INPROG reattach ATTACH_INPROG + * okcnt = 0,1 net_postattach -> softmac_create + * + * ATTACH_DONE reattach ATTACH_DONE + * left != 0 net_postattach -> softmac_create left = 0 + * + * Abbreviation notes: + * states have SOFTMAC_ prefix, + * okcnt - softmac_attach_okcnt, + * left - softmac_attached_left */ + +#ifdef DEBUG +void +softmac_state_verify(softmac_t *softmac) +{ + ASSERT(MUTEX_HELD(&softmac->smac_mutex)); + + /* + * There are at most 2 minor nodes, one per DLPI style + */ + ASSERT(softmac->smac_cnt <= 2 && softmac->smac_attachok_cnt <= 2); + + /* + * The smac_attachok_cnt represents the number of attaches i.e. the + * number of times net_postattach -> softmac_create() has been called + * for a device instance. + */ + ASSERT(softmac->smac_attachok_cnt == SMAC_NONZERO_NODECNT(softmac)); + + /* + * softmac_create (or softmac_mac_register) -> softmac_create_datalink + * happens only after all minor nodes have been attached + */ + ASSERT(softmac->smac_state != SOFTMAC_ATTACH_DONE || + softmac->smac_attachok_cnt == softmac->smac_cnt); + + if (softmac->smac_attachok_cnt == 0) { + ASSERT(softmac->smac_state == SOFTMAC_UNINIT); + ASSERT(softmac->smac_mh == NULL); + } else if (softmac->smac_attachok_cnt < softmac->smac_cnt) { + ASSERT(softmac->smac_state == SOFTMAC_ATTACH_INPROG || + softmac->smac_state == SOFTMAC_DETACH_INPROG); + ASSERT(softmac->smac_mh == NULL); + } else { + /* + * In the stable condition the state whould be + * SOFTMAC_ATTACH_DONE. But there is a small transient window + * in softmac_destroy where we change the state to + * SOFTMAC_DETACH_INPROG and drop the lock before doing + * the link destroy + */ + ASSERT(softmac->smac_attachok_cnt == softmac->smac_cnt); + ASSERT(softmac->smac_state != SOFTMAC_UNINIT); + } + if (softmac->smac_mh != NULL) + ASSERT(softmac->smac_attachok_cnt == softmac->smac_cnt); +} +#endif + +#ifdef DEBUG +#define SOFTMAC_STATE_VERIFY(softmac) softmac_state_verify(softmac) +#else +#define SOFTMAC_STATE_VERIFY(softmac) +#endif + int softmac_create(dev_info_t *dip, dev_t dev) { @@ -181,9 +287,7 @@ softmac_create(dev_info_t *dip, dev_t dev) softmac = kmem_zalloc(sizeof (softmac_t), KM_SLEEP); mutex_init(&softmac->smac_mutex, NULL, MUTEX_DRIVER, NULL); cv_init(&softmac->smac_cv, NULL, CV_DRIVER, NULL); - rw_init(&softmac->smac_lock, NULL, RW_DRIVER, NULL); (void) strlcpy(softmac->smac_devname, devname, MAXNAMELEN); - /* * Insert the softmac into the hash table. */ @@ -191,9 +295,15 @@ softmac_create(dev_info_t *dip, dev_t dev) (mod_hash_key_t)softmac->smac_devname, (mod_hash_val_t)softmac); ASSERT(err == 0); + mutex_enter(&smac_global_lock); + cv_broadcast(&smac_global_cv); + mutex_exit(&smac_global_lock); } mutex_enter(&softmac->smac_mutex); + SOFTMAC_STATE_VERIFY(softmac); + if (softmac->smac_state != SOFTMAC_ATTACH_DONE) + softmac->smac_state = SOFTMAC_ATTACH_INPROG; if (softmac->smac_attachok_cnt == 0) { /* * Initialize the softmac if this is the post-attach of the @@ -231,45 +341,26 @@ softmac_create(dev_info_t *dip, dev_t dev) index = (getmajor(dev) == ddi_name_to_major("clone")); if (softmac->smac_softmac[index] != NULL) { /* - * This is possible if the post_attach() is called: - * - * a. after pre_detach() fails. - * - * b. for a new round of reattachment. Note that DACF will not - * call pre_detach() for successfully post_attached minor - * nodes even when the post-attach failed after all. - * - * Both seem to be defects in the DACF framework. To work - * around it and only clear the SOFTMAC_ATTACH_DONE flag for - * the b case, a smac_attached_left field is used to tell - * the two cases apart. + * This is possible if the post_attach() is called after + * pre_detach() fails. This seems to be a defect of the DACF + * framework. We work around it by using a smac_attached_left + * field that tracks this */ - ASSERT(softmac->smac_attachok_cnt != 0); - - if (softmac->smac_attached_left != 0) - /* case a */ - softmac->smac_attached_left--; - else if (softmac->smac_attachok_cnt != softmac->smac_cnt) { - /* case b */ - softmac->smac_flags &= ~SOFTMAC_ATTACH_DONE; - } + ASSERT(softmac->smac_attached_left != 0); + softmac->smac_attached_left--; mutex_exit(&softmac->smac_mutex); rw_exit(&softmac_hash_lock); return (0); + } mutex_exit(&softmac->smac_mutex); rw_exit(&softmac_hash_lock); - /* - * No lock is needed for access this softmac pointer, as pre-detach and - * post-attach won't happen at the same time. - */ - mutex_enter(&softmac->smac_mutex); - softmac_dev = kmem_zalloc(sizeof (softmac_dev_t), KM_SLEEP); softmac_dev->sd_dev = dev; - softmac->smac_softmac[index] = softmac_dev; + mutex_enter(&softmac->smac_mutex); + softmac->smac_softmac[index] = softmac_dev; /* * Continue to register the mac and create the datalink only when all * the minor nodes are attached. @@ -281,18 +372,22 @@ softmac_create(dev_info_t *dip, dev_t dev) /* * All of the minor nodes have been attached; start a taskq - * to do the rest of the work. We use a taskq instead of of + * to do the rest of the work. We use a taskq instead of * doing the work here because: * - * - We could be called as a result of an open() system call - * where spec_open() already SLOCKED the snode. Using a taskq - * sidesteps the risk that our ldi_open_by_dev() call would - * deadlock trying to set SLOCKED on the snode again. + * We could be called as a result of a open() system call + * where spec_open() already SLOCKED the snode. Using a taskq + * sidesteps the risk that our ldi_open_by_dev() call would + * deadlock trying to set SLOCKED on the snode again. * - * - The devfs design requires no interruptible function calls - * in the device post-attach routine, but we need to make an - * (interruptible) upcall. Using a taskq to make the upcall - * sidesteps this. + * The devfs design requires that the downcalls don't use any + * interruptible cv_wait which happens when we do door upcalls. + * Otherwise the downcalls which may be holding devfs resources + * may cause a deadlock if the thread is stopped. Also we need to make + * sure these downcalls into softmac_create or softmac_destroy + * don't cv_wait on any devfs related condition. Thus softmac_destroy + * returns EBUSY if the asynchronous threads started in softmac_create + * haven't finished. */ ASSERT(softmac->smac_taskq == NULL); softmac->smac_taskq = taskq_dispatch(system_taskq, @@ -331,7 +426,6 @@ softmac_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) * simply return B_TRUE if we support it. */ case MAC_CAPAB_NO_ZCOPY: - case MAC_CAPAB_POLL: case MAC_CAPAB_NO_NATIVEVLAN: default: break; @@ -396,8 +490,6 @@ softmac_create_datalink(softmac_t *softmac) datalink_id_t linkid = DATALINK_INVALID_LINKID; int err; - ASSERT(MUTEX_HELD(&softmac->smac_mutex)); - /* * Inform dlmgmtd of this link so that softmac_hold_device() is able * to know the existence of this link. If this failed with EBADF, @@ -429,8 +521,11 @@ softmac_create_datalink(softmac_t *softmac) return (err); } - if (linkid == DATALINK_INVALID_LINKID) + if (linkid == DATALINK_INVALID_LINKID) { + mutex_enter(&softmac->smac_mutex); softmac->smac_flags |= SOFTMAC_NEED_RECREATE; + mutex_exit(&softmac->smac_mutex); + } return (0); } @@ -453,6 +548,8 @@ softmac_create_task(void *arg) mutex_enter(&softmac->smac_mutex); softmac->smac_media = (mac_info(mh))->mi_nativemedia; softmac->smac_mh = mh; + softmac->smac_taskq = NULL; + mutex_exit(&softmac->smac_mutex); /* * We can safely release the reference on the mac because @@ -467,10 +564,13 @@ softmac_create_task(void *arg) */ err = softmac_create_datalink(softmac); + mutex_enter(&softmac->smac_mutex); done: - ASSERT(!(softmac->smac_flags & SOFTMAC_ATTACH_DONE)); - softmac->smac_flags |= SOFTMAC_ATTACH_DONE; - softmac->smac_attacherr = err; + if (err != 0) { + softmac->smac_mh = NULL; + softmac->smac_attacherr = err; + } + softmac->smac_state = SOFTMAC_ATTACH_DONE; softmac->smac_taskq = NULL; cv_broadcast(&softmac->smac_cv); mutex_exit(&softmac->smac_mutex); @@ -498,6 +598,8 @@ softmac_mac_register(softmac_t *softmac) * as softmac_destroy() will wait until this function is called. */ ASSERT(softmac != NULL); + ASSERT(softmac->smac_state == SOFTMAC_ATTACH_INPROG && + softmac->smac_attachok_cnt == softmac->smac_cnt); if ((err = ldi_ident_from_dip(softmac_dip, &li)) != 0) { mutex_enter(&softmac->smac_mutex); @@ -617,11 +719,9 @@ softmac_mac_register(softmac_t *softmac) * dl_bind() because some drivers return DL_ERROR_ACK if the * stream is not bound. It is also before mac_register(), so * we don't need any lock protection here. - * - * Softmac always supports POLL. */ softmac->smac_capab_flags = - (MAC_CAPAB_POLL | MAC_CAPAB_NO_ZCOPY | MAC_CAPAB_LEGACY); + (MAC_CAPAB_NO_ZCOPY | MAC_CAPAB_LEGACY); softmac->smac_no_capability_req = B_FALSE; if (softmac_fill_capab(lh, softmac) != 0) @@ -714,6 +814,7 @@ softmac_mac_register(softmac_t *softmac) goto done; } } + mutex_exit(&softmac->smac_mutex); /* * Try to create the datalink for this softmac. @@ -724,10 +825,21 @@ softmac_mac_register(softmac_t *softmac) softmac->smac_mh = NULL; } } + /* + * If succeed, create the thread which handles the DL_NOTIFY_IND from + * the lower stream. + */ + if (softmac->smac_mh != NULL) { + softmac->smac_notify_thread = thread_create(NULL, 0, + softmac_notify_thread, softmac, 0, &p0, + TS_RUN, minclsyspri); + } + mutex_enter(&softmac->smac_mutex); done: - ASSERT(!(softmac->smac_flags & SOFTMAC_ATTACH_DONE)); - softmac->smac_flags |= SOFTMAC_ATTACH_DONE; + ASSERT(softmac->smac_state == SOFTMAC_ATTACH_INPROG && + softmac->smac_attachok_cnt == softmac->smac_cnt); + softmac->smac_state = SOFTMAC_ATTACH_DONE; softmac->smac_attacherr = err; softmac->smac_taskq = NULL; cv_broadcast(&softmac->smac_cv); @@ -743,24 +855,37 @@ softmac_destroy(dev_info_t *dip, dev_t dev) int index; int ppa, err; datalink_id_t linkid; + mac_handle_t smac_mh; + uint32_t smac_flags; ppa = ddi_get_instance(dip); (void) snprintf(devname, MAXNAMELEN, "%s%d", ddi_driver_name(dip), ppa); - rw_enter(&softmac_hash_lock, RW_WRITER); + /* + * We are called only from the predetach entry point. The DACF + * framework ensures there can't be a concurrent postattach call + * for the same softmac. The softmac found out from the modhash + * below can't vanish beneath us since this is the only place where + * it is deleted. + */ err = mod_hash_find(softmac_hash, (mod_hash_key_t)devname, (mod_hash_val_t *)&softmac); ASSERT(err == 0); mutex_enter(&softmac->smac_mutex); + SOFTMAC_STATE_VERIFY(softmac); /* * Fail the predetach routine if this softmac is in-use. + * Make sure these downcalls into softmac_create or softmac_destroy + * don't cv_wait on any devfs related condition. Thus softmac_destroy + * returns EBUSY if the asynchronous thread started in softmac_create + * hasn't finished */ - if (softmac->smac_hold_cnt != 0) { + if ((softmac->smac_hold_cnt != 0) || + (softmac->smac_state == SOFTMAC_ATTACH_INPROG)) { softmac->smac_attached_left = softmac->smac_attachok_cnt; mutex_exit(&softmac->smac_mutex); - rw_exit(&softmac_hash_lock); return (EBUSY); } @@ -772,78 +897,106 @@ softmac_destroy(dev_info_t *dip, dev_t dev) */ if (softmac->smac_attached_left != 0) { mutex_exit(&softmac->smac_mutex); - rw_exit(&softmac_hash_lock); return (EBUSY); } - if (softmac->smac_attachok_cnt != softmac->smac_cnt) - goto done; - - /* - * This is the detach for the first minor node. Wait until all the - * minor nodes are attached. - */ - while (!(softmac->smac_flags & SOFTMAC_ATTACH_DONE)) - cv_wait(&softmac->smac_cv, &softmac->smac_mutex); + smac_mh = softmac->smac_mh; + smac_flags = softmac->smac_flags; + softmac->smac_state = SOFTMAC_DETACH_INPROG; + mutex_exit(&softmac->smac_mutex); - if (softmac->smac_mh != NULL) { - if (!(softmac->smac_flags & SOFTMAC_NOSUPP)) { - if ((err = dls_devnet_destroy(softmac->smac_mh, - &linkid)) != 0) { - goto done; + if (smac_mh != NULL) { + /* + * This is the first minor node that is being detached for this + * softmac. + */ + ASSERT(softmac->smac_attachok_cnt == softmac->smac_cnt); + if (!(smac_flags & SOFTMAC_NOSUPP)) { + if ((err = dls_devnet_destroy(smac_mh, &linkid, + B_FALSE)) != 0) { + goto error; } } /* * If softmac_mac_register() succeeds in registering the mac * of the legacy device, unregister it. */ - if (!(softmac->smac_flags & (SOFTMAC_GLDV3 | SOFTMAC_NOSUPP))) { - if ((err = mac_unregister(softmac->smac_mh)) != 0) { - (void) dls_devnet_create(softmac->smac_mh, - linkid); - goto done; + if (!(smac_flags & (SOFTMAC_GLDV3 | SOFTMAC_NOSUPP))) { + if ((err = mac_disable_nowait(smac_mh)) != 0) { + (void) dls_devnet_create(smac_mh, linkid); + goto error; } + /* + * Ask softmac_notify_thread to quit, and wait for + * that to be done. + */ + mutex_enter(&softmac->smac_mutex); + softmac->smac_flags |= SOFTMAC_NOTIFY_QUIT; + cv_broadcast(&softmac->smac_cv); + while (softmac->smac_notify_thread != NULL) { + cv_wait(&softmac->smac_cv, + &softmac->smac_mutex); + } + mutex_exit(&softmac->smac_mutex); + VERIFY(mac_unregister(smac_mh) == 0); } softmac->smac_mh = NULL; } - softmac->smac_flags &= ~SOFTMAC_ATTACH_DONE; -done: - if (err == 0) { - /* - * Free softmac_dev - */ - index = (getmajor(dev) == ddi_name_to_major("clone")); - softmac_dev = softmac->smac_softmac[index]; - ASSERT(softmac_dev != NULL); - softmac->smac_softmac[index] = NULL; - kmem_free(softmac_dev, sizeof (softmac_dev_t)); - - if (--softmac->smac_attachok_cnt == 0) { - mod_hash_val_t hashval; - - err = mod_hash_remove(softmac_hash, - (mod_hash_key_t)devname, - (mod_hash_val_t *)&hashval); - ASSERT(err == 0); + /* + * Free softmac_dev + */ + rw_enter(&softmac_hash_lock, RW_WRITER); + mutex_enter(&softmac->smac_mutex); + ASSERT(softmac->smac_state == SOFTMAC_DETACH_INPROG && + softmac->smac_attachok_cnt != 0); + softmac->smac_mh = NULL; + index = (getmajor(dev) == ddi_name_to_major("clone")); + softmac_dev = softmac->smac_softmac[index]; + ASSERT(softmac_dev != NULL); + softmac->smac_softmac[index] = NULL; + kmem_free(softmac_dev, sizeof (softmac_dev_t)); + + if (--softmac->smac_attachok_cnt == 0) { + mod_hash_val_t hashval; + + softmac->smac_state = SOFTMAC_UNINIT; + if (softmac->smac_hold_cnt != 0) { + /* + * Someone did a softmac_hold_device while we dropped + * the locks. Leave the softmac itself intact which + * will be reused by the reattach + */ mutex_exit(&softmac->smac_mutex); rw_exit(&softmac_hash_lock); - - ASSERT(softmac->smac_taskq == NULL); - ASSERT(!(softmac->smac_flags & SOFTMAC_ATTACH_DONE)); - mutex_destroy(&softmac->smac_mutex); - cv_destroy(&softmac->smac_cv); - rw_destroy(&softmac->smac_lock); - kmem_free(softmac, sizeof (softmac_t)); return (0); } - } else { - softmac->smac_attached_left = softmac->smac_attachok_cnt; - } + ASSERT(softmac->smac_taskq == NULL); + err = mod_hash_remove(softmac_hash, + (mod_hash_key_t)devname, + (mod_hash_val_t *)&hashval); + ASSERT(err == 0); + + mutex_exit(&softmac->smac_mutex); + rw_exit(&softmac_hash_lock); + + mutex_destroy(&softmac->smac_mutex); + cv_destroy(&softmac->smac_cv); + kmem_free(softmac, sizeof (softmac_t)); + return (0); + } mutex_exit(&softmac->smac_mutex); rw_exit(&softmac_hash_lock); + return (0); + +error: + mutex_enter(&softmac->smac_mutex); + softmac->smac_attached_left = softmac->smac_attachok_cnt; + softmac->smac_state = SOFTMAC_ATTACH_DONE; + cv_broadcast(&softmac->smac_cv); + mutex_exit(&softmac->smac_mutex); return (err); } @@ -863,17 +1016,33 @@ softmac_mac_recreate(mod_hash_key_t key, mod_hash_val_t *val, void *arg) softmac_t *softmac = (softmac_t *)val; datalink_id_t linkid; int err; - - ASSERT(RW_READ_HELD(&softmac_hash_lock)); + softmac_walk_t *smwp = arg; /* - * Wait for softmac_create() and softmac_mac_register() to exit. + * The framework itself must not hold any locks across calls to the + * mac perimeter. Thus this function does not call any framework + * function that needs to grab the mac perimeter. */ + ASSERT(RW_READ_HELD(&softmac_hash_lock)); + + smwp->smw_retry = B_FALSE; mutex_enter(&softmac->smac_mutex); - while (!(softmac->smac_flags & SOFTMAC_ATTACH_DONE)) - cv_wait(&softmac->smac_cv, &softmac->smac_mutex); + SOFTMAC_STATE_VERIFY(softmac); + if (softmac->smac_state == SOFTMAC_ATTACH_INPROG) { + /* + * Wait till softmac_create or softmac_mac_register finishes + * Hold the softmac to ensure it stays around. The wait itself + * is done in the caller, since we need to drop all locks + * including the mod hash's internal lock before calling + * cv_wait. + */ + smwp->smw_retry = B_TRUE; + smwp->smw_softmac = softmac; + softmac->smac_hold_cnt++; + return (MH_WALK_TERMINATE); + } - if ((softmac->smac_attacherr != 0) || + if ((softmac->smac_state != SOFTMAC_ATTACH_DONE) || !(softmac->smac_flags & SOFTMAC_NEED_RECREATE)) { mutex_exit(&softmac->smac_mutex); return (MH_WALK_CONTINUE); @@ -918,13 +1087,30 @@ softmac_mac_recreate(mod_hash_key_t key, mod_hash_val_t *val, void *arg) void softmac_recreate() { + softmac_walk_t smw; + softmac_t *softmac; + /* * Walk through the softmac_hash table. Request to create the * [link name, linkid] mapping if we failed to do so. */ - rw_enter(&softmac_hash_lock, RW_READER); - mod_hash_walk(softmac_hash, softmac_mac_recreate, NULL); - rw_exit(&softmac_hash_lock); + do { + smw.smw_retry = B_FALSE; + rw_enter(&softmac_hash_lock, RW_READER); + mod_hash_walk(softmac_hash, softmac_mac_recreate, &smw); + rw_exit(&softmac_hash_lock); + if (smw.smw_retry) { + /* + * softmac_create or softmac_mac_register hasn't yet + * finished and the softmac is not yet in the + * SOFTMAC_ATTACH_DONE state. + */ + softmac = smw.smw_softmac; + cv_wait(&softmac->smac_cv, &softmac->smac_mutex); + softmac->smac_hold_cnt--; + mutex_exit(&softmac->smac_mutex); + } + } while (smw.smw_retry); } /* ARGSUSED */ @@ -1064,20 +1250,14 @@ softmac_m_open(void *arg) softmac_lower_t *slp; int err; - rw_enter(&softmac->smac_lock, RW_READER); - if (softmac->smac_state == SOFTMAC_READY) - goto done; - rw_exit(&softmac->smac_lock); + ASSERT(MAC_PERIM_HELD(softmac->smac_mh)); + ASSERT(softmac->smac_lower_state == SOFTMAC_INITIALIZED); if ((err = softmac_lower_setup(softmac, &slp)) != 0) return (err); - rw_enter(&softmac->smac_lock, RW_WRITER); - ASSERT(softmac->smac_state == SOFTMAC_INITIALIZED); softmac->smac_lower = slp; - softmac->smac_state = SOFTMAC_READY; -done: - rw_exit(&softmac->smac_lock); + softmac->smac_lower_state = SOFTMAC_READY; return (0); } @@ -1087,7 +1267,8 @@ softmac_m_close(void *arg) softmac_t *softmac = arg; softmac_lower_t *slp; - rw_enter(&softmac->smac_lock, RW_WRITER); + ASSERT(MAC_PERIM_HELD(softmac->smac_mh)); + ASSERT(softmac->smac_lower_state == SOFTMAC_READY); slp = softmac->smac_lower; ASSERT(slp != NULL); @@ -1095,9 +1276,8 @@ softmac_m_close(void *arg) * Note that slp is destroyed when lh is closed. */ (void) ldi_close(slp->sl_lh, FREAD|FWRITE, kcred); - softmac->smac_state = SOFTMAC_INITIALIZED; + softmac->smac_lower_state = SOFTMAC_INITIALIZED; softmac->smac_lower = NULL; - rw_exit(&softmac->smac_lock); } int @@ -1146,7 +1326,10 @@ again: * be recreated when device fails to detach (as this device * is held). */ + mutex_enter(&smac_global_lock); rw_exit(&softmac_hash_lock); + cv_wait(&smac_global_cv, &smac_global_lock); + mutex_exit(&smac_global_lock); goto again; } @@ -1155,17 +1338,16 @@ again: */ mutex_enter(&softmac->smac_mutex); softmac->smac_hold_cnt++; - mutex_exit(&softmac->smac_mutex); - rw_exit(&softmac_hash_lock); /* * Wait till the device is fully attached. */ - mutex_enter(&softmac->smac_mutex); - while (!(softmac->smac_flags & SOFTMAC_ATTACH_DONE)) + while (softmac->smac_state != SOFTMAC_ATTACH_DONE) cv_wait(&softmac->smac_cv, &softmac->smac_mutex); + SOFTMAC_STATE_VERIFY(softmac); + if ((err = softmac->smac_attacherr) != 0) softmac->smac_hold_cnt--; else diff --git a/usr/src/uts/common/io/softmac/softmac_pkt.c b/usr/src/uts/common/io/softmac/softmac_pkt.c index 3587fa515a..4b8d7e3049 100644 --- a/usr/src/uts/common/io/softmac/softmac_pkt.c +++ b/usr/src/uts/common/io/softmac/softmac_pkt.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/strsubr.h> #include <inet/led.h> #include <sys/softmac_impl.h> @@ -69,40 +67,6 @@ softmac_m_tx(void *arg, mblk_t *mp) return (mp); } -/*ARGSUSED*/ -static void -softmac_blank(void *arg, time_t ticks, uint_t count) -{ -} - -void -softmac_m_resources(void *arg) -{ - softmac_t *softmac = arg; - softmac_lower_t *slp = softmac->smac_lower; - mac_rx_fifo_t mrf; - - ASSERT((softmac->smac_state == SOFTMAC_READY) && (slp != NULL)); - - /* - * Register rx resources and save resource handle for future reference. - * Note that the mac_resources() function must be called when the lower - * stream is plumbed. - */ - - mutex_enter(&slp->sl_mutex); - - mrf.mrf_type = MAC_RX_FIFO; - mrf.mrf_blank = softmac_blank; - mrf.mrf_arg = slp; - mrf.mrf_normal_blank_time = SOFTMAC_BLANK_TICKS; - mrf.mrf_normal_pkt_count = SOFTMAC_BLANK_PKT_COUNT; - - slp->sl_handle = - mac_resource_add(softmac->smac_mh, (mac_resource_t *)&mrf); - - mutex_exit(&slp->sl_mutex); -} void softmac_rput_process_data(softmac_lower_t *slp, mblk_t *mp) @@ -125,7 +89,7 @@ softmac_rput_process_data(softmac_lower_t *slp, mblk_t *mp) mp = tmp; } - mac_rx(slp->sl_softmac->smac_mh, slp->sl_handle, mp); + mac_rx(slp->sl_softmac->smac_mh, NULL, mp); return; failed: diff --git a/usr/src/uts/common/io/strplumb.c b/usr/src/uts/common/io/strplumb.c index ffb7753e09..27b9cc8843 100644 --- a/usr/src/uts/common/io/strplumb.c +++ b/usr/src/uts/common/io/strplumb.c @@ -69,7 +69,7 @@ #include <sys/ddi_implfuncs.h> #include <sys/dld.h> -#include <sys/mac.h> +#include <sys/mac_client.h> /* * Debug Macros diff --git a/usr/src/uts/common/io/ural/ural.c b/usr/src/uts/common/io/ural/ural.c index 5b54d54935..b474dd8c2c 100644 --- a/usr/src/uts/common/io/ural/ural.c +++ b/usr/src/uts/common/io/ural/ural.c @@ -43,7 +43,7 @@ #include <sys/modctl.h> #include <sys/devops.h> #include <sys/dlpi.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_wifi.h> #include <sys/net80211.h> #include <sys/net80211_proto.h> @@ -295,7 +295,6 @@ static mac_callbacks_t ural_m_callbacks = { ural_m_multicst, ural_m_unicst, ural_m_tx, - NULL, /* mc_resources; */ ural_m_ioctl, NULL, /* mc_getcapab */ NULL, diff --git a/usr/src/uts/common/io/vnic/vnic_bcast.c b/usr/src/uts/common/io/vnic/vnic_bcast.c deleted file mode 100644 index 28ba800fd5..0000000000 --- a/usr/src/uts/common/io/vnic/vnic_bcast.c +++ /dev/null @@ -1,468 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <sys/types.h> -#include <sys/sysmacros.h> -#include <sys/conf.h> -#include <sys/cmn_err.h> -#include <sys/list.h> -#include <sys/kmem.h> -#include <sys/stream.h> -#include <sys/modctl.h> -#include <sys/ddi.h> -#include <sys/sunddi.h> -#include <sys/atomic.h> -#include <sys/stat.h> -#include <sys/modhash.h> -#include <sys/strsubr.h> -#include <sys/strsun.h> -#include <sys/mac.h> -#include <sys/vnic.h> -#include <sys/vnic_impl.h> - -/* - * Broadcast and multicast traffic must be distributed to the VNICs - * that are defined on top of the same underlying NIC. The set of - * destinations to which a multicast packet must be sent is a subset - * of all VNICs defined on top of the same NIC. A VNIC can be member - * of more than one such subset. - * - * To accomodate these requirements, we introduce broadcast groups. - * A broadcast group is associated with a broadcast or multicast - * address. The members of a broadcast group consist of the VNICs - * that should received copies of packets sent to the address - * associated with the group, and are defined on top of the - * same underlying NIC. The underlying NIC is always implicetely - * part of the group. - * - * The broadcast groups defined on top of a underlying NIC are chained, - * hanging off vnic_mac_t structures. - */ - -typedef struct vnic_bcast_grp_s { - struct vnic_bcast_grp_s *vbg_next; - uint_t vbg_refs; - void *vbg_addr; - vnic_mac_t *vbg_vnic_mac; - mac_addrtype_t vbg_addrtype; - vnic_flow_t *vbg_flow_ent; - vnic_t **vbg_vnics; - uint_t vbg_nvnics; - uint_t vbg_nvnics_alloc; - uint64_t vbg_vnics_gen; -} vnic_bcast_grp_t; - -#define VNIC_BCAST_GRP_REFHOLD(grp) { \ - atomic_add_32(&(grp)->vbg_refs, 1); \ - ASSERT((grp)->vbg_refs != 0); \ -} - -#define VNIC_BCAST_GRP_REFRELE(grp) { \ - ASSERT((grp)->vbg_refs != 0); \ - membar_exit(); \ - if (atomic_add_32_nv(&(grp)->vbg_refs, -1) == 0) \ - vnic_bcast_grp_free(grp); \ -} - -static kmem_cache_t *vnic_bcast_grp_cache; - -void -vnic_bcast_init(void) -{ - vnic_bcast_grp_cache = kmem_cache_create("vnic_bcast_grp_cache", - sizeof (vnic_bcast_grp_t), 0, NULL, NULL, NULL, NULL, NULL, 0); -} - -void -vnic_bcast_fini(void) -{ - kmem_cache_destroy(vnic_bcast_grp_cache); -} - -/* - * Free the specific broadcast group. Invoked when the last reference - * to the group is released. - */ -static void -vnic_bcast_grp_free(vnic_bcast_grp_t *grp) -{ - vnic_mac_t *vnic_mac = grp->vbg_vnic_mac; - - if (grp->vbg_addrtype == MAC_ADDRTYPE_MULTICAST) { - /* - * The address is a multicast address, have the - * underlying NIC leave the multicast group. - */ - (void) mac_multicst_remove(vnic_mac->va_mh, grp->vbg_addr); - } - - ASSERT(grp->vbg_addr != NULL); - kmem_free(grp->vbg_addr, grp->vbg_vnic_mac->va_addr_len); - - ASSERT(grp->vbg_vnics != NULL); - kmem_free(grp->vbg_vnics, grp->vbg_nvnics_alloc * sizeof (vnic_t *)); - - kmem_cache_free(vnic_bcast_grp_cache, grp); -} - -void -vnic_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain) -{ - vnic_bcast_grp_t *grp = arg1; - vnic_t *sender_vnic = arg2, *vnic; - const vnic_flow_fn_info_t *fn_info; - krwlock_t *grp_lock = &grp->vbg_vnic_mac->va_bcast_grp_lock; - uint64_t gen; - uint_t i; - mblk_t *mp_chain1; - vnic_mac_t *vnic_mac; - - VNIC_BCAST_GRP_REFHOLD(grp); - rw_enter(grp_lock, RW_READER); - - if (grp->vbg_nvnics == 0) - goto bail; - vnic_mac = grp->vbg_vnics[0]->vn_vnic_mac; - - /* - * Pass a copy of the mp chain to every VNIC except the sender - * VNIC, if the packet was not received from the underlying NIC. - * - * The broadcast group lock across calls to the flow's callback - * function, since the same group could potentially be accessed - * from the same context. When the lock is reacquired, changes - * to the broadcast group while the lock was released - * are caught using a generation counter incremented each time - * the list of VNICs associated with the broadcast group - * is changed. - */ - for (i = 0; i < grp->vbg_nvnics; i++) { - vnic = grp->vbg_vnics[i]; - if (vnic == sender_vnic) - continue; - - /* - * If this consumer is in promiscuous mode then it - * will have already seen a copy of the packet. - */ - if (vnic->vn_promisc) - continue; - /* - * It is important to hold a reference on the - * flow_ent here. vnic_dev_delete() may be waiting - * to delete the vnic after removing it from grp. - */ - if ((mp_chain1 = vnic_copymsgchain_cksum(mp_chain)) == NULL) - break; - /* - * Fix the checksum for packets originating - * from the local machine. - */ - if ((sender_vnic != NULL) && - ((mp_chain1 = vnic_fix_cksum(mp_chain1)) == NULL)) - break; - VNIC_FLOW_REFHOLD(vnic->vn_flow_ent); - fn_info = vnic_classifier_get_fn_info(vnic->vn_flow_ent); - gen = grp->vbg_vnics_gen; - rw_exit(grp_lock); - (fn_info->ff_fn)(fn_info->ff_arg1, fn_info->ff_arg2, mp_chain1); - VNIC_FLOW_REFRELE(vnic->vn_flow_ent); - rw_enter(grp_lock, RW_READER); - - /* update stats */ - if (grp->vbg_addrtype == MAC_ADDRTYPE_MULTICAST) - vnic->vn_stat_multircv++; - else - vnic->vn_stat_brdcstrcv++; - - if (grp->vbg_vnics_gen != gen) { - /* - * The list of VNICs associated with the group - * was changed while the lock was released. - * Give up on the current packet. - */ - freemsgchain(mp_chain); - goto bail; - } - } - - if (sender_vnic != NULL) { - /* - * The packet was sent from one of the VNICs - * (vnic_active_tx()), or from the active MAC - * (vnic_active_tx()). In both cases, we need to send - * a copy of the packet to the underlying NIC so that - * it can be sent on the wire. - */ - const mac_txinfo_t *mtp = vnic_mac->va_txinfo; - mblk_t *rest; - - if ((mp_chain1 = vnic_copymsgchain_cksum(mp_chain)) != NULL) { - rw_exit(grp_lock); - rest = mtp->mt_fn(mtp->mt_arg, mp_chain1); - rw_enter(grp_lock, RW_READER); - if (rest != NULL) - freemsgchain(rest); - } - } - - if ((sender_vnic != (vnic_t *)-1) && (sender_vnic != NULL)) { - /* - * Called while sending a packet from one of the VNICs. - * Make sure the active interface gets its copy. - */ - mp_chain1 = (sender_vnic != NULL) ? vnic_fix_cksum(mp_chain) : - mp_chain; - if (mp_chain1 != NULL) { - rw_exit(grp_lock); - mac_active_rx(vnic_mac->va_mh, NULL, mp_chain1); - rw_enter(grp_lock, RW_READER); - } - } else { - freemsgchain(mp_chain); - } -bail: - rw_exit(grp_lock); - VNIC_BCAST_GRP_REFRELE(grp); -} - -/* - * Add the specified VNIC to the group corresponding to the specified - * broadcast or multicast address. - * Return 0 on success, or an errno value on failure. - */ -int -vnic_bcast_add(vnic_t *vnic, const uint8_t *addr, mac_addrtype_t addrtype) -{ - vnic_mac_t *vnic_mac = vnic->vn_vnic_mac; - vnic_bcast_grp_t *grp = NULL, **last_grp; - int rc = 0; - - ASSERT(addrtype == MAC_ADDRTYPE_MULTICAST || - addrtype == MAC_ADDRTYPE_BROADCAST); - - rw_enter(&vnic_mac->va_bcast_grp_lock, RW_WRITER); - - /* - * Does a group with the specified broadcast address already - * exist for the underlying NIC? - */ - last_grp = &vnic_mac->va_bcast_grp; - for (grp = *last_grp; grp != NULL; - last_grp = &grp->vbg_next, grp = grp->vbg_next) { - if (bcmp(grp->vbg_addr, addr, vnic_mac->va_addr_len) == 0) - break; - } - - if (grp == NULL) { - /* - * The group does not yet exist, create it. - */ - grp = kmem_cache_alloc(vnic_bcast_grp_cache, KM_SLEEP); - bzero(grp, sizeof (vnic_bcast_grp_t)); - grp->vbg_next = NULL; - ASSERT(grp->vbg_refs == 0); - grp->vbg_vnic_mac = vnic_mac; - - grp->vbg_addr = kmem_zalloc(vnic_mac->va_addr_len, KM_SLEEP); - bcopy(addr, grp->vbg_addr, vnic_mac->va_addr_len); - grp->vbg_addrtype = addrtype; - - /* - * Add a new flow for the broadcast address. - */ - grp->vbg_flow_ent = vnic_classifier_flow_create( - vnic_mac->va_addr_len, (uchar_t *)addr, grp, B_FALSE, - KM_NOSLEEP); - if (grp->vbg_flow_ent == NULL) { - rc = ENOMEM; - goto bail; - } - - /* - * When the multicast and broadcast packet is received - * by the underlying NIC, mac_rx_classify() will invoke - * vnic_bcast_send() with arg2=NULL, which will cause - * vnic_bcast_send() to send a copy of the packet(s) - * to every VNIC defined on top of the underlying MAC. - * - * When the vnic_bcast_send() function is invoked from - * the VNIC transmit path, it will specify the transmitting - * VNIC as the arg2 value, which will allow vnic_bcast_send() - * to skip that VNIC and not send it a copy of the packet. - * - * We program the classifier to dispatch matching broadcast - * packets to vnic_bcast_send(). - * We need a ring allocated for this bcast flow, so that - * later snooping of the underlying MAC uses the same scheme - * of intercepting the ring's receiver to mac_rx_promisc(). - * For the economy of hardware resources, we command the MAC - * classifier to use a soft ring for these broadcast and - * multicast flows. - */ - vnic_classifier_flow_add(vnic_mac, grp->vbg_flow_ent, - vnic_bcast_send, grp, NULL); - - /* - * For multicast addresses, have the underlying MAC - * join the corresponsing multicast group. - */ - if ((addrtype == MAC_ADDRTYPE_MULTICAST) && - ((rc = mac_multicst_add(vnic_mac->va_mh, addr)) != 0)) { - vnic_classifier_flow_remove(vnic->vn_vnic_mac, - grp->vbg_flow_ent); - vnic_classifier_flow_destroy(grp->vbg_flow_ent); - goto bail; - } - - *last_grp = grp; - } - - /* - * Add the VNIC to the list of VNICs associated with the group. - */ - if (grp->vbg_nvnics_alloc == grp->vbg_nvnics) { - vnic_t **new_vnics; - uint_t new_size = grp->vbg_nvnics+1; - - new_vnics = kmem_zalloc(new_size * sizeof (vnic_t *), - KM_SLEEP); - - if (grp->vbg_nvnics) { - ASSERT(grp->vbg_vnics != NULL); - bcopy(grp->vbg_vnics, new_vnics, grp->vbg_nvnics * - sizeof (vnic_t *)); - kmem_free(grp->vbg_vnics, grp->vbg_nvnics * - sizeof (vnic_t *)); - } - - grp->vbg_vnics = new_vnics; - grp->vbg_nvnics_alloc = new_size; - } - - grp->vbg_vnics[grp->vbg_nvnics++] = vnic; - - /* - * Since we're adding to the list of VNICs using that group, - * kick the generation count, which will allow vnic_bcast_send() - * to detect that condition. - */ - grp->vbg_vnics_gen++; - - VNIC_BCAST_GRP_REFHOLD(grp); - -bail: - if (rc != 0 && grp != NULL) { - kmem_free(grp->vbg_addr, vnic_mac->va_addr_len); - kmem_cache_free(vnic_bcast_grp_cache, grp); - } - - rw_exit(&vnic->vn_vnic_mac->va_bcast_grp_lock); - return (rc); -} - -/* - * Remove the specified VNIC from the group corresponding to - * the specific broadcast or multicast address. - * - * Note: vnic_bcast_delete() calls net_remove_flow() which - * will call cv_wait for fe_refcnt to drop to 0. So this function - * should not be called from interrupt or STREAMS context. The only - * callers are vnic_dev_delete() and vnic_m_multicst() (both of - * which are called from taskq thread context). - */ -void -vnic_bcast_delete(vnic_t *vnic, const uint8_t *addr) -{ - vnic_mac_t *vnic_mac = vnic->vn_vnic_mac; - vnic_bcast_grp_t *grp, **prev; - uint_t i; - boolean_t removing_grp = B_FALSE; - - rw_enter(&vnic_mac->va_bcast_grp_lock, RW_WRITER); - - /* find the broadcast group */ - prev = &vnic_mac->va_bcast_grp; - for (grp = vnic_mac->va_bcast_grp; grp != NULL; prev = &grp->vbg_next, - grp = grp->vbg_next) { - if (bcmp(grp->vbg_addr, addr, vnic_mac->va_addr_len) == 0) - break; - } - ASSERT(grp != NULL); - - /* - * Remove the VNIC from the list of VNICs associated with that - * broadcast group. - * - * We keep the vbg_vnics[] always compact by repacing - * the removed vnic with the last non NULL element in that array. - */ - - for (i = 0; i < grp->vbg_nvnics; i++) { - if (grp->vbg_vnics[i] == vnic) - break; - } - - ASSERT(i < grp->vbg_nvnics); - - if (i == (grp->vbg_nvnics-1)) { - grp->vbg_vnics[i] = NULL; - } else { - grp->vbg_vnics[i] = grp->vbg_vnics[grp->vbg_nvnics-1]; - grp->vbg_vnics[grp->vbg_nvnics-1] = NULL; - } - - /* - * Since we're removing from the list of VNICs using that group, - * kick the generation count, which will allow vnic_bcast_send() - * to detect that condition. - */ - grp->vbg_vnics_gen++; - - if (--grp->vbg_nvnics == 0) { - /* - * Unlink the current group from the list of groups - * defined on top of the underlying NIC. The group - * structure will stay around until the last reference - * is dropped. - */ - *prev = grp->vbg_next; - removing_grp = B_TRUE; - } - - rw_exit(&vnic->vn_vnic_mac->va_bcast_grp_lock); - - /* - * If the group itself is being removed, remove the - * corresponding flow from the underlying NIC. - */ - if (removing_grp) { - vnic_classifier_flow_remove(vnic->vn_vnic_mac, - grp->vbg_flow_ent); - vnic_classifier_flow_destroy(grp->vbg_flow_ent); - } - - VNIC_BCAST_GRP_REFRELE(grp); -} diff --git a/usr/src/uts/common/io/vnic/vnic_cl.c b/usr/src/uts/common/io/vnic/vnic_cl.c deleted file mode 100644 index b7939f141d..0000000000 --- a/usr/src/uts/common/io/vnic/vnic_cl.c +++ /dev/null @@ -1,319 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/types.h> -#include <sys/vnic.h> -#include <sys/vnic_impl.h> - -/* - * Virtual Network Interface Card (VNIC) classification. - * - * The VNIC implements a software classifier which is used to steer - * traffic (locally and externally generated) to the appropriate VNIC - * based on MAC addresses. - */ - -static kmem_cache_t *vnic_flow_cache; -static kmem_cache_t *vnic_flow_tab_cache; - -static void vnic_classifier_rx(void *, mac_resource_handle_t, mblk_t *); - -/* ARGSUSED */ -static int -vnic_classifier_flow_tab_ctor(void *buf, void *arg, int km_flag) -{ - vnic_flow_tab_t *flow_tab = buf; - - bzero(flow_tab, sizeof (vnic_flow_tab_t)); - rw_init(&flow_tab->vt_lock, NULL, RW_DRIVER, NULL); - return (0); -} - -/* ARGSUSED */ -static void -vnic_classifier_flow_tab_dtor(void *buf, void *arg) -{ - vnic_flow_tab_t *flow_tab = buf; - - rw_destroy(&flow_tab->vt_lock); -} - -/* ARGSUSED */ -static int -vnic_classifier_flow_ctor(void *buf, void *arg, int km_flag) -{ - vnic_flow_t *flow = buf; - - bzero(flow, sizeof (vnic_flow_t)); - mutex_init(&flow->vf_lock, NULL, MUTEX_DRIVER, NULL); - cv_init(&flow->vf_cv, NULL, CV_DRIVER, NULL); - return (0); -} - -/* ARGSUSED */ -static void -vnic_classifier_flow_dtor(void *buf, void *arg) -{ - vnic_flow_t *flow = buf; - - ASSERT(flow->vf_refs == 0); - mutex_destroy(&flow->vf_lock); - cv_destroy(&flow->vf_cv); -} - -void -vnic_classifier_init(void) -{ - vnic_flow_cache = kmem_cache_create("vnic_flow_cache", - sizeof (vnic_flow_t), 0, vnic_classifier_flow_ctor, - vnic_classifier_flow_dtor, NULL, NULL, NULL, 0); - vnic_flow_tab_cache = kmem_cache_create("vnic_flow_tab_cache", - sizeof (vnic_flow_tab_t), 0, vnic_classifier_flow_tab_ctor, - vnic_classifier_flow_tab_dtor, NULL, NULL, NULL, 0); -} - -void -vnic_classifier_fini(void) -{ - kmem_cache_destroy(vnic_flow_cache); - kmem_cache_destroy(vnic_flow_tab_cache); -} - -int -vnic_classifier_flow_tab_init(vnic_mac_t *vnic_mac, uint_t mac_len, - int km_flag) -{ - vnic_mac->va_flow_tab = kmem_cache_alloc(vnic_flow_tab_cache, km_flag); - if (vnic_mac->va_flow_tab == NULL) - return (ENOMEM); - vnic_mac->va_rx_hdl = mac_rx_add(vnic_mac->va_mh, vnic_classifier_rx, - vnic_mac); - vnic_mac->va_flow_tab->vt_addr_len = mac_len; - return (0); -} - -void -vnic_classifier_flow_tab_fini(vnic_mac_t *vnic_mac) -{ - vnic_flow_tab_t *flow_tab = vnic_mac->va_flow_tab; - - ASSERT(flow_tab->vt_flow_list == NULL); - mac_rx_remove(vnic_mac->va_mh, vnic_mac->va_rx_hdl, B_TRUE); - kmem_cache_free(vnic_flow_tab_cache, flow_tab); - vnic_mac->va_flow_tab = NULL; -} - -vnic_flow_t * -vnic_classifier_flow_create(uint_t mac_len, uchar_t *mac_addr, - void *flow_cookie, boolean_t is_active, int km_flag) -{ - vnic_flow_t *flow; - - ASSERT(mac_len <= MAXMACADDRLEN); - - if ((flow = kmem_cache_alloc(vnic_flow_cache, km_flag)) == NULL) - return (NULL); - - flow->vf_addr_len = mac_len; - flow->vf_cookie = flow_cookie; - flow->vf_clearing = B_FALSE; - flow->vf_is_active = is_active; - bcopy(mac_addr, flow->vf_addr, mac_len); - return (flow); -} - -void -vnic_classifier_flow_destroy(vnic_flow_t *flow) -{ - kmem_cache_free(vnic_flow_cache, flow); -} - -void -vnic_classifier_flow_add(vnic_mac_t *vnic_mac, vnic_flow_t *flow, - vnic_rx_fn_t rx_fn, void *rx_arg1, void *rx_arg2) -{ - vnic_flow_tab_t *flow_tab = vnic_mac->va_flow_tab; - vnic_flow_t **cur_flow; - - ASSERT(flow->vf_addr_len == flow_tab->vt_addr_len); - - /* initialize the flow structure */ - flow->vf_fn_info.ff_fn = rx_fn; - flow->vf_fn_info.ff_arg1 = rx_arg1; - flow->vf_fn_info.ff_arg2 = rx_arg2; - - /* add to the flow table */ - rw_enter(&flow_tab->vt_lock, RW_WRITER); - for (cur_flow = &flow_tab->vt_flow_list; - *cur_flow != NULL; - cur_flow = &(*cur_flow)->vf_next) - ; - *cur_flow = flow; - flow->vf_next = NULL; - rw_exit(&flow_tab->vt_lock); -} - -void -vnic_classifier_flow_remove(vnic_mac_t *vnic_mac, vnic_flow_t *flow) -{ - vnic_flow_tab_t *flow_tab = vnic_mac->va_flow_tab; - vnic_flow_t **prev, *cur; - - /* unlink from list */ - rw_enter(&flow_tab->vt_lock, RW_WRITER); - prev = &flow_tab->vt_flow_list; - for (cur = *prev; cur != NULL && cur != flow; - prev = &cur->vf_next, cur = cur->vf_next) - ; - *prev = cur->vf_next; - rw_exit(&flow_tab->vt_lock); - - /* wait for all references to the flow to go away */ - mutex_enter(&flow->vf_lock); - flow->vf_clearing = B_TRUE; - while (flow->vf_refs > 0) - cv_wait(&flow->vf_cv, &flow->vf_lock); - mutex_exit(&flow->vf_lock); -} - -void -vnic_classifier_flow_update_addr(vnic_flow_t *flow, uchar_t *mac_addr) -{ - bcopy(mac_addr, flow->vf_addr, flow->vf_addr_len); -} - -void -vnic_classifier_flow_update_fn(vnic_flow_t *flow, vnic_rx_fn_t fn, - void *arg1, void *arg2) -{ - flow->vf_fn_info.ff_fn = fn; - flow->vf_fn_info.ff_arg1 = arg1; - flow->vf_fn_info.ff_arg2 = arg2; -} - -vnic_flow_t * -vnic_classifier_get_flow(vnic_mac_t *vnic_mac, mblk_t *mp) -{ - vnic_flow_tab_t *flow_tab = vnic_mac->va_flow_tab; - vnic_flow_t *flow; - mac_header_info_t hdr_info; - - if (mac_header_info(vnic_mac->va_mh, mp, &hdr_info) != 0) - return (NULL); - - rw_enter(&flow_tab->vt_lock, RW_READER); - for (flow = flow_tab->vt_flow_list; flow != NULL; - flow = flow->vf_next) { - if (bcmp(hdr_info.mhi_daddr, flow->vf_addr, - flow_tab->vt_addr_len) == 0) { - VNIC_FLOW_REFHOLD(flow); - break; - } - } - rw_exit(&flow_tab->vt_lock); - return (flow); -} - -void * -vnic_classifier_get_client_cookie(vnic_flow_t *flow) -{ - return (flow->vf_cookie); -} - -vnic_flow_fn_info_t * -vnic_classifier_get_fn_info(vnic_flow_t *flow) -{ - return (&flow->vf_fn_info); -} - -boolean_t -vnic_classifier_is_active(vnic_flow_t *flow) -{ - return (flow->vf_is_active); -} - -/* - * Receive function registered with the MAC layer. Classifies - * the packets, i.e. finds the flows matching the packets passed - * as argument, and invokes the callback functions associated with - * these flows. - */ -/*ARGSUSED*/ -static void -vnic_classifier_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp) -{ - vnic_mac_t *vnic_mac = arg; - vnic_flow_t *flow; - mblk_t *next_mp; - const vnic_flow_fn_info_t *fn_info; - - while (mp != NULL) { - next_mp = mp->b_next; - mp->b_next = NULL; - - vnic_promisc_rx(vnic_mac, NULL, mp); - - flow = vnic_classifier_get_flow(vnic_mac, mp); - if (flow == NULL) { - freemsg(mp); - } else { - if (flow->vf_is_active) { - /* - * Inbound packets are delivered to the - * active MAC through mac_rx() of the - * the NIC. - */ - freemsg(mp); - } else { - vnic_t *vnic; - - fn_info = vnic_classifier_get_fn_info(flow); - - /* - * If the vnic to which we would - * deliver this packet is in - * promiscuous mode then it already - * received the packet via - * vnic_promisc_rx(). - * - * XXX assumes that ff_arg2 is a - * vnic_t pointer if it is non-NULL - * (currently always true). - */ - vnic = (vnic_t *)fn_info->ff_arg2; - if ((vnic != NULL) && vnic->vn_promisc) { - freemsg(mp); - } else { - (fn_info->ff_fn)(fn_info->ff_arg1, - fn_info->ff_arg2, mp); - } - } - VNIC_FLOW_REFRELE(flow); - } - mp = next_mp; - } -} diff --git a/usr/src/uts/common/io/vnic/vnic_ctl.c b/usr/src/uts/common/io/vnic/vnic_ctl.c index a2873c9601..d4f5554949 100644 --- a/usr/src/uts/common/io/vnic/vnic_ctl.c +++ b/usr/src/uts/common/io/vnic/vnic_ctl.c @@ -31,62 +31,35 @@ #include <sys/modctl.h> #include <sys/vnic.h> #include <sys/vnic_impl.h> -#include <inet/common.h> +#include <sys/priv_names.h> /* module description */ -#define VNIC_LINKINFO "VNIC MAC" +#define VNIC_LINKINFO "Virtual NIC" /* device info ptr, only one for instance 0 */ static dev_info_t *vnic_dip = NULL; static int vnic_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); static int vnic_attach(dev_info_t *, ddi_attach_cmd_t); static int vnic_detach(dev_info_t *, ddi_detach_cmd_t); -static dld_ioc_func_t vnic_ioc_create, vnic_ioc_modify, vnic_ioc_delete, - vnic_ioc_info; + +static int vnic_ioc_create(void *, intptr_t, int, cred_t *, int *); +static int vnic_ioc_delete(void *, intptr_t, int, cred_t *, int *); +static int vnic_ioc_info(void *, intptr_t, int, cred_t *, int *); +static int vnic_ioc_modify(void *, intptr_t, int, cred_t *, int *); static dld_ioc_info_t vnic_ioc_list[] = { - {VNIC_IOC_CREATE, DLDCOPYIN | DLDDLCONFIG, sizeof (vnic_ioc_create_t), - vnic_ioc_create}, - {VNIC_IOC_DELETE, DLDCOPYIN | DLDDLCONFIG, sizeof (vnic_ioc_delete_t), - vnic_ioc_delete}, + {VNIC_IOC_CREATE, DLDCOPYINOUT, sizeof (vnic_ioc_create_t), + vnic_ioc_create, {PRIV_SYS_DL_CONFIG}}, + {VNIC_IOC_DELETE, DLDCOPYIN, sizeof (vnic_ioc_delete_t), + vnic_ioc_delete, {PRIV_SYS_DL_CONFIG}}, {VNIC_IOC_INFO, DLDCOPYINOUT, sizeof (vnic_ioc_info_t), - vnic_ioc_info}, - {VNIC_IOC_MODIFY, DLDCOPYIN | DLDDLCONFIG, sizeof (vnic_ioc_modify_t), - vnic_ioc_modify} -}; - -static struct cb_ops vnic_cb_ops = { - nulldev, /* open */ - nulldev, /* close */ - nulldev, /* strategy */ - nulldev, /* print */ - nodev, /* dump */ - nodev, /* read */ - nodev, /* write */ - nodev, /* ioctl */ - nodev, /* devmap */ - nodev, /* mmap */ - nodev, /* segmap */ - nochpoll, /* poll */ - ddi_prop_op, /* cb_prop_op */ - 0, /* streamtab */ - D_MP /* Driver compatibility flag */ + vnic_ioc_info, {NULL}}, + {VNIC_IOC_MODIFY, DLDCOPYIN, sizeof (vnic_ioc_modify_t), + vnic_ioc_modify, {PRIV_SYS_DL_CONFIG}}, }; -static struct dev_ops vnic_dev_ops = { - DEVO_REV, /* devo_rev */ - 0, /* refcnt */ - vnic_getinfo, /* get_dev_info */ - nulldev, /* identify */ - nulldev, /* probe */ - vnic_attach, /* attach */ - vnic_detach, /* detach */ - nodev, /* reset */ - &vnic_cb_ops, /* driver operations */ - NULL, /* bus operations */ - nodev, /* dev power */ - ddi_quiesce_not_supported, /* dev quiesce */ -}; +DDI_DEFINE_STREAM_OPS(vnic_dev_ops, nulldev, nulldev, vnic_attach, vnic_detach, + nodev, vnic_getinfo, D_MP, NULL, ddi_quiesce_not_supported); static struct modldrv vnic_modldrv = { &mod_driverops, /* Type of module. This one is a driver */ @@ -95,30 +68,32 @@ static struct modldrv vnic_modldrv = { }; static struct modlinkage modlinkage = { - MODREV_1, - &vnic_modldrv, - NULL + MODREV_1, &vnic_modldrv, NULL }; int _init(void) { - int err; + int status; mac_init_ops(&vnic_dev_ops, "vnic"); - if ((err = mod_install(&modlinkage)) != 0) + status = mod_install(&modlinkage); + if (status != DDI_SUCCESS) mac_fini_ops(&vnic_dev_ops); - return (err); + + return (status); } int _fini(void) { - int err; + int status; - if ((err = mod_remove(&modlinkage)) == 0) + status = mod_remove(&modlinkage); + if (status == DDI_SUCCESS) mac_fini_ops(&vnic_dev_ops); - return (err); + + return (status); } int @@ -131,16 +106,12 @@ static void vnic_init(void) { vnic_dev_init(); - vnic_bcast_init(); - vnic_classifier_init(); } static void vnic_fini(void) { vnic_dev_fini(); - vnic_bcast_fini(); - vnic_classifier_fini(); } dev_info_t * @@ -159,7 +130,7 @@ vnic_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, *result = vnic_dip; return (DDI_SUCCESS); case DDI_INFO_DEVT2INSTANCE: - *result = 0; + *result = NULL; return (DDI_SUCCESS); } return (DDI_FAILURE); @@ -174,14 +145,12 @@ vnic_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) /* we only allow instance 0 to attach */ return (DDI_FAILURE); } - if (dld_ioc_register(VNIC_IOC, vnic_ioc_list, DLDIOCCNT(vnic_ioc_list)) != 0) return (DDI_FAILURE); vnic_dip = dip; vnic_init(); - return (DDI_SUCCESS); case DDI_RESUME: @@ -208,7 +177,6 @@ vnic_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) vnic_dip = NULL; vnic_fini(); dld_ioc_unregister(VNIC_IOC); - return (DDI_SUCCESS); case DDI_SUSPEND: @@ -220,129 +188,135 @@ vnic_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) } /* - * Process a VNIC_IOC_CREATE request. + * Process a VNICIOC_CREATE request. */ /* ARGSUSED */ static int -vnic_ioc_create(void *karg, intptr_t arg, int mode, cred_t *cred) +vnic_ioc_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { vnic_ioc_create_t *create_arg = karg; - int mac_len; + int err = 0, mac_len = 0, mac_slot; uchar_t mac_addr[MAXMACADDRLEN]; - datalink_id_t vnic_id, linkid; + uint_t mac_prefix_len; vnic_mac_addr_type_t mac_addr_type; - - /* - * VNIC link id - */ - vnic_id = create_arg->vc_vnic_id; - - /* - * Linkid of the link the VNIC is defined on top of. - */ - linkid = create_arg->vc_link_id; + vnic_ioc_diag_t diag = VNIC_IOC_DIAG_NONE; + boolean_t is_anchor = create_arg->vc_flags & VNIC_IOC_CREATE_ANCHOR; /* MAC address */ mac_addr_type = create_arg->vc_mac_addr_type; - mac_len = create_arg->vc_mac_len; + + if (is_anchor) + goto create; switch (mac_addr_type) { case VNIC_MAC_ADDR_TYPE_FIXED: + mac_len = create_arg->vc_mac_len; + /* + * Sanity check the MAC address length. vnic_dev_create() + * will perform additional checks to ensure that the + * address is a valid unicast address of the appropriate + * length. + */ + if (mac_len == 0 || mac_len > MAXMACADDRLEN) { + err = EINVAL; + diag = VNIC_IOC_DIAG_MACADDRLEN_INVALID; + goto bail; + } + bcopy(create_arg->vc_mac_addr, mac_addr, MAXMACADDRLEN); + break; + case VNIC_MAC_ADDR_TYPE_FACTORY: + mac_slot = create_arg->vc_mac_slot; + /* sanity check the specified slot number */ + if (mac_slot < 0 && mac_slot != -1) { + err = EINVAL; + diag = VNIC_IOC_DIAG_MACFACTORYSLOTINVALID; + goto bail; + } + break; + case VNIC_MAC_ADDR_TYPE_AUTO: + mac_slot = -1; + /* FALLTHROUGH */ + case VNIC_MAC_ADDR_TYPE_RANDOM: + mac_prefix_len = create_arg->vc_mac_prefix_len; + if (mac_prefix_len > MAXMACADDRLEN) { + err = EINVAL; + diag = VNIC_IOC_DIAG_MACPREFIXLEN_INVALID; + goto bail; + } + mac_len = create_arg->vc_mac_len; + if (mac_len > MAXMACADDRLEN) { + err = EINVAL; + diag = VNIC_IOC_DIAG_MACADDRLEN_INVALID; + goto bail; + } bcopy(create_arg->vc_mac_addr, mac_addr, MAXMACADDRLEN); break; + case VNIC_MAC_ADDR_TYPE_PRIMARY: + /* + * We will get the primary address when we add this + * client + */ + break; default: - return (ENOTSUP); + err = ENOTSUP; + goto bail; } - return (vnic_dev_create(vnic_id, linkid, mac_len, mac_addr)); -} +create: + err = vnic_dev_create(create_arg->vc_vnic_id, create_arg->vc_link_id, + &mac_addr_type, &mac_len, mac_addr, &mac_slot, mac_prefix_len, + create_arg->vc_vid, &create_arg->vc_resource_props, + create_arg->vc_flags, &diag); + if (err != 0) + goto bail; -/* ARGSUSED */ -static int -vnic_ioc_modify(void *karg, intptr_t arg, int mode, cred_t *cred) -{ - vnic_ioc_modify_t *modify_arg = karg; - datalink_id_t vnic_id; - uint_t modify_mask; - vnic_mac_addr_type_t mac_addr_type; - uint_t mac_len; - uchar_t mac_addr[MAXMACADDRLEN]; + create_arg->vc_mac_addr_type = mac_addr_type; - vnic_id = modify_arg->vm_vnic_id; - modify_mask = modify_arg->vm_modify_mask; + if (is_anchor) + goto bail; - if (modify_mask & VNIC_IOC_MODIFY_ADDR) { - mac_addr_type = modify_arg->vm_mac_addr_type; - mac_len = modify_arg->vm_mac_len; - bcopy(modify_arg->vm_mac_addr, mac_addr, MAXMACADDRLEN); + switch (mac_addr_type) { + case VNIC_MAC_ADDR_TYPE_FACTORY: + create_arg->vc_mac_slot = mac_slot; + break; + case VNIC_MAC_ADDR_TYPE_RANDOM: + bcopy(mac_addr, create_arg->vc_mac_addr, MAXMACADDRLEN); + create_arg->vc_mac_len = mac_len; + break; } - return (vnic_dev_modify(vnic_id, modify_mask, mac_addr_type, - mac_len, mac_addr)); +bail: + create_arg->vc_diag = diag; + create_arg->vc_status = err; + return (err); } /* ARGSUSED */ static int -vnic_ioc_delete(void *karg, intptr_t arg, int mode, cred_t *cred) +vnic_ioc_modify(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { - vnic_ioc_delete_t *delete_arg = karg; + vnic_ioc_modify_t *modify_arg = karg; - return (vnic_dev_delete(delete_arg->vd_vnic_id)); + return (vnic_dev_modify(modify_arg->vm_vnic_id, + modify_arg->vm_modify_mask, modify_arg->vm_mac_addr_type, + modify_arg->vm_mac_len, modify_arg->vm_mac_addr, + modify_arg->vm_mac_slot, &modify_arg->vm_resource_props)); } -typedef struct vnic_ioc_info_state { - uint32_t bytes_left; - uchar_t *where; - int mode; -} vnic_ioc_info_state_t; - +/* ARGSUSED */ static int -vnic_ioc_info_new_vnic(void *arg, datalink_id_t id, - vnic_mac_addr_type_t addr_type, uint_t mac_len, uint8_t *mac_addr, - datalink_id_t linkid) +vnic_ioc_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { - vnic_ioc_info_state_t *state = arg; - /*LINTED*/ - vnic_ioc_info_vnic_t *vn = (vnic_ioc_info_vnic_t *)state->where; - - if (state->bytes_left < sizeof (*vn)) - return (ENOSPC); - - vn->vn_vnic_id = id; - vn->vn_link_id = linkid; - vn->vn_mac_addr_type = addr_type; - vn->vn_mac_len = mac_len; - if (ddi_copyout(mac_addr, &(vn->vn_mac_addr), mac_len, - state->mode) != 0) - return (EFAULT); - - state->where += sizeof (*vn); - state->bytes_left -= sizeof (*vn); + vnic_ioc_delete_t *delete_arg = karg; - return (0); + return (vnic_dev_delete(delete_arg->vd_vnic_id, 0)); } /* ARGSUSED */ static int -vnic_ioc_info(void *karg, intptr_t arg, int mode, cred_t *cred) +vnic_ioc_info(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) { - vnic_ioc_info_t *info_argp = karg; - uint32_t nvnics; - datalink_id_t vnic_id, linkid; - vnic_ioc_info_state_t state; - - /* - * ID of the vnic to return or vnic device. - * If zero, the call returns information - * regarding all vnics currently defined. - */ - vnic_id = info_argp->vi_vnic_id; - linkid = info_argp->vi_linkid; - - state.bytes_left = info_argp->vi_size; - state.where = (uchar_t *)(arg + sizeof (vnic_ioc_info_t)); - state.mode = mode; - - return (vnic_info(&nvnics, vnic_id, linkid, &state, - vnic_ioc_info_new_vnic)); + vnic_ioc_info_t *info_arg = karg; + + return (vnic_info(&info_arg->vi_info)); } diff --git a/usr/src/uts/common/io/vnic/vnic_dev.c b/usr/src/uts/common/io/vnic/vnic_dev.c index 7d98003a17..b76ddf678f 100644 --- a/usr/src/uts/common/io/vnic/vnic_dev.c +++ b/usr/src/uts/common/io/vnic/vnic_dev.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/sysmacros.h> #include <sys/conf.h> @@ -43,35 +41,50 @@ #include <sys/strsun.h> #include <sys/dlpi.h> #include <sys/mac.h> +#include <sys/mac_provider.h> +#include <sys/mac_client.h> +#include <sys/mac_client_priv.h> #include <sys/mac_ether.h> #include <sys/dls.h> #include <sys/pattr.h> +#include <sys/time.h> +#include <sys/vlan.h> #include <sys/vnic.h> #include <sys/vnic_impl.h> -#include <sys/gld.h> -#include <inet/ip.h> +#include <sys/mac_flow_impl.h> #include <inet/ip_impl.h> +/* + * Note that for best performance, the VNIC is a passthrough design. + * For each VNIC corresponds a MAC client of the underlying MAC (lower MAC). + * This MAC client is opened by the VNIC driver at VNIC creation, + * and closed when the VNIC is deleted. + * When a MAC client of the VNIC itself opens a VNIC, the MAC layer + * (upper MAC) detects that the MAC being opened is a VNIC. Instead + * of allocating a new MAC client, it asks the VNIC driver to return + * the lower MAC client handle associated with the VNIC, and that handle + * is returned to the upper MAC client directly. This allows access + * by upper MAC clients of the VNIC to have direct access to the lower + * MAC client for the control path and data path. + * + * Due to this passthrough, some of the entry points exported by the + * VNIC driver are never directly invoked. These entry points include + * vnic_m_start, vnic_m_stop, vnic_m_promisc, vnic_m_multicst, etc. + */ + static int vnic_m_start(void *); static void vnic_m_stop(void *); static int vnic_m_promisc(void *, boolean_t); static int vnic_m_multicst(void *, boolean_t, const uint8_t *); static int vnic_m_unicst(void *, const uint8_t *); static int vnic_m_stat(void *, uint_t, uint64_t *); -static void vnic_m_resources(void *); +static void vnic_m_ioctl(void *, queue_t *, mblk_t *); static mblk_t *vnic_m_tx(void *, mblk_t *); static boolean_t vnic_m_capab_get(void *, mac_capab_t, void *); -static void vnic_mac_free(vnic_mac_t *); -static uint_t vnic_info_walker(mod_hash_key_t, mod_hash_val_t *, void *); static void vnic_notify_cb(void *, mac_notify_type_t); -static int vnic_modify_mac_addr(vnic_t *, uint_t, uchar_t *); -static mblk_t *vnic_active_tx(void *, mblk_t *); -static int vnic_promisc_set(vnic_t *, boolean_t); static kmem_cache_t *vnic_cache; -static kmem_cache_t *vnic_mac_cache; static krwlock_t vnic_lock; -static kmutex_t vnic_mac_lock; static uint_t vnic_count; /* hash of VNICs (vnic_t's), keyed by VNIC id */ @@ -79,39 +92,7 @@ static mod_hash_t *vnic_hash; #define VNIC_HASHSZ 64 #define VNIC_HASH_KEY(vnic_id) ((mod_hash_key_t)(uintptr_t)vnic_id) -/* - * Hash of underlying open MACs (vnic_mac_t's), keyed by the string - * "<device name><instance number>/<port number>". - */ -static mod_hash_t *vnic_mac_hash; -#define VNIC_MAC_HASHSZ 64 - -#define VNIC_MAC_REFHOLD(va) { \ - ASSERT(MUTEX_HELD(&vnic_mac_lock)); \ - (va)->va_refs++; \ - ASSERT((va)->va_refs != 0); \ -} - -#define VNIC_MAC_REFRELE(va) { \ - ASSERT(MUTEX_HELD(&vnic_mac_lock)); \ - ASSERT((va)->va_refs != 0); \ - if (--((va)->va_refs) == 0) \ - vnic_mac_free(va); \ -} - -static uchar_t vnic_brdcst_mac[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; - -/* used by vnic_walker */ -typedef struct vnic_info_state { - datalink_id_t vs_vnic_id; - datalink_id_t vs_linkid; - boolean_t vs_vnic_found; - vnic_info_new_vnic_fn_t vs_new_vnic_fn; - void *vs_fn_arg; - int vs_rc; -} vnic_info_state_t; - -#define VNIC_M_CALLBACK_FLAGS (MC_RESOURCES | MC_GETCAPAB) +#define VNIC_M_CALLBACK_FLAGS (MC_IOCTL | MC_GETCAPAB) static mac_callbacks_t vnic_m_callbacks = { VNIC_M_CALLBACK_FLAGS, @@ -122,54 +103,21 @@ static mac_callbacks_t vnic_m_callbacks = { vnic_m_multicst, vnic_m_unicst, vnic_m_tx, - vnic_m_resources, - NULL, /* m_ioctl */ + vnic_m_ioctl, vnic_m_capab_get }; -/* ARGSUSED */ -static int -vnic_mac_ctor(void *buf, void *arg, int kmflag) -{ - vnic_mac_t *vnic_mac = buf; - - bzero(vnic_mac, sizeof (vnic_mac_t)); - rw_init(&vnic_mac->va_bcast_grp_lock, NULL, RW_DRIVER, NULL); - rw_init(&vnic_mac->va_promisc_lock, NULL, RW_DRIVER, NULL); - - return (0); -} - -/* ARGSUSED */ -static void -vnic_mac_dtor(void *buf, void *arg) -{ - vnic_mac_t *vnic_mac = buf; - - rw_destroy(&vnic_mac->va_promisc_lock); - rw_destroy(&vnic_mac->va_bcast_grp_lock); -} - void vnic_dev_init(void) { vnic_cache = kmem_cache_create("vnic_cache", sizeof (vnic_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - vnic_mac_cache = kmem_cache_create("vnic_mac_cache", - sizeof (vnic_mac_t), 0, vnic_mac_ctor, vnic_mac_dtor, - NULL, NULL, NULL, 0); - vnic_hash = mod_hash_create_idhash("vnic_hash", VNIC_HASHSZ, mod_hash_null_valdtor); - vnic_mac_hash = mod_hash_create_idhash("vnic_mac_hash", - VNIC_MAC_HASHSZ, mod_hash_null_valdtor); - rw_init(&vnic_lock, NULL, RW_DEFAULT, NULL); - mutex_init(&vnic_mac_lock, NULL, MUTEX_DEFAULT, NULL); - vnic_count = 0; } @@ -178,11 +126,8 @@ vnic_dev_fini(void) { ASSERT(vnic_count == 0); - mutex_destroy(&vnic_mac_lock); rw_destroy(&vnic_lock); - mod_hash_destroy_idhash(vnic_mac_hash); mod_hash_destroy_idhash(vnic_hash); - kmem_cache_destroy(vnic_mac_cache); kmem_cache_destroy(vnic_cache); } @@ -192,526 +137,162 @@ vnic_dev_count(void) return (vnic_count); } -static int -vnic_mac_open(datalink_id_t linkid, vnic_mac_t **vmp) -{ - int err; - vnic_mac_t *vnic_mac = NULL; - const mac_info_t *mip; - - *vmp = NULL; - - mutex_enter(&vnic_mac_lock); - - err = mod_hash_find(vnic_mac_hash, (mod_hash_key_t)(uintptr_t)linkid, - (mod_hash_val_t *)&vnic_mac); - if (err == 0) { - /* this MAC is already opened, increment reference count */ - VNIC_MAC_REFHOLD(vnic_mac); - mutex_exit(&vnic_mac_lock); - *vmp = vnic_mac; - return (0); - } - - vnic_mac = kmem_cache_alloc(vnic_mac_cache, KM_SLEEP); - if ((err = mac_open_by_linkid(linkid, &vnic_mac->va_mh)) != 0) { - vnic_mac->va_mh = NULL; - goto bail; - } - - /* - * For now, we do not support VNICs over legacy drivers. This will - * soon be changed. - */ - if (mac_is_legacy(vnic_mac->va_mh)) { - err = ENOTSUP; - goto bail; - } - - /* only ethernet support, for now */ - mip = mac_info(vnic_mac->va_mh); - if (mip->mi_media != DL_ETHER) { - err = ENOTSUP; - goto bail; - } - if (mip->mi_media != mip->mi_nativemedia) { - err = ENOTSUP; - goto bail; - } - - vnic_mac->va_linkid = linkid; - - /* add entry to hash table */ - err = mod_hash_insert(vnic_mac_hash, (mod_hash_key_t)(uintptr_t)linkid, - (mod_hash_val_t)vnic_mac); - ASSERT(err == 0); - - /* initialize the flow table associated with lower MAC */ - vnic_mac->va_addr_len = ETHERADDRL; - (void) vnic_classifier_flow_tab_init(vnic_mac, vnic_mac->va_addr_len, - KM_SLEEP); - - vnic_mac->va_txinfo = mac_vnic_tx_get(vnic_mac->va_mh); - vnic_mac->va_notify_hdl = mac_notify_add(vnic_mac->va_mh, - vnic_notify_cb, vnic_mac); - - VNIC_MAC_REFHOLD(vnic_mac); - *vmp = vnic_mac; - mutex_exit(&vnic_mac_lock); - return (0); - -bail: - if (vnic_mac != NULL) { - if (vnic_mac->va_mh != NULL) - mac_close(vnic_mac->va_mh); - kmem_cache_free(vnic_mac_cache, vnic_mac); +static vnic_ioc_diag_t +vnic_mac2vnic_diag(mac_diag_t diag) +{ + switch (diag) { + case MAC_DIAG_MACADDR_NIC: + return (VNIC_IOC_DIAG_MACADDR_NIC); + case MAC_DIAG_MACADDR_INUSE: + return (VNIC_IOC_DIAG_MACADDR_INUSE); + case MAC_DIAG_MACADDR_INVALID: + return (VNIC_IOC_DIAG_MACADDR_INVALID); + case MAC_DIAG_MACADDRLEN_INVALID: + return (VNIC_IOC_DIAG_MACADDRLEN_INVALID); + case MAC_DIAG_MACFACTORYSLOTINVALID: + return (VNIC_IOC_DIAG_MACFACTORYSLOTINVALID); + case MAC_DIAG_MACFACTORYSLOTUSED: + return (VNIC_IOC_DIAG_MACFACTORYSLOTUSED); + case MAC_DIAG_MACFACTORYSLOTALLUSED: + return (VNIC_IOC_DIAG_MACFACTORYSLOTALLUSED); + case MAC_DIAG_MACFACTORYNOTSUP: + return (VNIC_IOC_DIAG_MACFACTORYNOTSUP); + case MAC_DIAG_MACPREFIX_INVALID: + return (VNIC_IOC_DIAG_MACPREFIX_INVALID); + case MAC_DIAG_MACPREFIXLEN_INVALID: + return (VNIC_IOC_DIAG_MACPREFIXLEN_INVALID); + case MAC_DIAG_MACNO_HWRINGS: + return (VNIC_IOC_DIAG_NO_HWRINGS); + default: + return (VNIC_IOC_DIAG_NONE); } - mutex_exit(&vnic_mac_lock); - return (err); } -/* - * Create a new flow for the active MAC client sharing the NIC - * with the VNICs. This allows the unicast packets for that NIC - * to be classified and passed up to the active MAC client. It - * also allows packets sent from a VNIC to the active link to - * be classified by the VNIC transmit function and delivered via - * the MAC module locally. Returns B_TRUE on success, B_FALSE on - * failure. - */ static int -vnic_init_active_rx(vnic_mac_t *vnic_mac) -{ - uchar_t nic_mac_addr[MAXMACADDRLEN]; - - if (vnic_mac->va_active_flow != NULL) - return (B_TRUE); - - mac_unicst_get(vnic_mac->va_mh, nic_mac_addr); - - vnic_mac->va_active_flow = vnic_classifier_flow_create( - vnic_mac->va_addr_len, nic_mac_addr, NULL, B_TRUE, KM_SLEEP); - - vnic_classifier_flow_add(vnic_mac, vnic_mac->va_active_flow, - (vnic_rx_fn_t)mac_active_rx, vnic_mac->va_mh, NULL); - return (B_TRUE); -} - -static void -vnic_fini_active_rx(vnic_mac_t *vnic_mac) -{ - if (vnic_mac->va_active_flow == NULL) - return; - - vnic_classifier_flow_remove(vnic_mac, vnic_mac->va_active_flow); - vnic_classifier_flow_destroy(vnic_mac->va_active_flow); - vnic_mac->va_active_flow = NULL; -} - -static void -vnic_update_active_rx(vnic_mac_t *vnic_mac) -{ - if (vnic_mac->va_active_flow == NULL) - return; - - vnic_fini_active_rx(vnic_mac); - (void) vnic_init_active_rx(vnic_mac); -} - -/* - * Copy an mblk, preserving its hardware checksum flags. - */ -mblk_t * -vnic_copymsg_cksum(mblk_t *mp) -{ - mblk_t *mp1; - uint32_t start, stuff, end, value, flags; - - mp1 = copymsg(mp); - if (mp1 == NULL) - return (NULL); - - hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags); - (void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value, - flags, KM_NOSLEEP); - - return (mp1); -} - -/* - * Copy an mblk chain, presenting the hardware checksum flags of the - * individual mblks. - */ -mblk_t * -vnic_copymsgchain_cksum(mblk_t *mp) -{ - mblk_t *nmp = NULL; - mblk_t **nmpp = &nmp; - - for (; mp != NULL; mp = mp->b_next) { - if ((*nmpp = vnic_copymsg_cksum(mp)) == NULL) { - freemsgchain(nmp); - return (NULL); - } - - nmpp = &((*nmpp)->b_next); - } - - return (nmp); -} - - -/* - * Process the specified mblk chain for proper handling of hardware - * checksum offload. This routine is invoked for loopback VNIC traffic. - * The function handles a NULL mblk chain passed as argument. - */ -mblk_t * -vnic_fix_cksum(mblk_t *mp_chain) +vnic_unicast_add(vnic_t *vnic, vnic_mac_addr_type_t vnic_addr_type, + int *addr_slot, uint_t prefix_len, int *addr_len_ptr_arg, + uint8_t *mac_addr_arg, uint16_t flags, vnic_ioc_diag_t *diag, + uint16_t vid) { - mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1; - uint32_t flags, start, stuff, end, value; - - for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) { - uint16_t len; - uint32_t offset; - struct ether_header *ehp; - uint16_t sap; + mac_diag_t mac_diag; + uint16_t mac_flags = 0; + int err; + uint_t addr_len; - hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, - &flags); - if (flags == 0) - continue; + if (flags & VNIC_IOC_CREATE_NODUPCHECK) + mac_flags |= MAC_UNICAST_NODUPCHECK; + switch (vnic_addr_type) { + case VNIC_MAC_ADDR_TYPE_FIXED: /* - * Since the processing of checksum offload for loopback - * traffic requires modification of the packet contents, - * ensure sure that we are always modifying our own copy. + * The MAC address value to assign to the VNIC + * is already provided in mac_addr_arg. addr_len_ptr_arg + * already contains the MAC address length. */ - if (DB_REF(mp) > 1) { - mp1 = copymsg(mp); - if (mp1 == NULL) - continue; - mp1->b_next = mp->b_next; - mp->b_next = NULL; - freemsg(mp); - if (prev != NULL) - prev->b_next = mp1; - else - new_chain = mp1; - mp = mp1; - } + break; + case VNIC_MAC_ADDR_TYPE_RANDOM: /* - * Ethernet, and optionally VLAN header. + * Random MAC address. There are two sub-cases: + * + * 1 - If mac_len == 0, a new MAC address is generated. + * The length of the MAC address to generated depends + * on the type of MAC used. The prefix to use for the MAC + * address is stored in the most significant bytes + * of the mac_addr argument, and its length is specified + * by the mac_prefix_len argument. This prefix can + * correspond to a IEEE OUI in the case of Ethernet, + * for example. + * + * 2 - If mac_len > 0, the address was already picked + * randomly, and is now passed back during VNIC + * re-creation. The mac_addr argument contains the MAC + * address that was generated. We distinguish this + * case from the fixed MAC address case, since we + * want the user consumers to know, when they query + * the list of VNICs, that a VNIC was assigned a + * random MAC address vs assigned a fixed address + * specified by the user. */ - /*LINTED*/ - ehp = (struct ether_header *)mp->b_rptr; - if (ntohs(ehp->ether_type) == VLAN_TPID) { - struct ether_vlan_header *evhp; - - ASSERT(MBLKL(mp) >= - sizeof (struct ether_vlan_header)); - /*LINTED*/ - evhp = (struct ether_vlan_header *)mp->b_rptr; - sap = ntohs(evhp->ether_type); - offset = sizeof (struct ether_vlan_header); - } else { - sap = ntohs(ehp->ether_type); - offset = sizeof (struct ether_header); - } - if (MBLKL(mp) <= offset) { - offset -= MBLKL(mp); - if (mp->b_cont == NULL) { - /* corrupted packet, skip it */ - if (prev != NULL) - prev->b_next = mp->b_next; - else - new_chain = mp->b_next; - mp1 = mp->b_next; - mp->b_next = NULL; - freemsg(mp); - mp = mp1; - continue; - } - mp = mp->b_cont; - } - - if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) { - ipha_t *ipha = NULL; - - /* - * In order to compute the full and header - * checksums, we need to find and parse - * the IP and/or ULP headers. - */ - - sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; - - /* - * IP header. - */ - if (sap != ETHERTYPE_IP) - continue; - - ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t)); - /*LINTED*/ - ipha = (ipha_t *)(mp->b_rptr + offset); - - if (flags & HCK_FULLCKSUM) { - ipaddr_t src, dst; - uint32_t cksum; - uint16_t *up; - uint8_t proto; - - /* - * Pointer to checksum field in ULP header. - */ - proto = ipha->ipha_protocol; - ASSERT(ipha->ipha_version_and_hdr_length == - IP_SIMPLE_HDR_VERSION); - if (proto == IPPROTO_TCP) { - /*LINTED*/ - up = IPH_TCPH_CHECKSUMP(ipha, - IP_SIMPLE_HDR_LENGTH); - } else { - ASSERT(proto == IPPROTO_UDP); - /*LINTED*/ - up = IPH_UDPH_CHECKSUMP(ipha, - IP_SIMPLE_HDR_LENGTH); - } - - /* - * Pseudo-header checksum. - */ - src = ipha->ipha_src; - dst = ipha->ipha_dst; - len = ntohs(ipha->ipha_length) - - IP_SIMPLE_HDR_LENGTH; - - cksum = (dst >> 16) + (dst & 0xFFFF) + - (src >> 16) + (src & 0xFFFF); - cksum += htons(len); - - /* - * The checksum value stored in the packet needs - * to be correct. Compute it here. - */ - *up = 0; - cksum += (((proto) == IPPROTO_UDP) ? - IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP); - cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH + - offset, cksum); - *(up) = (uint16_t)(cksum ? cksum : ~cksum); - - flags |= HCK_FULLCKSUM_OK; - value = 0xffff; - } - - if (flags & HCK_IPV4_HDRCKSUM) { - ASSERT(ipha != NULL); - ipha->ipha_hdr_checksum = - (uint16_t)ip_csum_hdr(ipha); - } - } - - if (flags & HCK_PARTIALCKSUM) { - uint16_t *up, partial, cksum; - uchar_t *ipp; /* ptr to beginning of IP header */ - - if (mp->b_cont != NULL) { - mblk_t *mp1; - - mp1 = msgpullup(mp, offset + end); - if (mp1 == NULL) - continue; - mp1->b_next = mp->b_next; - mp->b_next = NULL; - freemsg(mp); - if (prev != NULL) - prev->b_next = mp1; - else - new_chain = mp1; - mp = mp1; - } - - ipp = mp->b_rptr + offset; - /*LINTED*/ - up = (uint16_t *)((uchar_t *)ipp + stuff); - partial = *up; - *up = 0; - - cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start, - end - start, partial); - cksum = ~cksum; - *up = cksum ? cksum : ~cksum; + /* + * If it's a pre-generated address, we're done. mac_addr_arg + * and addr_len_ptr_arg already contain the MAC address + * value and length. + */ + if (*addr_len_ptr_arg > 0) + break; - /* - * Since we already computed the whole checksum, - * indicate to the stack that it has already - * been verified by the hardware. - */ - flags &= ~HCK_PARTIALCKSUM; - flags |= (HCK_FULLCKSUM | HCK_FULLCKSUM_OK); - value = 0xffff; + /* generate a new random MAC address */ + if ((err = mac_addr_random(vnic->vn_mch, + prefix_len, mac_addr_arg, &mac_diag)) != 0) { + *diag = vnic_mac2vnic_diag(mac_diag); + return (err); } + *addr_len_ptr_arg = mac_addr_len(vnic->vn_lower_mh); + break; - (void) hcksum_assoc(mp, NULL, NULL, start, stuff, end, - value, flags, KM_NOSLEEP); - } - - return (new_chain); -} - -static void -vnic_mac_close(vnic_mac_t *vnic_mac) -{ - mutex_enter(&vnic_mac_lock); - VNIC_MAC_REFRELE(vnic_mac); - mutex_exit(&vnic_mac_lock); -} - -static void -vnic_mac_free(vnic_mac_t *vnic_mac) -{ - mod_hash_val_t val; - - ASSERT(MUTEX_HELD(&vnic_mac_lock)); - vnic_fini_active_rx(vnic_mac); - mac_notify_remove(vnic_mac->va_mh, vnic_mac->va_notify_hdl); - if (vnic_mac->va_mac_set) { - vnic_mac->va_mac_set = B_FALSE; - mac_vnic_clear(vnic_mac->va_mh); - } - vnic_classifier_flow_tab_fini(vnic_mac); - mac_close(vnic_mac->va_mh); - - (void) mod_hash_remove(vnic_mac_hash, - (mod_hash_key_t)(uintptr_t)vnic_mac->va_linkid, &val); - ASSERT(vnic_mac == (vnic_mac_t *)val); - - kmem_cache_free(vnic_mac_cache, vnic_mac); -} - -/* - * Initial VNIC receive routine. Invoked for packets that are steered - * to a VNIC but the VNIC has not been started yet. - */ -/* ARGSUSED */ -static void -vnic_rx_initial(void *arg1, void *arg2, mblk_t *mp_chain) -{ - vnic_t *vnic = arg1; - mblk_t *mp; - - /* update stats */ - for (mp = mp_chain; mp != NULL; mp = mp->b_next) - vnic->vn_stat_ierrors++; - freemsgchain(mp_chain); -} - -/* - * VNIC receive routine invoked after the classifier for the VNIC - * has been initialized and the VNIC has been started. - */ -/* ARGSUSED */ -void -vnic_rx(void *arg1, void *arg2, mblk_t *mp_chain) -{ - vnic_t *vnic = arg1; - mblk_t *mp; - - /* update stats */ - for (mp = mp_chain; mp != NULL; mp = mp->b_next) { - vnic->vn_stat_ipackets++; - vnic->vn_stat_rbytes += msgdsize(mp); - } - - /* pass packet up */ - mac_rx(vnic->vn_mh, NULL, mp_chain); -} - -/* - * Routine to create a MAC-based VNIC. Adds the passed MAC address - * to an unused slot in the NIC if one is available. Otherwise it - * sets the NIC in promiscuous mode and assigns the MAC address to - * a Rx ring if available or a soft ring. - */ -static int -vnic_add_unicstaddr(vnic_t *vnic, mac_multi_addr_t *maddr) -{ - vnic_mac_t *vnic_mac = vnic->vn_vnic_mac; - int err; - - if (mac_unicst_verify(vnic_mac->va_mh, maddr->mma_addr, - maddr->mma_addrlen) == B_FALSE) - return (EINVAL); - - if (mac_vnic_capab_get(vnic_mac->va_mh, MAC_CAPAB_MULTIADDRESS, - &(vnic->vn_mma_capab))) { - if (vnic->vn_maddr_naddrfree == 0) { - /* - * No free address slots available. - * Enable promiscuous mode. - */ - goto set_promisc; + case VNIC_MAC_ADDR_TYPE_FACTORY: + err = mac_addr_factory_reserve(vnic->vn_mch, addr_slot); + if (err != 0) { + if (err == EINVAL) + *diag = VNIC_IOC_DIAG_MACFACTORYSLOTINVALID; + if (err == EBUSY) + *diag = VNIC_IOC_DIAG_MACFACTORYSLOTUSED; + if (err == ENOSPC) + *diag = VNIC_IOC_DIAG_MACFACTORYSLOTALLUSED; + return (err); } - err = vnic->vn_maddr_add(vnic->vn_maddr_handle, maddr); - if (err != 0) { - if (err == ENOSPC) { - /* - * There was a race to add addresses - * with other multiple address consumers, - * and we lost out. Use promisc mode. - */ - goto set_promisc; - } + mac_addr_factory_value(vnic->vn_lower_mh, *addr_slot, + mac_addr_arg, &addr_len, NULL, NULL); + *addr_len_ptr_arg = addr_len; + break; - return (err); + case VNIC_MAC_ADDR_TYPE_AUTO: + /* first try to allocate a factory MAC address */ + err = mac_addr_factory_reserve(vnic->vn_mch, addr_slot); + if (err == 0) { + mac_addr_factory_value(vnic->vn_lower_mh, *addr_slot, + mac_addr_arg, &addr_len, NULL, NULL); + vnic_addr_type = VNIC_MAC_ADDR_TYPE_FACTORY; + *addr_len_ptr_arg = addr_len; + break; } - vnic->vn_slot_id = maddr->mma_slot; - vnic->vn_multi_mac = B_TRUE; - } else { /* - * Either multiple MAC address support is not - * available or all available addresses have - * been used up. + * Allocating a factory MAC address failed, generate a + * random MAC address instead. */ - set_promisc: - if ((err = mac_promisc_set(vnic_mac->va_mh, B_TRUE, - MAC_DEVPROMISC)) != 0) { + if ((err = mac_addr_random(vnic->vn_mch, + prefix_len, mac_addr_arg, &mac_diag)) != 0) { + *diag = vnic_mac2vnic_diag(mac_diag); return (err); } - - vnic->vn_promisc_mac = B_TRUE; + *addr_len_ptr_arg = mac_addr_len(vnic->vn_lower_mh); + vnic_addr_type = VNIC_MAC_ADDR_TYPE_RANDOM; + break; + case VNIC_MAC_ADDR_TYPE_PRIMARY: + /* + * We get the address here since we copy it in the + * vnic's vn_addr. + */ + mac_unicast_primary_get(vnic->vn_lower_mh, mac_addr_arg); + *addr_len_ptr_arg = mac_addr_len(vnic->vn_lower_mh); + mac_flags |= MAC_UNICAST_VNIC_PRIMARY; + break; } - return (err); -} -/* - * VNIC is getting deleted. Remove the MAC address from the slot. - * If promiscuous mode was being used, then unset the promiscuous mode. - */ -static int -vnic_remove_unicstaddr(vnic_t *vnic) -{ - vnic_mac_t *vnic_mac = vnic->vn_vnic_mac; - int err; - - if (vnic->vn_multi_mac) { - ASSERT(vnic->vn_promisc_mac == B_FALSE); - err = vnic->vn_maddr_remove(vnic->vn_maddr_handle, - vnic->vn_slot_id); - vnic->vn_multi_mac = B_FALSE; - } + vnic->vn_addr_type = vnic_addr_type; - if (vnic->vn_promisc_mac) { - ASSERT(vnic->vn_multi_mac == B_FALSE); - err = mac_promisc_set(vnic_mac->va_mh, B_FALSE, MAC_DEVPROMISC); - vnic->vn_promisc_mac = B_FALSE; + err = mac_unicast_add(vnic->vn_mch, mac_addr_arg, mac_flags, + &vnic->vn_muh, vid, &mac_diag); + if (err != 0) { + if (vnic_addr_type == VNIC_MAC_ADDR_TYPE_FACTORY) { + /* release factory MAC address */ + mac_addr_factory_release(vnic->vn_mch, *addr_slot); + } + *diag = vnic_mac2vnic_diag(mac_diag); } return (err); @@ -721,21 +302,23 @@ vnic_remove_unicstaddr(vnic_t *vnic) * Create a new VNIC upon request from administrator. * Returns 0 on success, an errno on failure. */ +/* ARGSUSED */ int -vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, int mac_len, - uchar_t *mac_addr) +vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, + vnic_mac_addr_type_t *vnic_addr_type, int *mac_len, uchar_t *mac_addr, + int *mac_slot, uint_t mac_prefix_len, uint16_t vid, + mac_resource_props_t *mrp, uint32_t flags, vnic_ioc_diag_t *diag) { - vnic_t *vnic = NULL; + vnic_t *vnic; mac_register_t *mac; int err; - vnic_mac_t *vnic_mac; - mac_multi_addr_t maddr; - mac_txinfo_t tx_info; + boolean_t is_anchor = ((flags & VNIC_IOC_CREATE_ANCHOR) != 0); + char vnic_name[MAXNAMELEN]; + const mac_info_t *minfop; + uint32_t req_hwgrp_flag = ((flags & VNIC_IOC_CREATE_REQ_HWRINGS) != 0) ? + MAC_OPEN_FLAGS_REQ_HWRINGS : 0; - if (mac_len != ETHERADDRL) { - /* currently only ethernet NICs are supported */ - return (EINVAL); - } + *diag = VNIC_IOC_DIAG_NONE; rw_enter(&vnic_lock, RW_WRITER); @@ -753,36 +336,86 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, int mac_len, return (ENOMEM); } - /* open underlying MAC */ - err = vnic_mac_open(linkid, &vnic_mac); - if (err != 0) { - kmem_cache_free(vnic_cache, vnic); - rw_exit(&vnic_lock); - return (err); - } - bzero(vnic, sizeof (*vnic)); - vnic->vn_id = vnic_id; - vnic->vn_vnic_mac = vnic_mac; + vnic->vn_id = vnic_id; + vnic->vn_link_id = linkid; vnic->vn_started = B_FALSE; - vnic->vn_promisc = B_FALSE; - vnic->vn_multi_mac = B_FALSE; - vnic->vn_bcast_grp = B_FALSE; - - /* set the VNIC MAC address */ - maddr.mma_addrlen = mac_len; - maddr.mma_slot = 0; - maddr.mma_flags = 0; - bcopy(mac_addr, maddr.mma_addr, mac_len); - if ((err = vnic_add_unicstaddr(vnic, &maddr)) != 0) - goto bail; - bcopy(mac_addr, vnic->vn_addr, mac_len); - /* set the initial VNIC capabilities */ - if (!mac_vnic_capab_get(vnic_mac->va_mh, MAC_CAPAB_HCKSUM, - &vnic->vn_hcksum_txflags)) - vnic->vn_hcksum_txflags = 0; + if (!is_anchor) { + if (linkid == DATALINK_INVALID_LINKID) { + err = EINVAL; + goto bail; + } + + /* + * Open the lower MAC and assign its initial bandwidth and + * MAC address. We do this here during VNIC creation and + * do not wait until the upper MAC client open so that we + * can validate the VNIC creation parameters (bandwidth, + * MAC address, etc) and reserve a factory MAC address if + * one was requested. + */ + err = mac_open_by_linkid(linkid, &vnic->vn_lower_mh); + if (err != 0) + goto bail; + + /* + * VNIC(vlan) over VNICs(vlans) is not supported. + */ + if (mac_is_vnic(vnic->vn_lower_mh)) { + err = EINVAL; + goto bail; + } + + /* only ethernet support for now */ + minfop = mac_info(vnic->vn_lower_mh); + if (minfop->mi_nativemedia != DL_ETHER) { + err = ENOTSUP; + goto bail; + } + + (void) dls_mgmt_get_linkinfo(vnic_id, vnic_name, NULL, NULL, + NULL); + err = mac_client_open(vnic->vn_lower_mh, &vnic->vn_mch, + vnic_name, MAC_OPEN_FLAGS_IS_VNIC | req_hwgrp_flag); + if (err != 0) + goto bail; + + if (mrp != NULL) { + err = mac_client_set_resources(vnic->vn_mch, mrp); + if (err != 0) + goto bail; + } + /* assign a MAC address to the VNIC */ + + err = vnic_unicast_add(vnic, *vnic_addr_type, mac_slot, + mac_prefix_len, mac_len, mac_addr, flags, diag, vid); + if (err != 0) { + vnic->vn_muh = NULL; + if (diag != NULL && req_hwgrp_flag != 0) + *diag = VNIC_IOC_DIAG_NO_HWRINGS; + goto bail; + } + + /* register to receive notification from underlying MAC */ + vnic->vn_mnh = mac_notify_add(vnic->vn_lower_mh, vnic_notify_cb, + vnic); + + *vnic_addr_type = vnic->vn_addr_type; + vnic->vn_addr_len = *mac_len; + vnic->vn_vid = vid; + + bcopy(mac_addr, vnic->vn_addr, vnic->vn_addr_len); + + if (vnic->vn_addr_type == VNIC_MAC_ADDR_TYPE_FACTORY) + vnic->vn_slot_id = *mac_slot; + + /* set the initial VNIC capabilities */ + if (!mac_capab_get(vnic->vn_lower_mh, MAC_CAPAB_HCKSUM, + &vnic->vn_hcksum_txflags)) + vnic->vn_hcksum_txflags = 0; + } /* register with the MAC module */ if ((mac = mac_alloc(MAC_VERSION)) == NULL) @@ -795,27 +428,61 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, int mac_len, mac->m_src_addr = vnic->vn_addr; mac->m_callbacks = &vnic_m_callbacks; - mac_sdu_get(vnic_mac->va_mh, &mac->m_min_sdu, &mac->m_max_sdu); + if (!is_anchor) { + /* + * If this is a VNIC based VLAN, then we check for the + * margin unless it has been created with the force + * flag. If we are configuring a VLAN over an etherstub, + * we don't check the margin even if force is not set. + */ + if (vid == 0 || (flags & VNIC_IOC_CREATE_FORCE) != 0) { + if (vid != VLAN_ID_NONE) + vnic->vn_force = B_TRUE; + /* + * As the current margin size of the underlying mac is + * used to determine the margin size of the VNIC + * itself, request the underlying mac not to change + * to a smaller margin size. + */ + err = mac_margin_add(vnic->vn_lower_mh, + &vnic->vn_margin, B_TRUE); + ASSERT(err == 0); + } else { + vnic->vn_margin = VLAN_TAGSZ; + err = mac_margin_add(vnic->vn_lower_mh, + &vnic->vn_margin, B_FALSE); + if (err != 0) { + mac_free(mac); + if (diag != NULL) + *diag = VNIC_IOC_DIAG_MACMARGIN_INVALID; + goto bail; + } + } + + mac_sdu_get(vnic->vn_lower_mh, &mac->m_min_sdu, + &mac->m_max_sdu); + } else { + vnic->vn_margin = VLAN_TAGSZ; + mac->m_min_sdu = 0; + mac->m_max_sdu = 9000; + } - /* - * As the current margin size of the underlying mac is used to - * determine the margin size of the VNIC itself, request the - * underlying mac not to change to a smaller margin size. - */ - err = mac_margin_add(vnic_mac->va_mh, &(vnic->vn_margin), B_TRUE); - if (err != 0) - goto bail; mac->m_margin = vnic->vn_margin; + err = mac_register(mac, &vnic->vn_mh); mac_free(mac); if (err != 0) { - VERIFY(mac_margin_remove(vnic_mac->va_mh, + VERIFY(is_anchor || mac_margin_remove(vnic->vn_lower_mh, vnic->vn_margin) == 0); goto bail; } + /* Set the VNIC's MAC in the client */ + if (!is_anchor) + mac_set_upper_mac(vnic->vn_mch, vnic->vn_mh); + if ((err = dls_devnet_create(vnic->vn_mh, vnic->vn_id)) != 0) { - VERIFY(mac_margin_remove(vnic_mac->va_mh, + VERIFY(is_anchor || mac_margin_remove(vnic->vn_lower_mh, vnic->vn_margin) == 0); (void) mac_unregister(vnic->vn_mh); goto bail; @@ -829,69 +496,22 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, int mac_len, rw_exit(&vnic_lock); - /* Create a flow, initialized with the MAC address of the VNIC */ - if ((vnic->vn_flow_ent = vnic_classifier_flow_create(mac_len, mac_addr, - NULL, B_FALSE, KM_SLEEP)) == NULL) { - (void) vnic_dev_delete(vnic_id); - vnic = NULL; - err = ENOMEM; - goto bail_unlocked; - } - - vnic_classifier_flow_add(vnic_mac, vnic->vn_flow_ent, vnic_rx_initial, - vnic, vnic); - - /* setup VNIC to receive broadcast packets */ - err = vnic_bcast_add(vnic, vnic_brdcst_mac, MAC_ADDRTYPE_BROADCAST); - if (err != 0) { - (void) vnic_dev_delete(vnic_id); - vnic = NULL; - goto bail_unlocked; - } - vnic->vn_bcast_grp = B_TRUE; - - mutex_enter(&vnic_mac_lock); - if (!vnic_mac->va_mac_set) { - /* - * We want to MAC layer to call the VNIC tx outbound - * routine, so that local broadcast packets sent by - * the active interface sharing the underlying NIC (if - * any), can be broadcast to every VNIC. - */ - tx_info.mt_fn = vnic_active_tx; - tx_info.mt_arg = vnic_mac; - if (!mac_vnic_set(vnic_mac->va_mh, &tx_info, - vnic_m_capab_get, vnic)) { - mutex_exit(&vnic_mac_lock); - (void) vnic_dev_delete(vnic_id); - vnic = NULL; - err = EBUSY; - goto bail_unlocked; - } - vnic_mac->va_mac_set = B_TRUE; - } - mutex_exit(&vnic_mac_lock); - - /* allow passing packets to NIC's active MAC client */ - if (!vnic_init_active_rx(vnic_mac)) { - (void) vnic_dev_delete(vnic_id); - vnic = NULL; - err = ENOMEM; - goto bail_unlocked; - } - return (0); bail: - (void) vnic_remove_unicstaddr(vnic); - vnic_mac_close(vnic_mac); rw_exit(&vnic_lock); - -bail_unlocked: - if (vnic != NULL) { - kmem_cache_free(vnic_cache, vnic); + if (!is_anchor) { + if (vnic->vn_mnh != NULL) + (void) mac_notify_remove(vnic->vn_mnh, B_TRUE); + if (vnic->vn_muh != NULL) + (void) mac_unicast_remove(vnic->vn_mch, vnic->vn_muh); + if (vnic->vn_mch != NULL) + mac_client_close(vnic->vn_mch, MAC_CLOSE_FLAGS_IS_VNIC); + if (vnic->vn_lower_mh != NULL) + mac_close(vnic->vn_lower_mh); } + kmem_cache_free(vnic_cache, vnic); return (err); } @@ -901,11 +521,10 @@ bail_unlocked: /* ARGSUSED */ int vnic_dev_modify(datalink_id_t vnic_id, uint_t modify_mask, - vnic_mac_addr_type_t mac_addr_type, uint_t mac_len, uchar_t *mac_addr) + vnic_mac_addr_type_t mac_addr_type, uint_t mac_len, uchar_t *mac_addr, + uint_t mac_slot, mac_resource_props_t *mrp) { vnic_t *vnic = NULL; - int rv = 0; - boolean_t notify_mac_addr = B_FALSE; rw_enter(&vnic_lock, RW_WRITER); @@ -915,29 +534,19 @@ vnic_dev_modify(datalink_id_t vnic_id, uint_t modify_mask, return (ENOENT); } - if (modify_mask & VNIC_IOC_MODIFY_ADDR) { - rv = vnic_modify_mac_addr(vnic, mac_len, mac_addr); - if (rv == 0) - notify_mac_addr = B_TRUE; - } - rw_exit(&vnic_lock); - if (notify_mac_addr) - mac_unicst_update(vnic->vn_mh, mac_addr); - - return (rv); + return (0); } +/* ARGSUSED */ int -vnic_dev_delete(datalink_id_t vnic_id) +vnic_dev_delete(datalink_id_t vnic_id, uint32_t flags) { vnic_t *vnic = NULL; mod_hash_val_t val; - vnic_flow_t *flent; datalink_id_t tmpid; int rc; - vnic_mac_t *vnic_mac; rw_enter(&vnic_lock, RW_WRITER); @@ -947,7 +556,7 @@ vnic_dev_delete(datalink_id_t vnic_id) return (ENOENT); } - if ((rc = dls_devnet_destroy(vnic->vn_mh, &tmpid)) != 0) { + if ((rc = dls_devnet_destroy(vnic->vn_mh, &tmpid, B_TRUE)) != 0) { rw_exit(&vnic_lock); return (rc); } @@ -957,317 +566,136 @@ vnic_dev_delete(datalink_id_t vnic_id) /* * We cannot unregister the MAC yet. Unregistering would * free up mac_impl_t which should not happen at this time. - * Packets could be entering vnic_rx() through the - * flow entry and so mac_impl_t cannot be NULL. So disable - * mac_impl_t by calling mac_disable(). This will prevent any - * new claims on mac_impl_t. + * So disable mac_impl_t by calling mac_disable(). This will prevent + * any new claims on mac_impl_t. */ - if (mac_disable(vnic->vn_mh) != 0) { + if ((rc = mac_disable(vnic->vn_mh)) != 0) { (void) dls_devnet_create(vnic->vn_mh, vnic_id); rw_exit(&vnic_lock); - return (EBUSY); + return (rc); } (void) mod_hash_remove(vnic_hash, VNIC_HASH_KEY(vnic_id), &val); ASSERT(vnic == (vnic_t *)val); - - if (vnic->vn_bcast_grp) - (void) vnic_bcast_delete(vnic, vnic_brdcst_mac); - - flent = vnic->vn_flow_ent; - if (flent != NULL) { - /* - * vnic_classifier_flow_destroy() ensures that the - * flow is no longer used. - */ - vnic_classifier_flow_remove(vnic->vn_vnic_mac, flent); - vnic_classifier_flow_destroy(flent); - } - - rc = mac_margin_remove(vnic->vn_vnic_mac->va_mh, vnic->vn_margin); - ASSERT(rc == 0); - rc = mac_unregister(vnic->vn_mh); - ASSERT(rc == 0); - (void) vnic_remove_unicstaddr(vnic); - vnic_mac = vnic->vn_vnic_mac; - kmem_cache_free(vnic_cache, vnic); vnic_count--; rw_exit(&vnic_lock); - vnic_mac_close(vnic_mac); - return (0); -} - -/* - * For the specified packet chain, return a sub-chain to be sent - * and the transmit function to be used to send the packet. Also - * return a pointer to the sub-chain of packets that should - * be re-classified. If the function returns NULL, the packet - * should be sent using the underlying NIC. - */ -static vnic_flow_t * -vnic_classify(vnic_mac_t *vnic_mac, mblk_t *mp, mblk_t **mp_chain_rest) -{ - vnic_flow_t *flow_ent; - - /* one packet at a time */ - *mp_chain_rest = mp->b_next; - mp->b_next = NULL; - - /* do classification on the packet */ - flow_ent = vnic_classifier_get_flow(vnic_mac, mp); - return (flow_ent); -} - -/* - * Send a packet chain to a local VNIC or an active MAC client. - */ -static void -vnic_local_tx(vnic_mac_t *vnic_mac, vnic_flow_t *flow_ent, mblk_t *mp_chain) -{ - mblk_t *mp1; - const vnic_flow_fn_info_t *fn_info; - vnic_t *vnic; - - if (!vnic_classifier_is_active(flow_ent) && - mac_promisc_get(vnic_mac->va_mh, MAC_PROMISC)) { - /* - * If the MAC is in promiscous mode, - * send a copy of the active client. - */ - if ((mp1 = vnic_copymsgchain_cksum(mp_chain)) == NULL) - goto sendit; - if ((mp1 = vnic_fix_cksum(mp1)) == NULL) - goto sendit; - mac_active_rx(vnic_mac->va_mh, NULL, mp1); - } -sendit: - fn_info = vnic_classifier_get_fn_info(flow_ent); /* - * If the vnic to which we would deliver this packet is in - * promiscuous mode then it already received the packet via - * vnic_promisc_rx(). - * - * XXX assumes that ff_arg2 is a vnic_t pointer if it is - * non-NULL (currently always true). + * XXX-nicolas shouldn't have a void cast here, if it's + * expected that the function will never fail, then we should + * have an ASSERT(). */ - vnic = (vnic_t *)fn_info->ff_arg2; - if ((vnic != NULL) && vnic->vn_promisc) - freemsg(mp_chain); - else if ((mp1 = vnic_fix_cksum(mp_chain)) != NULL) - (fn_info->ff_fn)(fn_info->ff_arg1, fn_info->ff_arg2, mp1); -} + (void) mac_unregister(vnic->vn_mh); -/* - * This function is invoked when a MAC client needs to send a packet - * to a NIC which is shared by VNICs. It is passed to the MAC layer - * by a call to mac_vnic_set() when the NIC is opened, and is returned - * to MAC clients by mac_tx_get() when VNICs are present. - */ -mblk_t * -vnic_active_tx(void *arg, mblk_t *mp_chain) -{ - vnic_mac_t *vnic_mac = arg; - mblk_t *mp, *extra_mp = NULL; - vnic_flow_t *flow_ent; - void *flow_cookie; - const mac_txinfo_t *mtp = vnic_mac->va_txinfo; - - for (mp = mp_chain; mp != NULL; mp = extra_mp) { - mblk_t *next; - - next = mp->b_next; - mp->b_next = NULL; - - vnic_promisc_rx(vnic_mac, (vnic_t *)-1, mp); - - flow_ent = vnic_classify(vnic_mac, mp, &extra_mp); - ASSERT(extra_mp == NULL); - extra_mp = next; - - if (flow_ent != NULL) { - flow_cookie = vnic_classifier_get_client_cookie( - flow_ent); - if (flow_cookie != NULL) { - /* - * Send a copy to every VNIC defined on the - * interface, as well as the underlying MAC. - */ - vnic_bcast_send(flow_cookie, (vnic_t *)-1, mp); - } else { - /* - * loopback the packet to a local VNIC or - * an active MAC client. - */ - vnic_local_tx(vnic_mac, flow_ent, mp); - } - VNIC_FLOW_REFRELE(flow_ent); - mp_chain = NULL; - } else { - /* - * Non-VNIC destination, send via the underlying - * NIC. In order to avoid a recursive call - * to this function, we ensured that mtp points - * to the unerlying NIC transmit function - * by inilizating through mac_vnic_tx_get(). - */ - mp_chain = mtp->mt_fn(mtp->mt_arg, mp); - if (mp_chain != NULL) - break; + if (vnic->vn_lower_mh != NULL) { + /* + * Check if MAC address for the vnic was obtained from the + * factory MAC addresses. If yes, release it. + */ + if (vnic->vn_addr_type == VNIC_MAC_ADDR_TYPE_FACTORY) { + (void) mac_addr_factory_release(vnic->vn_mch, + vnic->vn_slot_id); } + (void) mac_margin_remove(vnic->vn_lower_mh, vnic->vn_margin); + (void) mac_notify_remove(vnic->vn_mnh, B_TRUE); + (void) mac_unicast_remove(vnic->vn_mch, vnic->vn_muh); + mac_client_close(vnic->vn_mch, MAC_CLOSE_FLAGS_IS_VNIC); + mac_close(vnic->vn_lower_mh); } - if ((mp_chain != NULL) && (extra_mp != NULL)) { - ASSERT(mp_chain->b_next == NULL); - mp_chain->b_next = extra_mp; - } - return (mp_chain); + kmem_cache_free(vnic_cache, vnic); + return (0); } -/* - * VNIC transmit function. - */ +/* ARGSUSED */ mblk_t * vnic_m_tx(void *arg, mblk_t *mp_chain) { - vnic_t *vnic = arg; - vnic_mac_t *vnic_mac = vnic->vn_vnic_mac; - mblk_t *mp, *extra_mp = NULL; - vnic_flow_t *flow_ent; - void *flow_cookie; - /* - * Update stats. + * This function could be invoked for an anchor VNIC when sending + * broadcast and multicast packets, and unicast packets which did + * not match any local known destination. */ - for (mp = mp_chain; mp != NULL; mp = mp->b_next) { - vnic->vn_stat_opackets++; - vnic->vn_stat_obytes += msgdsize(mp); - } - - for (mp = mp_chain; mp != NULL; mp = extra_mp) { - mblk_t *next; - - next = mp->b_next; - mp->b_next = NULL; - - vnic_promisc_rx(vnic->vn_vnic_mac, vnic, mp); - - flow_ent = vnic_classify(vnic->vn_vnic_mac, mp, &extra_mp); - ASSERT(extra_mp == NULL); - extra_mp = next; - - if (flow_ent != NULL) { - flow_cookie = vnic_classifier_get_client_cookie( - flow_ent); - if (flow_cookie != NULL) { - /* - * The vnic_bcast_send function expects - * to receive the sender VNIC as value - * for arg2. - */ - vnic_bcast_send(flow_cookie, vnic, mp); - } else { - /* - * loopback the packet to a local VNIC or - * an active MAC client. - */ - vnic_local_tx(vnic_mac, flow_ent, mp); - } - VNIC_FLOW_REFRELE(flow_ent); - mp_chain = NULL; - } else { - /* - * Non-local destination, send via the underlying - * NIC. - */ - const mac_txinfo_t *mtp = vnic->vn_txinfo; - mp_chain = mtp->mt_fn(mtp->mt_arg, mp); - if (mp_chain != NULL) - break; - } - } - - /* update stats to account for unsent packets */ - for (mp = mp_chain; mp != NULL; mp = mp->b_next) { - vnic->vn_stat_opackets--; - vnic->vn_stat_obytes -= msgdsize(mp); - vnic->vn_stat_oerrors++; - /* - * link back in the last portion not counted due to bandwidth - * control. - */ - if (mp->b_next == NULL) { - mp->b_next = extra_mp; - break; - } - } - - return (mp_chain); + freemsgchain(mp_chain); + return (NULL); } -/* ARGSUSED */ +/*ARGSUSED*/ static void -vnic_m_resources(void *arg) +vnic_m_ioctl(void *arg, queue_t *q, mblk_t *mp) { - /* no resources to advertise */ + miocnak(q, mp, 0, ENOTSUP); } +/* + * This entry point cannot be passed-through, since it is invoked + * for the per-VNIC kstats which must be exported independently + * of the existence of VNIC MAC clients. + */ static int vnic_m_stat(void *arg, uint_t stat, uint64_t *val) { vnic_t *vnic = arg; int rval = 0; - rw_enter(&vnic_lock, RW_READER); + if (vnic->vn_lower_mh == NULL) { + /* + * It's an anchor VNIC, which does not have any + * statistics in itself. + */ + return (ENOTSUP); + } + + /* + * ENOTSUP must be reported for unsupported stats, the VNIC + * driver reports a subset of the stats that would + * be returned by a real piece of hardware. + */ switch (stat) { - case ETHER_STAT_LINK_DUPLEX: - *val = mac_stat_get(vnic->vn_vnic_mac->va_mh, - ETHER_STAT_LINK_DUPLEX); - break; + case MAC_STAT_LINK_STATE: + case MAC_STAT_LINK_UP: + case MAC_STAT_PROMISC: case MAC_STAT_IFSPEED: - *val = mac_stat_get(vnic->vn_vnic_mac->va_mh, - MAC_STAT_IFSPEED); - break; case MAC_STAT_MULTIRCV: - *val = vnic->vn_stat_multircv; - break; - case MAC_STAT_BRDCSTRCV: - *val = vnic->vn_stat_brdcstrcv; - break; case MAC_STAT_MULTIXMT: - *val = vnic->vn_stat_multixmt; - break; + case MAC_STAT_BRDCSTRCV: case MAC_STAT_BRDCSTXMT: - *val = vnic->vn_stat_brdcstxmt; - break; + case MAC_STAT_OPACKETS: + case MAC_STAT_OBYTES: case MAC_STAT_IERRORS: - *val = vnic->vn_stat_ierrors; - break; case MAC_STAT_OERRORS: - *val = vnic->vn_stat_oerrors; - break; case MAC_STAT_RBYTES: - *val = vnic->vn_stat_rbytes; - break; case MAC_STAT_IPACKETS: - *val = vnic->vn_stat_ipackets; - break; - case MAC_STAT_OBYTES: - *val = vnic->vn_stat_obytes; - break; - case MAC_STAT_OPACKETS: - *val = vnic->vn_stat_opackets; + *val = mac_client_stat_get(vnic->vn_mch, stat); break; default: rval = ENOTSUP; } - rw_exit(&vnic_lock); return (rval); } /* + * Invoked by the upper MAC to retrieve the lower MAC client handle + * corresponding to a VNIC. A pointer to this function is obtained + * by the upper MAC via capability query. + * + * XXX-nicolas Note: this currently causes all VNIC MAC clients to + * receive the same MAC client handle for the same VNIC. This is ok + * as long as we have only one VNIC MAC client which sends and + * receives data, but we don't currently enforce this at the MAC layer. + */ +static void * +vnic_mac_client_handle(void *vnic_arg) +{ + vnic_t *vnic = vnic_arg; + + return (vnic->vn_mch); +} + + +/* * Return information about the specified capability. */ /* ARGSUSED */ @@ -1277,8 +705,6 @@ vnic_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) vnic_t *vnic = arg; switch (cap) { - case MAC_CAPAB_POLL: - return (B_TRUE); case MAC_CAPAB_HCKSUM: { uint32_t *hcksum_txflags = cap_data; @@ -1287,331 +713,129 @@ vnic_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) HCKSUM_INET_PARTIAL); break; } + case MAC_CAPAB_VNIC: { + mac_capab_vnic_t *vnic_capab = cap_data; + + if (vnic->vn_lower_mh == NULL) { + /* + * It's an anchor VNIC, we don't have an underlying + * NIC and MAC client handle. + */ + return (B_FALSE); + } + + if (vnic_capab != NULL) { + vnic_capab->mcv_arg = vnic; + vnic_capab->mcv_mac_client_handle = + vnic_mac_client_handle; + } + break; + } + case MAC_CAPAB_ANCHOR_VNIC: { + /* since it's an anchor VNIC we don't have lower mac handle */ + if (vnic->vn_lower_mh == NULL) { + ASSERT(vnic->vn_link_id == 0); + return (B_TRUE); + } + return (B_FALSE); + } + case MAC_CAPAB_NO_NATIVEVLAN: + case MAC_CAPAB_NO_ZCOPY: + return (B_TRUE); default: return (B_FALSE); } return (B_TRUE); } +/* ARGSUSED */ static int vnic_m_start(void *arg) { - vnic_t *vnic = arg; - mac_handle_t lower_mh = vnic->vn_vnic_mac->va_mh; - int rc; - - rc = mac_start(lower_mh); - if (rc != 0) - return (rc); - - vnic_classifier_flow_update_fn(vnic->vn_flow_ent, vnic_rx, vnic, vnic); return (0); } +/* ARGSUSED */ static void vnic_m_stop(void *arg) { - vnic_t *vnic = arg; - mac_handle_t lower_mh = vnic->vn_vnic_mac->va_mh; - - vnic_classifier_flow_update_fn(vnic->vn_flow_ent, vnic_rx_initial, - vnic, vnic); - mac_stop(lower_mh); } /* ARGSUSED */ static int vnic_m_promisc(void *arg, boolean_t on) { - vnic_t *vnic = arg; - - return (vnic_promisc_set(vnic, on)); + return (0); } +/* ARGSUSED */ static int vnic_m_multicst(void *arg, boolean_t add, const uint8_t *addrp) { - vnic_t *vnic = arg; - int rc = 0; - - if (add) - rc = vnic_bcast_add(vnic, addrp, MAC_ADDRTYPE_MULTICAST); - else - vnic_bcast_delete(vnic, addrp); - - return (rc); + return (0); } static int -vnic_m_unicst(void *arg, const uint8_t *mac_addr) +vnic_m_unicst(void *arg, const uint8_t *macaddr) { vnic_t *vnic = arg; - vnic_mac_t *vnic_mac = vnic->vn_vnic_mac; - int rv; - rw_enter(&vnic_lock, RW_WRITER); - rv = vnic_modify_mac_addr(vnic, vnic_mac->va_addr_len, - (uchar_t *)mac_addr); - rw_exit(&vnic_lock); - - if (rv == 0) - mac_unicst_update(vnic->vn_mh, mac_addr); - return (0); + return (mac_vnic_unicast_set(vnic->vn_mch, macaddr)); } int -vnic_info(uint_t *nvnics, datalink_id_t vnic_id, datalink_id_t linkid, - void *fn_arg, vnic_info_new_vnic_fn_t new_vnic_fn) -{ - vnic_info_state_t state; - int rc = 0; - - rw_enter(&vnic_lock, RW_READER); - - *nvnics = vnic_count; - - bzero(&state, sizeof (state)); - state.vs_vnic_id = vnic_id; - state.vs_linkid = linkid; - state.vs_new_vnic_fn = new_vnic_fn; - state.vs_fn_arg = fn_arg; - - mod_hash_walk(vnic_hash, vnic_info_walker, &state); - - if ((rc = state.vs_rc) == 0 && vnic_id != DATALINK_ALL_LINKID && - !state.vs_vnic_found) - rc = ENOENT; - - rw_exit(&vnic_lock); - return (rc); -} - -/* - * Walker invoked when building a list of vnics that must be passed - * up to user space. - */ -/*ARGSUSED*/ -static uint_t -vnic_info_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg) -{ - vnic_t *vnic; - vnic_info_state_t *state = arg; - - if (state->vs_rc != 0) - return (MH_WALK_TERMINATE); /* terminate walk */ - - vnic = (vnic_t *)val; - - if (state->vs_vnic_id != DATALINK_ALL_LINKID && - vnic->vn_id != state->vs_vnic_id) { - goto bail; - } - - state->vs_vnic_found = B_TRUE; - - state->vs_rc = state->vs_new_vnic_fn(state->vs_fn_arg, - vnic->vn_id, vnic->vn_addr_type, vnic->vn_vnic_mac->va_addr_len, - vnic->vn_addr, vnic->vn_vnic_mac->va_linkid); -bail: - return ((state->vs_rc == 0) ? MH_WALK_CONTINUE : MH_WALK_TERMINATE); -} - -/* - * vnic_notify_cb() and vnic_notify_walker() below are used to - * process events received from an underlying NIC and, if needed, - * forward these events to the VNICs defined on top of that NIC. - */ - -typedef struct vnic_notify_state { - mac_notify_type_t vo_type; - vnic_mac_t *vo_vnic_mac; -} vnic_notify_state_t; - -/* ARGSUSED */ -static uint_t -vnic_notify_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg) +vnic_info(vnic_info_t *info) { - vnic_t *vnic = (vnic_t *)val; - vnic_notify_state_t *state = arg; + vnic_t *vnic; + int err; - /* ignore VNICs that don't use the specified underlying MAC */ - if (vnic->vn_vnic_mac != state->vo_vnic_mac) - return (MH_WALK_CONTINUE); + rw_enter(&vnic_lock, RW_WRITER); - switch (state->vo_type) { - case MAC_NOTE_TX: - mac_tx_update(vnic->vn_mh); - break; - case MAC_NOTE_LINK: - /* - * The VNIC link state must be up regardless of - * the link state of the underlying NIC to maintain - * connectivity between VNICs on the same host. - */ - mac_link_update(vnic->vn_mh, LINK_STATE_UP); - break; - case MAC_NOTE_UNICST: - vnic_update_active_rx(vnic->vn_vnic_mac); - break; - case MAC_NOTE_VNIC: - /* only for clients which share a NIC with a VNIC */ - break; - case MAC_NOTE_PROMISC: - mutex_enter(&vnic_mac_lock); - vnic->vn_vnic_mac->va_txinfo = mac_vnic_tx_get( - vnic->vn_vnic_mac->va_mh); - mutex_exit(&vnic_mac_lock); - break; + err = mod_hash_find(vnic_hash, VNIC_HASH_KEY(info->vn_vnic_id), + (mod_hash_val_t *)&vnic); + if (err != 0) { + rw_exit(&vnic_lock); + return (ENOENT); } - return (MH_WALK_CONTINUE); -} - -static void -vnic_notify_cb(void *arg, mac_notify_type_t type) -{ - vnic_mac_t *vnic = arg; - vnic_notify_state_t state; + info->vn_link_id = vnic->vn_link_id; + info->vn_mac_addr_type = vnic->vn_addr_type; + info->vn_mac_len = vnic->vn_addr_len; + bcopy(vnic->vn_addr, info->vn_mac_addr, MAXMACADDRLEN); + info->vn_mac_slot = vnic->vn_slot_id; + info->vn_mac_prefix_len = 0; + info->vn_vid = vnic->vn_vid; + info->vn_force = vnic->vn_force; - state.vo_type = type; - state.vo_vnic_mac = vnic; + bzero(&info->vn_resource_props, sizeof (mac_resource_props_t)); + if (vnic->vn_mch != NULL) + mac_resource_ctl_get(vnic->vn_mch, &info->vn_resource_props); - rw_enter(&vnic_lock, RW_READER); - mod_hash_walk(vnic_hash, vnic_notify_walker, &state); rw_exit(&vnic_lock); -} - -static int -vnic_modify_mac_addr(vnic_t *vnic, uint_t mac_len, uchar_t *mac_addr) -{ - vnic_mac_t *vnic_mac = vnic->vn_vnic_mac; - vnic_flow_t *vnic_flow = vnic->vn_flow_ent; - - ASSERT(RW_WRITE_HELD(&vnic_lock)); - - if (mac_len != vnic_mac->va_addr_len) - return (EINVAL); - - vnic_classifier_flow_update_addr(vnic_flow, mac_addr); return (0); } -static int -vnic_promisc_set(vnic_t *vnic, boolean_t on) -{ - vnic_mac_t *vnic_mac = vnic->vn_vnic_mac; - int r = -1; - - if (vnic->vn_promisc == on) - return (0); - - if (on) { - if ((r = mac_promisc_set(vnic_mac->va_mh, B_TRUE, - MAC_DEVPROMISC)) != 0) { - return (r); - } - - rw_enter(&vnic_mac->va_promisc_lock, RW_WRITER); - vnic->vn_promisc_next = vnic_mac->va_promisc; - vnic_mac->va_promisc = vnic; - vnic_mac->va_promisc_gen++; - - vnic->vn_promisc = B_TRUE; - rw_exit(&vnic_mac->va_promisc_lock); - - return (0); - } else { - vnic_t *loop, *prev = NULL; - - rw_enter(&vnic_mac->va_promisc_lock, RW_WRITER); - loop = vnic_mac->va_promisc; - - while ((loop != NULL) && (loop != vnic)) { - prev = loop; - loop = loop->vn_promisc_next; - } - - if ((loop != NULL) && - ((r = mac_promisc_set(vnic_mac->va_mh, B_FALSE, - MAC_DEVPROMISC)) == 0)) { - if (prev != NULL) - prev->vn_promisc_next = loop->vn_promisc_next; - else - vnic_mac->va_promisc = loop->vn_promisc_next; - vnic_mac->va_promisc_gen++; - - vnic->vn_promisc = B_FALSE; - } - rw_exit(&vnic_mac->va_promisc_lock); - - return (r); - } -} - -void -vnic_promisc_rx(vnic_mac_t *vnic_mac, vnic_t *sender, mblk_t *mp) +static void +vnic_notify_cb(void *arg, mac_notify_type_t type) { - vnic_t *loop; - vnic_flow_t *flow; - const vnic_flow_fn_info_t *fn_info; - mac_header_info_t hdr_info; - boolean_t dst_must_match = B_TRUE; - - ASSERT(mp->b_next == NULL); - - rw_enter(&vnic_mac->va_promisc_lock, RW_READER); - if (vnic_mac->va_promisc == NULL) - goto done; - - if (mac_header_info(vnic_mac->va_mh, mp, &hdr_info) != 0) - goto done; + vnic_t *vnic = arg; /* - * If this is broadcast or multicast then the destination - * address need not match for us to deliver it. + * Only the VLAN VNIC needs to be notified with primary MAC + * address change. */ - if ((hdr_info.mhi_dsttype == MAC_ADDRTYPE_BROADCAST) || - (hdr_info.mhi_dsttype == MAC_ADDRTYPE_MULTICAST)) - dst_must_match = B_FALSE; - - for (loop = vnic_mac->va_promisc; - loop != NULL; - loop = loop->vn_promisc_next) { - if (loop == sender) - continue; - - if (dst_must_match && - (bcmp(hdr_info.mhi_daddr, loop->vn_addr, - sizeof (loop->vn_addr)) != 0)) - continue; - - flow = loop->vn_flow_ent; - ASSERT(flow != NULL); - - if (!flow->vf_is_active) { - mblk_t *copy; - uint64_t gen; - - if ((copy = vnic_copymsg_cksum(mp)) == NULL) - break; - if ((sender != NULL) && - ((copy = vnic_fix_cksum(copy)) == NULL)) - break; - - VNIC_FLOW_REFHOLD(flow); - gen = vnic_mac->va_promisc_gen; - rw_exit(&vnic_mac->va_promisc_lock); - - fn_info = vnic_classifier_get_fn_info(flow); - (fn_info->ff_fn)(fn_info->ff_arg1, - fn_info->ff_arg2, copy); - - VNIC_FLOW_REFRELE(flow); - rw_enter(&vnic_mac->va_promisc_lock, RW_READER); - if (vnic_mac->va_promisc_gen != gen) - break; - } + if (vnic->vn_addr_type != VNIC_MAC_ADDR_TYPE_PRIMARY) + return; + + switch (type) { + case MAC_NOTE_UNICST: + /* the unicast MAC address value */ + mac_unicast_primary_get(vnic->vn_lower_mh, vnic->vn_addr); + + /* notify its upper layer MAC about MAC address change */ + mac_unicst_update(vnic->vn_mh, (const uint8_t *)vnic->vn_addr); + break; + default: + break; } -done: - rw_exit(&vnic_mac->va_promisc_lock); } diff --git a/usr/src/uts/common/io/wpi/wpi.c b/usr/src/uts/common/io/wpi/wpi.c index 00878f64ce..bd817f22c5 100644 --- a/usr/src/uts/common/io/wpi/wpi.c +++ b/usr/src/uts/common/io/wpi/wpi.c @@ -42,7 +42,7 @@ #include <sys/modctl.h> #include <sys/devops.h> #include <sys/dlpi.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_wifi.h> #include <sys/net80211.h> #include <sys/net80211_proto.h> @@ -371,7 +371,6 @@ mac_callbacks_t wpi_m_callbacks = { wpi_m_multicst, wpi_m_unicst, wpi_m_tx, - NULL, wpi_m_ioctl, NULL, NULL, diff --git a/usr/src/uts/common/io/xge/drv/xge.c b/usr/src/uts/common/io/xge/drv/xge.c index c41f82d706..6ee52f4262 100644 --- a/usr/src/uts/common/io/xge/drv/xge.c +++ b/usr/src/uts/common/io/xge/drv/xge.c @@ -65,34 +65,6 @@ ddi_device_acc_attr_t xge_dev_attr = { ddi_device_acc_attr_t *p_xge_dev_attr = &xge_dev_attr; /* - * xge_event - * - * This function called by HAL to notify upper layer that some any - * event been produced. - */ -void -xge_event(xge_queue_item_t *item) -{ - xgell_fifo_t *fifo = item->context; - xgelldev_t *lldev = fifo->lldev; - - switch (item->event_type) { - case XGELL_EVENT_RESCHED_NEEDED: - if (lldev->is_initialized) { - if (xge_hal_channel_dtr_count(fifo->channelh) - >= XGELL_TX_LEVEL_HIGH) { - mac_tx_update(lldev->mh); - xge_debug_osdep(XGE_TRACE, "%s", - "mac_tx_update happened!"); - } - } - break; - default: - break; - } -} - -/* * xgell_callback_crit_err * * This function called by HAL on Serious Error event. XGE_HAL_EVENT_SERR. @@ -139,18 +111,6 @@ xge_xpak_alarm_log(void *userdata, xge_hal_xpak_alarm_type_e type) } /* - * xge_queue_produce context - */ -static void -xge_callback_event_queued(xge_hal_device_h devh, int event_type) -{ - if (event_type == XGELL_EVENT_RESCHED_NEEDED) { - (void) taskq_dispatch(system_taskq, xge_device_poll_now, devh, - TQ_NOSLEEP); - } -} - -/* * xge_driver_init_hal * * To initialize HAL portion of driver. @@ -167,8 +127,8 @@ xge_driver_init_hal(void) uld_callbacks.link_up = xgell_callback_link_up; uld_callbacks.link_down = xgell_callback_link_down; uld_callbacks.crit_err = xge_callback_crit_err; - uld_callbacks.event = xge_event; - uld_callbacks.event_queued = xge_callback_event_queued; + uld_callbacks.event = NULL; + uld_callbacks.event_queued = NULL; uld_callbacks.before_device_poll = NULL; uld_callbacks.after_device_poll = NULL; uld_callbacks.sched_timer = NULL; @@ -241,7 +201,6 @@ _info(struct modinfo *pModinfo) return (mod_info(&modlinkage, pModinfo)); } -/* ARGSUSED */ /* * xge_isr * @arg: pointer to device private strucutre(hldev) @@ -249,6 +208,7 @@ _info(struct modinfo *pModinfo) * This is the ISR scheduled by the OS to indicate to the * driver that the receive/transmit operation is completed. */ +/* ARGSUSED */ static uint_t xge_isr(caddr_t arg0, caddr_t arg1) { @@ -308,262 +268,263 @@ xge_ring_msix_isr(caddr_t arg0, caddr_t arg1) * Configure single ring */ static void -xge_ring_config(dev_info_t *dev_info, - xge_hal_device_config_t *device_config, int num) +xge_ring_config(dev_info_t *dev_info, xge_hal_device_config_t *device_config, + int index) { char msg[MSG_SIZE]; - (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_configured", num); - device_config->ring.queue[num].configured = + (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_configured", index); + device_config->ring.queue[index].configured = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, - msg, num < XGELL_MAX_RING_DEFAULT ? 1 : 0); + msg, index < XGELL_RX_RING_NUM_MAX ? 1 : 0); /* no point to configure it further if unconfigured */ - if (!device_config->ring.queue[num].configured) + if (!device_config->ring.queue[index].configured) return; #if defined(__sparc) - device_config->ring.queue[num].no_snoop_bits = 1; + device_config->ring.queue[index].no_snoop_bits = 1; #endif - (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_max", num); - device_config->ring.queue[num].max = + (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_max", index); + device_config->ring.queue[index].max = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_USE_HARDCODE); - (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_initial", num); - device_config->ring.queue[num].initial = + (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_initial", index); + device_config->ring.queue[index].initial = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_USE_HARDCODE); - if (device_config->ring.queue[num].initial == + if (device_config->ring.queue[index].initial == XGE_HAL_DEFAULT_USE_HARDCODE) { - if (device_config->mtu > XGE_HAL_DEFAULT_MTU) { - device_config->ring.queue[num].initial = - device_config->ring.queue[num].max = - XGE_HAL_DEFAULT_RING_QUEUE_BLOCKS_J; - } else { - device_config->ring.queue[num].initial = - device_config->ring.queue[num].max = - XGE_HAL_DEFAULT_RING_QUEUE_BLOCKS_N; - } + device_config->ring.queue[index].initial = + device_config->ring.queue[index].max = + XGE_HAL_DEFAULT_RING_QUEUE_BLOCKS; } - (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_buffer_mode", num); - device_config->ring.queue[num].buffer_mode = + (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_buffer_mode", index); + device_config->ring.queue[index].buffer_mode = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_RING_QUEUE_BUFFER_MODE_DEFAULT); - (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_dram_size_mb", num); - device_config->ring.queue[num].dram_size_mb = + (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_dram_size_mb", index); + device_config->ring.queue[index].dram_size_mb = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_USE_HARDCODE); (void) xge_os_snprintf(msg, MSG_SIZE, - "ring%d_backoff_interval_us", num); - device_config->ring.queue[num].backoff_interval_us = + "ring%d_backoff_interval_us", index); + device_config->ring.queue[index].backoff_interval_us = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_BACKOFF_INTERVAL_US); - (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_max_frm_len", num); - device_config->ring.queue[num].max_frm_len = + (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_max_frm_len", index); + device_config->ring.queue[index].max_frm_len = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_RING_USE_MTU); - (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_priority", num); - device_config->ring.queue[num].priority = + (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_priority", index); + device_config->ring.queue[index].priority = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_RING_PRIORITY); - (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_urange_a", num); - device_config->ring.queue[num].rti.urange_a = + (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_urange_a", index); + device_config->ring.queue[index].rti.urange_a = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_RX_URANGE_A); - (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_a", num); - device_config->ring.queue[num].rti.ufc_a = + (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_a", index); + device_config->ring.queue[index].rti.ufc_a = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_RX_UFC_A); - (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_urange_b", num); - device_config->ring.queue[num].rti.urange_b = + (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_urange_b", index); + device_config->ring.queue[index].rti.urange_b = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_RX_URANGE_B); - (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_b", num); - device_config->ring.queue[num].rti.ufc_b = + (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_b", index); + device_config->ring.queue[index].rti.ufc_b = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, device_config->mtu > XGE_HAL_DEFAULT_MTU ? XGE_HAL_DEFAULT_RX_UFC_B_J: XGE_HAL_DEFAULT_RX_UFC_B_N); - (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_urange_c", num); - device_config->ring.queue[num].rti.urange_c = + (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_urange_c", index); + device_config->ring.queue[index].rti.urange_c = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_RX_URANGE_C); - (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_c", num); - device_config->ring.queue[num].rti.ufc_c = + (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_c", index); + device_config->ring.queue[index].rti.ufc_c = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, device_config->mtu > XGE_HAL_DEFAULT_MTU ? XGE_HAL_DEFAULT_RX_UFC_C_J: XGE_HAL_DEFAULT_RX_UFC_C_N); - (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_d", num); - device_config->ring.queue[num].rti.ufc_d = + (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_ufc_d", index); + device_config->ring.queue[index].rti.ufc_d = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_RX_UFC_D); - (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_timer_val", num); - device_config->ring.queue[num].rti.timer_val_us = + (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_timer_val", index); + device_config->ring.queue[index].rti.timer_val_us = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_RX_TIMER_VAL); - (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_timer_ac_en", num); - device_config->ring.queue[num].rti.timer_ac_en = + (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_timer_ac_en", index); + device_config->ring.queue[index].rti.timer_ac_en = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_RX_TIMER_AC_EN); - (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_indicate_max_pkts", num); - device_config->ring.queue[num].indicate_max_pkts = + (void) xge_os_snprintf(msg, MSG_SIZE, "ring%d_indicate_max_pkts", + index); + device_config->ring.queue[index].indicate_max_pkts = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, (device_config->bimodal_interrupts ? XGE_HAL_DEFAULT_INDICATE_MAX_PKTS_B : XGE_HAL_DEFAULT_INDICATE_MAX_PKTS_N)); - if (device_config->ring.queue[num].configured) { - /* enable RTH steering by default */ - device_config->ring.queue[num].rth_en = 1; - device_config->rth_en = XGE_HAL_RTH_ENABLE; - device_config->rth_bucket_size = XGE_HAL_MAX_RTH_BUCKET_SIZE; - device_config->rth_spdm_en = XGE_HAL_RTH_SPDM_DISABLE; - device_config->rth_spdm_use_l4 = XGE_HAL_RTH_SPDM_USE_L4; - } + /* + * Enable RTH steering if needed HERE!!!! + */ + if (device_config->rth_en == XGE_HAL_RTH_ENABLE) + device_config->ring.queue[index].rth_en = 1; } /* * Configure single fifo */ static void -xge_fifo_config(dev_info_t *dev_info, - xge_hal_device_config_t *device_config, int num) +xge_fifo_config(dev_info_t *dev_info, xge_hal_device_config_t *device_config, + int index) { char msg[MSG_SIZE]; - (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_configured", num); - device_config->fifo.queue[num].configured = + (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_configured", index); + device_config->fifo.queue[index].configured = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, - msg, num < XGELL_MAX_FIFO_DEFAULT ? 1 : 0); + msg, index < XGELL_TX_RING_NUM_MAX ? 1 : 0); /* no point to configure it further */ - if (!device_config->fifo.queue[num].configured) + if (!device_config->fifo.queue[index].configured) return; #if defined(__sparc) - device_config->fifo.queue[num].no_snoop_bits = 1; + device_config->fifo.queue[index].no_snoop_bits = 1; #endif - (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_max", num); - device_config->fifo.queue[num].max = ddi_prop_get_int(DDI_DEV_T_ANY, + (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_max", index); + device_config->fifo.queue[index].max = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_USE_HARDCODE); - (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_initial", num); - device_config->fifo.queue[num].initial = ddi_prop_get_int(DDI_DEV_T_ANY, - dev_info, DDI_PROP_DONTPASS, msg, + (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_initial", index); + device_config->fifo.queue[index].initial = + ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_USE_HARDCODE); - if (device_config->fifo.queue[num].initial == +#if 0 + if (device_config->fifo.queue[index].initial == XGE_HAL_DEFAULT_USE_HARDCODE) { if (device_config->mtu > XGE_HAL_DEFAULT_MTU) { - device_config->fifo.queue[num].initial = - device_config->fifo.queue[num].max = + device_config->fifo.queue[index].initial = + device_config->fifo.queue[index].max = XGE_HAL_DEFAULT_FIFO_QUEUE_LENGTH_J; } else { - device_config->fifo.queue[num].initial = - device_config->fifo.queue[num].max = + device_config->fifo.queue[index].initial = + device_config->fifo.queue[index].max = XGE_HAL_DEFAULT_FIFO_QUEUE_LENGTH_N; } } +#else + if (device_config->fifo.queue[index].initial == + XGE_HAL_DEFAULT_USE_HARDCODE) { + device_config->fifo.queue[index].max = + device_config->fifo.queue[index].initial = + XGE_HAL_DEFAULT_FIFO_QUEUE_LENGTH_A; + } +#endif - (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_intr", num); - device_config->fifo.queue[num].intr = ddi_prop_get_int(DDI_DEV_T_ANY, + (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_intr", index); + device_config->fifo.queue[index].intr = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_FIFO_QUEUE_INTR); /* * TTI 0 configuration */ - (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_enable", num); - device_config->fifo.queue[num].tti[num].enabled = ddi_prop_get_int( + (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_enable", index); + device_config->fifo.queue[index].tti[index].enabled = ddi_prop_get_int( DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, 1); - (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_urange_a", num); - device_config->fifo.queue[num].tti[num].urange_a = ddi_prop_get_int( + (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_urange_a", index); + device_config->fifo.queue[index].tti[index].urange_a = ddi_prop_get_int( DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_TX_URANGE_A); - (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_a", num); - device_config->fifo.queue[num].tti[num].ufc_a = ddi_prop_get_int( + (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_a", index); + device_config->fifo.queue[index].tti[index].ufc_a = ddi_prop_get_int( DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_TX_UFC_A); - (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_urange_b", num); - device_config->fifo.queue[num].tti[num].urange_b = ddi_prop_get_int( + (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_urange_b", index); + device_config->fifo.queue[index].tti[index].urange_b = ddi_prop_get_int( DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_TX_URANGE_B); - (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_b", num); - device_config->fifo.queue[num].tti[num].ufc_b = ddi_prop_get_int( + (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_b", index); + device_config->fifo.queue[index].tti[index].ufc_b = ddi_prop_get_int( DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_TX_UFC_B); - (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_urange_c", num); - device_config->fifo.queue[num].tti[num].urange_c = ddi_prop_get_int( + (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_urange_c", index); + device_config->fifo.queue[index].tti[index].urange_c = ddi_prop_get_int( DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_TX_URANGE_C); - (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_c", num); - device_config->fifo.queue[num].tti[num].ufc_c = ddi_prop_get_int( + (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_c", index); + device_config->fifo.queue[index].tti[index].ufc_c = ddi_prop_get_int( DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_TX_UFC_C); - (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_d", num); - device_config->fifo.queue[num].tti[num].ufc_d = ddi_prop_get_int( + (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_ufc_d", index); + device_config->fifo.queue[index].tti[index].ufc_d = ddi_prop_get_int( DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_TX_UFC_D); - (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_timer_ac_en", num); - device_config->fifo.queue[num].tti[num].timer_ac_en = ddi_prop_get_int( - DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, + (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_timer_ac_en", index); + device_config->fifo.queue[index].tti[index].timer_ac_en = + ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_TX_TIMER_AC_EN); - (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_timer_val", num); - device_config->fifo.queue[num].tti[num].timer_val_us = ddi_prop_get_int( - DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, + (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_timer_val", index); + device_config->fifo.queue[index].tti[index].timer_val_us = + ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_TX_TIMER_VAL); - (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_timer_ci_en", num); - device_config->fifo.queue[num].tti[num].timer_ci_en = ddi_prop_get_int( - DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, + (void) xge_os_snprintf(msg, MSG_SIZE, "fifo%d_tti_timer_ci_en", index); + device_config->fifo.queue[index].tti[index].timer_ci_en = + ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, msg, XGE_HAL_DEFAULT_TX_TIMER_CI_EN); } @@ -577,11 +538,57 @@ xge_fifo_config(dev_info_t *dev_info, */ static void xge_configuration_init(dev_info_t *dev_info, - xge_hal_device_config_t *device_config, xgell_config_t *ll_config) + xge_hal_device_config_t *device_config, xgell_config_t *xgell_config) { int i, rings_configured = 0, fifos_configured = 0; /* + * Initialize link layer configuration first + */ + xgell_config->rx_dma_lowat = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, + DDI_PROP_DONTPASS, "rx_dma_lowat", XGELL_RX_DMA_LOWAT); + xgell_config->rx_pkt_burst = ddi_prop_get_int(DDI_DEV_T_ANY, + dev_info, DDI_PROP_DONTPASS, "rx_pkt_burst", XGELL_RX_PKT_BURST); + xgell_config->tx_dma_lowat = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, + DDI_PROP_DONTPASS, "tx_dma_lowat", XGELL_TX_DMA_LOWAT); + xgell_config->lso_enable = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, + DDI_PROP_DONTPASS, "lso_enable", XGELL_CONF_ENABLE_BY_DEFAULT); + xgell_config->msix_enable = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, + DDI_PROP_DONTPASS, "msix_enable", XGELL_CONF_ENABLE_BY_DEFAULT); + + xgell_config->grouping = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, + DDI_PROP_DONTPASS, "grouping", XGELL_CONF_GROUP_POLICY_DEFAULT); + + switch (xgell_config->grouping) { + case XGELL_CONF_GROUP_POLICY_VIRT: + /* + * Enable layer 2 steering for better virtualization + */ + device_config->rth_en = XGE_HAL_RTH_DISABLE; + device_config->rts_mac_en = XGE_HAL_RTS_MAC_ENABLE; + break; + case XGELL_CONF_GROUP_POLICY_PERF: + /* + * Configure layer 4 RTH to hashing inbound traffic + */ + device_config->rth_en = XGE_HAL_RTH_ENABLE; + device_config->rth_bucket_size = XGE_HAL_MAX_RTH_BUCKET_SIZE; + device_config->rth_spdm_en = XGE_HAL_RTH_SPDM_DISABLE; + device_config->rth_spdm_use_l4 = XGE_HAL_RTH_SPDM_USE_L4; + + device_config->rts_mac_en = XGE_HAL_RTS_MAC_DISABLE; + break; + case XGELL_CONF_GROUP_POLICY_BASIC: + default: + /* + * Disable both RTS and RTH for single ring configuration + */ + device_config->rth_en = XGE_HAL_RTH_DISABLE; + device_config->rts_mac_en = XGE_HAL_RTS_MAC_DISABLE; + break; + } + + /* * Initialize common properties */ device_config->mtu = ddi_prop_get_int(DDI_DEV_T_ANY, @@ -634,12 +641,6 @@ xge_configuration_init(dev_info_t *dev_info, XGE_HAL_DEFAULT_BIMODAL_TIMER_HI_US); /* - * MSI-X switch - */ - ll_config->msix_enable = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, - DDI_PROP_DONTPASS, "msix_enable", XGELL_CONF_ENABLE_BY_DEFAULT); - - /* * Go through all possibly configured rings. Each ring could be * configured individually. To enable/disable specific ring, just * set ring->configured = [1|0]. @@ -740,30 +741,20 @@ xge_configuration_init(dev_info_t *dev_info, XGE_HAL_DEFAULT_LRO_FRM_LEN); /* - * Initialize link layer configuration + * Initialize other link layer configuration first */ - ll_config->rx_buffer_total = ddi_prop_get_int(DDI_DEV_T_ANY, + xgell_config->rx_buffer_total = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, "rx_buffer_total", - device_config->ring.queue[XGELL_RING_MAIN_QID].initial * + device_config->ring.queue[XGELL_RX_RING_MAIN].initial * XGELL_RX_BUFFER_TOTAL); - ll_config->rx_buffer_total += XGELL_RX_BUFFER_RECYCLE_CACHE; - ll_config->rx_buffer_post_hiwat = ddi_prop_get_int(DDI_DEV_T_ANY, + xgell_config->rx_buffer_total += XGELL_RX_BUFFER_RECYCLE_CACHE; + xgell_config->rx_buffer_post_hiwat = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, DDI_PROP_DONTPASS, "rx_buffer_post_hiwat", - device_config->ring.queue[XGELL_RING_MAIN_QID].initial * + device_config->ring.queue[XGELL_RX_RING_MAIN].initial * XGELL_RX_BUFFER_POST_HIWAT); - ll_config->rx_buffer_post_hiwat += XGELL_RX_BUFFER_RECYCLE_CACHE; - ll_config->rx_pkt_burst = ddi_prop_get_int(DDI_DEV_T_ANY, - dev_info, DDI_PROP_DONTPASS, "rx_pkt_burst", - XGELL_RX_PKT_BURST); - ll_config->rx_dma_lowat = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, - DDI_PROP_DONTPASS, "rx_dma_lowat", XGELL_RX_DMA_LOWAT); - ll_config->tx_dma_lowat = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, - DDI_PROP_DONTPASS, "tx_dma_lowat", XGELL_TX_DMA_LOWAT); - ll_config->lso_enable = ddi_prop_get_int(DDI_DEV_T_ANY, dev_info, - DDI_PROP_DONTPASS, "lso_enable", XGELL_CONF_ENABLE_BY_DEFAULT); + xgell_config->rx_buffer_post_hiwat += XGELL_RX_BUFFER_RECYCLE_CACHE; } - /* * xge_alloc_intrs: * @@ -847,6 +838,7 @@ _err_exit2: } _err_exit1: kmem_free(lldev->intr_table, lldev->intr_table_size); + lldev->intr_table = NULL; _err_exit0: if (lldev->intr_type == DDI_INTR_TYPE_MSIX) (void) ddi_prop_remove(DDI_DEV_T_NONE, dip, "#msix-request"); @@ -869,6 +861,7 @@ xge_free_intrs(xgelldev_t *lldev) (void) ddi_intr_free(lldev->intr_table[i]); } kmem_free(lldev->intr_table, lldev->intr_table_size); + lldev->intr_table = NULL; if (lldev->intr_type == DDI_INTR_TYPE_MSIX) (void) ddi_prop_remove(DDI_DEV_T_NONE, dip, "#msix-request"); @@ -889,9 +882,10 @@ xge_add_intrs(xgelldev_t *lldev) xge_hal_fifo_config_t *fifo_conf = &hal_conf->fifo; xge_list_t *item; int msix_idx = 1; /* 0 by default is reserved for Alarms. */ - xge_hal_channel_t *assigned[XGELL_MAX_RING_DEFAULT + - XGELL_MAX_FIFO_DEFAULT + 1]; + xge_hal_channel_t *assigned[XGELL_RX_RING_NUM_MAX + + XGELL_TX_RING_NUM_MAX + 1]; + xge_assert(lldev->intr_table != NULL); switch (lldev->intr_type) { case DDI_INTR_TYPE_FIXED: ret = ddi_intr_add_handler(lldev->intr_table[0], @@ -1054,6 +1048,8 @@ xge_rem_intrs(xgelldev_t *lldev) { int i; + xge_assert(lldev->intr_table != NULL); + /* Call ddi_intr_remove_handler() */ for (i = 0; i < lldev->intr_cnt; i++) { (void) ddi_intr_remove_handler(lldev->intr_table[i]); @@ -1079,11 +1075,11 @@ static int xge_attach(dev_info_t *dev_info, ddi_attach_cmd_t cmd) { xgelldev_t *ll; + xgell_config_t *xgell_config; xge_hal_device_config_t *device_config; xge_hal_device_t *hldev; xge_hal_device_attr_t attr; xge_hal_status_e status; - xgell_config_t ll_config; int ret, intr_types, i; xge_debug_osdep(XGE_TRACE, "XGE_ATTACH cmd %d", cmd); @@ -1104,10 +1100,13 @@ xge_attach(dev_info_t *dev_info, ddi_attach_cmd_t cmd) goto _exit0; } + xgell_config = kmem_zalloc(sizeof (xgell_config_t), KM_SLEEP); device_config = kmem_zalloc(sizeof (xge_hal_device_config_t), KM_SLEEP); - /* Init device_config by lookup up properties from .conf file */ - xge_configuration_init(dev_info, device_config, &ll_config); + /* + * Initialize all configurations + */ + xge_configuration_init(dev_info, device_config, xgell_config); /* Determine which types of interrupts supported */ ret = ddi_intr_get_supported_types(dev_info, &intr_types); @@ -1161,7 +1160,34 @@ xge_attach(dev_info_t *dev_info, ddi_attach_cmd_t cmd) goto _exit3; } - if (ll_config.msix_enable && intr_types & DDI_INTR_TYPE_MSIX) { + /* + * Init multiple rings configuration + */ + switch (xgell_config->grouping) { + case XGELL_CONF_GROUP_POLICY_VIRT: + ll->init_rx_rings = XGELL_RX_RING_NUM_MAX; /* 8 */ + ll->init_tx_rings = XGELL_TX_RING_NUM_MAX; /* 8 */ + ll->init_rx_groups = ll->init_rx_rings; + break; + case XGELL_CONF_GROUP_POLICY_PERF: + ll->init_rx_rings = XGELL_RX_RING_NUM_MAX; /* 8 */ + ll->init_tx_rings = XGELL_TX_RING_NUM_MAX; /* 8 */ + ll->init_rx_groups = 1; + break; + case XGELL_CONF_GROUP_POLICY_BASIC: + ll->init_rx_rings = XGELL_RX_RING_NUM_MIN; /* 1 */ + ll->init_tx_rings = XGELL_TX_RING_NUM_MIN; /* 1 */ + ll->init_rx_groups = ll->init_rx_rings; + break; + default: + ASSERT(0); + break; + } + + /* + * Init MSI-X configuration + */ + if (xgell_config->msix_enable && intr_types & DDI_INTR_TYPE_MSIX) { ll->intr_type = DDI_INTR_TYPE_MSIX; ll->intr_cnt = 1; for (i = 0; i < XGE_HAL_MAX_FIFO_NUM; i++) @@ -1175,9 +1201,12 @@ xge_attach(dev_info_t *dev_info, ddi_attach_cmd_t cmd) ll->intr_cnt = 1; } + /* + * Allocate interrupt(s) + */ while ((ret = xge_alloc_intrs(ll)) != DDI_SUCCESS) { if (ll->intr_type == DDI_INTR_TYPE_MSIX) { - ll_config.msix_enable = 0; + xgell_config->msix_enable = 0; ll->intr_type = DDI_INTR_TYPE_FIXED; ll->intr_cnt = 1; device_config->intr_mode = XGE_HAL_INTR_MODE_IRQLINE; @@ -1231,7 +1260,7 @@ xge_attach(dev_info_t *dev_info, ddi_attach_cmd_t cmd) goto _exit4; /* allocate and register Link Layer */ - ret = xgell_device_register(ll, &ll_config); + ret = xgell_device_register(ll, xgell_config); if (ret != DDI_SUCCESS) { goto _exit5; } @@ -1240,6 +1269,7 @@ xge_attach(dev_info_t *dev_info, ddi_attach_cmd_t cmd) xge_hal_device_private_set(hldev, ll); kmem_free(device_config, sizeof (xge_hal_device_config_t)); + kmem_free(xgell_config, sizeof (xgell_config_t)); return (DDI_SUCCESS); @@ -1263,6 +1293,7 @@ _exit1: ddi_regs_map_free(&attr.regh0); _exit0a: kmem_free(device_config, sizeof (xge_hal_device_config_t)); + kmem_free(xgell_config, sizeof (xgell_config_t)); _exit0: return (ret); } @@ -1298,7 +1329,7 @@ xge_quiesce(dev_info_t *dev_info) * This function is called by OS when the system is about * to shutdown or when the super user tries to unload * the driver. This function frees all the memory allocated - * during xge_attch() and also unregisters the Xframe + * during xge_attach() and also unregisters the Xframe * device instance from the GLD framework. */ static int diff --git a/usr/src/uts/common/io/xge/drv/xge_osdep.h b/usr/src/uts/common/io/xge/drv/xge_osdep.h index 18923972ee..4b09b0f983 100644 --- a/usr/src/uts/common/io/xge/drv/xge_osdep.h +++ b/usr/src/uts/common/io/xge/drv/xge_osdep.h @@ -37,8 +37,6 @@ #ifndef _SYS_XGE_OSDEP_H #define _SYS_XGE_OSDEP_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/ddi.h> #include <sys/sunddi.h> #include <sys/varargs.h> diff --git a/usr/src/uts/common/io/xge/drv/xgell.c b/usr/src/uts/common/io/xge/drv/xgell.c index 85db35ddcc..4ec1117750 100644 --- a/usr/src/uts/common/io/xge/drv/xgell.c +++ b/usr/src/uts/common/io/xge/drv/xgell.c @@ -24,10 +24,8 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* - * Copyright (c) 2002-2005 Neterion, Inc. + * Copyright (c) 2002-2008 Neterion, Inc. * All right Reserved. * * FileName : xgell.c @@ -100,9 +98,7 @@ static int xgell_m_start(void *); static void xgell_m_stop(void *); static int xgell_m_promisc(void *, boolean_t); static int xgell_m_multicst(void *, boolean_t, const uint8_t *); -static int xgell_m_unicst(void *, const uint8_t *); static void xgell_m_ioctl(void *, queue_t *, mblk_t *); -static mblk_t *xgell_m_tx(void *, mblk_t *); static boolean_t xgell_m_getcapab(void *, mac_capab_t, void *); #define XGELL_M_CALLBACK_FLAGS (MC_IOCTL | MC_GETCAPAB) @@ -114,8 +110,7 @@ static mac_callbacks_t xgell_m_callbacks = { xgell_m_stop, xgell_m_promisc, xgell_m_multicst, - xgell_m_unicst, - xgell_m_tx, + NULL, NULL, xgell_m_ioctl, xgell_m_getcapab @@ -124,7 +119,7 @@ static mac_callbacks_t xgell_m_callbacks = { /* * xge_device_poll * - * Cyclic should call me every 1s. xge_callback_event_queued should call me + * Timeout should call me every 1s. xge_callback_event_queued should call me * when HAL hope event was rescheduled. */ /*ARGSUSED*/ @@ -194,32 +189,34 @@ xgell_callback_link_down(void *userdata) * xgell_rx_buffer_replenish_all * * To replenish all freed dtr(s) with buffers in free pool. It's called by - * xgell_rx_buffer_recycle() or xgell_rx_1b_compl(). + * xgell_rx_buffer_recycle() or xgell_rx_1b_callback(). * Must be called with pool_lock held. */ static void -xgell_rx_buffer_replenish_all(xgell_ring_t *ring) +xgell_rx_buffer_replenish_all(xgell_rx_ring_t *ring) { + xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool; xge_hal_dtr_h dtr; xgell_rx_buffer_t *rx_buffer; xgell_rxd_priv_t *rxd_priv; - xge_assert(mutex_owned(&ring->bf_pool.pool_lock)); + xge_assert(mutex_owned(&bf_pool->pool_lock)); + + while ((bf_pool->free > 0) && + (xge_hal_ring_dtr_reserve(ring->channelh, &dtr) == XGE_HAL_OK)) { + xge_assert(bf_pool->head); - while ((ring->bf_pool.free > 0) && - (xge_hal_ring_dtr_reserve(ring->channelh, &dtr) == - XGE_HAL_OK)) { - rx_buffer = ring->bf_pool.head; - ring->bf_pool.head = rx_buffer->next; - ring->bf_pool.free--; + rx_buffer = bf_pool->head; + + bf_pool->head = rx_buffer->next; + bf_pool->free--; - xge_assert(rx_buffer); xge_assert(rx_buffer->dma_addr); rxd_priv = (xgell_rxd_priv_t *) xge_hal_ring_dtr_private(ring->channelh, dtr); xge_hal_ring_dtr_1b_set(dtr, rx_buffer->dma_addr, - ring->bf_pool.size); + bf_pool->size); rxd_priv->rx_buffer = rx_buffer; xge_hal_ring_dtr_post(ring->channelh, dtr); @@ -235,15 +232,16 @@ xgell_rx_buffer_replenish_all(xgell_ring_t *ring) static void xgell_rx_buffer_release(xgell_rx_buffer_t *rx_buffer) { - xgell_ring_t *ring = rx_buffer->ring; + xgell_rx_ring_t *ring = rx_buffer->ring; + xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool; - xge_assert(mutex_owned(&ring->bf_pool.pool_lock)); + xge_assert(mutex_owned(&bf_pool->pool_lock)); /* Put the buffer back to pool */ - rx_buffer->next = ring->bf_pool.head; - ring->bf_pool.head = rx_buffer; + rx_buffer->next = bf_pool->head; + bf_pool->head = rx_buffer; - ring->bf_pool.free++; + bf_pool->free++; } /* @@ -266,7 +264,7 @@ static void xgell_rx_buffer_recycle(char *arg) { xgell_rx_buffer_t *rx_buffer = (xgell_rx_buffer_t *)arg; - xgell_ring_t *ring = rx_buffer->ring; + xgell_rx_ring_t *ring = rx_buffer->ring; xgelldev_t *lldev = ring->lldev; xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool; @@ -282,18 +280,17 @@ xgell_rx_buffer_recycle(char *arg) * Before finding a good way to set this hiwat, just always call to * replenish_all. *TODO* */ - if ((lldev->is_initialized != 0) && + if ((lldev->is_initialized != 0) && (ring->live) && (bf_pool->recycle >= XGELL_RX_BUFFER_RECYCLE_CACHE)) { - if (mutex_tryenter(&bf_pool->pool_lock)) { - bf_pool->recycle_tail->next = bf_pool->head; - bf_pool->head = bf_pool->recycle_head; - bf_pool->recycle_head = bf_pool->recycle_tail = NULL; - bf_pool->post -= bf_pool->recycle; - bf_pool->free += bf_pool->recycle; - bf_pool->recycle = 0; - xgell_rx_buffer_replenish_all(ring); - mutex_exit(&bf_pool->pool_lock); - } + mutex_enter(&bf_pool->pool_lock); + bf_pool->recycle_tail->next = bf_pool->head; + bf_pool->head = bf_pool->recycle_head; + bf_pool->recycle_head = bf_pool->recycle_tail = NULL; + bf_pool->post -= bf_pool->recycle; + bf_pool->free += bf_pool->recycle; + bf_pool->recycle = 0; + xgell_rx_buffer_replenish_all(ring); + mutex_exit(&bf_pool->pool_lock); } mutex_exit(&bf_pool->recycle_lock); @@ -306,8 +303,10 @@ xgell_rx_buffer_recycle(char *arg) * Return NULL if failed. */ static xgell_rx_buffer_t * -xgell_rx_buffer_alloc(xgell_ring_t *ring) +xgell_rx_buffer_alloc(xgell_rx_ring_t *ring) { + xgelldev_t *lldev = ring->lldev; + xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool; xge_hal_device_t *hldev; void *vaddr; ddi_dma_handle_t dma_handle; @@ -318,7 +317,6 @@ xgell_rx_buffer_alloc(xgell_ring_t *ring) size_t real_size; extern ddi_device_acc_attr_t *p_xge_dev_attr; xgell_rx_buffer_t *rx_buffer; - xgelldev_t *lldev = ring->lldev; hldev = (xge_hal_device_t *)lldev->devh; @@ -330,7 +328,7 @@ xgell_rx_buffer_alloc(xgell_ring_t *ring) } /* reserve some space at the end of the buffer for recycling */ - if (ddi_dma_mem_alloc(dma_handle, HEADROOM + ring->bf_pool.size + + if (ddi_dma_mem_alloc(dma_handle, HEADROOM + bf_pool->size + sizeof (xgell_rx_buffer_t), p_xge_dev_attr, DDI_DMA_STREAMING, DDI_DMA_SLEEP, 0, (caddr_t *)&vaddr, &real_size, &dma_acch) != DDI_SUCCESS) { @@ -339,7 +337,7 @@ xgell_rx_buffer_alloc(xgell_ring_t *ring) goto mem_failed; } - if (HEADROOM + ring->bf_pool.size + sizeof (xgell_rx_buffer_t) > + if (HEADROOM + bf_pool->size + sizeof (xgell_rx_buffer_t) > real_size) { xge_debug_ll(XGE_ERR, "%s%d: can not allocate DMA-able memory", XGELL_IFNAME, lldev->instance); @@ -347,14 +345,14 @@ xgell_rx_buffer_alloc(xgell_ring_t *ring) } if (ddi_dma_addr_bind_handle(dma_handle, NULL, (char *)vaddr + HEADROOM, - ring->bf_pool.size, DDI_DMA_READ | DDI_DMA_STREAMING, + bf_pool->size, DDI_DMA_READ | DDI_DMA_STREAMING, DDI_DMA_SLEEP, 0, &dma_cookie, &ncookies) != DDI_SUCCESS) { xge_debug_ll(XGE_ERR, "%s%d: out of mapping for mblk", XGELL_IFNAME, lldev->instance); goto bind_failed; } - if (ncookies != 1 || dma_cookie.dmac_size < ring->bf_pool.size) { + if (ncookies != 1 || dma_cookie.dmac_size < bf_pool->size) { xge_debug_ll(XGE_ERR, "%s%d: can not handle partial DMA", XGELL_IFNAME, lldev->instance); goto check_failed; @@ -393,64 +391,77 @@ handle_failed: * Destroy buffer pool. If there is still any buffer hold by upper layer, * recorded by bf_pool.post, return DDI_FAILURE to reject to be unloaded. */ -static int -xgell_rx_destroy_buffer_pool(xgell_ring_t *ring) +static boolean_t +xgell_rx_destroy_buffer_pool(xgell_rx_ring_t *ring) { + xgelldev_t *lldev = ring->lldev; + xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool; xgell_rx_buffer_t *rx_buffer; ddi_dma_handle_t dma_handle; ddi_acc_handle_t dma_acch; - xgelldev_t *lldev = ring->lldev; int i; - if (ring->bf_pool.recycle > 0) { - ring->bf_pool.recycle_tail->next = ring->bf_pool.head; - ring->bf_pool.head = ring->bf_pool.recycle_head; - ring->bf_pool.recycle_tail = - ring->bf_pool.recycle_head = NULL; - ring->bf_pool.post -= ring->bf_pool.recycle; - ring->bf_pool.free += ring->bf_pool.recycle; - ring->bf_pool.recycle = 0; + /* + * If the pool has been destroied, just return B_TRUE + */ + if (!bf_pool->live) + return (B_TRUE); + + mutex_enter(&bf_pool->recycle_lock); + if (bf_pool->recycle > 0) { + mutex_enter(&bf_pool->pool_lock); + bf_pool->recycle_tail->next = bf_pool->head; + bf_pool->head = bf_pool->recycle_head; + bf_pool->recycle_tail = bf_pool->recycle_head = NULL; + bf_pool->post -= bf_pool->recycle; + bf_pool->free += bf_pool->recycle; + bf_pool->recycle = 0; + mutex_exit(&bf_pool->pool_lock); } + mutex_exit(&bf_pool->recycle_lock); /* * If there is any posted buffer, the driver should reject to be * detached. Need notice upper layer to release them. */ - if (ring->bf_pool.post != 0) { + if (bf_pool->post != 0) { xge_debug_ll(XGE_ERR, "%s%d has some buffers not be recycled, try later!", XGELL_IFNAME, lldev->instance); - return (DDI_FAILURE); + return (B_FALSE); } /* - * Relase buffers one by one. + * Release buffers one by one. */ - for (i = ring->bf_pool.total; i > 0; i--) { - rx_buffer = ring->bf_pool.head; + for (i = bf_pool->total; i > 0; i--) { + rx_buffer = bf_pool->head; xge_assert(rx_buffer != NULL); - ring->bf_pool.head = rx_buffer->next; + bf_pool->head = rx_buffer->next; dma_handle = rx_buffer->dma_handle; dma_acch = rx_buffer->dma_acch; if (ddi_dma_unbind_handle(dma_handle) != DDI_SUCCESS) { - xge_debug_ll(XGE_ERR, "%s", - "failed to unbind DMA handle!"); - ring->bf_pool.head = rx_buffer; - return (DDI_FAILURE); + xge_debug_ll(XGE_ERR, "failed to unbind DMA handle!"); + bf_pool->head = rx_buffer; + return (B_FALSE); } ddi_dma_mem_free(&dma_acch); ddi_dma_free_handle(&dma_handle); - ring->bf_pool.total--; - ring->bf_pool.free--; + bf_pool->total--; + bf_pool->free--; } - mutex_destroy(&ring->bf_pool.recycle_lock); - mutex_destroy(&ring->bf_pool.pool_lock); - return (DDI_SUCCESS); + xge_assert(!mutex_owned(&bf_pool->pool_lock)); + + mutex_destroy(&bf_pool->recycle_lock); + mutex_destroy(&bf_pool->pool_lock); + bf_pool->live = B_FALSE; + + return (B_TRUE); } /* @@ -458,29 +469,34 @@ xgell_rx_destroy_buffer_pool(xgell_ring_t *ring) * * Initialize RX buffer pool for all RX rings. Refer to rx_buffer_pool_t. */ -static int -xgell_rx_create_buffer_pool(xgell_ring_t *ring) +static boolean_t +xgell_rx_create_buffer_pool(xgell_rx_ring_t *ring) { + xgelldev_t *lldev = ring->lldev; + xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool; xge_hal_device_t *hldev; xgell_rx_buffer_t *rx_buffer; - xgelldev_t *lldev = ring->lldev; int i; + if (bf_pool->live) + return (B_TRUE); + hldev = (xge_hal_device_t *)lldev->devh; - ring->bf_pool.total = 0; - ring->bf_pool.size = XGELL_MAX_FRAME_SIZE(hldev); - ring->bf_pool.head = NULL; - ring->bf_pool.free = 0; - ring->bf_pool.post = 0; - ring->bf_pool.post_hiwat = lldev->config.rx_buffer_post_hiwat; - ring->bf_pool.recycle = 0; - ring->bf_pool.recycle_head = NULL; - ring->bf_pool.recycle_tail = NULL; - - mutex_init(&ring->bf_pool.pool_lock, NULL, MUTEX_DRIVER, + bf_pool->total = 0; + bf_pool->size = XGELL_MAX_FRAME_SIZE(hldev); + bf_pool->head = NULL; + bf_pool->free = 0; + bf_pool->post = 0; + bf_pool->post_hiwat = lldev->config.rx_buffer_post_hiwat; + bf_pool->recycle = 0; + bf_pool->recycle_head = NULL; + bf_pool->recycle_tail = NULL; + bf_pool->live = B_TRUE; + + mutex_init(&bf_pool->pool_lock, NULL, MUTEX_DRIVER, DDI_INTR_PRI(hldev->irqh)); - mutex_init(&ring->bf_pool.recycle_lock, NULL, MUTEX_DRIVER, + mutex_init(&bf_pool->recycle_lock, NULL, MUTEX_DRIVER, DDI_INTR_PRI(hldev->irqh)); /* @@ -491,17 +507,17 @@ xgell_rx_create_buffer_pool(xgell_ring_t *ring) for (i = 0; i < lldev->config.rx_buffer_total; i++) { if ((rx_buffer = xgell_rx_buffer_alloc(ring)) == NULL) { (void) xgell_rx_destroy_buffer_pool(ring); - return (DDI_FAILURE); + return (B_FALSE); } - rx_buffer->next = ring->bf_pool.head; - ring->bf_pool.head = rx_buffer; + rx_buffer->next = bf_pool->head; + bf_pool->head = rx_buffer; - ring->bf_pool.total++; - ring->bf_pool.free++; + bf_pool->total++; + bf_pool->free++; } - return (DDI_SUCCESS); + return (B_TRUE); } /* @@ -514,23 +530,26 @@ xge_hal_status_e xgell_rx_dtr_replenish(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, int index, void *userdata, xge_hal_channel_reopen_e reopen) { - xgell_ring_t *ring = userdata; + xgell_rx_ring_t *ring = userdata; + xgell_rx_buffer_pool_t *bf_pool = &ring->bf_pool; xgell_rx_buffer_t *rx_buffer; xgell_rxd_priv_t *rxd_priv; - if (ring->bf_pool.head == NULL) { - xge_debug_ll(XGE_ERR, "%s", "no more available rx DMA buffer!"); + mutex_enter(&bf_pool->pool_lock); + if (bf_pool->head == NULL) { + xge_debug_ll(XGE_ERR, "no more available rx DMA buffer!"); return (XGE_HAL_FAIL); } - rx_buffer = ring->bf_pool.head; - ring->bf_pool.head = rx_buffer->next; - ring->bf_pool.free--; - + rx_buffer = bf_pool->head; xge_assert(rx_buffer); xge_assert(rx_buffer->dma_addr); + bf_pool->head = rx_buffer->next; + bf_pool->free--; + mutex_exit(&bf_pool->pool_lock); + rxd_priv = (xgell_rxd_priv_t *)xge_hal_ring_dtr_private(channelh, dtr); - xge_hal_ring_dtr_1b_set(dtr, rx_buffer->dma_addr, ring->bf_pool.size); + xge_hal_ring_dtr_1b_set(dtr, rx_buffer->dma_addr, bf_pool->size); rxd_priv->rx_buffer = rx_buffer; @@ -637,9 +656,10 @@ xgell_rx_hcksum_assoc(mblk_t *mp, char *vaddr, int pkt_length, * new message and copy the payload in. */ static mblk_t * -xgell_rx_1b_msg_alloc(xgelldev_t *lldev, xgell_rx_buffer_t *rx_buffer, +xgell_rx_1b_msg_alloc(xgell_rx_ring_t *ring, xgell_rx_buffer_t *rx_buffer, int pkt_length, xge_hal_dtr_info_t *ext_info, boolean_t *copyit) { + xgelldev_t *lldev = ring->lldev; mblk_t *mp; char *vaddr; @@ -676,24 +696,25 @@ xgell_rx_1b_msg_alloc(xgelldev_t *lldev, xgell_rx_buffer_t *rx_buffer, } /* - * xgell_rx_1b_compl + * xgell_rx_1b_callback * * If the interrupt is because of a received frame or if the receive ring * contains fresh as yet un-processed frames, this function is called. */ static xge_hal_status_e -xgell_rx_1b_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code, +xgell_rx_1b_callback(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code, void *userdata) { - xgell_ring_t *ring = (xgell_ring_t *)userdata; + xgell_rx_ring_t *ring = (xgell_rx_ring_t *)userdata; xgelldev_t *lldev = ring->lldev; xgell_rx_buffer_t *rx_buffer; mblk_t *mp_head = NULL; mblk_t *mp_end = NULL; int pkt_burst = 0; - mutex_enter(&ring->bf_pool.pool_lock); + xge_debug_ll(XGE_TRACE, "xgell_rx_1b_callback on ring %d", ring->index); + mutex_enter(&ring->bf_pool.pool_lock); do { int pkt_length; dma_addr_t dma_data; @@ -744,7 +765,7 @@ xgell_rx_1b_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code, copyit = B_FALSE; } - mp = xgell_rx_1b_msg_alloc(lldev, rx_buffer, pkt_length, + mp = xgell_rx_1b_msg_alloc(ring, rx_buffer, pkt_length, &ext_info, ©it); xge_hal_ring_dtr_free(channelh, dtr); @@ -771,8 +792,10 @@ xgell_rx_1b_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code, * Associate cksum_flags per packet type and h/w * cksum flags. */ - xgell_rx_hcksum_assoc(mp, (char *)rx_buffer->vaddr + - HEADROOM, pkt_length, &ext_info); + xgell_rx_hcksum_assoc(mp, (char *)rx_buffer->vaddr + HEADROOM, + pkt_length, &ext_info); + + ring->received_bytes += pkt_length; if (mp_head == NULL) { mp_head = mp; @@ -782,6 +805,26 @@ xgell_rx_1b_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code, mp_end = mp; } + /* + * Inlined implemented polling function. + */ + if ((ring->poll_mp == NULL) && (ring->poll_bytes > 0)) { + ring->poll_mp = mp_head; + } + if (ring->poll_mp != NULL) { + if ((ring->poll_bytes -= pkt_length) <= 0) { + /* have polled enough packets. */ + break; + } else { + /* continue polling packets. */ + continue; + } + } + + /* + * We're not in polling mode, so try to chain more messages + * or send the chain up according to pkt_burst. + */ if (++pkt_burst < lldev->config.rx_pkt_burst) continue; @@ -791,8 +834,8 @@ xgell_rx_1b_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code, } mutex_exit(&ring->bf_pool.pool_lock); if (mp_head != NULL) { - mac_rx(lldev->mh, ((xgell_ring_t *)userdata)->handle, - mp_head); + mac_rx_ring(lldev->mh, ring->ring_handle, mp_head, + ring->ring_gen_num); } mp_head = mp_end = NULL; pkt_burst = 0; @@ -807,13 +850,39 @@ xgell_rx_1b_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code, xgell_rx_buffer_replenish_all(ring); mutex_exit(&ring->bf_pool.pool_lock); - if (mp_head != NULL) { - mac_rx(lldev->mh, ((xgell_ring_t *)userdata)->handle, mp_head); + /* + * If we're not in polling cycle, call mac_rx(), otherwise + * just return while leaving packets chained to ring->poll_mp. + */ + if ((ring->poll_mp == NULL) && (mp_head != NULL)) { + mac_rx_ring(lldev->mh, ring->ring_handle, mp_head, + ring->ring_gen_num); } return (XGE_HAL_OK); } +mblk_t * +xgell_rx_poll(void *arg, int bytes_to_pickup) +{ + xgell_rx_ring_t *ring = (xgell_rx_ring_t *)arg; + int got_rx = 0; + mblk_t *mp; + + xge_debug_ll(XGE_TRACE, "xgell_rx_poll on ring %d", ring->index); + + ring->poll_mp = NULL; + ring->poll_bytes = bytes_to_pickup; + (void) xge_hal_device_poll_rx_channel(ring->channelh, &got_rx); + + mp = ring->poll_mp; + ring->poll_bytes = -1; + ring->polled_bytes += got_rx; + ring->poll_mp = NULL; + + return (mp); +} + /* * xgell_xmit_compl * @@ -826,8 +895,8 @@ static xge_hal_status_e xgell_xmit_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code, void *userdata) { - xgell_fifo_t *fifo = (xgell_fifo_t *)userdata; - xgelldev_t *lldev = fifo->lldev; + xgell_tx_ring_t *ring = userdata; + xgelldev_t *lldev = ring->lldev; do { xgell_txd_priv_t *txd_priv = ((xgell_txd_priv_t *) @@ -861,58 +930,36 @@ xgell_xmit_compl(xge_hal_channel_h channelh, xge_hal_dtr_h dtr, u8 t_code, txd_priv->mblk = NULL; } - lldev->resched_avail++; - } while (xge_hal_fifo_dtr_next_completed(channelh, &dtr, &t_code) == XGE_HAL_OK); - if (lldev->resched_retry && - xge_queue_produce_context(xge_hal_device_queue(lldev->devh), - XGELL_EVENT_RESCHED_NEEDED, fifo) == XGE_QUEUE_OK) { - xge_debug_ll(XGE_TRACE, "%s%d: IRQ produced event for queue %d", - XGELL_IFNAME, lldev->instance, - ((xge_hal_channel_t *)channelh)->post_qid); - lldev->resched_send = lldev->resched_avail; - lldev->resched_retry = 0; - } + if (ring->need_resched) + mac_tx_ring_update(lldev->mh, ring->ring_handle); return (XGE_HAL_OK); } -/* - * xgell_send - * @hldev: pointer to xge_hal_device_t strucutre - * @mblk: pointer to network buffer, i.e. mblk_t structure - * - * Called by the xgell_m_tx to transmit the packet to the XFRAME firmware. - * A pointer to an M_DATA message that contains the packet is passed to - * this routine. - */ -static boolean_t -xgell_send(xgelldev_t *lldev, mblk_t *mp) +mblk_t * +xgell_ring_tx(void *arg, mblk_t *mp) { + xgell_tx_ring_t *ring = (xgell_tx_ring_t *)arg; mblk_t *bp; - boolean_t retry; + xgelldev_t *lldev = ring->lldev; xge_hal_device_t *hldev = lldev->devh; xge_hal_status_e status; xge_hal_dtr_h dtr; xgell_txd_priv_t *txd_priv; uint32_t hckflags; + uint32_t lsoflags; uint32_t mss; int handle_cnt, frag_cnt, ret, i, copied; boolean_t used_copy; - xgell_fifo_t *fifo; - xge_hal_channel_h fifo_channel; _begin: - retry = B_FALSE; handle_cnt = frag_cnt = 0; if (!lldev->is_initialized || lldev->in_reset) - return (B_FALSE); - - fifo = &lldev->fifos[0]; - fifo_channel = fifo->channelh; + return (mp); /* * If the free Tx dtrs count reaches the lower threshold, @@ -921,23 +968,17 @@ _begin: * gld through gld_sched call, when the free dtrs count exceeds * the higher threshold. */ - if (xge_hal_channel_dtr_count(fifo_channel) + if (xge_hal_channel_dtr_count(ring->channelh) <= XGELL_TX_LEVEL_LOW) { - if (++fifo->level_low > XGELL_TX_LEVEL_CHECK) { - xge_debug_ll(XGE_TRACE, "%s%d: queue %d: err on xmit," - "free descriptors count at low threshold %d", - XGELL_IFNAME, lldev->instance, - ((xge_hal_channel_t *)fifo_channel)->post_qid, - XGELL_TX_LEVEL_LOW); - fifo->level_low = 0; - retry = B_TRUE; - goto _exit; - } - } else { - fifo->level_low = 0; + xge_debug_ll(XGE_TRACE, "%s%d: queue %d: err on xmit," + "free descriptors count at low threshold %d", + XGELL_IFNAME, lldev->instance, + ((xge_hal_channel_t *)ring->channelh)->post_qid, + XGELL_TX_LEVEL_LOW); + goto _exit; } - status = xge_hal_fifo_dtr_reserve(fifo_channel, &dtr); + status = xge_hal_fifo_dtr_reserve(ring->channelh, &dtr); if (status != XGE_HAL_OK) { switch (status) { case XGE_HAL_INF_CHANNEL_IS_NOT_READY: @@ -945,19 +986,17 @@ _begin: "%s%d: channel %d is not ready.", XGELL_IFNAME, lldev->instance, ((xge_hal_channel_t *) - fifo_channel)->post_qid); - retry = B_TRUE; + ring->channelh)->post_qid); goto _exit; case XGE_HAL_INF_OUT_OF_DESCRIPTORS: xge_debug_ll(XGE_TRACE, "%s%d: queue %d: error in xmit," " out of descriptors.", XGELL_IFNAME, lldev->instance, ((xge_hal_channel_t *) - fifo_channel)->post_qid); - retry = B_TRUE; + ring->channelh)->post_qid); goto _exit; default: - return (B_FALSE); + return (mp); } } @@ -1002,6 +1041,8 @@ _begin: continue; } + ring->sent_bytes += mblen; + /* * Check the message length to decide to DMA or bcopy() data * to tx descriptor(s). @@ -1009,7 +1050,7 @@ _begin: if (mblen < lldev->config.tx_dma_lowat && (copied + mblen) < lldev->tx_copied_max) { xge_hal_status_e rc; - rc = xge_hal_fifo_dtr_buffer_append(fifo_channel, + rc = xge_hal_fifo_dtr_buffer_append(ring->channelh, dtr, bp->b_rptr, mblen); if (rc == XGE_HAL_OK) { used_copy = B_TRUE; @@ -1017,11 +1058,11 @@ _begin: continue; } else if (used_copy) { xge_hal_fifo_dtr_buffer_finalize( - fifo_channel, dtr, frag_cnt++); + ring->channelh, dtr, frag_cnt++); used_copy = B_FALSE; } } else if (used_copy) { - xge_hal_fifo_dtr_buffer_finalize(fifo_channel, + xge_hal_fifo_dtr_buffer_finalize(ring->channelh, dtr, frag_cnt++); used_copy = B_FALSE; } @@ -1075,7 +1116,7 @@ _begin: /* setup the descriptors for this data buffer */ while (ncookies) { - xge_hal_fifo_dtr_buffer_set(fifo_channel, dtr, + xge_hal_fifo_dtr_buffer_set(ring->channelh, dtr, frag_cnt++, dma_cookie.dmac_laddress, dma_cookie.dmac_size); if (--ncookies) { @@ -1108,7 +1149,7 @@ _begin: /* finalize unfinished copies */ if (used_copy) { - xge_hal_fifo_dtr_buffer_finalize(fifo_channel, dtr, + xge_hal_fifo_dtr_buffer_finalize(ring->channelh, dtr, frag_cnt++); } @@ -1118,11 +1159,14 @@ _begin: * If LSO is required, just call xge_hal_fifo_dtr_mss_set(dtr, mss) to * do all necessary work. */ - hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, &mss, &hckflags); - if ((hckflags & HW_LSO) && (mss != 0)) { + lso_info_get(mp, &mss, &lsoflags); + + if (lsoflags & HW_LSO) { + xge_assert((mss != 0) && (mss <= XGE_HAL_DEFAULT_MTU)); xge_hal_fifo_dtr_mss_set(dtr, mss); } + hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags); if (hckflags & HCK_IPV4_HDRCKSUM) { xge_hal_fifo_dtr_cksum_set_bits(dtr, XGE_HAL_TXD_TX_CKO_IPV4_EN); @@ -1132,63 +1176,376 @@ _begin: XGE_HAL_TXD_TX_CKO_UDP_EN); } - xge_hal_fifo_dtr_post(fifo_channel, dtr); + xge_hal_fifo_dtr_post(ring->channelh, dtr); - return (B_TRUE); + return (NULL); _exit_cleanup: - + /* + * Could not successfully transmit but have changed the message, + * so just free it and return NULL + */ for (i = 0; i < handle_cnt; i++) { (void) ddi_dma_unbind_handle(txd_priv->dma_handles[i]); ddi_dma_free_handle(&txd_priv->dma_handles[i]); txd_priv->dma_handles[i] = 0; } - xge_hal_fifo_dtr_free(fifo_channel, dtr); + xge_hal_fifo_dtr_free(ring->channelh, dtr); + + freemsg(mp); + return (NULL); _exit: - if (retry) { - if (lldev->resched_avail != lldev->resched_send && - xge_queue_produce_context(xge_hal_device_queue(lldev->devh), - XGELL_EVENT_RESCHED_NEEDED, fifo) == XGE_QUEUE_OK) { - lldev->resched_send = lldev->resched_avail; - return (B_FALSE); - } else { - lldev->resched_retry = 1; + ring->need_resched = B_TRUE; + return (mp); +} + +/* + * xgell_ring_macaddr_init + */ +static void +xgell_rx_ring_maddr_init(xgell_rx_ring_t *ring) +{ + int i; + xgelldev_t *lldev = ring->lldev; + xge_hal_device_t *hldev = lldev->devh; + int slot_start; + + xge_debug_ll(XGE_TRACE, "%s", "xgell_rx_ring_maddr_init"); + + ring->mmac.naddr = XGE_RX_MULTI_MAC_ADDRESSES_MAX; + ring->mmac.naddrfree = ring->mmac.naddr; + + /* + * For the default rx ring, the first MAC address is the factory one. + * This will be set by the framework, so need to clear it for now. + */ + (void) xge_hal_device_macaddr_clear(hldev, 0); + + /* + * Read the MAC address Configuration Memory from HAL. + * The first slot will hold a factory MAC address, contents in other + * slots will be FF:FF:FF:FF:FF:FF. + */ + slot_start = ring->index * 32; + for (i = 0; i < ring->mmac.naddr; i++) { + (void) xge_hal_device_macaddr_get(hldev, slot_start + i, + ring->mmac.mac_addr + i); + ring->mmac.mac_addr_set[i] = B_FALSE; + } +} + +static int xgell_maddr_set(xgelldev_t *, int, uint8_t *); + +static int +xgell_addmac(void *arg, const uint8_t *mac_addr) +{ + xgell_rx_ring_t *ring = arg; + xgelldev_t *lldev = ring->lldev; + xge_hal_device_t *hldev = lldev->devh; + int slot; + int slot_start; + + xge_debug_ll(XGE_TRACE, "%s", "xgell_addmac"); + + mutex_enter(&lldev->genlock); + + if (ring->mmac.naddrfree == 0) { + mutex_exit(&lldev->genlock); + return (ENOSPC); + } + + /* First slot is for factory MAC address */ + for (slot = 0; slot < ring->mmac.naddr; slot++) { + if (ring->mmac.mac_addr_set[slot] == B_FALSE) { + break; } } - if (mp) - freemsg(mp); - return (B_TRUE); + ASSERT(slot < ring->mmac.naddr); + + slot_start = ring->index * 32; + + if (xgell_maddr_set(lldev, slot_start + slot, (uint8_t *)mac_addr) != + 0) { + mutex_exit(&lldev->genlock); + return (EIO); + } + + /* Simply enable RTS for the whole section. */ + (void) xge_hal_device_rts_section_enable(hldev, slot_start + slot); + + /* + * Read back the MAC address from HAL to keep the array up to date. + */ + if (xge_hal_device_macaddr_get(hldev, slot_start + slot, + ring->mmac.mac_addr + slot) != XGE_HAL_OK) { + (void) xge_hal_device_macaddr_clear(hldev, slot_start + slot); + return (EIO); + } + + ring->mmac.mac_addr_set[slot] = B_TRUE; + ring->mmac.naddrfree--; + + mutex_exit(&lldev->genlock); + + return (0); +} + +static int +xgell_remmac(void *arg, const uint8_t *mac_addr) +{ + xgell_rx_ring_t *ring = arg; + xgelldev_t *lldev = ring->lldev; + xge_hal_device_t *hldev = lldev->devh; + xge_hal_status_e status; + int slot; + int slot_start; + + xge_debug_ll(XGE_TRACE, "%s", "xgell_remmac"); + + slot = xge_hal_device_macaddr_find(hldev, (uint8_t *)mac_addr); + if (slot == -1) + return (EINVAL); + + slot_start = ring->index * 32; + + /* + * Adjust slot to the offset in the MAC array of this ring (group). + */ + slot -= slot_start; + + /* + * Only can remove a pre-set MAC address for this ring (group). + */ + if (slot < 0 || slot >= ring->mmac.naddr) + return (EINVAL); + + + xge_assert(ring->mmac.mac_addr_set[slot]); + + mutex_enter(&lldev->genlock); + if (!ring->mmac.mac_addr_set[slot]) { + mutex_exit(&lldev->genlock); + /* + * The result will be unexpected when reach here. WARNING! + */ + xge_debug_ll(XGE_ERR, + "%s%d: caller is trying to remove an unset MAC address", + XGELL_IFNAME, lldev->instance); + return (ENXIO); + } + + status = xge_hal_device_macaddr_clear(hldev, slot_start + slot); + if (status != XGE_HAL_OK) { + mutex_exit(&lldev->genlock); + return (EIO); + } + + ring->mmac.mac_addr_set[slot] = B_FALSE; + ring->mmac.naddrfree++; + + /* + * TODO: Disable MAC RTS if all addresses have been cleared. + */ + + /* + * Read back the MAC address from HAL to keep the array up to date. + */ + (void) xge_hal_device_macaddr_get(hldev, slot_start + slot, + ring->mmac.mac_addr + slot); + mutex_exit(&lldev->genlock); + + return (0); } /* - * xge_m_tx - * @arg: pointer to the xgelldev_t structure - * @resid: resource id - * @mp: pointer to the message buffer + * Temporarily calling hal function. * - * Called by MAC Layer to send a chain of packets + * With MSI-X implementation, no lock is needed, so that the interrupt + * handling could be faster. */ -static mblk_t * -xgell_m_tx(void *arg, mblk_t *mp) +int +xgell_rx_ring_intr_enable(mac_intr_handle_t ih) { - xgelldev_t *lldev = arg; - mblk_t *next; + xgell_rx_ring_t *ring = (xgell_rx_ring_t *)ih; - while (mp != NULL) { - next = mp->b_next; - mp->b_next = NULL; + mutex_enter(&ring->ring_lock); + xge_hal_device_rx_channel_disable_polling(ring->channelh); + mutex_exit(&ring->ring_lock); - if (!xgell_send(lldev, mp)) { - mp->b_next = next; - break; - } - mp = next; + return (0); +} + +int +xgell_rx_ring_intr_disable(mac_intr_handle_t ih) +{ + xgell_rx_ring_t *ring = (xgell_rx_ring_t *)ih; + + mutex_enter(&ring->ring_lock); + xge_hal_device_rx_channel_enable_polling(ring->channelh); + mutex_exit(&ring->ring_lock); + + return (0); +} + +static int +xgell_rx_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num) +{ + xgell_rx_ring_t *rx_ring = (xgell_rx_ring_t *)rh; + + rx_ring->ring_gen_num = mr_gen_num; + + return (0); +} + +/*ARGSUSED*/ +static void +xgell_rx_ring_stop(mac_ring_driver_t rh) +{ +} + +/*ARGSUSED*/ +static int +xgell_tx_ring_start(mac_ring_driver_t rh, uint64_t useless) +{ + return (0); +} + +/*ARGSUSED*/ +static void +xgell_tx_ring_stop(mac_ring_driver_t rh) +{ +} + +/* + * Callback funtion for MAC layer to register all rings. + * + * Xframe hardware doesn't support grouping explicitly, so the driver needs + * to pretend having resource groups. We may also optionally group all 8 rx + * rings into a single group for increased scalability on CMT architectures, + * or group one rx ring per group for maximum virtualization. + * + * TX grouping is actually done by framework, so, just register all TX + * resources without grouping them. + */ +void +xgell_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, + const int index, mac_ring_info_t *infop, mac_ring_handle_t rh) +{ + xgelldev_t *lldev = (xgelldev_t *)arg; + mac_intr_t *mintr; + + switch (rtype) { + case MAC_RING_TYPE_RX: { + xgell_rx_ring_t *rx_ring; + + xge_assert(index < lldev->init_rx_rings); + xge_assert(rg_index < lldev->init_rx_groups); + + /* + * Performance vs. Virtualization + */ + if (lldev->init_rx_rings == lldev->init_rx_groups) + rx_ring = lldev->rx_ring + rg_index; + else + rx_ring = lldev->rx_ring + index; + + rx_ring->ring_handle = rh; + + infop->mri_driver = (mac_ring_driver_t)rx_ring; + infop->mri_start = xgell_rx_ring_start; + infop->mri_stop = xgell_rx_ring_stop; + infop->mri_poll = xgell_rx_poll; + + mintr = &infop->mri_intr; + mintr->mi_handle = (mac_intr_handle_t)rx_ring; + mintr->mi_enable = xgell_rx_ring_intr_enable; + mintr->mi_disable = xgell_rx_ring_intr_disable; + + break; } + case MAC_RING_TYPE_TX: { + xgell_tx_ring_t *tx_ring; - return (mp); + xge_assert(rg_index == -1); + + xge_assert((index >= 0) && (index < lldev->init_tx_rings)); + + tx_ring = lldev->tx_ring + index; + tx_ring->ring_handle = rh; + + infop->mri_driver = (mac_ring_driver_t)tx_ring; + infop->mri_start = xgell_tx_ring_start; + infop->mri_stop = xgell_tx_ring_stop; + infop->mri_tx = xgell_ring_tx; + + break; + } + default: + break; + } +} + +void +xgell_fill_group(void *arg, mac_ring_type_t rtype, const int index, + mac_group_info_t *infop, mac_group_handle_t gh) +{ + xgelldev_t *lldev = (xgelldev_t *)arg; + + switch (rtype) { + case MAC_RING_TYPE_RX: { + xgell_rx_ring_t *rx_ring; + + xge_assert(index < lldev->init_rx_groups); + + rx_ring = lldev->rx_ring + index; + + rx_ring->group_handle = gh; + + infop->mgi_driver = (mac_group_driver_t)rx_ring; + infop->mgi_start = NULL; + infop->mgi_stop = NULL; + infop->mgi_addmac = xgell_addmac; + infop->mgi_remmac = xgell_remmac; + infop->mgi_count = lldev->init_rx_rings / lldev->init_rx_groups; + + break; + } + case MAC_RING_TYPE_TX: + xge_assert(0); + break; + default: + break; + } +} + +/* + * xgell_macaddr_set + */ +static int +xgell_maddr_set(xgelldev_t *lldev, int index, uint8_t *macaddr) +{ + xge_hal_device_t *hldev = lldev->devh; + xge_hal_status_e status; + + xge_debug_ll(XGE_TRACE, "%s", "xgell_maddr_set"); + + xge_debug_ll(XGE_TRACE, + "setting macaddr: 0x%02x-%02x-%02x-%02x-%02x-%02x", + macaddr[0], macaddr[1], macaddr[2], + macaddr[3], macaddr[4], macaddr[5]); + + status = xge_hal_device_macaddr_set(hldev, index, (uchar_t *)macaddr); + + if (status != XGE_HAL_OK) { + xge_debug_ll(XGE_ERR, "%s%d: can not set mac address", + XGELL_IFNAME, lldev->instance); + return (EIO); + } + + return (0); } /* @@ -1201,12 +1558,13 @@ static void xgell_rx_dtr_term(xge_hal_channel_h channelh, xge_hal_dtr_h dtrh, xge_hal_dtr_state_e state, void *userdata, xge_hal_channel_reopen_e reopen) { - xgell_ring_t *ring = (xgell_ring_t *)userdata; xgell_rxd_priv_t *rxd_priv = ((xgell_rxd_priv_t *)xge_hal_ring_dtr_private(channelh, dtrh)); xgell_rx_buffer_t *rx_buffer = rxd_priv->rx_buffer; if (state == XGE_HAL_DTR_STATE_POSTED) { + xgell_rx_ring_t *ring = rx_buffer->ring; + mutex_enter(&ring->bf_pool.pool_lock); xge_hal_ring_dtr_free(channelh, dtrh); xgell_rx_buffer_release(rx_buffer); @@ -1215,6 +1573,137 @@ xgell_rx_dtr_term(xge_hal_channel_h channelh, xge_hal_dtr_h dtrh, } /* + * To open a rx ring. + */ +static boolean_t +xgell_rx_ring_open(xgell_rx_ring_t *rx_ring) +{ + xge_hal_status_e status; + xge_hal_channel_attr_t attr; + xgelldev_t *lldev = rx_ring->lldev; + xge_hal_device_t *hldev = lldev->devh; + + if (rx_ring->live) + return (B_TRUE); + + /* Create the buffer pool first */ + if (!xgell_rx_create_buffer_pool(rx_ring)) { + xge_debug_ll(XGE_ERR, "can not create buffer pool for ring: %d", + rx_ring->index); + return (B_FALSE); + } + + /* Default ring initialization */ + attr.post_qid = rx_ring->index; + attr.compl_qid = 0; + attr.callback = xgell_rx_1b_callback; + attr.per_dtr_space = sizeof (xgell_rxd_priv_t); + attr.flags = 0; + attr.type = XGE_HAL_CHANNEL_TYPE_RING; + attr.dtr_init = xgell_rx_dtr_replenish; + attr.dtr_term = xgell_rx_dtr_term; + attr.userdata = rx_ring; + + status = xge_hal_channel_open(lldev->devh, &attr, &rx_ring->channelh, + XGE_HAL_CHANNEL_OC_NORMAL); + if (status != XGE_HAL_OK) { + xge_debug_ll(XGE_ERR, "%s%d: cannot open Rx channel got status " + " code %d", XGELL_IFNAME, lldev->instance, status); + (void) xgell_rx_destroy_buffer_pool(rx_ring); + return (B_FALSE); + } + + xgell_rx_ring_maddr_init(rx_ring); + + mutex_init(&rx_ring->ring_lock, NULL, MUTEX_DRIVER, + DDI_INTR_PRI(hldev->irqh)); + + rx_ring->received_bytes = 0; + rx_ring->poll_bytes = -1; + rx_ring->polled_bytes = 0; + rx_ring->poll_mp = NULL; + rx_ring->live = B_TRUE; + + xge_debug_ll(XGE_TRACE, "RX ring [%d] is opened successfully", + rx_ring->index); + + return (B_TRUE); +} + +static void +xgell_rx_ring_close(xgell_rx_ring_t *rx_ring) +{ + if (!rx_ring->live) + return; + xge_hal_channel_close(rx_ring->channelh, XGE_HAL_CHANNEL_OC_NORMAL); + rx_ring->channelh = NULL; + /* This may not clean up all used buffers, driver will handle it */ + if (xgell_rx_destroy_buffer_pool(rx_ring)) + rx_ring->live = B_FALSE; + + mutex_destroy(&rx_ring->ring_lock); +} + +/* + * xgell_rx_open + * @lldev: the link layer object + * + * Initialize and open all RX channels. + */ +static boolean_t +xgell_rx_open(xgelldev_t *lldev) +{ + xgell_rx_ring_t *rx_ring; + int i; + + if (lldev->live_rx_rings != 0) + return (B_TRUE); + + lldev->live_rx_rings = 0; + + /* + * Initialize all rings + */ + for (i = 0; i < lldev->init_rx_rings; i++) { + rx_ring = &lldev->rx_ring[i]; + rx_ring->index = i; + rx_ring->lldev = lldev; + rx_ring->live = B_FALSE; + + if (!xgell_rx_ring_open(rx_ring)) + return (B_FALSE); + + lldev->live_rx_rings++; + } + + return (B_TRUE); +} + +static void +xgell_rx_close(xgelldev_t *lldev) +{ + xgell_rx_ring_t *rx_ring; + int i; + + if (lldev->live_rx_rings == 0) + return; + + /* + * Close all rx rings + */ + for (i = 0; i < lldev->init_rx_rings; i++) { + rx_ring = &lldev->rx_ring[i]; + + if (rx_ring->live) { + xgell_rx_ring_close(rx_ring); + lldev->live_rx_rings--; + } + } + + xge_assert(lldev->live_rx_rings == 0); +} + +/* * xgell_tx_term * * Function will be called by HAL to terminate all DTRs for @@ -1252,215 +1741,105 @@ xgell_tx_term(xge_hal_channel_h channelh, xge_hal_dtr_h dtrh, } } -/* - * xgell_tx_close - * @lldev: the link layer object - * - * Close all Tx channels - */ -static void -xgell_tx_close(xgelldev_t *lldev) -{ - xge_list_t *item, *list; - xge_hal_device_t *hldev = (xge_hal_device_t *)lldev->devh; - - list = &hldev->fifo_channels; - while (!xge_list_is_empty(list)) { - item = xge_list_first_get(list); - xge_hal_channel_t *channel = xge_container_of(item, - xge_hal_channel_t, item); - - xge_hal_channel_close(channel, XGE_HAL_CHANNEL_OC_NORMAL); - } -} - -/* - * xgell_tx_open - * @lldev: the link layer object - * - * Initialize and open all Tx channels; - */ static boolean_t -xgell_tx_open(xgelldev_t *lldev) +xgell_tx_ring_open(xgell_tx_ring_t *tx_ring) { xge_hal_status_e status; - u64 adapter_status; xge_hal_channel_attr_t attr; - xge_list_t *item; - xge_hal_device_t *hldev = (xge_hal_device_t *)lldev->devh; + xgelldev_t *lldev = tx_ring->lldev; + + if (tx_ring->live) + return (B_TRUE); - attr.post_qid = 0; + attr.post_qid = tx_ring->index; attr.compl_qid = 0; attr.callback = xgell_xmit_compl; attr.per_dtr_space = sizeof (xgell_txd_priv_t); attr.flags = 0; attr.type = XGE_HAL_CHANNEL_TYPE_FIFO; - attr.userdata = lldev; attr.dtr_init = NULL; attr.dtr_term = xgell_tx_term; + attr.userdata = tx_ring; - if (xge_hal_device_status(lldev->devh, &adapter_status)) { - xge_debug_ll(XGE_ERR, "%s%d: device is not ready " - "adaper status reads 0x%"PRIx64, XGELL_IFNAME, - lldev->instance, (uint64_t)adapter_status); + status = xge_hal_channel_open(lldev->devh, &attr, &tx_ring->channelh, + XGE_HAL_CHANNEL_OC_NORMAL); + if (status != XGE_HAL_OK) { + xge_debug_ll(XGE_ERR, "%s%d: cannot open Tx channel got status " + "code %d", XGELL_IFNAME, lldev->instance, status); return (B_FALSE); } - /* - * Open only configured channels. HAL structures are static, - * so, no worries here.. - */ -_next_channel: - xge_list_for_each(item, &hldev->free_channels) { - xge_hal_channel_t *channel = xge_container_of(item, - xge_hal_channel_t, item); - xgell_fifo_t *fifo; - - /* filter on FIFO channels */ - if (channel->type != XGE_HAL_CHANNEL_TYPE_FIFO) - continue; - - fifo = &lldev->fifos[attr.post_qid]; - fifo->lldev = lldev; - attr.userdata = fifo; - - status = xge_hal_channel_open(lldev->devh, &attr, - &fifo->channelh, XGE_HAL_CHANNEL_OC_NORMAL); - if (status != XGE_HAL_OK) { - xge_debug_ll(XGE_ERR, "%s%d: cannot open Tx channel " - "got status code %d", XGELL_IFNAME, - lldev->instance, status); - /* unwind */ - xgell_tx_close(lldev); - return (B_FALSE); - } - - attr.post_qid++; - - /* - * because channel_open() moves xge_list entry - * to the fifos_channels - */ - goto _next_channel; - } + tx_ring->sent_bytes = 0; + tx_ring->live = B_TRUE; return (B_TRUE); } -/* - * xgell_rx_close - * @lldev: the link layer object - * - * Close all Rx channels - */ static void -xgell_rx_close(xgelldev_t *lldev) +xgell_tx_ring_close(xgell_tx_ring_t *tx_ring) { - xge_list_t *item, *list; - xge_hal_device_t *hldev = (xge_hal_device_t *)lldev->devh; - - list = &hldev->ring_channels; - while (!xge_list_is_empty(list)) { - item = xge_list_first_get(list); - xge_hal_channel_t *channel = xge_container_of(item, - xge_hal_channel_t, item); - xgell_ring_t *ring = xge_hal_channel_userdata(channel); - - xge_hal_channel_close(channel, XGE_HAL_CHANNEL_OC_NORMAL); - - /* - * destroy Ring's buffer pool - */ - if (xgell_rx_destroy_buffer_pool(ring) != DDI_SUCCESS) { - xge_debug_ll(XGE_ERR, "unable to destroy Ring%d " - "buffer pool", channel->post_qid); - } - list = &hldev->ring_channels; - } + if (!tx_ring->live) + return; + xge_hal_channel_close(tx_ring->channelh, XGE_HAL_CHANNEL_OC_NORMAL); + tx_ring->live = B_FALSE; } /* - * xgell_rx_open + * xgell_tx_open * @lldev: the link layer object * - * Initialize and open all Rx channels; + * Initialize and open all TX channels. */ static boolean_t -xgell_rx_open(xgelldev_t *lldev) +xgell_tx_open(xgelldev_t *lldev) { - xge_hal_status_e status; - u64 adapter_status; - xge_hal_channel_attr_t attr; - xge_list_t *item; - xge_hal_device_t *hldev = (xge_hal_device_t *)lldev->devh; + xgell_tx_ring_t *tx_ring; + int i; - attr.post_qid = 0; - attr.compl_qid = 0; - attr.callback = xgell_rx_1b_compl; - attr.per_dtr_space = sizeof (xgell_rxd_priv_t); - attr.flags = 0; - attr.type = XGE_HAL_CHANNEL_TYPE_RING; - attr.dtr_init = xgell_rx_dtr_replenish; - attr.dtr_term = xgell_rx_dtr_term; + if (lldev->live_tx_rings != 0) + return (B_TRUE); - if (xge_hal_device_status(lldev->devh, &adapter_status)) { - xge_debug_ll(XGE_ERR, - "%s%d: device is not ready adaper status reads 0x%"PRIx64, - XGELL_IFNAME, lldev->instance, - (uint64_t)adapter_status); - return (B_FALSE); - } + lldev->live_tx_rings = 0; /* - * Open only configured channels. HAL structures are static, - * so, no worries here.. + * Enable rings by reserve sequence to match the h/w sequences. */ -_next_channel: - xge_list_for_each(item, &hldev->free_channels) { - xge_hal_channel_t *channel = xge_container_of(item, - xge_hal_channel_t, item); - xgell_ring_t *ring; - - /* filter on RING channels */ - if (channel->type != XGE_HAL_CHANNEL_TYPE_RING) - continue; - - ring = &lldev->rings[attr.post_qid]; - ring->lldev = lldev; - attr.userdata = ring; - - if (xgell_rx_create_buffer_pool(ring) != DDI_SUCCESS) { - xge_debug_ll(XGE_ERR, "unable to create Ring%d " - "buffer pool", attr.post_qid); - /* unwind */ - xgell_rx_close(lldev); - return (B_FALSE); - } + for (i = 0; i < lldev->init_tx_rings; i++) { + tx_ring = &lldev->tx_ring[i]; + tx_ring->index = i; + tx_ring->lldev = lldev; + tx_ring->live = B_FALSE; - status = xge_hal_channel_open(lldev->devh, &attr, - &ring->channelh, XGE_HAL_CHANNEL_OC_NORMAL); - if (status != XGE_HAL_OK) { - xge_debug_ll(XGE_ERR, "%s%d: cannot open Rx channel " - "got status got status code %d", XGELL_IFNAME, - lldev->instance, status); - /* unwind */ - (void) xgell_rx_destroy_buffer_pool(ring); - xgell_rx_close(lldev); + if (!xgell_tx_ring_open(tx_ring)) return (B_FALSE); - } - attr.post_qid++; - - /* - * because chhannel_open() moves xge_list entry - * to the rings channels - */ - goto _next_channel; + lldev->live_tx_rings++; } return (B_TRUE); } +static void +xgell_tx_close(xgelldev_t *lldev) +{ + xgell_tx_ring_t *tx_ring; + int i; + + if (lldev->live_tx_rings == 0) + return; + + /* + * Enable rings by reserve sequence to match the h/w sequences. + */ + for (i = 0; i < lldev->init_tx_rings; i++) { + tx_ring = &lldev->tx_ring[i]; + if (tx_ring->live) { + xgell_tx_ring_close(tx_ring); + lldev->live_tx_rings--; + } + } +} + static int xgell_initiate_start(xgelldev_t *lldev) { @@ -1485,13 +1864,13 @@ xgell_initiate_start(xgelldev_t *lldev) } /* tune jumbo/normal frame UFC counters */ - hldev->config.ring.queue[XGELL_RING_MAIN_QID].rti.ufc_b = \ - maxpkt > XGE_HAL_DEFAULT_MTU ? + hldev->config.ring.queue[XGELL_RX_RING_MAIN].rti.ufc_b = + (maxpkt > XGE_HAL_DEFAULT_MTU) ? XGE_HAL_DEFAULT_RX_UFC_B_J : XGE_HAL_DEFAULT_RX_UFC_B_N; - hldev->config.ring.queue[XGELL_RING_MAIN_QID].rti.ufc_c = \ - maxpkt > XGE_HAL_DEFAULT_MTU ? + hldev->config.ring.queue[XGELL_RX_RING_MAIN].rti.ufc_c = + (maxpkt > XGE_HAL_DEFAULT_MTU) ? XGE_HAL_DEFAULT_RX_UFC_C_J : XGE_HAL_DEFAULT_RX_UFC_C_N; @@ -1515,6 +1894,7 @@ xgell_initiate_start(xgelldev_t *lldev) XGELL_IFNAME, lldev->instance, (uint64_t)adapter_status, status); } + xgell_rx_close(lldev); xge_os_mdelay(1500); return (ENOMEM); } @@ -1531,9 +1911,9 @@ xgell_initiate_start(xgelldev_t *lldev) XGELL_IFNAME, lldev->instance, (uint64_t)adapter_status, status); } - xge_os_mdelay(1500); + xgell_tx_close(lldev); xgell_rx_close(lldev); - + xge_os_mdelay(1500); return (ENOMEM); } @@ -1686,46 +2066,6 @@ xgell_onerr_reset(xgelldev_t *lldev) return (rc); } - -/* - * xgell_m_unicst - * @arg: pointer to device private strucutre(hldev) - * @mac_addr: - * - * This function is called by MAC Layer to set the physical address - * of the XFRAME firmware. - */ -static int -xgell_m_unicst(void *arg, const uint8_t *macaddr) -{ - xge_hal_status_e status; - xgelldev_t *lldev = (xgelldev_t *)arg; - xge_hal_device_t *hldev = lldev->devh; - xge_debug_ll(XGE_TRACE, "%s", "MAC_UNICST"); - - xge_debug_ll(XGE_TRACE, "%s", "M_UNICAST"); - - mutex_enter(&lldev->genlock); - - xge_debug_ll(XGE_TRACE, - "setting macaddr: 0x%02x-%02x-%02x-%02x-%02x-%02x", - macaddr[0], macaddr[1], macaddr[2], - macaddr[3], macaddr[4], macaddr[5]); - - status = xge_hal_device_macaddr_set(hldev, 0, (uchar_t *)macaddr); - if (status != XGE_HAL_OK) { - xge_debug_ll(XGE_ERR, "%s%d: can not set mac address", - XGELL_IFNAME, lldev->instance); - mutex_exit(&lldev->genlock); - return (EIO); - } - - mutex_exit(&lldev->genlock); - - return (0); -} - - /* * xgell_m_multicst * @arg: pointer to device private strucutre(hldev) @@ -2039,12 +2379,14 @@ xgell_m_ioctl(void *arg, queue_t *wq, mblk_t *mp) } } -/* ARGSUSED */ + static boolean_t xgell_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) { xgelldev_t *lldev = arg; + xge_debug_ll(XGE_TRACE, "xgell_m_getcapab: %x", cap); + switch (cap) { case MAC_CAPAB_HCKSUM: { uint32_t *hcksum_txflags = cap_data; @@ -2063,6 +2405,29 @@ xgell_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) return (B_FALSE); } } + case MAC_CAPAB_RINGS: { + mac_capab_rings_t *cap_rings = cap_data; + + switch (cap_rings->mr_type) { + case MAC_RING_TYPE_RX: + cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; + cap_rings->mr_rnum = lldev->init_rx_rings; + cap_rings->mr_gnum = lldev->init_rx_groups; + cap_rings->mr_rget = xgell_fill_ring; + cap_rings->mr_gget = xgell_fill_group; + break; + case MAC_RING_TYPE_TX: + cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; + cap_rings->mr_rnum = lldev->init_tx_rings; + cap_rings->mr_gnum = 0; + cap_rings->mr_rget = xgell_fill_ring; + cap_rings->mr_gget = NULL; + break; + default: + break; + } + break; + } default: return (B_FALSE); } @@ -2320,8 +2685,7 @@ xgell_devconfig_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp) return (ENOSPC); } status = xge_hal_aux_device_config_read(lldev->devh, - XGELL_DEVCONF_BUFSIZE, - buf, &retsize); + XGELL_DEVCONF_BUFSIZE, buf, &retsize); if (status != XGE_HAL_OK) { kmem_free(buf, XGELL_DEVCONF_BUFSIZE); xge_debug_ll(XGE_ERR, "device_config_read(): status %d", @@ -2349,6 +2713,9 @@ xgell_device_register(xgelldev_t *lldev, xgell_config_t *config) mac_register_t *macp = NULL; xge_hal_device_t *hldev = (xge_hal_device_t *)lldev->devh; + /* + * Initialize some NDD interface for internal debug. + */ if (nd_load(&lldev->ndp, "pciconf", xgell_pciconf_get, NULL, (caddr_t)lldev) == B_FALSE) goto xgell_ndd_fail; @@ -2393,11 +2760,11 @@ xgell_device_register(xgelldev_t *lldev, xgell_config_t *config) macp->m_min_sdu = 0; macp->m_max_sdu = hldev->config.mtu; macp->m_margin = VLAN_TAGSZ; + macp->m_v12n = MAC_VIRT_LEVEL1; + /* - * Finally, we're ready to register ourselves with the Nemo - * interface; if this succeeds, we're all ready to start() + * MAC Registration. */ - if (mac_register(macp, &lldev->mh) != 0) goto xgell_register_fail; diff --git a/usr/src/uts/common/io/xge/drv/xgell.h b/usr/src/uts/common/io/xge/drv/xgell.h index aa8bcc43ff..93845bb655 100644 --- a/usr/src/uts/common/io/xge/drv/xgell.h +++ b/usr/src/uts/common/io/xge/drv/xgell.h @@ -60,7 +60,7 @@ #include <sys/pattr.h> #include <sys/strsun.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> #ifdef __cplusplus @@ -69,11 +69,6 @@ extern "C" { #define XGELL_DESC "Xframe I/II 10Gb Ethernet" #define XGELL_IFNAME "xge" -#define XGELL_TX_LEVEL_LOW 8 -#define XGELL_TX_LEVEL_HIGH 32 -#define XGELL_TX_LEVEL_CHECK 3 -#define XGELL_MAX_RING_DEFAULT 8 -#define XGELL_MAX_FIFO_DEFAULT 1 #include <xgehal.h> @@ -93,25 +88,64 @@ extern "C" { #define XGELL_RX_BUFFER_TOTAL XGE_HAL_RING_RXDS_PER_BLOCK(1) * 6 #define XGELL_RX_BUFFER_POST_HIWAT XGE_HAL_RING_RXDS_PER_BLOCK(1) * 5 -/* Control driver to copy or DMA received packets */ -#define XGELL_RX_DMA_LOWAT 256 +/* + * Multiple rings configuration + */ +#define XGELL_RX_RING_MAIN 0 +#define XGELL_TX_RING_MAIN 0 + +#define XGELL_RX_RING_NUM_MIN 1 +#define XGELL_TX_RING_NUM_MIN 1 +#define XGELL_RX_RING_NUM_MAX 8 +#define XGELL_TX_RING_NUM_MAX 1 /* TODO */ +#define XGELL_RX_RING_NUM_DEFAULT XGELL_RX_RING_NUM_MAX +#define XGELL_TX_RING_NUM_DEFAULT XGELL_TX_RING_NUM_MAX + +#define XGELL_MINTR_NUM_MIN 1 +#define XGELL_MINTR_NUM_MAX \ + (XGELL_RX_RING_NUM_MAX + XGELL_TX_RING_NUM_MAX + 1) +#define XGELL_MINTR_NUM_DEFAULT XGELL_MINTR_NUM_MAX + +#define XGELL_CONF_GROUP_POLICY_BASIC 0 +#define XGELL_CONF_GROUP_POLICY_VIRT 1 +#define XGELL_CONF_GROUP_POLICY_PERF 2 +#if 0 +#if defined(__sparc) +#define XGELL_CONF_GROUP_POLICY_DEFAULT XGELL_CONF_GROUP_POLICY_PERF +#else +#define XGELL_CONF_GROUP_POLICY_DEFAULT XGELL_CONF_GROUP_POLICY_VIRT +#endif +#else +/* + * The _PERF configuration enable a fat group of all rx rings, as approachs + * better fanout performance of the primary interface. + */ +#define XGELL_CONF_GROUP_POLICY_DEFAULT XGELL_CONF_GROUP_POLICY_PERF +#endif -#define XGELL_RING_MAIN_QID 0 +#define XGELL_TX_LEVEL_LOW 8 +#define XGELL_TX_LEVEL_HIGH 32 +#define XGELL_TX_LEVEL_CHECK 3 +#define XGELL_MAX_RING_DEFAULT 8 +#define XGELL_MAX_FIFO_DEFAULT 1 -#if defined(__x86) -#define XGELL_TX_DMA_LOWAT 128 +/* Control driver to copy or DMA inbound/outbound packets */ +#if defined(__sparc) +#define XGELL_RX_DMA_LOWAT 256 +#define XGELL_TX_DMA_LOWAT 512 #else -#define XGELL_TX_DMA_LOWAT 512 +#define XGELL_RX_DMA_LOWAT 256 +#define XGELL_TX_DMA_LOWAT 128 #endif /* * Try to collapse up to XGELL_RX_PKT_BURST packets into single mblk * sequence before mac_rx() is called. */ -#define XGELL_RX_PKT_BURST 32 +#define XGELL_RX_PKT_BURST 32 /* About 1s */ -#define XGE_DEV_POLL_TICKS drv_usectohz(1000000) +#define XGE_DEV_POLL_TICKS drv_usectohz(1000000) #define XGELL_LSO_MAXLEN 65535 #define XGELL_CONF_ENABLE_BY_DEFAULT 1 @@ -157,6 +191,7 @@ extern "C" { #define XGE_HAL_DEFAULT_RX_TIMER_AC_EN 1 #define XGE_HAL_DEFAULT_RX_TIMER_VAL 384 +#define XGE_HAL_DEFAULT_FIFO_QUEUE_LENGTH_A 1024 #define XGE_HAL_DEFAULT_FIFO_QUEUE_LENGTH_J 2048 #define XGE_HAL_DEFAULT_FIFO_QUEUE_LENGTH_N 4096 #define XGE_HAL_DEFAULT_FIFO_QUEUE_INTR 0 @@ -171,15 +206,14 @@ extern "C" { */ #define XGE_HAL_DEFAULT_FIFO_ALIGNMENT_SIZE 4096 #define XGE_HAL_DEFAULT_FIFO_MAX_ALIGNED_FRAGS 1 -#if defined(__x86) -#define XGE_HAL_DEFAULT_FIFO_FRAGS 128 -#else +#if defined(__sparc) #define XGE_HAL_DEFAULT_FIFO_FRAGS 64 +#else +#define XGE_HAL_DEFAULT_FIFO_FRAGS 128 #endif #define XGE_HAL_DEFAULT_FIFO_FRAGS_THRESHOLD 18 -#define XGE_HAL_DEFAULT_RING_QUEUE_BLOCKS_J 2 -#define XGE_HAL_DEFAULT_RING_QUEUE_BLOCKS_N 2 +#define XGE_HAL_DEFAULT_RING_QUEUE_BLOCKS 2 #define XGE_HAL_RING_QUEUE_BUFFER_MODE_DEFAULT 1 #define XGE_HAL_DEFAULT_BACKOFF_INTERVAL_US 64 #define XGE_HAL_DEFAULT_RING_PRIORITY 0 @@ -202,18 +236,15 @@ extern "C" { #define XGE_HAL_DEFAULT_STATS_REFRESH_TIME 1 #if defined(__sparc) -#define XGE_HAL_DEFAULT_MMRB_COUNT \ - XGE_HAL_MAX_MMRB_COUNT -#define XGE_HAL_DEFAULT_SPLIT_TRANSACTION \ - XGE_HAL_EIGHT_SPLIT_TRANSACTION +#define XGE_HAL_DEFAULT_MMRB_COUNT XGE_HAL_MAX_MMRB_COUNT +#define XGE_HAL_DEFAULT_SPLIT_TRANSACTION XGE_HAL_EIGHT_SPLIT_TRANSACTION #else #define XGE_HAL_DEFAULT_MMRB_COUNT 1 /* 1k */ -#define XGE_HAL_DEFAULT_SPLIT_TRANSACTION \ - XGE_HAL_TWO_SPLIT_TRANSACTION +#define XGE_HAL_DEFAULT_SPLIT_TRANSACTION XGE_HAL_TWO_SPLIT_TRANSACTION #endif /* - * default the size of buffers allocated for ndd interface functions + * Default the size of buffers allocated for ndd interface functions */ #define XGELL_STATS_BUFSIZE 8192 #define XGELL_PCICONF_BUFSIZE 2048 @@ -222,17 +253,12 @@ extern "C" { #define XGELL_DEVCONF_BUFSIZE 8192 /* - * xgell_event_e + * Multiple mac address definitions * - * This enumeration derived from xgehal_event_e. It extends it - * for the reason to get serialized context. + * We'll use whole MAC Addresses Configuration Memory for unicast addresses, + * since current multicast implementation in HAL is by enabling promise mode. */ -/* Renamb the macro from HAL */ -#define XGELL_EVENT_BASE XGE_LL_EVENT_BASE -typedef enum xgell_event_e { - /* LL events */ - XGELL_EVENT_RESCHED_NEEDED = XGELL_EVENT_BASE + 1, -} xgell_event_e; +#define XGE_RX_MULTI_MAC_ADDRESSES_MAX 8 /* per ring group */ typedef struct { int rx_pkt_burst; @@ -240,24 +266,27 @@ typedef struct { int rx_buffer_post_hiwat; int rx_dma_lowat; int tx_dma_lowat; - int msix_enable; int lso_enable; + int msix_enable; + int grouping; } xgell_config_t; -typedef struct xgell_ring xgell_ring_t; -typedef struct xgell_fifo xgell_fifo_t; +typedef struct xgell_multi_mac xgell_multi_mac_t; +typedef struct xgell_rx_ring xgell_rx_ring_t; +typedef struct xgell_tx_ring xgell_tx_ring_t; +typedef struct xgelldev xgelldev_t; typedef struct xgell_rx_buffer_t { - struct xgell_rx_buffer_t *next; - void *vaddr; - dma_addr_t dma_addr; - ddi_dma_handle_t dma_handle; - ddi_acc_handle_t dma_acch; - xgell_ring_t *ring; - frtn_t frtn; + struct xgell_rx_buffer_t *next; + void *vaddr; + dma_addr_t dma_addr; + ddi_dma_handle_t dma_handle; + ddi_acc_handle_t dma_acch; + xgell_rx_ring_t *ring; + frtn_t frtn; } xgell_rx_buffer_t; -/* Buffer pool for all rings */ +/* Buffer pool for one rx ring */ typedef struct xgell_rx_buffer_pool_t { uint_t total; /* total buffers */ uint_t size; /* buffer size */ @@ -266,50 +295,92 @@ typedef struct xgell_rx_buffer_pool_t { uint_t post; /* posted buffers */ uint_t post_hiwat; /* hiwat to stop post */ spinlock_t pool_lock; /* buffer pool lock */ + boolean_t live; /* pool status */ xgell_rx_buffer_t *recycle_head; /* recycle list's head */ xgell_rx_buffer_t *recycle_tail; /* recycle list's tail */ uint_t recycle; /* # of rx buffers recycled */ spinlock_t recycle_lock; /* buffer recycle lock */ } xgell_rx_buffer_pool_t; -typedef struct xgelldev xgelldev_t; +struct xgell_multi_mac { + int naddr; /* total supported addresses */ + int naddrfree; /* free addresses slots */ + ether_addr_t mac_addr[XGE_RX_MULTI_MAC_ADDRESSES_MAX]; + boolean_t mac_addr_set[XGE_RX_MULTI_MAC_ADDRESSES_MAX]; +}; -struct xgell_ring { - xge_hal_channel_h channelh; - xgelldev_t *lldev; - mac_resource_handle_t handle; /* per ring cookie */ - xgell_rx_buffer_pool_t bf_pool; +typedef uint_t (*intr_func_t)(caddr_t, caddr_t); + +typedef struct xgell_intr { + uint_t index; + ddi_intr_handle_t *handle; /* DDI interrupt handle */ + intr_func_t *function; /* interrupt function */ + caddr_t arg; /* interrupt source */ +} xgell_intr_t; + +struct xgell_rx_ring { + int index; + boolean_t live; /* ring active status */ + xge_hal_channel_h channelh; /* hardware channel */ + xgelldev_t *lldev; /* driver device */ + mac_ring_handle_t ring_handle; /* call back ring handle */ + mac_group_handle_t group_handle; /* call back group handle */ + uint64_t ring_gen_num; + + xgell_multi_mac_t mmac; /* per group multiple addrs */ + xgell_rx_buffer_pool_t bf_pool; /* per ring buffer pool */ + int received_bytes; /* total received bytes */ + int intr_bytes; /* interrupt received bytes */ + int poll_bytes; /* bytes to be polled up */ + int polled_bytes; /* total polled bytes */ + mblk_t *poll_mp; /* polled messages */ + + spinlock_t ring_lock; /* per ring lock */ }; -struct xgell_fifo { - xge_hal_channel_h channelh; - xgelldev_t *lldev; - int level_low; +struct xgell_tx_ring { + int index; + boolean_t live; /* ring active status */ + xge_hal_channel_h channelh; /* hardware channel */ + xgelldev_t *lldev; /* driver device */ + mac_ring_handle_t ring_handle; /* call back ring handle */ + int sent_bytes; /* bytes sent though the ring */ + + boolean_t need_resched; }; struct xgelldev { - caddr_t ndp; + volatile int is_initialized; + volatile int in_reset; + kmutex_t genlock; mac_handle_t mh; int instance; dev_info_t *dev_info; xge_hal_device_h devh; - xgell_ring_t rings[XGE_HAL_MAX_RING_NUM]; - xgell_fifo_t fifos[XGE_HAL_MAX_FIFO_NUM]; - int resched_avail; - int resched_send; - int resched_retry; - int tx_copied_max; - volatile int is_initialized; - xgell_config_t config; - volatile int in_reset; + caddr_t ndp; timeout_id_t timeout_id; - kmutex_t genlock; + + int init_rx_rings; + int init_tx_rings; + int init_rx_groups; + + int live_rx_rings; + int live_tx_rings; + xgell_rx_ring_t rx_ring[XGELL_RX_RING_NUM_DEFAULT]; + xgell_tx_ring_t tx_ring[XGELL_TX_RING_NUM_DEFAULT]; + + int tx_copied_max; + + xgell_intr_t intrs[XGELL_MINTR_NUM_DEFAULT]; + ddi_intr_handle_t *intr_table; uint_t intr_table_size; int intr_type; int intr_cnt; uint_t intr_pri; int intr_cap; + + xgell_config_t config; }; typedef struct { diff --git a/usr/src/uts/common/io/xge/hal/include/xgehal-channel.h b/usr/src/uts/common/io/xge/hal/include/xgehal-channel.h index 5852bb9e9a..5275da409a 100644 --- a/usr/src/uts/common/io/xge/hal/include/xgehal-channel.h +++ b/usr/src/uts/common/io/xge/hal/include/xgehal-channel.h @@ -21,6 +21,11 @@ * Copyright (c) 2002-2006 Neterion, Inc. */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + #ifndef XGE_HAL_CHANNEL_H #define XGE_HAL_CHANNEL_H @@ -69,7 +74,8 @@ typedef enum xge_hal_channel_type_e { typedef enum xge_hal_channel_flag_e { XGE_HAL_CHANNEL_FLAG_NONE = 0x0, XGE_HAL_CHANNEL_FLAG_USE_TX_LOCK = 0x1, - XGE_HAL_CHANNEL_FLAG_FREE_RXD = 0x2 + XGE_HAL_CHANNEL_FLAG_FREE_RXD = 0x2, + XGE_HAL_CHANNEL_FLAG_USE_RX_POLLING = 0x4 } xge_hal_channel_flag_e; /** diff --git a/usr/src/uts/common/io/xge/hal/include/xgehal-regs.h b/usr/src/uts/common/io/xge/hal/include/xgehal-regs.h index e79774e329..f0b0a3520d 100644 --- a/usr/src/uts/common/io/xge/hal/include/xgehal-regs.h +++ b/usr/src/uts/common/io/xge/hal/include/xgehal-regs.h @@ -21,6 +21,11 @@ * Copyright (c) 2002-2006 Neterion, Inc. */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + #ifndef XGE_HAL_REGS_H #define XGE_HAL_REGS_H @@ -814,8 +819,8 @@ typedef struct { u64 rmac_cfg_key; #define XGE_HAL_RMAC_CFG_KEY(val) vBIT(val,0,16) -#define XGE_HAL_MAX_MAC_ADDRESSES 64 -#define XGE_HAL_MAC_MC_ALL_MC_ADDR_OFFSET 63 +#define XGE_HAL_MAX_MAC_ADDRESSES 256 +#define XGE_HAL_MAC_MC_ALL_MC_ADDR_OFFSET 255 #define XGE_HAL_MAX_MAC_ADDRESSES_HERC 256 #define XGE_HAL_MAC_MC_ALL_MC_ADDR_OFFSET_HERC 255 diff --git a/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device-fp.c b/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device-fp.c index 5b70ea1378..d08c1d58bf 100644 --- a/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device-fp.c +++ b/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device-fp.c @@ -21,6 +21,11 @@ * Copyright (c) 2002-2006 Neterion, Inc. */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + #ifdef XGE_DEBUG_FP #include "xgehal-device.h" #endif @@ -444,7 +449,9 @@ xge_hal_device_poll_rx_channels(xge_hal_device_t *hldev, int *got_rx) if (hldev->terminating) return XGE_HAL_OK; channel = xge_container_of(item, xge_hal_channel_t, item); - (void) xge_hal_device_poll_rx_channel(channel, got_rx); + if (!(channel->flags & XGE_HAL_CHANNEL_FLAG_USE_RX_POLLING)) { + (void) xge_hal_device_poll_rx_channel(channel, got_rx); + } } return XGE_HAL_OK; @@ -483,6 +490,21 @@ xge_hal_device_poll_tx_channels(xge_hal_device_t *hldev, int *got_tx) } /** + * + */ +__HAL_STATIC_DEVICE __HAL_INLINE_DEVICE void +xge_hal_device_rx_channel_enable_polling(xge_hal_channel_t *channel) +{ + channel->flags |= XGE_HAL_CHANNEL_FLAG_USE_RX_POLLING; +} + +__HAL_STATIC_DEVICE __HAL_INLINE_DEVICE void +xge_hal_device_rx_channel_disable_polling(xge_hal_channel_t *channel) +{ + channel->flags &= ~XGE_HAL_CHANNEL_FLAG_USE_RX_POLLING; +} + +/** * xge_hal_device_mask_tx - Mask Tx interrupts. * @hldev: HAL device handle. * diff --git a/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device.c b/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device.c index 346f10b8bc..4cf18c2621 100644 --- a/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device.c +++ b/usr/src/uts/common/io/xge/hal/xgehal/xgehal-device.c @@ -5044,7 +5044,7 @@ xge_hal_device_macaddr_find(xge_hal_device_t *hldev, macaddr_t wanted) return XGE_HAL_ERR_INVALID_DEVICE; } - for (i=1; i<XGE_HAL_MAX_MAC_ADDRESSES; i++) { + for (i=0; i<XGE_HAL_MAX_MAC_ADDRESSES; i++) { (void) xge_hal_device_macaddr_get(hldev, i, &macaddr); if (!xge_os_memcmp(macaddr, wanted, sizeof(macaddr_t))) { return i; diff --git a/usr/src/uts/common/os/exacct.c b/usr/src/uts/common/os/exacct.c index cb8ced5239..43a7298c7b 100644 --- a/usr/src/uts/common/os/exacct.c +++ b/usr/src/uts/common/os/exacct.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/exacct.h> #include <sys/exacct_catalog.h> #include <sys/disp.h> @@ -43,6 +41,7 @@ #include <sys/sysmacros.h> #include <sys/bitmap.h> #include <sys/msacct.h> +#include <sys/mac.h> /* * exacct usage and recording routines @@ -1163,6 +1162,271 @@ exacct_commit_proc(proc_t *p, int wstat) } static int +exacct_attach_netstat_item(net_stat_t *ns, ea_object_t *record, int res) +{ + int attached = 1; + + switch (res) { + case AC_NET_NAME: + (void) ea_attach_item(record, ns->ns_name, + strlen(ns->ns_name) + 1, EXT_STRING | EXD_NET_STATS_NAME); + break; + case AC_NET_CURTIME: + { + uint64_t now; + timestruc_t ts; + + gethrestime(&ts); + now = (uint64_t)(ulong_t)ts.tv_sec; + (void) ea_attach_item(record, &now, sizeof (uint64_t), + EXT_UINT64 | EXD_NET_STATS_CURTIME); + } + break; + case AC_NET_IBYTES: + (void) ea_attach_item(record, &ns->ns_ibytes, + sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IBYTES); + break; + case AC_NET_OBYTES: + (void) ea_attach_item(record, &ns->ns_obytes, + sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OBYTES); + break; + case AC_NET_IPKTS: + (void) ea_attach_item(record, &ns->ns_ipackets, + sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IPKTS); + break; + case AC_NET_OPKTS: + (void) ea_attach_item(record, &ns->ns_opackets, + sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OPKTS); + break; + case AC_NET_IERRPKTS: + (void) ea_attach_item(record, &ns->ns_ierrors, + sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_IERRPKTS); + break; + case AC_NET_OERRPKTS: + (void) ea_attach_item(record, &ns->ns_oerrors, + sizeof (uint64_t), EXT_UINT64 | EXD_NET_STATS_OERRPKTS); + break; + default: + attached = 0; + } + return (attached); +} + +static int +exacct_attach_netdesc_item(net_desc_t *nd, ea_object_t *record, int res) +{ + int attached = 1; + + switch (res) { + case AC_NET_NAME: + (void) ea_attach_item(record, nd->nd_name, + strlen(nd->nd_name) + 1, EXT_STRING | EXD_NET_DESC_NAME); + break; + case AC_NET_DEVNAME: + (void) ea_attach_item(record, nd->nd_devname, + strlen(nd->nd_devname) + 1, EXT_STRING | + EXD_NET_DESC_DEVNAME); + break; + case AC_NET_EHOST: + (void) ea_attach_item(record, &nd->nd_ehost, + sizeof (nd->nd_ehost), EXT_RAW | EXD_NET_DESC_EHOST); + break; + case AC_NET_EDEST: + (void) ea_attach_item(record, &nd->nd_edest, + sizeof (nd->nd_edest), EXT_RAW | EXD_NET_DESC_EDEST); + break; + case AC_NET_VLAN_TPID: + (void) ea_attach_item(record, &nd->nd_vlan_tpid, + sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_VLAN_TPID); + break; + case AC_NET_VLAN_TCI: + (void) ea_attach_item(record, &nd->nd_vlan_tci, + sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_VLAN_TCI); + break; + case AC_NET_SAP: + (void) ea_attach_item(record, &nd->nd_sap, + sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_SAP); + break; + case AC_NET_PRIORITY: + (void) ea_attach_item(record, &nd->nd_priority, + sizeof (ushort_t), EXT_UINT16 | EXD_NET_DESC_PRIORITY); + break; + case AC_NET_BWLIMIT: + (void) ea_attach_item(record, &nd->nd_bw_limit, + sizeof (uint64_t), EXT_UINT64 | EXD_NET_DESC_BWLIMIT); + break; + case AC_NET_SADDR: + if (nd->nd_isv4) { + (void) ea_attach_item(record, &nd->nd_saddr[3], + sizeof (uint32_t), EXT_UINT32 | + EXD_NET_DESC_V4SADDR); + } else { + (void) ea_attach_item(record, &nd->nd_saddr, + sizeof (nd->nd_saddr), EXT_RAW | + EXD_NET_DESC_V6SADDR); + } + break; + case AC_NET_DADDR: + if (nd->nd_isv4) { + (void) ea_attach_item(record, &nd->nd_daddr[3], + sizeof (uint32_t), EXT_UINT32 | + EXD_NET_DESC_V4DADDR); + } else { + (void) ea_attach_item(record, &nd->nd_daddr, + sizeof (nd->nd_daddr), EXT_RAW | + EXD_NET_DESC_V6DADDR); + } + break; + case AC_NET_SPORT: + (void) ea_attach_item(record, &nd->nd_sport, + sizeof (uint16_t), EXT_UINT16 | EXD_NET_DESC_SPORT); + break; + case AC_NET_DPORT: + (void) ea_attach_item(record, &nd->nd_dport, + sizeof (uint16_t), EXT_UINT16 | EXD_NET_DESC_DPORT); + break; + case AC_NET_PROTOCOL: + (void) ea_attach_item(record, &nd->nd_protocol, + sizeof (uint8_t), EXT_UINT8 | EXD_NET_DESC_PROTOCOL); + break; + case AC_NET_DSFIELD: + (void) ea_attach_item(record, &nd->nd_dsfield, + sizeof (uint8_t), EXT_UINT8 | EXD_NET_DESC_DSFIELD); + break; + default: + attached = 0; + } + return (attached); +} + +static ea_object_t * +exacct_assemble_net_record(void *ninfo, ulong_t *mask, ea_catalog_t record_type, + int what) +{ + int res; + int count; + ea_object_t *record; + + /* + * Assemble usage values into group. + */ + record = ea_alloc_group(EXT_GROUP | EXC_DEFAULT | record_type); + for (res = 1, count = 0; res <= AC_NET_MAX_RES; res++) + if (BT_TEST(mask, res)) { + if (what == EX_NET_LNDESC_REC || + what == EX_NET_FLDESC_REC) { + count += exacct_attach_netdesc_item( + (net_desc_t *)ninfo, record, res); + } else { + count += exacct_attach_netstat_item( + (net_stat_t *)ninfo, record, res); + } + } + if (count == 0) { + ea_free_object(record, EUP_ALLOC); + record = NULL; + } + return (record); +} + +int +exacct_assemble_net_usage(ac_info_t *ac_net, void *ninfo, + int (*callback)(ac_info_t *, void *, size_t, void *, size_t, size_t *), + void *ubuf, size_t ubufsize, size_t *actual, int what) +{ + ulong_t mask[AC_MASK_SZ]; + ea_object_t *net_desc; + ea_catalog_t record_type; + void *buf; + size_t bufsize; + int ret; + + mutex_enter(&ac_net->ac_lock); + if (ac_net->ac_state == AC_OFF) { + mutex_exit(&ac_net->ac_lock); + return (ENOTACTIVE); + } + bt_copy(&ac_net->ac_mask[0], mask, AC_MASK_SZ); + mutex_exit(&ac_net->ac_lock); + + switch (what) { + case EX_NET_LNDESC_REC: + record_type = EXD_GROUP_NET_LINK_DESC; + break; + case EX_NET_LNSTAT_REC: + record_type = EXD_GROUP_NET_LINK_STATS; + break; + case EX_NET_FLDESC_REC: + record_type = EXD_GROUP_NET_FLOW_DESC; + break; + case EX_NET_FLSTAT_REC: + record_type = EXD_GROUP_NET_FLOW_STATS; + break; + } + + net_desc = exacct_assemble_net_record(ninfo, mask, record_type, what); + if (net_desc == NULL) + return (0); + + /* + * Pack object into buffer and pass to callback. + */ + bufsize = ea_pack_object(net_desc, NULL, 0); + buf = kmem_alloc(bufsize, KM_NOSLEEP); + if (buf == NULL) + return (ENOMEM); + + (void) ea_pack_object(net_desc, buf, bufsize); + + ret = callback(ac_net, ubuf, ubufsize, buf, bufsize, actual); + + /* + * Free all previously allocations. + */ + kmem_free(buf, bufsize); + ea_free_object(net_desc, EUP_ALLOC); + return (ret); +} + +int +exacct_commit_netinfo(void *arg, int what) +{ + size_t size; + ulong_t mask[AC_MASK_SZ]; + struct exacct_globals *acg; + ac_info_t *ac_net; + + if (exacct_zone_key == ZONE_KEY_UNINITIALIZED) { + /* + * acctctl module not loaded. Nothing to do. + */ + return (ENOTACTIVE); + } + + /* + * Even though each zone nominally has its own flow accounting settings + * (ac_flow), these are only maintained by and for the global zone. + * + * If this were to change in the future, this function should grow a + * second zoneid (or zone) argument, and use the corresponding zone's + * settings rather than always using those of the global zone. + */ + acg = zone_getspecific(exacct_zone_key, global_zone); + ac_net = &acg->ac_net; + + mutex_enter(&ac_net->ac_lock); + if (ac_net->ac_state == AC_OFF) { + mutex_exit(&ac_net->ac_lock); + return (ENOTACTIVE); + } + bt_copy(&ac_net->ac_mask[0], mask, AC_MASK_SZ); + mutex_exit(&ac_net->ac_lock); + + return (exacct_assemble_net_usage(ac_net, arg, exacct_commit_callback, + NULL, 0, &size, what)); +} + +static int exacct_attach_flow_item(flow_usage_t *fu, ea_object_t *record, int res) { int attached = 1; diff --git a/usr/src/uts/common/inet/ip/ip_cksum.c b/usr/src/uts/common/os/ip_cksum.c index 3b5b3435d9..722c793b79 100644 --- a/usr/src/uts/common/inet/ip/ip_cksum.c +++ b/usr/src/uts/common/os/ip_cksum.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,13 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/inttypes.h> #include <sys/systm.h> @@ -36,6 +33,7 @@ #include <sys/ddi.h> #include <sys/vtrace.h> #include <inet/sctp_crc32.h> +#include <inet/ip.h> #include <sys/multidata.h> #include <sys/multidata_impl.h> @@ -154,14 +152,14 @@ ip_cksum(mblk_t *mp, int offset, uint_t sum) */ if (mlen) mlen += dp->db_cksumend - - dp->db_cksumstuff; + - dp->db_cksumstuff; else { w = (ushort_t *)(mp->b_rptr + dp->db_cksumstuff); if (is_odd(w)) goto slow; mlen = dp->db_cksumend - - dp->db_cksumstuff; + - dp->db_cksumstuff; } } else if (mlen == 0) return (psum); @@ -239,7 +237,7 @@ slow1: int odd; douio: odd = is_odd(dp->db_cksumstuff - - dp->db_cksumstart); + dp->db_cksumstart); if (pmlen == -1) { /* * Previous mlen was odd, so swap @@ -262,7 +260,7 @@ slow1: */ if (mlen) mlen += dp->db_cksumend - - dp->db_cksumstuff; + - dp->db_cksumstuff; else { w = (ushort_t *)(mp->b_rptr + dp->db_cksumstuff); @@ -385,7 +383,7 @@ done: sum = (sum & 0xFFFF) + (sum >> 16); sum = (sum & 0xFFFF) + (sum >> 16); TRACE_3(TR_FAC_IP, TR_IP_CKSUM_END, - "ip_cksum_end:(%S) type %d (%X)", "ip_cksum", 1, sum); + "ip_cksum_end:(%S) type %d (%X)", "ip_cksum", 1, sum); return (sum); } @@ -537,3 +535,30 @@ ip_md_cksum(pdesc_t *pd, int offset, uint_t sum) return (sum); } + +/* Return the IP checksum for the IP header at "iph". */ +uint16_t +ip_csum_hdr(ipha_t *ipha) +{ + uint16_t *uph; + uint32_t sum; + int opt_len; + + opt_len = (ipha->ipha_version_and_hdr_length & 0xF) - + IP_SIMPLE_HDR_LENGTH_IN_WORDS; + uph = (uint16_t *)ipha; + sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + + uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; + if (opt_len > 0) { + do { + sum += uph[10]; + sum += uph[11]; + uph += 2; + } while (--opt_len); + } + sum = (sum & 0xFFFF) + (sum >> 16); + sum = ~(sum + (sum >> 16)) & 0xFFFF; + if (sum == 0xffff) + sum = 0; + return ((uint16_t)sum); +} diff --git a/usr/src/uts/common/os/modhash.c b/usr/src/uts/common/os/modhash.c index 3c63231253..4d52a9eb66 100644 --- a/usr/src/uts/common/os/modhash.c +++ b/usr/src/uts/common/os/modhash.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * mod_hash: flexible hash table implementation. * @@ -816,6 +814,22 @@ mod_hash_find_cb(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val, return (res); } +int +mod_hash_find_cb_rval(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val, + int (*find_cb)(mod_hash_key_t, mod_hash_val_t), int *cb_rval) +{ + int res; + + rw_enter(&hash->mh_contents, RW_READER); + res = i_mod_hash_find_nosync(hash, key, val); + if (res == 0) { + *cb_rval = find_cb(key, *val); + } + rw_exit(&hash->mh_contents); + + return (res); +} + void i_mod_hash_walk_nosync(mod_hash_t *hash, uint_t (*callback)(mod_hash_key_t, mod_hash_val_t *, void *), void *arg) diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c index 23c5e91475..b71b956f8a 100644 --- a/usr/src/uts/common/os/policy.c +++ b/usr/src/uts/common/os/policy.c @@ -54,6 +54,7 @@ #include <sys/vfs.h> #include <sys/mntent.h> #include <sys/contract_impl.h> +#include <sys/dld_ioc.h> /* * There are two possible layers of privilege routines and two possible @@ -2267,3 +2268,23 @@ secpolicy_xvm_control(const cred_t *cr) return (EPERM); return (0); } + +/* + * secpolicy_dld_ioctl + * + * Determine if the subject has permission to use certain dld ioctls. + * Each ioctl should require a limited number of privileges. A large + * number indicates a poor design. + */ +int +secpolicy_dld_ioctl(const cred_t *cr, const char *dld_priv, const char *msg) +{ + int rv; + + if ((rv = priv_getbyname(dld_priv, 0)) >= 0) { + return (PRIV_POLICY(cr, rv, B_FALSE, EPERM, msg)); + } + /* priv_getbyname() returns -ve errno */ + return (-rv); + +} diff --git a/usr/src/uts/common/inet/sctp_crc32.c b/usr/src/uts/common/os/sctp_crc32.c index 21dcaf18fd..38e049e440 100644 --- a/usr/src/uts/common/inet/sctp_crc32.c +++ b/usr/src/uts/common/os/sctp_crc32.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> /* @@ -68,7 +65,7 @@ static uint32_t flip32(uint32_t w) { return (((w >> 24) | ((w >> 8) & 0xff00) | ((w << 8) & 0xff0000) | - (w << 24))); + (w << 24))); } #endif diff --git a/usr/src/uts/common/os/space.c b/usr/src/uts/common/os/space.c index 6edebecdfe..6ed5e749f1 100644 --- a/usr/src/uts/common/os/space.c +++ b/usr/src/uts/common/os/space.c @@ -359,23 +359,14 @@ space_free(char *key) const uint32_t crc32_table[256] = { CRC32_TABLE }; - /* - * We need to fanout load from NIC which can overwhelm a single - * CPU. A 10Gb NIC interrupting a single CPU is a good example. - * Instead of fanning out to random CPUs, it a big performance - * win if you can fanout to the threads on the same core (niagara) - * that is taking interrupts. - * - * We need a better mechanism to figure out the other threads on - * the same core or cores on the same chip which share caches etc. - * but for time being, this will suffice. + * We need to fanout load from NIC which can overwhelm a single CPU. + * This becomes especially important on systems having slow CPUs + * (sun4v architecture). mac_soft_ring_enable is false on all + * systems except sun4v. On sun4v, they get enabled by default (see + * sun4v/os/mach_startup.c). */ -#define NUMBER_OF_THREADS_PER_CPU 4 -uint_t ip_threads_per_cpu = NUMBER_OF_THREADS_PER_CPU; - -/* Global flag to enable/disable soft ring facility */ -boolean_t ip_squeue_soft_ring = B_FALSE; +boolean_t mac_soft_ring_enable = B_FALSE; /* * Global iscsi boot prop diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c index cd8a0a2a62..442ced2b51 100644 --- a/usr/src/uts/common/os/strsubr.c +++ b/usr/src/uts/common/os/strsubr.c @@ -27,8 +27,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/sysmacros.h> #include <sys/param.h> @@ -8450,18 +8448,25 @@ hcksum_retrieve(mblk_t *mp, multidata_t *mmd, pdesc_t *pd, ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_MULTIDATA); if (mp->b_datap->db_type == M_DATA) { if (flags != NULL) { - *flags = DB_CKSUMFLAGS(mp); - if (*flags & HCK_PARTIALCKSUM) { - if (start != NULL) - *start = (uint32_t)DB_CKSUMSTART(mp); - if (stuff != NULL) - *stuff = (uint32_t)DB_CKSUMSTUFF(mp); - if (end != NULL) - *end = (uint32_t)DB_CKSUMEND(mp); + *flags = DB_CKSUMFLAGS(mp) & (HCK_IPV4_HDRCKSUM | + HCK_PARTIALCKSUM | HCK_FULLCKSUM | + HCK_FULLCKSUM_OK); + if ((*flags & (HCK_PARTIALCKSUM | + HCK_FULLCKSUM)) != 0) { if (value != NULL) *value = (uint32_t)DB_CKSUM16(mp); - } else if ((*flags & HW_LSO) && (value != NULL)) - *value = (uint32_t)DB_LSOMSS(mp); + if ((*flags & HCK_PARTIALCKSUM) != 0) { + if (start != NULL) + *start = + (uint32_t)DB_CKSUMSTART(mp); + if (stuff != NULL) + *stuff = + (uint32_t)DB_CKSUMSTUFF(mp); + if (end != NULL) + *end = + (uint32_t)DB_CKSUMEND(mp); + } + } } } else { pattrinfo_t hck_attr = {PATTR_HCKSUM}; @@ -8488,6 +8493,28 @@ hcksum_retrieve(mblk_t *mp, multidata_t *mmd, pdesc_t *pd, } } +void +lso_info_set(mblk_t *mp, uint32_t mss, uint32_t flags) +{ + ASSERT(DB_TYPE(mp) == M_DATA); + + /* Set the flags */ + DB_LSOFLAGS(mp) |= flags; + DB_LSOMSS(mp) = mss; +} + +void +lso_info_get(mblk_t *mp, uint32_t *mss, uint32_t *flags) +{ + ASSERT(DB_TYPE(mp) == M_DATA); + + if (flags != NULL) { + *flags = DB_CKSUMFLAGS(mp) & HW_LSO; + if ((*flags != 0) && (mss != NULL)) + *mss = (uint32_t)DB_LSOMSS(mp); + } +} + /* * Checksum buffer *bp for len bytes with psum partial checksum, * or 0 if none, and return the 16 bit partial checksum. diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index 5fe7393f56..cecccf50ab 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -191,6 +191,7 @@ CHKHDRS= \ dld_impl.h \ dld_ioc.h \ dls.h \ + dls_mgmt.h \ dls_impl.h \ dma_i8237A.h \ dnlc.h \ @@ -353,7 +354,13 @@ CHKHDRS= \ lwp_upimutex_impl.h \ lpif.h \ mac.h \ + mac_client.h \ + mac_client_impl.h \ + mac_flow.h \ + mac_flow_impl.h \ mac_impl.h \ + mac_provider.h \ + mac_soft_ring.h \ machelf.h \ map.h \ md4.h \ @@ -418,6 +425,7 @@ CHKHDRS= \ pci.h \ pcie.h \ pci_impl.h \ + pci_tools.h \ pcmcia.h \ pctypes.h \ pem.h \ diff --git a/usr/src/uts/common/sys/acctctl.h b/usr/src/uts/common/sys/acctctl.h index 5019d36c4c..1dfa8e8577 100644 --- a/usr/src/uts/common/sys/acctctl.h +++ b/usr/src/uts/common/sys/acctctl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ACCTCTL_H #define _SYS_ACCTCTL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/bitmap.h> #include <sys/sysmacros.h> @@ -44,10 +41,11 @@ extern "C" { /* * modes */ -#define AC_PROC (0x1 << 28) /* change process accounting settings */ -#define AC_TASK (0x2 << 28) /* change task accounting settings */ -#define AC_FLOW (0x4 << 28) /* change flow accounting settings */ -#define AC_MODE(x) ((x) & 0xf0000000) +#define AC_PROC (0x1 << 24) /* change process accounting settings */ +#define AC_TASK (0x2 << 24) /* change task accounting settings */ +#define AC_FLOW (0x4 << 24) /* change flow accounting settings */ +#define AC_NET (0x8 << 24) /* change network accounting settings */ +#define AC_MODE(x) ((x) & 0xff000000) /* * options @@ -58,7 +56,7 @@ extern "C" { #define AC_RES_GET (0x08) /* get a list of enabled resources */ #define AC_STATE_SET (0x10) /* set accounting mode state (on/off) */ #define AC_STATE_GET (0x20) /* get accounting mode state */ -#define AC_OPTION(x) ((x) & 0x0fffffff) +#define AC_OPTION(x) ((x) & 0x00ffffff) /* * Process accounting resource IDs @@ -113,8 +111,36 @@ extern "C" { #define AC_FLOW_ANAME 13 /* action instance name */ #define AC_FLOW_MAX_RES 13 /* must be equal to the number above */ -#define AC_MAX_RES_TMP MAX(AC_PROC_MAX_RES, AC_TASK_MAX_RES) -#define AC_MAX_RES MAX(AC_MAX_RES_TMP, AC_FLOW_MAX_RES) +/* + * Network accounting resource IDs + */ +#define AC_NET_NAME 1 /* flow name */ +#define AC_NET_EHOST 2 /* ethernet source address */ +#define AC_NET_EDEST 3 /* ethernet destination address */ +#define AC_NET_VLAN_TPID 4 /* VLAN protocol ID */ +#define AC_NET_VLAN_TCI 5 /* VLAN tag control info. */ +#define AC_NET_SAP 6 /* SAP */ +#define AC_NET_PRIORITY 7 /* Priority */ +#define AC_NET_BWLIMIT 8 /* Bandwidth limit */ +#define AC_NET_DEVNAME 9 /* Device name */ +#define AC_NET_SADDR 10 /* Source IP address */ +#define AC_NET_DADDR 11 /* Dest IP address */ +#define AC_NET_SPORT 12 /* Source Port */ +#define AC_NET_DPORT 13 /* Dest Port */ +#define AC_NET_PROTOCOL 14 /* Protocol */ +#define AC_NET_DSFIELD 15 /* DiffServ field */ +#define AC_NET_CURTIME 16 /* Current Time */ +#define AC_NET_IBYTES 17 /* Inbound Bytes */ +#define AC_NET_OBYTES 18 /* Outbound Bytes */ +#define AC_NET_IPKTS 19 /* Inbound Packets */ +#define AC_NET_OPKTS 20 /* Outbound Packets */ +#define AC_NET_IERRPKTS 21 /* Inbound Error Packets */ +#define AC_NET_OERRPKTS 22 /* Outbound Error Packets */ +#define AC_NET_MAX_RES 22 /* must be equal to the number above */ + +#define AC_MAX_RES \ + MAX(MAX(MAX(AC_PROC_MAX_RES, AC_TASK_MAX_RES), AC_FLOW_MAX_RES), \ + AC_NET_MAX_RES) #define AC_MASK_SZ BT_BITOUL(AC_MAX_RES + 1) /* @@ -150,7 +176,7 @@ extern zone_key_t exacct_zone_key; /* * Per-zone exacct settings. Each zone may have its own settings for - * process, task, and flow accounting. + * process, task, flow, and network accounting. * * Per-zone flow accounting has not yet been implemented, so this * provides zones with the view that flow accounting in the zone hasn't @@ -164,6 +190,7 @@ struct exacct_globals { ac_info_t ac_task; ac_info_t ac_proc; ac_info_t ac_flow; + ac_info_t ac_net; list_node_t ac_link; }; diff --git a/usr/src/uts/common/sys/aggr.h b/usr/src/uts/common/sys/aggr.h index 740ac7f6f9..c63cc9e99f 100644 --- a/usr/src/uts/common/sys/aggr.h +++ b/usr/src/uts/common/sys/aggr.h @@ -28,9 +28,8 @@ #include <sys/types.h> #include <sys/ethernet.h> -#include <sys/mac.h> -#include <sys/dls.h> #include <sys/param.h> +#include <sys/mac.h> #include <sys/dld_ioc.h> #ifdef __cplusplus @@ -38,7 +37,7 @@ extern "C" { #endif /* - * Note that the datastructures defined here define an ioctl interface + * Note that the data structures defined here define an ioctl interface * that is shared betwen user and kernel space. The aggr driver thus * assumes that the structures have identical layout and size when * compiled in either IPL32 or LP64. diff --git a/usr/src/uts/common/sys/aggr_impl.h b/usr/src/uts/common/sys/aggr_impl.h index 62fe0de59b..a1f7e82849 100644 --- a/usr/src/uts/common/sys/aggr_impl.h +++ b/usr/src/uts/common/sys/aggr_impl.h @@ -27,8 +27,10 @@ #define _SYS_AGGR_IMPL_H #include <sys/types.h> -#include <sys/mac.h> #include <sys/mac_ether.h> +#include <sys/mac_provider.h> +#include <sys/mac_client.h> +#include <sys/mac_client_priv.h> #include <sys/aggr_lacp.h> #ifdef __cplusplus @@ -46,6 +48,33 @@ extern "C" { #define AGGR_MODIFY_LACP_TIMER 0x08 /* + * Possible value of aggr_rseudo_rx_ring_t.arr_flags. Set when the ring entry + * in the pseudo RX group is used. + */ +#define MAC_PSEUDO_RING_INUSE 0x01 + +typedef struct aggr_unicst_addr_s { + uint8_t aua_addr[ETHERADDRL]; + struct aggr_unicst_addr_s *aua_next; +} aggr_unicst_addr_t; + +typedef struct aggr_pseudo_rx_ring_s { + mac_ring_handle_t arr_rh; /* filled in by aggr_fill_ring() */ + struct aggr_port_s *arr_port; + mac_ring_handle_t arr_hw_rh; + uint_t arr_flags; + uint64_t arr_gen; +} aggr_pseudo_rx_ring_t; + +typedef struct aggr_pseudo_rx_group_s { + struct aggr_grp_s *arg_grp; /* filled in by aggr_fill_group() */ + mac_group_handle_t arg_gh; /* filled in by aggr_fill_group() */ + aggr_unicst_addr_t *arg_macaddr; + aggr_pseudo_rx_ring_t arg_rings[MAX_RINGS_PER_GROUP]; + uint_t arg_ring_cnt; +} aggr_pseudo_rx_group_t; + +/* * A link aggregation MAC port. * Note that lp_next is protected by the lg_lock of the group the * port is part of. @@ -63,13 +92,13 @@ typedef struct aggr_port_s { lp_collector_enabled : 1, lp_promisc_on : 1, lp_no_link_update : 1, - lp_pad_bits : 27; - uint32_t lp_closing; + lp_grp_added : 1, + lp_closing : 1, + lp_pad_bits : 25; mac_handle_t lp_mh; + mac_client_handle_t lp_mch; const mac_info_t *lp_mip; mac_notify_handle_t lp_mnh; - mac_rx_handle_t lp_mrh; - krwlock_t lp_lock; uint_t lp_tx_idx; /* idx in group's tx array */ uint64_t lp_ifspeed; link_state_t lp_link_state; @@ -78,15 +107,15 @@ typedef struct aggr_port_s { uint64_t lp_ether_stat[ETHER_NSTAT]; aggr_lacp_port_t lp_lacp; /* LACP state */ lacp_stats_t lp_lacp_stats; - const mac_txinfo_t *lp_txinfo; uint32_t lp_margin; -} aggr_port_t; + mac_promisc_handle_t lp_mphp; + mac_unicast_handle_t lp_mah; -typedef struct lg_mcst_addr_s lg_mcst_addr_t; -struct lg_mcst_addr_s { - lg_mcst_addr_t *lg_mcst_nextp; - uint8_t lg_mcst_addr[MAXMACADDRLEN]; -}; + /* List of non-primary addresses that requires promiscous mode set */ + aggr_unicst_addr_t *lp_prom_addr; + /* handle of the underlying HW RX group */ + mac_group_handle_t lp_hwgh; +} aggr_port_t; /* * A link aggregation group. @@ -105,7 +134,6 @@ struct lg_mcst_addr_s { * */ typedef struct aggr_grp_s { - krwlock_t lg_lock; datalink_id_t lg_linkid; uint16_t lg_key; /* key (group port number) */ uint32_t lg_refs; /* refcount */ @@ -116,16 +144,15 @@ typedef struct aggr_grp_s { lg_addr_fixed : 1, /* fixed MAC address? */ lg_started : 1, /* group started? */ lg_promisc : 1, /* in promiscuous mode? */ - lg_gldv3_polling : 1, lg_zcopy : 1, lg_vlan : 1, lg_force : 1, - lg_pad_bits : 8; + lg_pad_bits : 9; aggr_port_t *lg_ports; /* list of configured ports */ aggr_port_t *lg_mac_addr_port; mac_handle_t lg_mh; - uint_t lg_rx_resources; uint_t lg_nattached_ports; + krwlock_t lg_tx_lock; uint_t lg_ntx_ports; aggr_port_t **lg_tx_ports; /* array of tx ports */ uint_t lg_tx_ports_size; /* size of lg_tx_ports */ @@ -140,14 +167,32 @@ typedef struct aggr_grp_s { uint32_t lg_hcksum_txflags; uint_t lg_max_sdu; uint32_t lg_margin; - lg_mcst_addr_t *lg_mcst_list; /* A list of multicast addresses */ -} aggr_grp_t; -#define AGGR_LACP_LOCK_WRITER(grp) rw_enter(&(grp)->aggr.gl_lock, RW_WRITER); -#define AGGR_LACP_UNLOCK(grp) rw_exit(&(grp)->aggr.gl_lock); -#define AGGR_LACP_LOCK_HELD_WRITER(grp) RW_WRITE_HELD(&(grp)->aggr.gl_lock) -#define AGGR_LACP_LOCK_READER(grp) rw_enter(&(grp)->aggr.gl_lock, RW_READER); -#define AGGR_LACP_LOCK_HELD_READER(grp) RW_READ_HELD(&(grp)->aggr.gl_lock) + /* + * The following fields are used by the LACP packets processing. + * Specifically, as the LACP packets processing is not performance + * critical, all LACP packets will be handled by a dedicated thread + * instead of in the mac_rx() call. This is to avoid the dead lock + * with mac_unicast_remove(), which holding the mac perimeter of the + * aggr, and wait for the mr_refcnt of the RX ring to drop to zero. + */ + kmutex_t lg_lacp_lock; + kcondvar_t lg_lacp_cv; + mblk_t *lg_lacp_head; + mblk_t *lg_lacp_tail; + kthread_t *lg_lacp_rx_thread; + boolean_t lg_lacp_done; + aggr_pseudo_rx_group_t lg_rx_group; + + /* + * The following fields are used by aggr to wait for all the + * aggr_port_notify_cb() and aggr_port_timer_thread() to finish + * before it calls mac_unregister() when the aggr is deleted. + */ + kmutex_t lg_port_lock; + kcondvar_t lg_port_cv; + int lg_port_ref; +} aggr_grp_t; #define AGGR_GRP_REFHOLD(grp) { \ atomic_add_32(&(grp)->lg_refs, 1); \ @@ -195,33 +240,34 @@ extern int aggr_grp_info(datalink_id_t, void *, aggr_grp_info_new_grp_fn_t, aggr_grp_info_new_port_fn_t); extern void aggr_grp_notify(aggr_grp_t *, uint32_t); extern boolean_t aggr_grp_attach_port(aggr_grp_t *, aggr_port_t *); -extern boolean_t aggr_grp_detach_port(aggr_grp_t *, aggr_port_t *, boolean_t); +extern boolean_t aggr_grp_detach_port(aggr_grp_t *, aggr_port_t *); extern void aggr_grp_port_mac_changed(aggr_grp_t *, aggr_port_t *, boolean_t *, boolean_t *); extern int aggr_grp_add_ports(datalink_id_t, uint_t, boolean_t, laioc_port_t *); extern int aggr_grp_rem_ports(datalink_id_t, uint_t, laioc_port_t *); extern boolean_t aggr_grp_update_ports_mac(aggr_grp_t *); -extern int aggr_grp_modify(datalink_id_t, aggr_grp_t *, uint8_t, uint32_t, - boolean_t, const uchar_t *, aggr_lacp_mode_t, aggr_lacp_timer_t); +extern int aggr_grp_modify(datalink_id_t, uint8_t, uint32_t, boolean_t, + const uchar_t *, aggr_lacp_mode_t, aggr_lacp_timer_t); extern void aggr_grp_multicst_port(aggr_port_t *, boolean_t); extern uint_t aggr_grp_count(void); extern void aggr_port_init(void); extern void aggr_port_fini(void); -extern int aggr_port_create(const datalink_id_t, boolean_t, aggr_port_t **); +extern int aggr_port_create(aggr_grp_t *, const datalink_id_t, boolean_t, + aggr_port_t **); extern void aggr_port_delete(aggr_port_t *); extern void aggr_port_free(aggr_port_t *); extern int aggr_port_start(aggr_port_t *); extern void aggr_port_stop(aggr_port_t *); extern int aggr_port_promisc(aggr_port_t *, boolean_t); -extern int aggr_port_unicst(aggr_port_t *, uint8_t *); +extern int aggr_port_unicst(aggr_port_t *); extern int aggr_port_multicst(void *, boolean_t, const uint8_t *); extern uint64_t aggr_port_stat(aggr_port_t *, uint_t); -extern boolean_t aggr_port_notify_link(aggr_grp_t *, aggr_port_t *, boolean_t); +extern boolean_t aggr_port_notify_link(aggr_grp_t *, aggr_port_t *); extern void aggr_port_init_callbacks(aggr_port_t *); -extern void aggr_recv_cb(void *, mac_resource_handle_t, mblk_t *); +extern void aggr_recv_cb(void *, mac_resource_handle_t, mblk_t *, boolean_t); extern mblk_t *aggr_m_tx(void *, mblk_t *); extern void aggr_send_port_enable(aggr_port_t *); @@ -236,10 +282,20 @@ extern void aggr_lacp_set_mode(aggr_grp_t *, aggr_lacp_mode_t, aggr_lacp_timer_t); extern void aggr_lacp_update_mode(aggr_grp_t *, aggr_lacp_mode_t); extern void aggr_lacp_update_timer(aggr_grp_t *, aggr_lacp_timer_t); -extern void aggr_lacp_rx(aggr_port_t *, mblk_t *); +extern void aggr_lacp_rx_enqueue(aggr_port_t *, mblk_t *); extern void aggr_lacp_port_attached(aggr_port_t *); extern void aggr_lacp_port_detached(aggr_port_t *); -extern void aggr_lacp_policy_changed(aggr_grp_t *); +extern void aggr_port_lacp_set_mode(aggr_grp_t *, aggr_port_t *); + +extern void aggr_lacp_rx_thread(void *); +extern void aggr_recv_lacp(aggr_port_t *, mac_resource_handle_t, mblk_t *); + +extern void aggr_grp_port_hold(aggr_port_t *); +extern void aggr_grp_port_rele(aggr_port_t *); +extern void aggr_grp_port_wait(aggr_grp_t *); + +extern int aggr_port_addmac(aggr_port_t *, const uint8_t *); +extern void aggr_port_remmac(aggr_port_t *, const uint8_t *); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/aggr_lacp.h b/usr/src/uts/common/sys/aggr_lacp.h index ebcc07cb12..ef8c7408ac 100644 --- a/usr/src/uts/common/sys/aggr_lacp.h +++ b/usr/src/uts/common/sys/aggr_lacp.h @@ -157,8 +157,6 @@ typedef struct Agg { aggr_lacp_timer_t PeriodicTimer; /* AGGR_LACP_{LONG,SHORT} */ uint64_t TimeOfLastOperChange; /* Time in state */ boolean_t ready; /* Ready_N for all ports TRUE */ - - krwlock_t gl_lock; } Agg_t; /* @@ -192,6 +190,19 @@ typedef struct state_machine { } state_machine_t; /* + * The following three flags are set when specific timer is timed out; used + * by the LACP timer handler thread. + */ +#define LACP_PERIODIC_TIMEOUT 0x01 +#define LACP_WAIT_WHILE_TIMEOUT 0x02 +#define LACP_CURRENT_WHILE_TIMEOUT 0x04 +/* + * Set when the port is being deleted; used to inform the LACP timer handler + * thread to exit. + */ +#define LACP_THREAD_EXIT 0x08 + +/* * 802.3ad Variables associated with each port (section 43.4.7) */ typedef struct aggr_lacp_port { @@ -228,6 +239,10 @@ typedef struct aggr_lacp_port { lacp_timer_t current_while_timer; lacp_timer_t periodic_timer; lacp_timer_t wait_while_timer; + uint32_t lacp_timer_bits; + kthread_t *lacp_timer_thread; + kmutex_t lacp_timer_lock; + kcondvar_t lacp_timer_cv; hrtime_t time; } aggr_lacp_port_t; diff --git a/usr/src/uts/common/sys/dld.h b/usr/src/uts/common/sys/dld.h index d3663f464f..1510b46123 100644 --- a/usr/src/uts/common/sys/dld.h +++ b/usr/src/uts/common/sys/dld.h @@ -38,6 +38,7 @@ #include <sys/types.h> #include <sys/stream.h> #include <sys/dld_ioc.h> +#include <sys/mac_flow.h> #include <sys/conf.h> #include <sys/sad.h> #include <net/if.h> @@ -84,14 +85,18 @@ extern "C" { */ #define DLD_DRIVER_NAME "dld" +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif + /* * IOCTL codes and data structures. */ #define DLDIOC_ATTR DLDIOC(0x03) typedef struct dld_ioc_attr { - datalink_id_t dia_linkid; - uint_t dia_max_sdu; + datalink_id_t dia_linkid; + uint_t dia_max_sdu; } dld_ioc_attr_t; #define DLDIOC_VLAN_ATTR DLDIOC(0x04) @@ -100,7 +105,6 @@ typedef struct dld_ioc_vlan_attr { uint16_t div_vid; datalink_id_t div_linkid; boolean_t div_force; - boolean_t div_implicit; } dld_ioc_vlan_attr_t; #define DLDIOC_PHYS_ATTR DLDIOC(0x05) @@ -203,15 +207,8 @@ typedef struct dld_ioc_rename { typedef struct dld_ioc_zid { zoneid_t diz_zid; char diz_link[MAXLINKNAMELEN]; - boolean_t diz_is_ppa_hack; } dld_ioc_zid_t; -#define DLDIOC_GETZID DLDIOC(0x13) -typedef struct dld_ioc_getzid { - datalink_id_t dig_linkid; - zoneid_t dig_zid; -} dld_ioc_getzid_t; - /* * data-link autopush configuration. */ @@ -221,8 +218,72 @@ struct dlautopush { char dap_aplist[MAXAPUSH][FMNAMESZ+1]; }; -#define DLDIOC_SETMACPROP DLDIOC(0x14) -#define DLDIOC_GETMACPROP DLDIOC(0x15) +#define DLDIOC_MACADDRGET DLDIOC(0x15) +typedef struct dld_ioc_macaddrget { + datalink_id_t dig_linkid; + uint_t dig_count; + uint_t dig_size; +} dld_ioc_macaddrget_t; + +/* possible flags for dmi_flags below */ +#define DLDIOCMACADDR_USED 0x1 /* address slot used */ + +typedef struct dld_macaddrinfo { + uint_t dmi_slot; + uint_t dmi_flags; + uint_t dmi_addrlen; + uchar_t dmi_addr[MAXMACADDRLEN]; + char dmi_client_name[MAXNAMELEN]; + datalink_id_t dma_client_linkid; +} dld_macaddrinfo_t; + +/* + * IOCTL codes and data structures for flowadm. + */ +#define DLDIOC_ADDFLOW DLDIOC(0x16) +typedef struct dld_ioc_addflow { + datalink_id_t af_linkid; + flow_desc_t af_flow_desc; + mac_resource_props_t af_resource_props; + char af_name[MAXNAMELEN]; +} dld_ioc_addflow_t; + +#define DLDIOC_REMOVEFLOW DLDIOC(0x17) +typedef struct dld_ioc_removeflow { + char rf_name[MAXNAMELEN]; +} dld_ioc_removeflow_t; + +#define DLDIOC_MODIFYFLOW DLDIOC(0x18) +typedef struct dld_ioc_modifyflow { + char mf_name[MAXNAMELEN]; + mac_resource_props_t mf_resource_props; +} dld_ioc_modifyflow_t; + +#define DLDIOC_WALKFLOW DLDIOC(0x19) +typedef struct dld_ioc_walkflow { + datalink_id_t wf_linkid; + char wf_name[MAXNAMELEN]; + uint32_t wf_nflows; + uint_t wf_len; +} dld_ioc_walkflow_t; + +typedef struct dld_flowinfo { + datalink_id_t fi_linkid; + flow_desc_t fi_flow_desc; + mac_resource_props_t fi_resource_props; + char fi_flowname[MAXNAMELEN]; + uint32_t fi_pad; +} dld_flowinfo_t; + +#define DLDIOC_USAGELOG DLDIOC(0x1a) +typedef struct dld_ioc_usagelog { + mac_logtype_t ul_type; + boolean_t ul_onoff; + uint_t ul_interval; +} dld_ioc_usagelog_t; + +#define DLDIOC_SETMACPROP DLDIOC(0x1b) +#define DLDIOC_GETMACPROP DLDIOC(0x1c) #define MAC_PROP_VERSION 1 typedef struct dld_ioc_macprop_s { @@ -236,7 +297,111 @@ typedef struct dld_ioc_macprop_s { char pr_val[1]; } dld_ioc_macprop_t; +#define DLDIOC_GETHWGRP DLDIOC(0x1d) + +typedef struct dld_ioc_hwgrpget { + datalink_id_t dih_linkid; + uint_t dih_n_groups; /* number of groups included in ioc */ + uint_t dih_size; +} dld_ioc_hwgrpget_t; + +#define MAXCLIENTNAMELEN 1024 +typedef struct dld_hwgrpinfo { + char dhi_link_name[MAXLINKNAMELEN]; + uint_t dhi_grp_num; + uint_t dhi_grp_type; + uint_t dhi_n_rings; + uint_t dhi_n_clnts; + /* XXXX later we should use dhi_n_clnts * MAXNAMELEN for dhi_clnts */ + char dhi_clnts[MAXCLIENTNAMELEN]; +} dld_hwgrpinfo_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif + #ifdef _KERNEL + +#define DLD_CAPAB_DIRECT 0x00000001 +#define DLD_CAPAB_POLL 0x00000002 +#define DLD_CAPAB_PERIM 0x00000003 +#define DLD_CAPAB_LSO 0x00000004 + +#define DLD_ENABLE 0x00000001 +#define DLD_DISABLE 0x00000002 +#define DLD_QUERY 0x00000003 + +/* + * GLDv3 entry point for negotiating capabilities. + * This is exposed to IP after negotiation of DL_CAPAB_DLD. + * + * This function takes the following arguments: + * handle: used for identifying the interface to operate on (provided by dld). + * type: capability type. + * arg: points to a capability-specific structure. + * flags: used for indicating whether to enable or disable a capability. + * + * With this function, capability negotiation is reduced from a multi-step + * process to just one single function call. + * e.g. the following code would pass 'x' from IP to dld and obtain + * arg.output_arg from dld: + * + * arg.input_arg = x; + * rc = (*dld_capab)(handle, DLD_CAPAB_XXX, &arg, DLD_ENABLE); + * ill->info1 = arg.output_arg; + */ +typedef int (*dld_capab_func_t)(void *, uint_t, void *, uint_t); + +/* + * Direct Tx/Rx capability. + */ +typedef struct dld_capab_direct_s { + /* + * Rx entry point and handle, owned by IP. + */ + uintptr_t di_rx_cf; + void *di_rx_ch; + + /* + * Tx entry points and handle, owned by DLD. + */ + /* Entry point for transmitting packets */ + uintptr_t di_tx_df; + void *di_tx_dh; + + /* flow control notification callback */ + uintptr_t di_tx_cb_df; /* callback registration/de-registration */ + void *di_tx_cb_dh; +} dld_capab_direct_t; + +/* + * Polling/softring capability. + */ +#define POLL_SOFTRING 0x00000001 +typedef struct dld_capab_poll_s { + uintptr_t poll_ring_add_cf; + uintptr_t poll_ring_remove_cf; + uintptr_t poll_ring_quiesce_cf; + uintptr_t poll_ring_restart_cf; + uintptr_t poll_ring_bind_cf; + void *poll_ring_ch; + uintptr_t poll_mac_accept_df; + void *poll_mac_dh; +} dld_capab_poll_t; + +/* + * LSO capability + */ +/* + * Currently supported flags for LSO. + */ +#define DLD_LSO_TX_BASIC_TCP_IPV4 0x01 /* TCP LSO capability */ + +typedef struct dld_capab_lso_s { + uint_t lso_flags; /* capability flags */ + uint_t lso_max; /* maximum payload */ +} dld_capab_lso_t; + int dld_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); int dld_open(queue_t *, dev_t *, int, int, cred_t *); int dld_close(queue_t *); @@ -245,6 +410,13 @@ void dld_wsrv(queue_t *); void dld_init_ops(struct dev_ops *, const char *); void dld_fini_ops(struct dev_ops *); int dld_autopush(dev_t *, struct dlautopush *); + +int dld_add_flow(datalink_id_t, char *, flow_desc_t *, + mac_resource_props_t *); +int dld_remove_flow(char *); +int dld_modify_flow(char *, mac_resource_props_t *); +int dld_walk_flow(dld_ioc_walkflow_t *, intptr_t); + #endif #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/dld_impl.h b/usr/src/uts/common/sys/dld_impl.h index 8d2138cc52..906fd6fe15 100644 --- a/usr/src/uts/common/sys/dld_impl.h +++ b/usr/src/uts/common/sys/dld_impl.h @@ -27,13 +27,12 @@ #define _SYS_DLD_IMPL_H #include <sys/types.h> -#include <sys/conf.h> +#include <sys/list.h> #include <sys/ethernet.h> #include <sys/stream.h> #include <sys/dlpi.h> -#include <sys/mac.h> -#include <sys/dls.h> #include <sys/dld.h> +#include <sys/dls_impl.h> #ifdef __cplusplus extern "C" { @@ -57,39 +56,50 @@ typedef enum { DLD_ACTIVE } dld_passivestate_t; -typedef struct dld_str dld_str_t; -typedef void (*dld_tx_t)(struct dld_str *, mblk_t *); - /* - * dld_str_t object definition. + * The dld_str_t object definition and protection scheme for each member + * is described below. The framework locking mechanism details are described in + * mac_impl.h and mac.c + * + * Write Once Only (WO): Typically these are initialized when the end point + * is created or initialized and don't change subsequently + * + * Serializer (SL): Protected by the Serializer. All modify operations on an + * end point go through the serializer. Readers don't care about reading + * these fields atomically, or readers also use the serializer to see the + * values atomically. + * + * Lock: kmutex_t or kwrlock_t lock. Modify operations still go through the + * serializer, the lock helps synchronize readers with writers. */ -struct dld_str { + +struct dld_str_s { /* Protected by */ /* * Major number of the device */ - major_t ds_major; + major_t ds_major; /* WO */ /* * Ephemeral minor number for the object. */ - minor_t ds_minor; + minor_t ds_minor; /* WO */ /* - * Read/write queues for the stream which the object represents. + * PPA number this stream is attached to. */ - queue_t *ds_rq; - queue_t *ds_wq; + t_uscalar_t ds_ppa; /* SL */ /* - * Lock to protect this structure. + * Read/write queues for the stream which the object represents. */ - krwlock_t ds_lock; + queue_t *ds_rq; /* WO */ + queue_t *ds_wq; /* WO */ /* * Stream is open to DLD_CONTROL (control node) or * DLD_DLPI (DLS provider) node. */ - uint_t ds_type; + uint_t ds_type; /* WO */ /* * The following fields are only used for DLD_DLPI type objects. @@ -98,158 +108,123 @@ struct dld_str { /* * Current DLPI state. */ - t_uscalar_t ds_dlstate; + t_uscalar_t ds_dlstate; /* ds_lock */ /* * DLPI style */ - t_uscalar_t ds_style; + t_uscalar_t ds_style; /* WO */ /* * Currently bound DLSAP. */ - uint16_t ds_sap; - - /* - * Handle of the data-link channel that is used by this object. - */ - dls_channel_t ds_dc; + uint16_t ds_sap; /* SL */ /* * Handle of the MAC that is used by the data-link interface. */ - mac_handle_t ds_mh; - - /* - * VLAN identifier of the data-link interface. - */ - uint16_t ds_vid; + mac_handle_t ds_mh; /* SL */ + mac_client_handle_t ds_mch; /* SL */ /* * Promiscuity level information. */ - uint32_t ds_promisc; + uint32_t ds_promisc; /* SL */ + mac_promisc_handle_t ds_mph; + mac_promisc_handle_t ds_vlan_mph; /* * Immutable information of the MAC which the channel is using. */ - const mac_info_t *ds_mip; + const mac_info_t *ds_mip; /* SL */ /* * Current packet priority. */ - uint_t ds_pri; + uint_t ds_pri; /* SL */ /* * Handle of our MAC notification callback. */ - mac_notify_handle_t ds_mnh; + mac_notify_handle_t ds_mnh; /* SL */ /* * Set of enabled DL_NOTE... notifications. (See dlpi.h). */ - uint32_t ds_notifications; - - /* - * Cached MAC unicast addresses. - */ - uint8_t ds_fact_addr[MAXMACADDRLEN]; - uint8_t ds_curr_addr[MAXMACADDRLEN]; + uint32_t ds_notifications; /* SL */ /* * Mode: unitdata, fast-path or raw. */ - dld_str_mode_t ds_mode; + dld_str_mode_t ds_mode; /* SL */ /* * Native mode state. */ - boolean_t ds_native; + boolean_t ds_native; /* SL */ /* * IP polling is operational if this flag is set. */ - boolean_t ds_polling; - boolean_t ds_soft_ring; + boolean_t ds_polling; /* SL */ + boolean_t ds_direct; /* SL */ /* * LSO is enabled if ds_lso is set. */ - boolean_t ds_lso; - uint64_t ds_lso_max; + boolean_t ds_lso; /* SL */ + uint64_t ds_lso_max; /* SL */ /* * State of DLPI user: may be active (regular network layer), * passive (snoop-like monitoring), or unknown (not yet * determined). */ - dld_passivestate_t ds_passivestate; + dld_passivestate_t ds_passivestate; /* SL */ /* * Dummy mblk used for flow-control. */ - mblk_t *ds_tx_flow_mp; - - /* - * Internal transmit queue and its parameters. - */ - kmutex_t ds_tx_list_lock; - mblk_t *ds_tx_list_head; - mblk_t *ds_tx_list_tail; - uint_t ds_tx_cnt; - uint_t ds_tx_msgcnt; - timeout_id_t ds_tx_qdepth_tid; - boolean_t ds_tx_qbusy; - - dld_tx_t ds_tx; - dld_tx_t ds_unitdata_tx; - kmutex_t ds_tx_lock; - kcondvar_t ds_tx_cv; - uint32_t ds_intx_cnt; - boolean_t ds_detaching; - - /* - * Pending control messages to be processed. - */ - mblk_t *ds_pending_head; - mblk_t *ds_pending_tail; - - taskqid_t ds_tid; - kmutex_t ds_disp_lock; - kcondvar_t ds_disp_cv; - boolean_t ds_closing; + mblk_t *ds_tx_flow_mp; /* ds_lock */ /* - * Used to process ioctl message for control node. See comments - * above dld_ioctl(). + * List of queued DLPI requests. These will be processed + * by a taskq thread. This block is protected by ds_lock */ - void (*ds_ioctl)(queue_t *, mblk_t *); + kmutex_t ds_lock; + krwlock_t ds_rw_lock; + kcondvar_t ds_datathr_cv; /* ds_lock */ + uint_t ds_datathr_cnt; /* ds_lock */ + mblk_t *ds_pending_head; /* ds_lock */ + mblk_t *ds_pending_tail; /* ds_lock */ + kcondvar_t ds_dlpi_pending_cv; /* ds_lock */ + uint32_t + ds_dlpi_pending : 1, /* ds_lock */ + ds_local : 1, + ds_pad : 30; /* ds_lock */ + + dls_link_t *ds_dlp; /* SL */ + dls_multicst_addr_t *ds_dmap; /* ds_rw_lock */ + dls_rx_t ds_rx; /* ds_lock */ + void *ds_rx_arg; /* ds_lock */ + boolean_t ds_active; /* SL */ + dld_str_t *ds_next; /* SL */ + dls_head_t *ds_head; + dls_dl_handle_t ds_ddh; + list_node_t ds_tqlist; }; -#define DLD_TX_ENTER(dsp) { \ - mutex_enter(&(dsp)->ds_tx_lock); \ - (dsp)->ds_intx_cnt++; \ - mutex_exit(&(dsp)->ds_tx_lock); \ -} - -#define DLD_TX_EXIT(dsp) { \ - mutex_enter(&(dsp)->ds_tx_lock); \ - if ((--(dsp)->ds_intx_cnt == 0) && (dsp)->ds_detaching) \ - cv_signal(&(dsp)->ds_tx_cv); \ - mutex_exit(&(dsp)->ds_tx_lock); \ +#define DLD_DATATHR_INC(dsp) { \ + ASSERT(MUTEX_HELD(&(dsp)->ds_lock)); \ + dsp->ds_datathr_cnt++; \ } -/* - * Quiesce the traffic. - */ -#define DLD_TX_QUIESCE(dsp) { \ - mutex_enter(&(dsp)->ds_tx_lock); \ - (dsp)->ds_tx = (dsp)->ds_unitdata_tx = NULL; \ - (dsp)->ds_detaching = B_TRUE; \ - while ((dsp)->ds_intx_cnt != 0) \ - cv_wait(&(dsp)->ds_tx_cv, &(dsp)->ds_tx_lock); \ - (dsp)->ds_detaching = B_FALSE; \ - mutex_exit(&(dsp)->ds_tx_lock); \ +#define DLD_DATATHR_DCR(dsp) { \ + mutex_enter(&(dsp)->ds_lock); \ + (dsp)->ds_datathr_cnt--; \ + if ((dsp)->ds_datathr_cnt == 0) \ + cv_broadcast(&(dsp)->ds_datathr_cv); \ + mutex_exit(&(dsp)->ds_lock); \ } /* @@ -269,26 +244,34 @@ extern void dld_str_rx_fastpath(void *, mac_resource_handle_t, mblk_t *, mac_header_info_t *); extern void dld_str_rx_unitdata(void *, mac_resource_handle_t, mblk_t *, mac_header_info_t *); - -extern void dld_tx_flush(dld_str_t *); extern void dld_str_notify_ind(dld_str_t *); -extern void dld_tx_single(dld_str_t *, mblk_t *); -extern void str_mdata_fastpath_put(dld_str_t *, mblk_t *); -extern void str_mdata_raw_put(dld_str_t *, mblk_t *); - -extern void dld_ioctl(queue_t *, mblk_t *); -extern void dld_finish_pending_task(dld_str_t *); +extern mac_tx_cookie_t str_mdata_fastpath_put(dld_str_t *, mblk_t *, + uintptr_t, uint16_t); +extern int dld_flow_ctl_callb(dld_str_t *, uint64_t, + int (*func)(), void *); /* * dld_proto.c */ -extern void dld_wput_proto_nondata(dld_str_t *, mblk_t *); -extern void dld_wput_proto_data(dld_str_t *, mblk_t *); +extern void dld_proto(dld_str_t *, mblk_t *); +extern void dld_proto_unitdata_req(dld_str_t *, mblk_t *); extern void dld_capabilities_disable(dld_str_t *); +extern void proto_unitdata_req(dld_str_t *, mblk_t *); + +/* + * dld_flow.c + */ +extern void flow_rx_pkt_chain(void *, void *, mblk_t *); + +/* + * dld_drv.c + */ +extern mac_handle_t dld_mac_open(char *dev_name, int *err); +#define dld_mac_close(mh) mac_close(mh) /* * Options: there should be a separate bit defined here for each - * DLD_PROP... defined in dld.h. + * DLD_PROP... defined in dld.h. */ #define DLD_OPT_NO_FASTPATH 0x00000001 #define DLD_OPT_NO_POLL 0x00000002 @@ -316,6 +299,33 @@ typedef struct dld_ap { #define IMPLY(p, c) (!(p) || (c)) +#define DLD_SETQFULL(dsp) { \ + queue_t *q = (dsp)->ds_wq; \ + \ + mutex_enter(&(dsp)->ds_lock); \ + if ((dsp)->ds_tx_flow_mp != NULL) { \ + (void) putq(q, (dsp)->ds_tx_flow_mp); \ + (dsp)->ds_tx_flow_mp = NULL; \ + qenable((dsp)->ds_wq); \ + } \ + mutex_exit(&(dsp)->ds_lock); \ +} + +#define DLD_CLRQFULL(dsp) { \ + queue_t *q = (dsp)->ds_wq; \ + \ + mutex_enter(&(dsp)->ds_lock); \ + if (!mac_tx_is_flow_blocked((dsp)->ds_mch, NULL)) { \ + if ((dsp)->ds_tx_flow_mp == NULL) \ + (dsp)->ds_tx_flow_mp = getq(q); \ + ASSERT((dsp)->ds_tx_flow_mp != NULL); \ + } \ + mutex_exit(&(dsp)->ds_lock); \ +} + +#define DLD_TX(dsp, mp, f_hint, flag) \ + mac_tx(dsp->ds_mch, mp, f_hint, flag, NULL) + #ifdef DEBUG #define DLD_DBG cmn_err #else diff --git a/usr/src/uts/common/sys/dld_ioc.h b/usr/src/uts/common/sys/dld_ioc.h index cb8f5bf225..86406cab4f 100644 --- a/usr/src/uts/common/sys/dld_ioc.h +++ b/usr/src/uts/common/sys/dld_ioc.h @@ -77,18 +77,22 @@ extern "C" { * DLDCOPYIN or DLDCOPYOUT flags are set so that every di_func() * callback function does not need to copyin/out its own data. */ -typedef int (dld_ioc_func_t)(void *, intptr_t, int, cred_t *); + +/* Maximum number of Privileges */ +#define DLD_MAX_PRIV 16 + +typedef int (dld_ioc_func_t)(void *, intptr_t, int, cred_t *, int *); typedef struct dld_ioc_info { uint_t di_cmd; uint_t di_flags; size_t di_argsize; dld_ioc_func_t *di_func; + const char *di_priv[DLD_MAX_PRIV]; } dld_ioc_info_t; /* Values for di_flags */ #define DLDCOPYIN 0x00000001 /* copyin di_argsize amount of data */ #define DLDCOPYOUT 0x00000002 /* copyout di_argsize amount of data */ -#define DLDDLCONFIG 0x00000004 /* ioctl requires PRIV_SYS_DL_CONFIG */ #define DLDCOPYINOUT (DLDCOPYIN | DLDCOPYOUT) #define DLDIOCCNT(l) (sizeof (l) / sizeof (dld_ioc_info_t)) diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h index 3af7b7bca7..aa01ddeed6 100644 --- a/usr/src/uts/common/sys/dlpi.h +++ b/usr/src/uts/common/sys/dlpi.h @@ -586,12 +586,8 @@ union DL_qos_types { /* dl_data is dl_capab_mdt_t */ #define DL_CAPAB_ZEROCOPY 0x05 /* Zero-copy capability */ /* dl_data is dl_capab_zerocopy_t */ -#define DL_CAPAB_POLL 0x06 /* Polling capability */ - /* dl_data is dl_capab_dls_t */ -#define DL_CAPAB_SOFT_RING 0x07 /* Soft ring capable */ - /* dl_data is dl_capab_dls_t */ -#define DL_CAPAB_LSO 0x08 /* Large Send Offload capability */ - /* dl_data is dl_capab_lso_t */ +#define DL_CAPAB_DLD 0x06 /* dld capability */ + /* dl_data is dl_capab_dld_t */ typedef struct { t_uscalar_t dl_cap; /* capability type */ @@ -710,55 +706,22 @@ typedef struct { #ifdef _KERNEL /* - * This structure is used by DL_CAPAB_POLL and DL_CAPAB_SOFT_RING - * capabilities. It provides a mechanism for IP to exchange function - * pointers with a gldv3-based driver to enable it to bypass streams- - * data-paths. DL_CAPAB_POLL mechanism provides a way to blank - * interrupts. Note: True polling support will be added in the future. - * DL_CAPAB_SOFT_RING provides a mechanism to create soft ring at the - * dls layer. + * The DL_CAPAB_DLD capability enables the capabilities of gldv3-based drivers + * to be negotiated using a function call (dld_capab) instead of using streams. */ -typedef struct dl_capab_dls_s { - t_uscalar_t dls_version; - t_uscalar_t dls_flags; +typedef struct dl_capab_dld_s { + t_uscalar_t dld_version; + t_uscalar_t dld_flags; /* DLD provided information */ - uintptr_t dls_tx_handle; - uintptr_t dls_tx; - uintptr_t dls_ring_change_status; - uintptr_t dls_ring_bind; - uintptr_t dls_ring_unbind; + uintptr_t dld_capab; + uintptr_t dld_capab_handle; + dl_mid_t dld_mid; /* module ID */ +} dl_capab_dld_t; - /* IP provided information */ - uintptr_t dls_rx_handle; - uintptr_t dls_ring_assign; - uintptr_t dls_rx; - uintptr_t dls_ring_add; - t_uscalar_t dls_ring_cnt; - - dl_mid_t dls_mid; /* module ID */ -} dl_capab_dls_t; - -#define POLL_CURRENT_VERSION 0x01 -#define POLL_VERSION_1 0x01 - -#define SOFT_RING_VERSION_1 0x01 - -/* Values for poll_flags */ -#define POLL_ENABLE 0x01 /* Set to enable polling */ - /* capability */ -#define POLL_CAPABLE 0x02 /* Polling ability exists */ -#define POLL_DISABLE 0x03 /* Disable Polling */ - -/* Values for soft_ring_flags */ -#define SOFT_RING_ENABLE 0x04 /* Set to enable soft_ring */ - /* capability */ -#define SOFT_RING_CAPABLE 0x05 /* Soft_Ring ability exists */ -#define SOFT_RING_DISABLE 0x06 /* Disable Soft_Ring */ - -/* Soft_Ring fanout types (used by soft_ring_change_status) */ -#define SOFT_RING_NONE 0x00 -#define SOFT_RING_FANOUT 0x01 +#define DL_CAPAB_DLD_ENABLE 0x00000001 +#define DLD_VERSION_1 1 +#define DLD_CURRENT_VERSION DLD_VERSION_1 #endif /* _KERNEL */ @@ -786,29 +749,6 @@ typedef struct { /* transmit */ /* - * Large Send Offload sub-capability (follows dl_capability_sub_t) - */ -typedef struct { - t_uscalar_t lso_version; /* interface version */ - t_uscalar_t lso_flags; /* capability flags */ - t_uscalar_t lso_max; /* maximum payload */ - t_uscalar_t reserved[1]; /* reserved fields */ - dl_mid_t lso_mid; /* module ID */ -} dl_capab_lso_t; - -/* - * Large Send Offload revision definition history - */ -#define LSO_CURRENT_VERSION 0x01 -#define LSO_VERSION_1 0x01 - -/* - * Currently supported values of lso_flags - */ -#define LSO_TX_ENABLE 0x01 /* to enable LSO */ -#define LSO_TX_BASIC_TCP_IPV4 0x02 /* TCP LSO capability */ - -/* * DLPI interface primitive definitions. * * Each primitive is sent as a stream message. It is possible that diff --git a/usr/src/uts/common/sys/dls.h b/usr/src/uts/common/sys/dls.h index 3bfe25ecf0..c96c6f1b85 100644 --- a/usr/src/uts/common/sys/dls.h +++ b/usr/src/uts/common/sys/dls.h @@ -28,8 +28,8 @@ #include <sys/types.h> #include <sys/stream.h> -#include <net/if.h> -#include <sys/mac.h> +#include <sys/mac_client.h> +#include <sys/dls_mgmt.h> /* * Data-Link Services Module @@ -53,233 +53,56 @@ extern "C" { * Macros for converting ppas to instance #s, Vlan ID, or minor. */ #define DLS_PPA2INST(ppa) ((int)((ppa) % 1000)) -#define DLS_PPA2VID(ppa) ((ppa) / 1000) +#define DLS_PPA2VID(ppa) ((uint16_t)((ppa) / 1000)) +#define DLS_PPA2MINOR(ppa) ((minor_t)((DLS_PPA2INST(ppa)) + 1)) /* - * Converts a minor to an instance#; makes sense only when minor <= 1000. - */ -#define DLS_MINOR2INST(minor) ((int)((minor) - 1)) - -typedef enum { - DATALINK_CLASS_PHYS = 0x01, - DATALINK_CLASS_VLAN = 0x02, - DATALINK_CLASS_AGGR = 0x04, - DATALINK_CLASS_VNIC = 0x08 -} datalink_class_t; - -#define DATALINK_CLASS_ALL (DATALINK_CLASS_PHYS | \ - DATALINK_CLASS_VLAN | DATALINK_CLASS_AGGR | DATALINK_CLASS_VNIC) - -/* - * A combination of flags and media. - * flags is the higher 32 bits, and if it is 0x01, it indicates all media - * types can be accepted; otherwise, only the given media type (specified - * in the lower 32 bits) is accepted. + * Maps a (VID, INST) pair to ppa */ -typedef uint64_t datalink_media_t; - -#define DATALINK_ANY_MEDIATYPE \ - ((datalink_media_t)(((datalink_media_t)0x01) << 32)) - -#define DATALINK_MEDIA_ACCEPTED(dmedia, media) \ - (((uint32_t)(((dmedia) >> 32) & 0xfffffffful) & 0x01) ? \ - B_TRUE : ((uint32_t)((dmedia) & 0xfffffffful) == (media))) - -#define MAXLINKATTRLEN 32 -#define MAXLINKATTRVALLEN 1024 +#define DLS_VIDINST2PPA(vid, inst) ((minor_t)((vid) * 1000 + (inst))) /* - * Link attributes used by the kernel. - */ -/* - * The major number and instance number of the underlying physical device - * are kept as FPHYMAJ and FPHYINST (major, instance + 1). - * - * Set for physical links only. - */ -#define FPHYMAJ "phymaj" /* uint64_t */ -#define FPHYINST "phyinst" /* uint64_t */ - -/* - * The devname of the physical link. For example, bge0, ce1. Set for physical - * links only. - */ -#define FDEVNAME "devname" /* string */ - -/* - * The door file for the dlmgmtd (data-link management) daemon. - */ -#define DLMGMT_DOOR "/etc/svc/volatile/dladm/dlmgmt_door" - -/* - * Door upcall commands. - */ -#define DLMGMT_CMD_DLS_CREATE 1 -#define DLMGMT_CMD_DLS_GETATTR 2 -#define DLMGMT_CMD_DLS_DESTROY 3 -#define DLMGMT_CMD_GETNAME 4 -#define DLMGMT_CMD_GETLINKID 5 -#define DLMGMT_CMD_GETNEXT 6 -#define DLMGMT_CMD_DLS_UPDATE 7 -#define DLMGMT_CMD_LINKPROP_INIT 8 -#define DLMGMT_CMD_BASE 128 - -/* - * Indicate the link mapping is active or persistent - */ -#define DLMGMT_ACTIVE 0x01 -#define DLMGMT_PERSIST 0x02 - -/* upcall argument */ -typedef struct dlmgmt_door_arg { - uint_t ld_cmd; -} dlmgmt_door_arg_t; - -typedef struct dlmgmt_upcall_arg_create { - int ld_cmd; - datalink_class_t ld_class; - uint32_t ld_media; - boolean_t ld_persist; - uint64_t ld_phymaj; - uint64_t ld_phyinst; - char ld_devname[MAXNAMELEN]; -} dlmgmt_upcall_arg_create_t; - -/* - * Note: ld_padding is necessary to keep the size of the structure the - * same on amd64 and i386. The same note applies to other ld_padding - * and lr_paddding fields in structures throughout this file. + * Converts a minor to an instance#; makes sense only when minor <= 1000. */ -typedef struct dlmgmt_upcall_arg_destroy { - int ld_cmd; - datalink_id_t ld_linkid; - boolean_t ld_persist; - int ld_padding; -} dlmgmt_upcall_arg_destroy_t; - -typedef struct dlmgmt_upcall_arg_update { - int ld_cmd; - boolean_t ld_novanity; - uint32_t ld_media; - uint32_t ld_padding; - char ld_devname[MAXNAMELEN]; -} dlmgmt_upcall_arg_update_t; - -typedef struct dlmgmt_upcall_arg_getattr { - int ld_cmd; - datalink_id_t ld_linkid; - char ld_attr[MAXLINKATTRLEN]; -} dlmgmt_upcall_arg_getattr_t; - -typedef struct dlmgmt_door_getname { - int ld_cmd; - datalink_id_t ld_linkid; -} dlmgmt_door_getname_t; - -typedef struct dlmgmt_door_getlinkid { - int ld_cmd; - char ld_link[MAXLINKNAMELEN]; -} dlmgmt_door_getlinkid_t; - -typedef struct dlmgmt_door_getnext_s { - int ld_cmd; - datalink_id_t ld_linkid; - datalink_class_t ld_class; - uint32_t ld_flags; - datalink_media_t ld_dmedia; -} dlmgmt_door_getnext_t; - -typedef struct dlmgmt_door_linkprop_init { - int ld_cmd; - datalink_id_t ld_linkid; -} dlmgmt_door_linkprop_init_t; - -/* upcall return value */ -typedef struct dlmgmt_retval_s { - uint_t lr_err; /* return error code */ -} dlmgmt_retval_t; - -typedef dlmgmt_retval_t dlmgmt_destroy_retval_t, - dlmgmt_linkprop_init_retval_t; - -struct dlmgmt_linkid_retval_s { - uint_t lr_err; - datalink_id_t lr_linkid; - uint32_t lr_flags; - datalink_class_t lr_class; - uint32_t lr_media; - uint32_t lr_padding; -}; - -typedef struct dlmgmt_linkid_retval_s dlmgmt_create_retval_t, - dlmgmt_update_retval_t, - dlmgmt_getlinkid_retval_t, - dlmgmt_getnext_retval_t; - -typedef struct dlmgmt_getname_retval_s { - uint_t lr_err; - char lr_link[MAXLINKNAMELEN]; - datalink_class_t lr_class; - uint32_t lr_media; - uint32_t lr_flags; -} dlmgmt_getname_retval_t; - -typedef struct dlmgmt_getattr_retval_s { - uint_t lr_err; - uint_t lr_type; - uint_t lr_attrsz; - uint_t lr_padding; - char lr_attrval[MAXLINKATTRVALLEN]; -} dlmgmt_getattr_retval_t; +#define DLS_MINOR2INST(minor) ((int)((minor) - 1)) #ifdef _KERNEL #define DLS_MAX_PPA 999 #define DLS_MAX_MINOR (DLS_MAX_PPA + 1) -typedef struct dls_t *dls_channel_t; +typedef void (*dls_rx_t)(void *, mac_resource_handle_t, mblk_t *, + mac_header_info_t *); -extern int dls_open_style2_vlan(major_t, uint_t, dls_channel_t *); -extern int dls_open_by_dev(dev_t, dls_channel_t *); -extern void dls_close(dls_channel_t); - -extern mac_handle_t dls_mac(dls_channel_t); -extern uint16_t dls_vid(dls_channel_t); +typedef struct dld_str_s dld_str_t; +typedef struct dls_devnet_s *dls_dl_handle_t; +typedef struct dls_dev_t *dls_dev_handle_t; +typedef struct dls_link_s dls_link_t; #define DLS_SAP_LLC 0 #define DLS_SAP_PROMISC (1 << 16) -extern int dls_bind(dls_channel_t, uint32_t); -extern void dls_unbind(dls_channel_t); - #define DLS_PROMISC_SAP 0x00000001 #define DLS_PROMISC_MULTI 0x00000002 #define DLS_PROMISC_PHYS 0x00000004 -extern int dls_promisc(dls_channel_t, uint32_t); - -extern int dls_multicst_add(dls_channel_t, const uint8_t *); -extern int dls_multicst_remove(dls_channel_t, const uint8_t *); - -extern mblk_t *dls_header(dls_channel_t, const uint8_t *, - uint16_t, uint_t, mblk_t **); -extern int dls_header_info(dls_channel_t, mblk_t *, - mac_header_info_t *); +extern int dls_open(dls_link_t *, dls_dl_handle_t, dld_str_t *); +extern void dls_close(dld_str_t *); +extern int dls_bind(dld_str_t *, uint32_t); +extern int dls_unbind(dld_str_t *); -typedef void (*dls_rx_t)(void *, mac_resource_handle_t, mblk_t *, - mac_header_info_t *); +extern int dls_promisc(dld_str_t *, uint32_t); -extern void dls_rx_set(dls_channel_t, dls_rx_t, void *); +extern int dls_multicst_add(dld_str_t *, const uint8_t *); +extern int dls_multicst_remove(dld_str_t *, const uint8_t *); -extern mblk_t *dls_tx(dls_channel_t, mblk_t *); +extern mblk_t *dls_header(dld_str_t *, const uint8_t *, + uint16_t, uint_t, mblk_t **); -extern boolean_t dls_active_set(dls_channel_t); -extern void dls_active_clear(dls_channel_t); +extern void dls_rx_set(dld_str_t *, dls_rx_t, void *); +extern dld_str_t *dls_rx_get(char *, flow_desc_t *, size_t *); -extern dev_info_t *dls_finddevinfo(dev_t); - -typedef struct dls_devnet_s *dls_dl_handle_t; -typedef struct dls_dev_t *dls_dev_handle_t; +extern void str_notify(void *, mac_notify_type_t); extern int dls_devnet_open(const char *, dls_dl_handle_t *, dev_t *); @@ -289,19 +112,18 @@ extern boolean_t dls_devnet_rebuild(); extern int dls_devnet_rename(datalink_id_t, datalink_id_t, const char *); extern int dls_devnet_create(mac_handle_t, datalink_id_t); -extern int dls_devnet_destroy(mac_handle_t, datalink_id_t *); +extern int dls_devnet_destroy(mac_handle_t, datalink_id_t *, + boolean_t); extern int dls_devnet_recreate(mac_handle_t, datalink_id_t); -extern int dls_devnet_create_vlan(datalink_id_t, - datalink_id_t, uint16_t, boolean_t); -extern int dls_devnet_destroy_vlan(datalink_id_t); extern int dls_devnet_hold_tmp(datalink_id_t, dls_dl_handle_t *); extern void dls_devnet_rele_tmp(dls_dl_handle_t); +extern int dls_devnet_hold_by_dev(dev_t, dls_dl_handle_t *); +extern void dls_devnet_rele(dls_dl_handle_t); extern void dls_devnet_prop_task_wait(dls_dl_handle_t); extern const char *dls_devnet_mac(dls_dl_handle_t); extern uint16_t dls_devnet_vid(dls_dl_handle_t); extern datalink_id_t dls_devnet_linkid(dls_dl_handle_t); -extern boolean_t dls_devnet_is_explicit(dls_dl_handle_t); extern int dls_devnet_dev2linkid(dev_t, datalink_id_t *); extern int dls_devnet_phydev(datalink_id_t, dev_t *); extern int dls_devnet_setzid(const char *, zoneid_t); @@ -318,6 +140,8 @@ extern int dls_mgmt_get_linkinfo(datalink_id_t, char *, extern int dls_mgmt_get_linkid(const char *, datalink_id_t *); extern datalink_id_t dls_mgmt_get_next(datalink_id_t, datalink_class_t, datalink_media_t, uint32_t); +extern int dls_devnet_macname2linkid(const char *, + datalink_id_t *); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/dls_impl.h b/usr/src/uts/common/sys/dls_impl.h index 83bccd20bb..71f79a611a 100644 --- a/usr/src/uts/common/sys/dls_impl.h +++ b/usr/src/uts/common/sys/dls_impl.h @@ -26,174 +26,97 @@ #ifndef _SYS_DLS_IMPL_H #define _SYS_DLS_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/stream.h> #include <sys/dls.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> +#include <sys/mac_client.h> +#include <sys/mac_client_priv.h> #include <sys/modhash.h> #include <sys/kstat.h> #include <net/if.h> #include <sys/dlpi.h> -#include <sys/dls_soft_ring.h> #ifdef __cplusplus extern "C" { #endif -typedef struct dls_multicst_addr_s dls_multicst_addr_t; - -struct dls_multicst_addr_s { - dls_multicst_addr_t *dma_nextp; - uint8_t dma_addr[MAXMACADDRLEN]; -}; - -typedef struct dls_link_s dls_link_t; - -struct dls_link_s { - char dl_name[MAXNAMELEN]; - mac_handle_t dl_mh; - const mac_info_t *dl_mip; - mac_rx_handle_t dl_mrh; - mac_txloop_handle_t dl_mth; - uint_t dl_ref; - uint_t dl_macref; - mod_hash_t *dl_impl_hash; - krwlock_t dl_impl_lock; - uint_t dl_impl_count; - kmutex_t dl_promisc_lock; - uint_t dl_npromisc; - uint_t dl_nactive; - uint32_t dl_unknowns; - kmutex_t dl_lock; +typedef struct dls_multicst_addr_s { + struct dls_multicst_addr_s *dma_nextp; /* ds_rw_lock */ + uint8_t dma_addr[MAXMACADDRLEN]; +} dls_multicst_addr_t; + +struct dls_link_s { /* Protected by */ + char dl_name[MAXNAMELEN]; /* SL */ + uint_t dl_ddi_instance; /* SL */ + mac_handle_t dl_mh; /* SL */ + mac_client_handle_t dl_mch; /* SL */ + mac_unicast_handle_t dl_mah; /* SL */ + const mac_info_t *dl_mip; /* SL */ + uint_t dl_ref; /* SL */ + mod_hash_t *dl_str_hash; /* SL, modhash lock */ + uint_t dl_impl_count; /* SL */ + uint_t dl_nactive; /* SL */ + uint32_t dl_unknowns; /* atomic */ + zoneid_t dl_zid; + uint_t dl_zone_ref; }; -typedef struct dls_impl_s dls_impl_t; -typedef struct dls_head_s dls_head_t; - -/* - * The maximum length of an SPA (subnetwork point of attachment). It is of - * the form <macname/vid>. - */ -#define MAXSPALEN (MAXNAMELEN + 5) - -typedef struct dls_vlan_s { - /* - * The following fields will not change after dls_vlan_t creation. - */ - dls_link_t *dv_dlp; - uint16_t dv_id; - - /* - * Unique SPA (of the form <macname/vid>) identifying a data-link; - * is needed to avoid name collisions between an explicitly and - * implicitly created VLANs. - */ - char dv_spa[MAXSPALEN]; - - /* - * The ppa value of the associated device. Used to derive this link's - * devfs node name. - */ - uint_t dv_ppa; - - /* - * The dev_t used to access this dls_vlan_t. - */ - dev_t dv_dev; - - dev_info_t *dv_dip; - kstat_t *dv_ksp; - uint32_t dv_force : 1; - - /* - * The following fields are protected by dv_lock. - */ - kmutex_t dv_lock; - - /* - * Reference count of dls_impl_t plus explicit creation of the link - */ - uint_t dv_ref; - - /* - * The reference count of this vlan is opened in its own zone. - */ - uint_t dv_zone_ref; - zoneid_t dv_zid; -} dls_vlan_t; - -struct dls_impl_s { - dls_impl_t *di_nextp; - dls_head_t *di_headp; - dls_vlan_t *di_dvp; - mac_handle_t di_mh; - mac_notify_handle_t di_mnh; - const mac_info_t *di_mip; - krwlock_t di_lock; - uint16_t di_sap; - uint_t di_promisc; - dls_multicst_addr_t *di_dmap; - dls_rx_t di_rx; - void *di_rx_arg; - mac_resource_add_t di_ring_add; - const mac_txinfo_t *di_txinfo; - uint_t di_bound : 1, - di_removing : 1, - di_active : 1, - di_local : 1; - - uint8_t di_unicst_addr[MAXMACADDRLEN]; - soft_ring_t **di_soft_ring_list; - uint_t di_soft_ring_size; - dls_dl_handle_t di_ddh; -}; - -struct dls_head_s { - dls_impl_t *dh_list; - uint_t dh_ref; - mod_hash_key_t dh_key; -}; +typedef struct dls_head_s { + kmutex_t dh_lock; + struct dld_str_s *dh_list; /* dh_ref */ + uint_t dh_ref; /* dh_lock */ + mod_hash_key_t dh_key; /* SL */ + kcondvar_t dh_cv; /* dh_lock */ + uint_t dh_removing; /* dh_lock */ +} dls_head_t; extern void dls_link_init(void); extern int dls_link_fini(void); extern int dls_link_hold(const char *, dls_link_t **); +extern int dls_link_hold_create(const char *, dls_link_t **); +extern int dls_link_hold_by_dev(dev_t, dls_link_t **); extern void dls_link_rele(dls_link_t *); -extern void dls_link_add(dls_link_t *, uint32_t, dls_impl_t *); -extern void dls_link_remove(dls_link_t *, dls_impl_t *); +extern int dls_link_rele_by_name(const char *); +extern void dls_link_add(dls_link_t *, uint32_t, dld_str_t *); +extern void dls_link_remove(dls_link_t *, dld_str_t *); extern int dls_link_header_info(dls_link_t *, mblk_t *, mac_header_info_t *); -extern int dls_mac_hold(dls_link_t *); -extern void dls_mac_rele(dls_link_t *); -extern boolean_t dls_mac_active_set(dls_link_t *); -extern void dls_mac_active_clear(dls_link_t *); +extern int dls_link_setzid(const char *, zoneid_t); +extern dev_info_t *dls_link_devinfo(dev_t); +extern dev_t dls_link_dev(dls_link_t *); -extern void dls_mac_stat_create(dls_vlan_t *); -extern void dls_mac_stat_destroy(dls_vlan_t *); +extern void i_dls_head_rele(dls_head_t *); +extern int dls_mac_active_set(dls_link_t *i); +extern void dls_mac_active_clear(dls_link_t *); -extern void dls_vlan_init(void); -extern int dls_vlan_fini(void); -extern int dls_vlan_hold(const char *, uint16_t, dls_vlan_t **, - boolean_t, boolean_t); -extern int dls_vlan_hold_by_dev(dev_t, dls_vlan_t **); -extern void dls_vlan_rele(dls_vlan_t *); -extern int dls_vlan_destroy(const char *, uint16_t); -extern int dls_vlan_create(const char *, uint16_t, boolean_t); -extern int dls_vlan_setzid(const char *, uint16_t, zoneid_t); -extern int dls_stat_update(kstat_t *, dls_vlan_t *, int); +extern void dls_create_str_kstats(dld_str_t *); +extern int dls_stat_update(kstat_t *, dls_link_t *, int); extern int dls_stat_create(const char *, int, const char *, int (*)(struct kstat *, int), void *, kstat_t **); -extern int dls_devnet_open_by_dev(dev_t, dls_vlan_t **, +extern int dls_devnet_open_by_dev(dev_t, dls_link_t **, dls_dl_handle_t *); +extern int dls_devnet_hold_link(datalink_id_t, dls_dl_handle_t *, + dls_link_t **); +extern void dls_devnet_rele_link(dls_dl_handle_t, dls_link_t *); extern void dls_init(void); extern int dls_fini(void); extern void dls_link_txloop(void *, mblk_t *); -extern boolean_t dls_accept(dls_impl_t *, mac_header_info_t *, +extern boolean_t dls_accept(dld_str_t *, mac_header_info_t *, dls_rx_t *, void **); -extern boolean_t dls_accept_loopback(dls_impl_t *, mac_header_info_t *, +extern boolean_t dls_accept_loopback(dld_str_t *, mac_header_info_t *, dls_rx_t *, void **); +extern boolean_t dls_accept_promisc(dld_str_t *, mac_header_info_t *, + dls_rx_t *, void **, boolean_t); +extern void i_dls_link_rx(void *, mac_resource_handle_t, mblk_t *, + boolean_t); +extern void dls_rx_promisc(void *, mac_resource_handle_t, mblk_t *, + boolean_t); +extern void dls_rx_vlan_promisc(void *, mac_resource_handle_t, + mblk_t *, boolean_t); +extern int dls_active_set(dld_str_t *); +extern void dls_active_clear(dld_str_t *); extern void dls_mgmt_init(void); extern void dls_mgmt_fini(void); diff --git a/usr/src/uts/common/sys/dls_mgmt.h b/usr/src/uts/common/sys/dls_mgmt.h new file mode 100644 index 0000000000..5177de09b9 --- /dev/null +++ b/usr/src/uts/common/sys/dls_mgmt.h @@ -0,0 +1,218 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _DLS_MGMT_H +#define _DLS_MGMT_H + +#include <sys/types.h> +#include <sys/dld.h> + +/* + * Data-Link Services Module + */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + DATALINK_CLASS_PHYS = 0x01, + DATALINK_CLASS_VLAN = 0x02, + DATALINK_CLASS_AGGR = 0x04, + DATALINK_CLASS_VNIC = 0x08, + DATALINK_CLASS_ETHERSTUB = 0x10 +} datalink_class_t; + +#define DATALINK_CLASS_ALL (DATALINK_CLASS_PHYS | \ + DATALINK_CLASS_VLAN | DATALINK_CLASS_AGGR | DATALINK_CLASS_VNIC | \ + DATALINK_CLASS_ETHERSTUB) + +/* + * A combination of flags and media. + * flags is the higher 32 bits, and if it is 0x01, it indicates all media + * types can be accepted; otherwise, only the given media type (specified + * in the lower 32 bits) is accepted. + */ +typedef uint64_t datalink_media_t; + +#define DATALINK_ANY_MEDIATYPE \ + ((datalink_media_t)(((datalink_media_t)0x01) << 32)) + +#define DATALINK_MEDIA_ACCEPTED(dmedia, media) \ + (((uint32_t)(((dmedia) >> 32) & 0xfffffffful) & 0x01) ? \ + B_TRUE : ((uint32_t)((dmedia) & 0xfffffffful) == (media))) + +#define MAXLINKATTRLEN 32 +#define MAXLINKATTRVALLEN 1024 + +/* + * Link attributes used by the kernel. + */ +/* + * The major number and instance number of the underlying physical device + * are kept as FPHYMAJ and FPHYINST (major, instance + 1). + * + * Set for physical links only. + */ +#define FPHYMAJ "phymaj" /* uint64_t */ +#define FPHYINST "phyinst" /* uint64_t */ + +/* + * The devname of the physical link. For example, bge0, ce1. Set for physical + * links only. + */ +#define FDEVNAME "devname" /* string */ + +/* + * The door file for the dlmgmtd (data-link management) daemon. + */ +#define DLMGMT_DOOR "/etc/svc/volatile/dladm/dlmgmt_door" + +/* + * Door upcall commands. + */ +#define DLMGMT_CMD_DLS_CREATE 1 +#define DLMGMT_CMD_DLS_GETATTR 2 +#define DLMGMT_CMD_DLS_DESTROY 3 +#define DLMGMT_CMD_GETNAME 4 +#define DLMGMT_CMD_GETLINKID 5 +#define DLMGMT_CMD_GETNEXT 6 +#define DLMGMT_CMD_DLS_UPDATE 7 +#define DLMGMT_CMD_LINKPROP_INIT 8 +#define DLMGMT_CMD_BASE 128 + +/* + * Indicate the link mapping is active or persistent + */ +#define DLMGMT_ACTIVE 0x01 +#define DLMGMT_PERSIST 0x02 + +/* upcall argument */ +typedef struct dlmgmt_door_arg { + uint_t ld_cmd; +} dlmgmt_door_arg_t; + +typedef struct dlmgmt_upcall_arg_create { + int ld_cmd; + datalink_class_t ld_class; + uint32_t ld_media; + boolean_t ld_persist; + uint64_t ld_phymaj; + uint64_t ld_phyinst; + char ld_devname[MAXNAMELEN]; +} dlmgmt_upcall_arg_create_t; + +/* + * Note: ld_padding is necessary to keep the size of the structure the + * same on amd64 and i386. The same note applies to other ld_padding + * and lr_paddding fields in structures throughout this file. + */ +typedef struct dlmgmt_upcall_arg_destroy { + int ld_cmd; + datalink_id_t ld_linkid; + boolean_t ld_persist; + int ld_padding; +} dlmgmt_upcall_arg_destroy_t; + +typedef struct dlmgmt_upcall_arg_update { + int ld_cmd; + boolean_t ld_novanity; + uint32_t ld_media; + uint32_t ld_padding; + char ld_devname[MAXNAMELEN]; +} dlmgmt_upcall_arg_update_t; + +typedef struct dlmgmt_upcall_arg_getattr { + int ld_cmd; + datalink_id_t ld_linkid; + char ld_attr[MAXLINKATTRLEN]; +} dlmgmt_upcall_arg_getattr_t; + +typedef struct dlmgmt_door_getname { + int ld_cmd; + datalink_id_t ld_linkid; +} dlmgmt_door_getname_t; + +typedef struct dlmgmt_door_getlinkid { + int ld_cmd; + char ld_link[MAXLINKNAMELEN]; +} dlmgmt_door_getlinkid_t; + +typedef struct dlmgmt_door_getnext_s { + int ld_cmd; + datalink_id_t ld_linkid; + datalink_class_t ld_class; + uint32_t ld_flags; + datalink_media_t ld_dmedia; +} dlmgmt_door_getnext_t; + +typedef struct dlmgmt_door_linkprop_init { + int ld_cmd; + datalink_id_t ld_linkid; +} dlmgmt_door_linkprop_init_t; + +/* upcall return value */ +typedef struct dlmgmt_retval_s { + uint_t lr_err; /* return error code */ +} dlmgmt_retval_t; + +typedef dlmgmt_retval_t dlmgmt_destroy_retval_t, + dlmgmt_linkprop_init_retval_t; + +struct dlmgmt_linkid_retval_s { + uint_t lr_err; + datalink_id_t lr_linkid; + uint32_t lr_flags; + datalink_class_t lr_class; + uint32_t lr_media; + uint32_t lr_padding; +}; + +typedef struct dlmgmt_linkid_retval_s dlmgmt_create_retval_t, + dlmgmt_update_retval_t, + dlmgmt_getlinkid_retval_t, + dlmgmt_getnext_retval_t; + +typedef struct dlmgmt_getname_retval_s { + uint_t lr_err; + char lr_link[MAXLINKNAMELEN]; + datalink_class_t lr_class; + uint32_t lr_media; + uint32_t lr_flags; +} dlmgmt_getname_retval_t; + +typedef struct dlmgmt_getattr_retval_s { + uint_t lr_err; + uint_t lr_type; + uint_t lr_attrsz; + uint_t lr_padding; + char lr_attrval[MAXLINKATTRVALLEN]; +} dlmgmt_getattr_retval_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _DLS_MGMT_H */ diff --git a/usr/src/uts/common/sys/dls_soft_ring.h b/usr/src/uts/common/sys/dls_soft_ring.h deleted file mode 100644 index 403623853a..0000000000 --- a/usr/src/uts/common/sys/dls_soft_ring.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_DLS_SOFT_RING_H -#define _SYS_DLS_SOFT_RING_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#ifdef __cplusplus -extern "C" { -#endif - -#include <sys/types.h> -#include <sys/processor.h> -#include <sys/stream.h> -#include <sys/squeue.h> -#include <sys/mac.h> - -#define S_RING_NAMELEN 64 - -typedef void (*s_ring_proc_t)(void *, void *, mblk_t *, mac_header_info_t *); - -typedef struct soft_ring_s { - /* Keep the most used members 64bytes cache aligned */ - kmutex_t s_ring_lock; /* lock before using any member */ - uint16_t s_ring_type; /* processing model of the sq */ - uint16_t s_ring_state; /* state flags and message count */ - int s_ring_count; /* # of mblocks in soft_ring */ - mblk_t *s_ring_first; /* first mblk chain or NULL */ - mblk_t *s_ring_last; /* last mblk chain or NULL */ - s_ring_proc_t s_ring_upcall; /* Upcall func pointer */ - void *s_ring_upcall_arg1; /* upcall argument 1 */ - void *s_ring_upcall_arg2; /* upcall argument 2 */ - clock_t s_ring_awaken; /* time async thread was awakened */ - - kthread_t *s_ring_run; /* Current thread processing sq */ - processorid_t s_ring_bind; /* processor to bind to */ - kcondvar_t s_ring_async; /* async thread blocks on */ - clock_t s_ring_wait; /* lbolts to wait after a fill() */ - timeout_id_t s_ring_tid; /* timer id of pending timeout() */ - kthread_t *s_ring_worker; /* kernel thread id */ - char s_ring_name[S_RING_NAMELEN + 1]; - uint32_t s_ring_total_inpkt; -} soft_ring_t; - - -/* - * type flags - combination allowed to process and drain the queue - */ -#define S_RING_WORKER_ONLY 0x0001 /* Worker thread only */ -#define S_RING_ANY 0x0002 /* Any thread can process the queue */ - -/* - * State flags. - */ -#define S_RING_PROC 0x0001 /* being processed */ -#define S_RING_WORKER 0x0002 /* worker thread */ -#define S_RING_BOUND 0x0004 /* Worker thread is bound */ -#define S_RING_DESTROY 0x0008 /* Ring is being destroyed */ -#define S_RING_DEAD 0x0010 /* Worker thread is no more */ - -/* - * arguments for processors to bind to - */ -#define S_RING_BIND_NONE -1 - -/* - * Structure for dls statistics - */ -struct dls_kstats { - kstat_named_t dlss_soft_ring_pkt_drop; -}; - -extern struct dls_kstats dls_kstat; - -#define DLS_BUMP_STAT(x, y) (dls_kstat.x.value.ui32 += y) - -extern void soft_ring_init(void); -extern soft_ring_t *soft_ring_create(char *, processorid_t, clock_t, - uint_t, pri_t); -extern soft_ring_t **soft_ring_set_create(char *, processorid_t, clock_t, - uint_t, pri_t, int); -extern void soft_ring_set_destroy(soft_ring_t **, int); -extern void soft_ring_bind(void *, processorid_t); -extern void soft_ring_unbind(void *); -extern void dls_soft_ring_fanout(void *, void *, mblk_t *, mac_header_info_t *); -extern boolean_t dls_soft_ring_enable(dls_channel_t, dl_capab_dls_t *); -extern void dls_soft_ring_disable(dls_channel_t); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_DLS_SOFT_RING_H */ diff --git a/usr/src/uts/common/sys/exacct.h b/usr/src/uts/common/sys/exacct.h index b30362bb05..a9c394bb4f 100644 --- a/usr/src/uts/common/sys/exacct.h +++ b/usr/src/uts/common/sys/exacct.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_EXACCT_H #define _SYS_EXACCT_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/task.h> #include <sys/proc.h> @@ -175,6 +173,7 @@ extern int exacct_tag_task(ac_info_t *, task_t *, void *, size_t, int); extern int exacct_tag_proc(ac_info_t *, pid_t, taskid_t, void *, size_t, int, const char *); extern void exacct_commit_flow(void *); +extern int exacct_commit_netinfo(void *, int); extern void exacct_init(void); extern void *exacct_create_header(size_t *); extern int exacct_write_header(ac_info_t *, void *, size_t); @@ -192,6 +191,9 @@ extern int exacct_assemble_flow_usage(ac_info_t *, flow_usage_t *, int (*)(ac_info_t *, void *, size_t, void *, size_t, size_t *), void *, size_t, size_t *); extern void exacct_move_mstate(proc_t *, task_t *, task_t *); +extern int exacct_assemble_net_usage(ac_info_t *, void *, + int (*)(ac_info_t *, void *, size_t, void *, size_t, size_t *), + void *, size_t, size_t *, int); extern taskq_t *exacct_queue; extern kmem_cache_t *exacct_object_cache; #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/exacct_catalog.h b/usr/src/uts/common/sys/exacct_catalog.h index 0911344382..f6d9c09e7a 100644 --- a/usr/src/uts/common/sys/exacct_catalog.h +++ b/usr/src/uts/common/sys/exacct_catalog.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_EXACCT_CATALOG_H #define _SYS_EXACCT_CATALOG_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -104,6 +101,10 @@ extern "C" { #define EXD_GROUP_FLOW 0x000109 #define EXD_GROUP_RFMA 0x00010a #define EXD_GROUP_FMA 0x00010b +#define EXD_GROUP_NET_LINK_DESC 0X00010c +#define EXD_GROUP_NET_FLOW_DESC 0X00010d +#define EXD_GROUP_NET_LINK_STATS 0X00010e +#define EXD_GROUP_NET_FLOW_STATS 0X00010f #define EXD_PROC_PID 0x001000 #define EXD_PROC_UID 0x001001 @@ -204,6 +205,36 @@ extern "C" { #define EXD_FMA_OFFSET 0x00400B #define EXD_FMA_UUID 0x00400C +/* For EXD_GROUP_FLDESC and EXD_GROUP_LNDESC */ +#define EXD_NET_DESC_NAME 0x005001 +#define EXD_NET_DESC_EHOST 0x005002 +#define EXD_NET_DESC_EDEST 0x005003 +#define EXD_NET_DESC_VLAN_TPID 0x005004 +#define EXD_NET_DESC_VLAN_TCI 0x005005 +#define EXD_NET_DESC_SAP 0x005006 +#define EXD_NET_DESC_PRIORITY 0x005007 +#define EXD_NET_DESC_BWLIMIT 0x005008 +/* For EXD_GROUP_FLDESC only */ +#define EXD_NET_DESC_DEVNAME 0x005009 +#define EXD_NET_DESC_V4SADDR 0x00500a +#define EXD_NET_DESC_V4DADDR 0x00500b +#define EXD_NET_DESC_V6SADDR 0x00500c +#define EXD_NET_DESC_V6DADDR 0x00500d +#define EXD_NET_DESC_SPORT 0x00500e +#define EXD_NET_DESC_DPORT 0x00500f +#define EXD_NET_DESC_PROTOCOL 0x005010 +#define EXD_NET_DESC_DSFIELD 0x005011 + +/* For EXD_NET_STATS */ +#define EXD_NET_STATS_NAME 0x006000 +#define EXD_NET_STATS_CURTIME 0x006001 +#define EXD_NET_STATS_IBYTES 0x006002 +#define EXD_NET_STATS_OBYTES 0x006003 +#define EXD_NET_STATS_IPKTS 0x006004 +#define EXD_NET_STATS_OPKTS 0x006005 +#define EXD_NET_STATS_IERRPKTS 0x006006 +#define EXD_NET_STATS_OERRPKTS 0x006007 + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/exacct_impl.h b/usr/src/uts/common/sys/exacct_impl.h index 14cee43d5f..6f25f02e7e 100644 --- a/usr/src/uts/common/sys/exacct_impl.h +++ b/usr/src/uts/common/sys/exacct_impl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_EXACCT_IMPL_H #define _SYS_EXACCT_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -129,6 +126,42 @@ typedef struct flow_usage { char *fu_aname; /* action instance name */ } flow_usage_t; +#define EX_NET_LNDESC_REC 1 +#define EX_NET_FLDESC_REC 2 +#define EX_NET_LNSTAT_REC 3 +#define EX_NET_FLSTAT_REC 4 + +typedef struct net_stat_s { + char *ns_name; + uint64_t ns_ibytes; + uint64_t ns_obytes; + uint64_t ns_ipackets; + uint64_t ns_opackets; + uint64_t ns_ierrors; + uint64_t ns_oerrors; + boolean_t ns_isref; +} net_stat_t; + +typedef struct net_desc_s { + char *nd_name; + char *nd_devname; + uchar_t nd_ehost[6]; + uchar_t nd_edest[6]; + ushort_t nd_vlan_tpid; + ushort_t nd_vlan_tci; + ushort_t nd_sap; + ushort_t nd_priority; + uint64_t nd_bw_limit; + uint32_t nd_saddr[4]; + uint32_t nd_daddr[4]; + boolean_t nd_isv4; + uint16_t nd_sport; + uint16_t nd_dport; + uint8_t nd_protocol; + uint8_t nd_dsfield; + int nd_type; +} net_desc_t; + extern void exacct_order16(uint16_t *); extern void exacct_order32(uint32_t *); extern void exacct_order64(uint64_t *); diff --git a/usr/src/uts/common/sys/ib/clients/ibd/ibd.h b/usr/src/uts/common/sys/ib/clients/ibd/ibd.h index 8cdf2cf96a..73419866a9 100644 --- a/usr/src/uts/common/sys/ib/clients/ibd/ibd.h +++ b/usr/src/uts/common/sys/ib/clients/ibd/ibd.h @@ -26,8 +26,6 @@ #ifndef _SYS_IB_CLIENTS_IBD_H #define _SYS_IB_CLIENTS_IBD_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -96,7 +94,7 @@ typedef struct ipoib_pgrh { #include <sys/ib/ibtl/ibti.h> #include <sys/ib/ib_pkt_hdrs.h> #include <sys/list.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ib.h> #include <sys/modhash.h> diff --git a/usr/src/uts/common/sys/mac.h b/usr/src/uts/common/sys/mac.h index 9011423727..d4608f3729 100644 --- a/usr/src/uts/common/sys/mac.h +++ b/usr/src/uts/common/sys/mac.h @@ -18,6 +18,7 @@ * * CDDL HEADER END */ + /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -30,6 +31,7 @@ #include <sys/ddi.h> #include <sys/sunddi.h> #include <sys/stream.h> +#include <sys/mac_flow.h> /* * MAC Services Module @@ -42,13 +44,7 @@ extern "C" { /* * MAC Information (text emitted by modinfo(1m)) */ -#define MAC_INFO "MAC Services" - -/* - * MAC version identifier. This is used by mac_alloc() mac_register() to - * verify that incompatible drivers don't register. - */ -#define MAC_VERSION 0x1 +#define MAC_INFO "MAC Services v1.20" /* * MAC-Type version identifier. This is used by mactype_alloc() and @@ -58,17 +54,23 @@ extern "C" { #define MACTYPE_VERSION 0x1 /* - * Statistics + * Opaque handle types */ +typedef struct __mac_handle *mac_handle_t; +typedef struct __mac_resource_handle *mac_resource_handle_t; +typedef struct __mac_notify_handle *mac_notify_handle_t; +typedef struct __mac_tx_notify_handle *mac_tx_notify_handle_t; +typedef struct __mac_intr_handle *mac_intr_handle_t; +typedef struct __mac_ring_handle *mac_ring_handle_t; +typedef struct __mac_group_handle *mac_group_handle_t; -#define XCVR_UNDEFINED 0 -#define XCVR_NONE 1 -#define XCVR_10 2 -#define XCVR_100T4 3 -#define XCVR_100X 4 -#define XCVR_100T2 5 -#define XCVR_1000X 6 -#define XCVR_1000T 7 +#define DATALINK_INVALID_LINKID 0 +#define DATALINK_ALL_LINKID 0 +#define DATALINK_MAX_LINKID 0xffffffff + +#define MAC_MAX_MINOR 1000 + +typedef uint32_t datalink_id_t; typedef enum { LINK_STATE_UNKNOWN = -1, @@ -82,10 +84,6 @@ typedef enum { LINK_DUPLEX_FULL } link_duplex_t; -#define DATALINK_INVALID_LINKID 0 -#define DATALINK_ALL_LINKID 0 -#define DATALINK_MAX_LINKID 0xffffffff - typedef enum { LINK_FLOWCTRL_NONE = 0, LINK_FLOWCTRL_RX, @@ -93,7 +91,15 @@ typedef enum { LINK_FLOWCTRL_BI } link_flowctrl_t; -typedef uint32_t datalink_id_t; +/* + * Maximum MAC address length + */ +#define MAXMACADDRLEN 20 + +typedef enum { + MAC_LOGTYPE_LINK = 1, + MAC_LOGTYPE_FLOW +} mac_logtype_t; /* * Encodings for public properties. @@ -153,15 +159,13 @@ typedef enum { MAC_PROP_WL_DELKEY, MAC_PROP_WL_KEY, MAC_PROP_WL_MLME, + MAC_PROP_MAXBW, + MAC_PROP_PRIO, + MAC_PROP_BIND_CPU, MAC_PROP_PRIVATE = -1 } mac_prop_id_t; /* - * Maximum MAC address length - */ -#define MAXMACADDRLEN 20 - -/* * Flags to figure out r/w status of legacy ndd props. */ #define MAC_PROP_PERM_READ 0x0001 @@ -172,13 +176,6 @@ typedef enum { #ifdef _KERNEL -typedef struct mac_stat_info_s { - uint_t msi_stat; - char *msi_name; - uint_t msi_type; /* as defined in kstat_named_init(9F) */ - uint64_t msi_default; -} mac_stat_info_t; - /* * There are three ranges of statistics values. 0 to 1 - MAC_STAT_MIN are * interface statistics maintained by the mac module. MAC_STAT_MIN to 1 - @@ -259,27 +256,6 @@ typedef struct mac_info_s { } mac_info_t; /* - * LSO capability - */ -typedef struct lso_basic_tcp_ipv4_s { - t_uscalar_t lso_max; /* maximum payload */ -} lso_basic_tcp_ipv4_t; - -/* - * Future LSO capabilities can be added at the end of the mac_capab_lso_t. - * When such capability is added to the GLDv3 framework, the size of the - * mac_capab_lso_t it allocates and passes to the drivers increases. Older - * drivers wil access only the (upper) sections of that structure, that is the - * sections carrying the capabilities they understand. This ensures the - * interface can be safely extended in a binary compatible way. - */ -typedef struct mac_capab_lso_s { - t_uscalar_t lso_flags; - lso_basic_tcp_ipv4_t lso_basic_tcp_ipv4; - /* Add future lso capabilities here */ -} mac_capab_lso_t; - -/* * Information for legacy devices. */ typedef struct mac_capab_legacy_s { @@ -294,307 +270,32 @@ typedef struct mac_capab_legacy_s { } mac_capab_legacy_t; /* - * MAC layer capabilities. These capabilities are handled by the drivers' - * mc_capab_get() callbacks. Some capabilities require the driver to fill - * in a given data structure, and others are simply boolean capabilities. - * Note that capability values must be powers of 2 so that consumers and - * providers of this interface can keep track of which capabilities they - * care about by keeping a bitfield of these things around somewhere. - */ -typedef enum { - MAC_CAPAB_HCKSUM = 0x01, /* data is a uint32_t for the txflags */ - MAC_CAPAB_POLL = 0x02, /* boolean only, no data */ - MAC_CAPAB_MULTIADDRESS = 0x04, /* data is multiaddress_capab_t */ - MAC_CAPAB_LSO = 0x08, /* data is mac_capab_lso_t */ - MAC_CAPAB_NO_NATIVEVLAN = 0x10, /* boolean only, no data */ - MAC_CAPAB_NO_ZCOPY = 0x20, /* boolean only, no data */ - /* add new capabilities here */ - MAC_CAPAB_RINGS = 0x100, /* data is mac_capab_rings_t */ - MAC_CAPAB_SHARES = 0x200, /* data is mac_capab_share_t */ - - /* The following capabilities are specific to softmac. */ - MAC_CAPAB_LEGACY = 0x8000 /* data is mac_capab_legacy_t */ -} mac_capab_t; - -typedef int mac_addr_slot_t; - -/* mma_flags values */ -#define MMAC_SLOT_USED 0x1 /* address slot used */ -#define MMAC_SLOT_UNUSED 0x2 /* free address slot */ -#define MMAC_VENDOR_ADDR 0x4 /* address returned is vendor supplied */ - -typedef struct mac_multi_address_s { - mac_addr_slot_t mma_slot; /* slot for add/remove/get/set */ - uint_t mma_addrlen; - uint8_t mma_addr[MAXMACADDRLEN]; - uint_t mma_flags; -} mac_multi_addr_t; - -typedef int (*maddr_reserve_t)(void *, mac_multi_addr_t *); -typedef int (*maddr_add_t)(void *, mac_multi_addr_t *); -typedef int (*maddr_remove_t)(void *, mac_addr_slot_t); -typedef int (*maddr_modify_t)(void *, mac_multi_addr_t *); -typedef int (*maddr_get_t)(void *, mac_multi_addr_t *); - -/* maddr_flag values */ -#define MADDR_VENDOR_ADDR 0x01 /* addr returned is vendor supplied */ - -/* multiple mac address: add/remove/set/get mac address */ -typedef struct multiaddress_capab_s { - int maddr_naddr; /* total addresses */ - int maddr_naddrfree; /* free address slots */ - uint_t maddr_flag; /* MADDR_VENDOR_ADDR bit can be set */ - /* driver entry points */ - void *maddr_handle; /* cookie to be used for the calls */ - maddr_reserve_t maddr_reserve; /* reserve a factory address */ - maddr_add_t maddr_add; /* add a new unicst address */ - maddr_remove_t maddr_remove; /* remove an added address */ - maddr_modify_t maddr_modify; /* modify an added address */ - maddr_get_t maddr_get; /* get address from specified slot */ -} multiaddress_capab_t; - -/* - * MAC driver entry point types. - */ -typedef int (*mac_getstat_t)(void *, uint_t, uint64_t *); -typedef int (*mac_start_t)(void *); -typedef void (*mac_stop_t)(void *); -typedef int (*mac_setpromisc_t)(void *, boolean_t); -typedef int (*mac_multicst_t)(void *, boolean_t, const uint8_t *); -typedef int (*mac_unicst_t)(void *, const uint8_t *); -typedef void (*mac_ioctl_t)(void *, queue_t *, mblk_t *); -typedef void (*mac_resources_t)(void *); -typedef mblk_t *(*mac_tx_t)(void *, mblk_t *); -typedef boolean_t (*mac_getcapab_t)(void *, mac_capab_t, void *); -typedef int (*mac_open_t)(void *); -typedef void (*mac_close_t)(void *); -typedef int (*mac_set_prop_t)(void *, const char *, mac_prop_id_t, - uint_t, const void *); -typedef int (*mac_get_prop_t)(void *, const char *, mac_prop_id_t, - uint_t, uint_t, void *, uint_t *); - -/* - * Drivers must set all of these callbacks except for mc_resources, - * mc_ioctl, and mc_getcapab, which are optional. If any of these optional - * callbacks are set, their appropriate flags must be set in mc_callbacks. - * Any future additions to this list must also be accompanied by an - * associated mc_callbacks flag so that the framework can grow without - * affecting the binary compatibility of the interface. - */ -typedef struct mac_callbacks_s { - uint_t mc_callbacks; /* Denotes which callbacks are set */ - mac_getstat_t mc_getstat; /* Get the value of a statistic */ - mac_start_t mc_start; /* Start the device */ - mac_stop_t mc_stop; /* Stop the device */ - mac_setpromisc_t mc_setpromisc; /* Enable or disable promiscuous mode */ - mac_multicst_t mc_multicst; /* Enable or disable a multicast addr */ - mac_unicst_t mc_unicst; /* Set the unicast MAC address */ - mac_tx_t mc_tx; /* Transmit a packet */ - mac_resources_t mc_resources; /* Get the device resources */ - mac_ioctl_t mc_ioctl; /* Process an unknown ioctl */ - mac_getcapab_t mc_getcapab; /* Get capability information */ - mac_open_t mc_open; /* Open the device */ - mac_close_t mc_close; /* Close the device */ - mac_set_prop_t mc_setprop; - mac_get_prop_t mc_getprop; -} mac_callbacks_t; - -typedef struct mac_priv_prop_s { - char mpp_name[MAXLINKPROPNAME]; - uint_t mpp_flags; -} mac_priv_prop_t; - -/* - * Multiple Rings capability - */ -typedef enum { - MAC_RING_TYPE_RX = 1, /* Receive ring */ - MAC_RING_TYPE_TX = 2 /* Transmit ring */ -} mac_ring_type_t; - -/* - * Grouping type of a ring group + * When VNICs are created on top of the NIC, there are two levels + * of MAC layer, a lower MAC, which is the MAC layer at the level of the + * physical NIC, and an upper MAC, which is the MAC layer at the level + * of the VNIC. Each VNIC maps to a MAC client at the lower MAC, and + * the SRS and classification is done at the lower MAC level. The upper + * MAC is therefore for the most part pass-through, and therefore + * special processing needs to be done at the upper MAC layer when + * dealing with a VNIC. * - * MAC_GROUP_TYPE_STATIC: The ring group can not be re-grouped. - * MAC_GROUP_TYPE_DYNAMIC: The ring group support dynamic re-grouping - */ -typedef enum { - MAC_GROUP_TYPE_STATIC = 1, /* Static ring group */ - MAC_GROUP_TYPE_DYNAMIC = 2 /* Dynamic ring group */ -} mac_group_type_t; - -typedef struct __mac_ring_driver *mac_ring_driver_t; -typedef struct __mac_ring_handle *mac_ring_handle_t; -typedef struct __mac_group_driver *mac_group_driver_t; -typedef struct __mac_group_handle *mac_group_handle_t; -typedef struct __mac_intr_handle *mac_intr_handle_t; - -typedef struct mac_ring_info_s mac_ring_info_t; -typedef struct mac_group_info_s mac_group_info_t; - -typedef int (*mac_intr_enable_t)(mac_intr_handle_t); -typedef int (*mac_intr_disable_t)(mac_intr_handle_t); - -typedef struct mac_intr_s { - mac_intr_handle_t mi_handle; - mac_intr_enable_t mi_enable; - mac_intr_disable_t mi_disable; -} mac_intr_t; - -typedef void (*mac_get_ring_t)(void *, mac_ring_type_t, const int, const int, - mac_ring_info_t *, mac_ring_handle_t); -typedef void (*mac_get_group_t)(void *, mac_ring_type_t, const int, - mac_group_info_t *, mac_group_handle_t); - -typedef void (*mac_group_add_ring_t)(mac_group_driver_t, - mac_ring_driver_t, mac_ring_type_t); -typedef void (*mac_group_rem_ring_t)(mac_group_driver_t, - mac_ring_driver_t, mac_ring_type_t); - -/* - * Multiple Rings Capability - */ -typedef struct mac_capab_rings_s { - mac_ring_type_t mr_type; /* Ring type */ - mac_group_type_t mr_group_type; /* Grouping type */ - void *mr_handle; /* Group Driver Handle. */ - uint_t mr_rnum; /* Number of rings */ - uint_t mr_gnum; /* Number of ring groups */ - mac_get_ring_t mr_rget; /* Get ring from driver */ - mac_get_group_t mr_gget; /* Get ring group from driver */ - mac_group_add_ring_t mr_gadd_ring; /* Add ring into a group */ - mac_group_rem_ring_t mr_grem_ring; /* Remove ring from a group */ -} mac_capab_rings_t; - -/* - * Common ring functions and driver interfaces + * This capability allows the MAC layer to detect when a VNIC is being + * access, and implement the required shortcuts. */ -typedef int (*mac_ring_start_t)(mac_ring_driver_t); -typedef void (*mac_ring_stop_t)(mac_ring_driver_t); -typedef mblk_t *(*mac_ring_send_t)(void *, mblk_t *); -typedef mblk_t *(*mac_ring_poll_t)(void *, int); +typedef void *(*mac_client_handle_fn_t)(void *); -typedef struct mac_ring_info_s { - mac_ring_driver_t mr_driver; - mac_ring_start_t mr_start; - mac_ring_stop_t mr_stop; - mac_intr_t mr_intr; - union { - mac_ring_send_t send; - mac_ring_poll_t poll; - } mrfunion; -} mac_ring_info_s; - -#define mr_send mrfunion.send -#define mr_poll mrfunion.poll - -typedef int (*mac_group_start_t)(mac_group_driver_t); -typedef void (*mac_group_stop_t)(mac_group_driver_t); -typedef int (*mac_add_mac_addr_t)(void *, const uint8_t *); -typedef int (*mac_rem_mac_addr_t)(void *, const uint8_t *); - -struct mac_group_info_s { - mac_group_driver_t mrg_driver; /* Driver reference */ - mac_group_start_t mrg_start; /* Start the group */ - mac_group_stop_t mrg_stop; /* Stop the group */ - uint_t mrg_count; /* Count of rings */ - mac_intr_t mrg_intr; /* Optional per-group intr */ - - /* Only used for rx groups */ - mac_add_mac_addr_t mrg_addmac; /* Add a MAC address */ - mac_rem_mac_addr_t mrg_remmac; /* Remove a MAC address */ -}; - -/* - * Share management functions. - */ -typedef uint64_t mac_share_handle_t; +typedef struct mac_capab_vnic_s { + void *mcv_arg; + mac_client_handle_fn_t mcv_mac_client_handle; +} mac_capab_vnic_t; -/* - * Returns a Share handle to the client calling from above. - */ -typedef int (*mac_alloc_share_t)(void *, uint64_t cookie, - uint64_t *rcookie, mac_share_handle_t *); - -/* - * Destroys the share previously allocated and unallocates - * all share resources (e.g. DMA's assigned to the share). - */ -typedef void (*mac_free_share_t)(mac_share_handle_t); - -typedef void (*mac_share_query_t)(mac_share_handle_t shdl, - mac_ring_type_t type, uint32_t *rmin, uint32_t *rmax, - uint64_t *rmap, uint64_t *gnum); - -/* - * Basic idea, bind previously created ring groups to shares - * for them to be exported (or shared) by another domain. - * These interfaces bind/unbind the ring group to a share. The - * of doing such causes the resources to be shared with the guest. - */ -typedef int (*mac_share_add_group_t)(mac_share_handle_t, - mac_group_handle_t); -typedef int (*mac_share_rem_group_t)(mac_share_handle_t, - mac_group_handle_t); - -typedef struct mac_capab_share_s { - uint_t ms_snum; /* Number of shares (vr's) */ - void *ms_handle; /* Handle to driver. */ - mac_alloc_share_t ms_salloc; /* Get a share from driver. */ - mac_free_share_t ms_sfree; /* Return a share to driver. */ - mac_share_add_group_t ms_sadd; /* Add a group to the share. */ - mac_share_rem_group_t ms_sremove; /* Remove group from share. */ - mac_share_query_t ms_squery; /* Query share constraints */ -} mac_capab_share_t; +typedef void (*mac_rename_fn_t)(const char *, void *); +typedef struct mac_capab_aggr_s { + mac_rename_fn_t mca_rename_fn; + int (*mca_unicst)(void *, const uint8_t *); +} mac_capab_aggr_t; -/* - * Flags for mc_callbacks. Requiring drivers to set the flags associated - * with optional callbacks initialized in the structure allows the mac - * module to add optional callbacks in the future without requiring drivers - * to recompile. - */ -#define MC_RESOURCES 0x001 -#define MC_IOCTL 0x002 -#define MC_GETCAPAB 0x004 -#define MC_OPEN 0x008 -#define MC_CLOSE 0x010 -#define MC_SETPROP 0x020 -#define MC_GETPROP 0x040 - -#define MAC_MAX_MINOR 1000 - -typedef struct mac_register_s { - uint_t m_version; /* set by mac_alloc() */ - const char *m_type_ident; - void *m_driver; /* Driver private data */ - dev_info_t *m_dip; - uint_t m_instance; - uint8_t *m_src_addr; - uint8_t *m_dst_addr; - mac_callbacks_t *m_callbacks; - uint_t m_min_sdu; - uint_t m_max_sdu; - void *m_pdata; - size_t m_pdata_size; - uint32_t m_margin; - mac_priv_prop_t *m_priv_props; - size_t m_priv_prop_count; -} mac_register_t; - - -/* - * Opaque handle types. - */ -typedef struct mac_t *mac_handle_t; -typedef struct __mac_notify_handle *mac_notify_handle_t; -typedef struct __mac_rx_handle *mac_rx_handle_t; -typedef struct __mac_txloop_handle *mac_txloop_handle_t; -typedef struct __mac_resource_handle *mac_resource_handle_t; - -/* - * MAC interface callback types. - */ typedef enum { MAC_NOTE_LINK, MAC_NOTE_PROMISC, @@ -604,15 +305,15 @@ typedef enum { MAC_NOTE_DEVPROMISC, MAC_NOTE_FASTPATH_FLUSH, MAC_NOTE_SDU_SIZE, - MAC_NOTE_VNIC, MAC_NOTE_MARGIN, + MAC_NOTE_CAPAB_CHG, MAC_NNOTE /* must be the last entry */ } mac_notify_type_t; typedef void (*mac_notify_t)(void *, mac_notify_type_t); -typedef void (*mac_rx_t)(void *, mac_resource_handle_t, mblk_t *); -typedef void (*mac_txloop_t)(void *, mblk_t *); -typedef void (*mac_blank_t)(void *, time_t, uint_t); +typedef void (*mac_rx_t)(void *, mac_resource_handle_t, mblk_t *, + boolean_t); +typedef mblk_t *(*mac_receive_t)(void *, int); /* * MAC promiscuous types @@ -629,26 +330,38 @@ typedef enum { MAC_RX_FIFO = 1 } mac_resource_type_t; +typedef int (*mac_intr_enable_t)(mac_intr_handle_t); +typedef int (*mac_intr_disable_t)(mac_intr_handle_t); + +typedef struct mac_intr_s { + mac_intr_handle_t mi_handle; + mac_intr_enable_t mi_enable; + mac_intr_disable_t mi_disable; +} mac_intr_t; + typedef struct mac_rx_fifo_s { mac_resource_type_t mrf_type; /* MAC_RX_FIFO */ - mac_blank_t mrf_blank; - void *mrf_arg; - time_t mrf_normal_blank_time; - uint_t mrf_normal_pkt_count; + mac_intr_t mrf_intr; + mac_receive_t mrf_receive; + void *mrf_rx_arg; + uint32_t mrf_flow_priority; + /* + * The CPU this flow is to be processed on. With intrd and future + * things, we should know which CPU the flow needs to be processed + * and get a squeue assigned on that CPU. + */ + uint_t mrf_cpu_id; } mac_rx_fifo_t; -typedef struct mac_txinfo_s { - mac_tx_t mt_fn; - void *mt_arg; -} mac_txinfo_t; +#define mrf_intr_handle mrf_intr.mi_handle +#define mrf_intr_enable mrf_intr.mi_enable +#define mrf_intr_disable mrf_intr.mi_disable typedef union mac_resource_u { mac_resource_type_t mr_type; mac_rx_fifo_t mr_fifo; } mac_resource_t; -typedef mac_resource_handle_t (*mac_resource_add_t)(void *, mac_resource_t *); - typedef enum { MAC_ADDRTYPE_UNICAST, MAC_ADDRTYPE_MULTICAST, @@ -664,11 +377,29 @@ typedef struct mac_header_info_s { uint32_t mhi_bindsap; mac_addrtype_t mhi_dsttype; uint16_t mhi_tci; - uint_t mhi_istagged:1, - mhi_prom_looped:1; + boolean_t mhi_istagged; } mac_header_info_t; /* + * Function pointer to match dls client signature. Should be same as + * dls_rx_t to allow a soft ring to bypass DLS layer and call a DLS + * client directly. + */ +typedef void (*mac_direct_rx_t)(void *, mac_resource_handle_t, + mblk_t *, mac_header_info_t *); + +typedef mac_resource_handle_t (*mac_resource_add_t)(void *, mac_resource_t *); +typedef int (*mac_resource_bind_t)(void *, + mac_resource_handle_t, processorid_t); +typedef void (*mac_resource_remove_t)(void *, void *); +typedef void (*mac_resource_quiesce_t)(void *, void *); +typedef void (*mac_resource_restart_t)(void *, void *); +typedef int (*mac_resource_modify_t)(void *, void *, + mac_resource_t *); +typedef void (*mac_change_upcall_t)(void *, mac_direct_rx_t, + void *); + +/* * MAC-Type plugin interfaces */ @@ -782,6 +513,13 @@ typedef struct mac_ndd_mapping_s { #define mp_prop_id u_mp_id.u_id #define mp_kstat u_mp_id.u_kstat +typedef struct mac_stat_info_s { + uint_t msi_stat; + char *msi_name; + uint_t msi_type; /* as defined in kstat_named_init(9F) */ + uint64_t msi_default; +} mac_stat_info_t; + typedef struct mactype_register_s { uint_t mtr_version; /* set by mactype_alloc() */ const char *mtr_ident; @@ -803,107 +541,25 @@ typedef struct mac_prop_s { } mac_prop_t; /* - * Client interface functions. + * Driver interface functions. */ -extern int mac_open(const char *, mac_handle_t *); extern int mac_open_by_linkid(datalink_id_t, mac_handle_t *); extern int mac_open_by_linkname(const char *, mac_handle_t *); -extern void mac_close(mac_handle_t); -extern const mac_info_t *mac_info(mac_handle_t); -extern boolean_t mac_info_get(const char *, mac_info_t *); -extern uint64_t mac_stat_get(mac_handle_t, uint_t); -extern int mac_start(mac_handle_t); -extern void mac_stop(mac_handle_t); -extern int mac_promisc_set(mac_handle_t, boolean_t, - mac_promisc_type_t); -extern boolean_t mac_promisc_get(mac_handle_t, - mac_promisc_type_t); -extern int mac_multicst_add(mac_handle_t, const uint8_t *); -extern int mac_multicst_remove(mac_handle_t, - const uint8_t *); -extern boolean_t mac_unicst_verify(mac_handle_t, - const uint8_t *, uint_t); -extern int mac_unicst_set(mac_handle_t, const uint8_t *); -extern void mac_unicst_get(mac_handle_t, uint8_t *); -extern void mac_dest_get(mac_handle_t, uint8_t *); -extern void mac_sdu_get(mac_handle_t, uint_t *, uint_t *); -extern void mac_resources(mac_handle_t); -extern void mac_ioctl(mac_handle_t, queue_t *, mblk_t *); -extern const mac_txinfo_t *mac_tx_get(mac_handle_t); -extern const mac_txinfo_t *mac_vnic_tx_get(mac_handle_t); -extern link_state_t mac_link_get(mac_handle_t); -extern mac_notify_handle_t mac_notify_add(mac_handle_t, mac_notify_t, - void *); -extern void mac_notify_remove(mac_handle_t, - mac_notify_handle_t); -extern void mac_notify(mac_handle_t); -extern mac_rx_handle_t mac_rx_add(mac_handle_t, mac_rx_t, void *); -extern mac_rx_handle_t mac_active_rx_add(mac_handle_t, mac_rx_t, - void *); -extern void mac_rx_remove(mac_handle_t, mac_rx_handle_t, - boolean_t); -extern void mac_rx_remove_wait(mac_handle_t); -extern mblk_t *mac_txloop(void *, mblk_t *); -extern mac_txloop_handle_t mac_txloop_add(mac_handle_t, mac_txloop_t, - void *); -extern void mac_txloop_remove(mac_handle_t, - mac_txloop_handle_t); -extern boolean_t mac_active_set(mac_handle_t); -extern boolean_t mac_active_shareable_set(mac_handle_t); -extern void mac_active_clear(mac_handle_t); -extern void mac_active_rx(void *, mac_resource_handle_t, - mblk_t *); -extern boolean_t mac_vnic_set(mac_handle_t, mac_txinfo_t *, - mac_getcapab_t, void *); -extern void mac_vnic_clear(mac_handle_t); -extern void mac_resource_set(mac_handle_t, - mac_resource_add_t, void *); -extern dev_info_t *mac_devinfo_get(mac_handle_t); extern const char *mac_name(mac_handle_t); extern minor_t mac_minor(mac_handle_t); -extern boolean_t mac_capab_get(mac_handle_t, mac_capab_t, - void *); -extern boolean_t mac_vnic_capab_get(mac_handle_t, mac_capab_t, - void *); -extern boolean_t mac_sap_verify(mac_handle_t, uint32_t, - uint32_t *); -extern mblk_t *mac_header(mac_handle_t, const uint8_t *, - uint32_t, mblk_t *, size_t); -extern int mac_header_info(mac_handle_t, mblk_t *, - mac_header_info_t *); -extern mblk_t *mac_header_cook(mac_handle_t, mblk_t *); -extern mblk_t *mac_header_uncook(mac_handle_t, mblk_t *); extern minor_t mac_minor_hold(boolean_t); extern void mac_minor_rele(minor_t); +extern void mac_sdu_get(mac_handle_t, uint_t *, uint_t *); +extern int mac_maxsdu_update(mac_handle_t, uint_t); -/* - * Driver interface functions. - */ -extern mac_register_t *mac_alloc(uint_t); -extern void mac_free(mac_register_t *); -extern int mac_register(mac_register_t *, mac_handle_t *); -extern int mac_disable(mac_handle_t); -extern int mac_unregister(mac_handle_t); -extern void mac_rx(mac_handle_t, mac_resource_handle_t, - mblk_t *); -extern void mac_link_update(mac_handle_t, link_state_t); extern void mac_unicst_update(mac_handle_t, const uint8_t *); -extern void mac_tx_update(mac_handle_t); extern void mac_resource_update(mac_handle_t); -extern mac_resource_handle_t mac_resource_add(mac_handle_t, - mac_resource_t *); -extern int mac_maxsdu_update(mac_handle_t, uint_t); +extern void mac_capab_update(mac_handle_t); extern int mac_pdata_update(mac_handle_t, void *, size_t); -extern void mac_multicst_refresh(mac_handle_t, - mac_multicst_t, void *, boolean_t); -extern void mac_unicst_refresh(mac_handle_t, mac_unicst_t, - void *); -extern void mac_promisc_refresh(mac_handle_t, - mac_setpromisc_t, void *); extern boolean_t mac_margin_update(mac_handle_t, uint32_t); extern void mac_margin_get(mac_handle_t, uint32_t *); extern int mac_margin_remove(mac_handle_t, uint32_t); @@ -912,18 +568,17 @@ extern int mac_margin_add(mac_handle_t, uint32_t *, extern void mac_init_ops(struct dev_ops *, const char *); extern void mac_fini_ops(struct dev_ops *); extern uint32_t mac_no_notification(mac_handle_t); -extern boolean_t mac_is_legacy(mac_handle_t); -extern int mac_hold_exclusive(mac_handle_t); -extern void mac_rele_exclusive(mac_handle_t); extern mactype_register_t *mactype_alloc(uint_t); extern void mactype_free(mactype_register_t *); extern int mactype_register(mactype_register_t *); extern int mactype_unregister(const char *); -extern int mac_set_prop(mac_handle_t, mac_prop_t *, - void *, uint_t); -extern int mac_get_prop(mac_handle_t, mac_prop_t *, - void *, uint_t, uint_t *); +extern void mac_set_ring(void *, void *); + +extern void mac_start_logusage(mac_logtype_t, uint_t); +extern void mac_stop_logusage(mac_logtype_t); + +extern mac_handle_t mac_get_lower_mac_handle(mac_handle_t); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/mac_client.h b/usr/src/uts/common/sys/mac_client.h new file mode 100644 index 0000000000..f1743577ef --- /dev/null +++ b/usr/src/uts/common/sys/mac_client.h @@ -0,0 +1,184 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * This file captures the MAC client API definitions. It can be + * included from any MAC clients. + */ + +#ifndef _SYS_MAC_CLIENT_H +#define _SYS_MAC_CLIENT_H + +#include <sys/mac.h> +#include <sys/mac_flow.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +/* + * MAC client interface. + */ + +typedef struct __mac_client_handle *mac_client_handle_t; +typedef struct __mac_unicast_handle *mac_unicast_handle_t; +typedef struct __mac_promisc_handle *mac_promisc_handle_t; +typedef struct __mac_perim_handle *mac_perim_handle_t; +typedef uintptr_t mac_tx_cookie_t; + +typedef void (*mac_tx_notify_t)(void *, mac_tx_cookie_t); + +typedef enum { + MAC_DIAG_NONE, + MAC_DIAG_MACADDR_NIC, + MAC_DIAG_MACADDR_INUSE, + MAC_DIAG_MACADDR_INVALID, + MAC_DIAG_MACADDRLEN_INVALID, + MAC_DIAG_MACFACTORYSLOTINVALID, + MAC_DIAG_MACFACTORYSLOTUSED, + MAC_DIAG_MACFACTORYSLOTALLUSED, + MAC_DIAG_MACFACTORYNOTSUP, + MAC_DIAG_MACPREFIX_INVALID, + MAC_DIAG_MACPREFIXLEN_INVALID, + MAC_DIAG_MACNO_HWRINGS +} mac_diag_t; + +typedef enum { + MAC_CLIENT_PROMISC_ALL, + MAC_CLIENT_PROMISC_FILTERED, + MAC_CLIENT_PROMISC_MULTI +} mac_client_promisc_type_t; + +/* flags passed to mac_unicast_add() */ +#define MAC_UNICAST_NODUPCHECK 0x0001 +#define MAC_UNICAST_PRIMARY 0x0002 +#define MAC_UNICAST_HW 0x0004 +#define MAC_UNICAST_VNIC_PRIMARY 0x0008 + +/* flags passed to mac_client_open */ +#define MAC_OPEN_FLAGS_IS_VNIC 0x0001 +#define MAC_OPEN_FLAGS_EXCLUSIVE 0x0002 +#define MAC_OPEN_FLAGS_TAG_DISABLE 0x0004 +#define MAC_OPEN_FLAGS_IS_AGGR_PORT 0x0008 +#define MAC_OPEN_FLAGS_STRIP_DISABLE 0x0010 +#define MAC_OPEN_FLAGS_NO_HWRINGS 0x0020 +#define MAC_OPEN_FLAGS_SHARES_DESIRED 0x0040 +#define MAC_OPEN_FLAGS_DISABLE_TX_VID_CHECK 0x0080 +#define MAC_OPEN_FLAGS_USE_DATALINK_NAME 0x0100 +#define MAC_OPEN_FLAGS_REQ_HWRINGS 0x0200 + +/* flags passed to mac_client_close */ +#define MAC_CLOSE_FLAGS_IS_VNIC 0x0001 +#define MAC_CLOSE_FLAGS_EXCLUSIVE 0x0002 +#define MAC_CLOSE_FLAGS_IS_AGGR_PORT 0x0004 + +/* flags passed to mac_promisc_add() */ +#define MAC_PROMISC_FLAGS_NO_TX_LOOP 0x0001 +#define MAC_PROMISC_FLAGS_NO_PHYS 0x0002 + +/* flags passed to mac_tx() */ +#define MAC_DROP_ON_NO_DESC 0x01 /* freemsg() if no tx descs */ +#define MAC_TX_NO_ENQUEUE 0x02 /* don't enqueue mblks if not xmit'ed */ +#define MAC_TX_NO_HOLD 0x04 /* don't bump the active Tx count */ + +extern int mac_client_open(mac_handle_t, mac_client_handle_t *, char *, + uint16_t); +extern void mac_client_close(mac_client_handle_t, uint16_t); + +extern int mac_unicast_add(mac_client_handle_t, uint8_t *, uint16_t, + mac_unicast_handle_t *, uint16_t, mac_diag_t *); +extern int mac_unicast_primary_add(mac_client_handle_t, mac_unicast_handle_t *, + mac_diag_t *); +extern int mac_unicast_remove(mac_client_handle_t, mac_unicast_handle_t); + +extern int mac_multicast_add(mac_client_handle_t, const uint8_t *); +extern void mac_multicast_remove(mac_client_handle_t, const uint8_t *); + +extern void mac_rx_set(mac_client_handle_t, mac_rx_t, void *); +extern void mac_rx_clear(mac_client_handle_t); +extern mac_tx_cookie_t mac_tx(mac_client_handle_t, mblk_t *, + uintptr_t, uint16_t, mblk_t **); +extern boolean_t mac_tx_is_flow_blocked(mac_client_handle_t, mac_tx_cookie_t); +extern uint64_t mac_client_stat_get(mac_client_handle_t, uint_t); + +extern int mac_promisc_add(mac_client_handle_t, mac_client_promisc_type_t, + mac_rx_t, void *, mac_promisc_handle_t *, uint16_t); +extern int mac_promisc_remove(mac_promisc_handle_t); + +extern mac_notify_handle_t mac_notify_add(mac_handle_t, mac_notify_t, void *); +extern int mac_notify_remove(mac_notify_handle_t, boolean_t); +extern void mac_notify_remove_wait(mac_handle_t); +extern int mac_rename_primary(mac_handle_t, const char *); +extern char *mac_client_name(mac_client_handle_t); + +extern int mac_open(const char *, mac_handle_t *); +extern void mac_close(mac_handle_t); +extern uint64_t mac_stat_get(mac_handle_t, uint_t); + +extern int mac_unicast_primary_set(mac_handle_t, const uint8_t *); +extern void mac_unicast_primary_get(mac_handle_t, uint8_t *); +extern void mac_unicast_primary_info(mac_handle_t, char *, boolean_t *); + +extern int mac_addr_random(mac_client_handle_t, uint_t, uint8_t *, + mac_diag_t *); + +extern int mac_addr_factory_reserve(mac_client_handle_t, int *); +extern void mac_addr_factory_release(mac_client_handle_t, uint_t); +extern void mac_addr_factory_value(mac_handle_t, int, uchar_t *, uint_t *, + char *, boolean_t *); +extern uint_t mac_addr_factory_num(mac_handle_t); + +extern uint_t mac_addr_len(mac_handle_t); + +extern mac_tx_notify_handle_t mac_client_tx_notify(mac_client_handle_t, + mac_tx_notify_t, void *); + +extern int mac_set_resources(mac_handle_t, mac_resource_props_t *); +extern void mac_get_resources(mac_handle_t, mac_resource_props_t *); +extern int mac_client_set_resources(mac_client_handle_t, + mac_resource_props_t *); +extern void mac_client_get_resources(mac_client_handle_t, + mac_resource_props_t *); + +extern int mac_share_capable(mac_handle_t); +extern int mac_share_bind(mac_client_handle_t, uint64_t, uint64_t *); +extern void mac_share_unbind(mac_client_handle_t); + +extern int mac_set_mtu(mac_handle_t, uint_t, uint_t *); + +extern uint_t mac_hwgrp_num(mac_handle_t); +extern void mac_get_hwgrp_info(mac_handle_t, int, uint_t *, uint_t *, + uint_t *, uint_t *, char *); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MAC_CLIENT_H */ diff --git a/usr/src/uts/common/sys/mac_client_impl.h b/usr/src/uts/common/sys/mac_client_impl.h new file mode 100644 index 0000000000..29d2a40ff1 --- /dev/null +++ b/usr/src/uts/common/sys/mac_client_impl.h @@ -0,0 +1,318 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_MAC_CLIENT_IMPL_H +#define _SYS_MAC_CLIENT_IMPL_H + +#include <sys/modhash.h> +#include <sys/mac_client.h> +#include <sys/mac_provider.h> +#include <sys/mac.h> +#include <sys/mac_impl.h> +#include <net/if.h> +#include <sys/mac_flow_impl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern kmem_cache_t *mac_client_impl_cache; +extern kmem_cache_t *mac_unicast_impl_cache; +extern kmem_cache_t *mac_promisc_impl_cache; + +/* + * Need a list to chain all VIDs assigned to a client. Normally, one + * MAC client only has one VID. But vsw might need multiple VIDs. + */ +typedef struct mac_unicast_impl_s { /* Protected by */ + struct mac_unicast_impl_s *mui_next; /* SL */ + mac_address_t *mui_map; /* SL */ + uint16_t mui_vid; /* SL */ +} mac_unicast_impl_t; + +#define MAC_CLIENT_FLAGS_PRIMARY 0X0001 +#define MAC_CLIENT_FLAGS_VNIC_PRIMARY 0x0002 + +/* + * One of these is instantiated per MAC client promiscuous callback. + * + * Each element of this structure belongs to two linked list. One + * for the mac_client_impl_t (mci_promisc_list) which created allocated + * the callback, the other for the mac_impl_t (mi_promisc_list) corresponding + * to the MAC client. + * The former allows us to do bookkeeping, the latter allows us + * to more efficiently dispatch packets to the promiscuous callbacks. + */ +typedef struct mac_promisc_impl_s { /* Protected by */ + mac_cb_t mpi_mci_link; /* mi_promisc_lock */ + mac_cb_t mpi_mi_link; /* mi_promisc_lock */ + mac_client_promisc_type_t mpi_type; /* WO */ + mac_rx_t mpi_fn; /* WO */ + void *mpi_arg; /* WO */ + struct mac_client_impl_s *mpi_mcip; /* WO */ + boolean_t mpi_no_tx_loop; /* WO */ + boolean_t mpi_no_phys; /* WO */ +} mac_promisc_impl_t; + +typedef union mac_tx_percpu_s { + struct { + kmutex_t _pcpu_tx_lock; + uint_t _pcpu_tx_refcnt; + } pcpu_lr; + uchar_t pcpu_pad[64]; +} mac_tx_percpu_t; + +#define pcpu_tx_lock pcpu_lr._pcpu_tx_lock +#define pcpu_tx_refcnt pcpu_lr._pcpu_tx_refcnt + +/* + * One of these is instanciated for each MAC client. + */ +struct mac_client_impl_s { /* Protected by */ + struct mac_client_impl_s *mci_client_next; /* mi_rw_lock */ + char mci_name[MAXNAMELEN]; /* mi_rw_lock */ + /* + * This flow entry will contain all the internal constructs + * such as SRS etc. for this MAC client. The MAC client may + * have more than one flow corresponding to each upper client + * sharing this mac_client_impl_t. + */ + flow_entry_t *mci_flent; /* mi_rw_lock */ + struct mac_impl_s *mci_mip; /* WO */ + /* + * If this is a client that has a pass thru MAC (e.g. a VNIC), + * then we also keep the handle for the client's upper MAC. + */ + struct mac_impl_s *mci_upper_mip; /* WO */ + + uint32_t mci_state_flags; /* WO */ + mac_rx_t mci_rx_fn; /* Rx Quiescence */ + void *mci_rx_arg; /* Rx Quiescence */ + mac_direct_rx_t mci_direct_rx_fn; /* SL */ + void *mci_direct_rx_arg; /* SL */ + + mac_cb_t *mci_promisc_list; /* mi_promisc_lock */ + + mac_address_t *mci_unicast; + uint32_t mci_flags; /* SL */ + krwlock_t mci_rw_lock; + mac_unicast_impl_t *mci_unicast_list; /* mci_rw_lock */ + /* + * The mac_client_impl_t may be shared by multiple clients, i.e + * multiple VLANs sharing the same MAC client. In this case the + * address/vid tubles differ and are each associated with their + * own flow entry, but the rest underlying components SRS, etc, + * are common. + */ + flow_entry_t *mci_flent_list; /* mci_rw_lock */ + uint_t mci_nflents; /* mci_rw_lock */ + uint_t mci_nvids; /* mci_rw_lock */ + + /* Resource Management Functions */ + mac_resource_add_t mci_resource_add; /* SL */ + mac_resource_remove_t mci_resource_remove; /* SL */ + mac_resource_quiesce_t mci_resource_quiesce; /* SL */ + mac_resource_restart_t mci_resource_restart; /* SL */ + mac_resource_bind_t mci_resource_bind; /* SL */ + void *mci_resource_arg; /* SL */ + + + /* Tx notify callback */ + kmutex_t mci_tx_cb_lock; + mac_cb_info_t mci_tx_notify_cb_info; /* cb list info */ + mac_cb_t *mci_tx_notify_cb_list; /* The cb list */ + uintptr_t mci_tx_notify_id; + + /* per MAC client stats */ /* None */ + uint64_t mci_stat_multircv; + uint64_t mci_stat_brdcstrcv; + uint64_t mci_stat_multixmt; + uint64_t mci_stat_brdcstxmt; + uint64_t mci_stat_obytes; + uint64_t mci_stat_opackets; + uint64_t mci_stat_oerrors; + uint64_t mci_stat_ibytes; + uint64_t mci_stat_ipackets; + uint64_t mci_stat_ierrors; + + flow_tab_t *mci_subflow_tab; /* Rx quiescence */ + + /* + * Priority range for this MAC client. This the range + * corresponding to the priority configured (nr_flow_priority). + */ + pri_t mci_min_pri; + pri_t mci_max_pri; + + /* + * Hybrid I/O related definitions. + */ + mac_share_handle_t mci_share; + boolean_t mci_share_bound; + boolean_t mci_no_hwrings; + + /* The client requests a hardware group */ + boolean_t mci_req_hwrings; + + /* for multicast support */ + struct mac_mcast_addrs_s *mci_mcast_addrs; /* mi_rw_lock */ + + /* + * Protected by mci_tx_pcpu[0].pcpu_tx_lock + */ + uint_t mci_tx_flag; + kcondvar_t mci_tx_cv; + + /* Must be last in the structure for dynamic sizing */ + mac_tx_percpu_t mci_tx_pcpu[1]; /* SL */ +}; + +#define MAC_CLIENT_IMPL_SIZE \ + (sizeof (mac_client_impl_t) + \ + (mac_tx_percpu_cnt * sizeof (mac_tx_percpu_t))) + +extern int mac_tx_percpu_cnt; + +#define MCIP_TX_SRS(mcip) \ + ((mcip)->mci_flent == NULL ? NULL : (mcip)->mci_flent->fe_tx_srs) + +/* Defensive coding, non-null mcip_flent could be an assert */ + +#define MCIP_DATAPATH_SETUP(mcip) \ + ((mcip)->mci_flent == NULL ? B_FALSE : \ + !((mcip)->mci_flent->fe_flags & FE_MC_NO_DATAPATH)) + +#define MCIP_RESOURCE_PROPS(mcip) \ + ((mcip)->mci_flent == NULL ? NULL : \ + &(mcip)->mci_flent->fe_resource_props) + +#define MCIP_EFFECTIVE_PROPS(mcip) \ + (mcip->mci_flent == NULL ? NULL : \ + &(mcip)->mci_flent->fe_effective_props) + +#define MCIP_RESOURCE_PROPS_MASK(mcip) \ + ((mcip)->mci_flent == NULL ? 0 : \ + (mcip)->mci_flent->fe_resource_props.mrp_mask) + +#define MCIP_RESOURCE_PROPS_MAXBW(mcip) \ + ((mcip)->mci_flent == NULL ? 0 : \ + (mcip)->mci_flent->fe_resource_props.mrp_maxbw) + +#define MCIP_RESOURCE_PROPS_PRIORITY(mcip) \ + ((mcip)->mci_flent == NULL ? 0 : \ + (mcip)->mci_flent->fe_resource_props.mrp_priority) + +#define MCIP_RESOURCE_PROPS_CPUS(mcip) \ + ((mcip)->mci_flent == NULL ? 0 : \ + &(mcip)->mci_flent->fe_resource_props.mrp_cpus) + +#define MCIP_RESOURCE_PROPS_NCPUS(mcip) \ + ((mcip)->mci_flent == NULL ? 0 : \ + (mcip)->mci_flent->fe_resource_props.mrp_ncpus) + +#define MCIP_RESOURCE_PROPS_CPU(mcip) \ + ((mcip)->mci_flent == NULL ? 0 : \ + (mcip)->mci_flent->fe_resource_props.mrp_ncpu) + +/* + * We validate the VLAN id of the packet w.r.t the client's vid, + * if required (i.e. !MCIS_DISABLE_TX_VID_CHECK). DLS clients + * will have MCIS_DISABLE_TX_VID_CHECK set. + * (In the case of aggr when we get back packets, due to + * the underlying driver being flow controlled, we won't + * drop the packet even if it is VLAN tagged as we + * don't set MCIS_DISABLE_TX_VID_CHECK for an aggr.) + */ +#define MAC_VID_CHECK_NEEDED(mcip) \ + (((mcip)->mci_state_flags & MCIS_DISABLE_TX_VID_CHECK) == 0 && \ + (mcip)->mci_mip->mi_info.mi_nativemedia == DL_ETHER) + +#define MAC_VID_CHECK(mcip, mp, err) { \ + if (ntohs(((struct ether_header *)(mp)->b_rptr)->ether_type) == \ + ETHERTYPE_VLAN) { \ + /* \ + * err is set to EINVAL (so the caller can take the \ + * appropriate action. e.g. freemsg()) for two cases: \ + * -client is not responsible for filling in the vid. \ + * -client is responsible for filling in the vid, but \ + * the vid doesn't match the vid of the MAC client. \ + */ \ + (err) = EINVAL; \ + if (((mcip)->mci_state_flags & MCIS_TAG_DISABLE) != 0) {\ + struct ether_vlan_header *evhp; \ + uint16_t vlanid; \ + \ + evhp = (struct ether_vlan_header *)(mp)->b_rptr;\ + vlanid = VLAN_ID(ntohs(evhp->ether_tci)); \ + if (mac_client_check_flow_vid((mcip), vlanid)) \ + (err) = 0; \ + } \ + } \ +} + +#define MAC_TAG_NEEDED(mcip) \ + (((mcip)->mci_state_flags & MCIS_TAG_DISABLE) == 0 && \ + (mcip)->mci_nvids == 1) \ + +/* MCI state flags */ +#define MCIS_IS_VNIC 0x0001 +#define MCIS_EXCLUSIVE 0x0002 +#define MCIS_TAG_DISABLE 0x0004 +#define MCIS_STRIP_DISABLE 0x0008 +#define MCIS_IS_AGGR_PORT 0x0010 +#define MCIS_CLIENT_POLL_CAPABLE 0x0020 +#define MCIS_DESC_LOGGED 0x0040 +#define MCIS_SHARE_BOUND 0x0080 +#define MCIS_NO_HWRINGS 0x0100 +#define MCIS_DISABLE_TX_VID_CHECK 0x0200 +#define MCIS_USE_DATALINK_NAME 0x0400 + +/* in mac_client.c */ +extern void mac_promisc_client_dispatch(mac_client_impl_t *, mblk_t *); +extern void mac_client_init(void); +extern void mac_client_fini(void); +extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *, + mac_client_impl_t *); + +extern int mac_validate_props(mac_resource_props_t *); + +extern mac_client_impl_t *mac_vnic_lower(mac_impl_t *); +extern mac_client_impl_t *mac_primary_client_handle(mac_impl_t *); +extern uint16_t i_mac_flow_vid(flow_entry_t *); +extern boolean_t i_mac_capab_get(mac_handle_t, mac_capab_t, void *); + +extern void mac_unicast_update_clients(mac_impl_t *, mac_address_t *); +extern void mac_update_resources(mac_resource_props_t *, + mac_resource_props_t *, boolean_t); + +boolean_t mac_client_check_flow_vid(mac_client_impl_t *, uint16_t); + +extern boolean_t mac_is_primary_client(mac_client_impl_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MAC_CLIENT_IMPL_H */ diff --git a/usr/src/uts/common/sys/mac_client_priv.h b/usr/src/uts/common/sys/mac_client_priv.h new file mode 100644 index 0000000000..7e22552aeb --- /dev/null +++ b/usr/src/uts/common/sys/mac_client_priv.h @@ -0,0 +1,149 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * This file contains *private* MAC API definitions. This header file + * should only be included by kernel components which are part of the + * GLDv3 stack (dld, dls, aggr, softmac). + */ + +#ifndef _SYS_MAC_CLIENT_PRIV_H +#define _SYS_MAC_CLIENT_PRIV_H + +#include <sys/mac.h> +#include <sys/mac_flow.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#ifdef DEBUG +#define MAC_PERIM_HELD(mph) mac_perim_held(mph) +#else +#define MAC_PERIM_HELD(mph) +#endif + +extern boolean_t mac_rx_bypass_set(mac_client_handle_t, mac_direct_rx_t, + void *); + +extern const mac_info_t *mac_info(mac_handle_t); +extern boolean_t mac_info_get(const char *, mac_info_t *); +extern int mac_promisc_set(mac_handle_t, boolean_t, mac_promisc_type_t); +extern boolean_t mac_promisc_get(mac_handle_t, mac_promisc_type_t); + +extern void mac_ioctl(mac_handle_t, queue_t *, mblk_t *); +extern link_state_t mac_link_get(mac_handle_t); +extern void mac_resource_set(mac_client_handle_t, mac_resource_add_t, void *); +extern dev_info_t *mac_devinfo_get(mac_handle_t); +extern boolean_t mac_capab_get(mac_handle_t, mac_capab_t, void *); +extern boolean_t mac_sap_verify(mac_handle_t, uint32_t, uint32_t *); +extern mblk_t *mac_header(mac_handle_t, const uint8_t *, uint32_t, mblk_t *, + size_t); +extern int mac_header_info(mac_handle_t, mblk_t *, mac_header_info_t *); +extern mblk_t *mac_header_cook(mac_handle_t, mblk_t *); +extern mblk_t *mac_header_uncook(mac_handle_t, mblk_t *); + +extern void mac_resource_set_common(mac_client_handle_t, + mac_resource_add_t, mac_resource_remove_t, mac_resource_quiesce_t, + mac_resource_restart_t, mac_resource_bind_t, void *); + +extern void mac_perim_enter_by_mh(mac_handle_t, mac_perim_handle_t *); +extern int mac_perim_enter_by_macname(const char *, mac_perim_handle_t *); +extern int mac_perim_enter_by_linkid(datalink_id_t, mac_perim_handle_t *); +extern void mac_perim_exit(mac_perim_handle_t); +extern boolean_t mac_perim_held(mac_handle_t); + +extern uint16_t mac_client_vid(mac_client_handle_t); +extern int mac_vnic_unicast_set(mac_client_handle_t, const uint8_t *); + +extern void mac_client_poll_enable(mac_client_handle_t); +extern void mac_client_poll_disable(mac_client_handle_t); + +extern int mac_resource_ctl_set(mac_client_handle_t, mac_resource_props_t *); +extern void mac_resource_ctl_get(mac_client_handle_t, mac_resource_props_t *); + +/* + * Flow-related APIs for MAC clients. + */ + +extern void mac_link_init_flows(mac_client_handle_t); +extern void mac_link_release_flows(mac_client_handle_t); +extern int mac_link_flow_add(datalink_id_t, char *, flow_desc_t *, + mac_resource_props_t *); +extern int mac_link_flow_remove(char *); +extern int mac_link_flow_modify(char *, mac_resource_props_t *); +extern boolean_t mac_link_has_flows(mac_client_handle_t); + +typedef struct { + char fi_flow_name[MAXNAMELEN]; + datalink_id_t fi_link_id; + flow_desc_t fi_flow_desc; + mac_resource_props_t fi_resource_props; +} mac_flowinfo_t; + +extern int mac_link_flow_walk(datalink_id_t, + int (*)(mac_flowinfo_t *, void *), void *); +extern int mac_link_flow_info(char *, mac_flowinfo_t *); + +extern void *mac_tx_hold(mac_client_handle_t); +extern void mac_tx_rele(mac_client_handle_t, void *); +extern void mac_rx_client_quiesce(mac_client_handle_t); +extern void mac_rx_client_restart(mac_client_handle_t); +extern void mac_srs_perm_quiesce(mac_client_handle_t, boolean_t); +extern int mac_hwrings_get(mac_client_handle_t, mac_group_handle_t *, + mac_ring_handle_t *); +extern void mac_hwring_setup(mac_ring_handle_t, mac_resource_handle_t); +extern void mac_hwring_teardown(mac_ring_handle_t); +extern int mac_hwring_disable_intr(mac_ring_handle_t); +extern int mac_hwring_enable_intr(mac_ring_handle_t); +extern int mac_hwring_start(mac_ring_handle_t); +extern void mac_hwring_stop(mac_ring_handle_t); +extern mblk_t *mac_hwring_poll(mac_ring_handle_t, int); +#define MAC_HWRING_POLL(ring, bytes) \ + (((ring)->mr_info.mri_poll) \ + ((ring)->mr_info.mri_driver, (bytes))) + +extern int mac_hwgroup_addmac(mac_group_handle_t, const uint8_t *); +extern int mac_hwgroup_remmac(mac_group_handle_t, const uint8_t *); + +extern void mac_set_upper_mac(mac_client_handle_t, mac_handle_t); + +extern int mac_mark_exclusive(mac_handle_t); +extern void mac_unmark_exclusive(mac_handle_t); + +extern int32_t mac_client_intr_cpu(mac_client_handle_t); +extern void mac_client_set_intr_cpu(void *, mac_client_handle_t, int32_t); +extern void *mac_get_devinfo(mac_handle_t); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MAC_CLIENT_PRIV_H */ diff --git a/usr/src/uts/common/sys/mac_flow.h b/usr/src/uts/common/sys/mac_flow.h new file mode 100644 index 0000000000..05ed62a217 --- /dev/null +++ b/usr/src/uts/common/sys/mac_flow.h @@ -0,0 +1,210 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _MAC_FLOW_H +#define _MAC_FLOW_H + +/* + * Main structure describing a flow of packets, for classification use + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <netinet/in.h> /* for IPPROTO_* constants */ +#include <sys/ethernet.h> + +#define MAXFLOWNAME 32 + +/* need to use MAXMACADDRLEN from dld.h instead of this one */ +#define MAXMACADDR 20 + +/* Bit-mask for the selectors carried in the flow descriptor */ +typedef uint64_t flow_mask_t; + +#define FLOW_LINK_DST 0x00000001 /* Destination MAC addr */ +#define FLOW_LINK_SRC 0x00000002 /* Source MAC address */ +#define FLOW_LINK_VID 0x00000004 /* VLAN ID */ +#define FLOW_LINK_SAP 0x00000008 /* SAP value */ + +#define FLOW_IP_VERSION 0x00000010 /* V4 or V6 */ +#define FLOW_IP_PROTOCOL 0x00000020 /* Protocol type */ +#define FLOW_IP_LOCAL 0x00000040 /* Local address */ +#define FLOW_IP_REMOTE 0x00000080 /* Remote address */ +#define FLOW_IP_DSFIELD 0x00000100 /* DSfield value */ + +#define FLOW_ULP_PORT_LOCAL 0x00001000 /* ULP local port */ +#define FLOW_ULP_PORT_REMOTE 0x00002000 /* ULP remote port */ + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif + +typedef struct flow_desc_s { + flow_mask_t fd_mask; + uint32_t fd_mac_len; + uint8_t fd_dst_mac[MAXMACADDR]; + uint8_t fd_src_mac[MAXMACADDR]; + uint16_t fd_vid; + uint32_t fd_sap; + uint8_t fd_ipversion; + uint8_t fd_protocol; + in6_addr_t fd_local_addr; + in6_addr_t fd_local_netmask; + in6_addr_t fd_remote_addr; + in6_addr_t fd_remote_netmask; + in_port_t fd_local_port; + in_port_t fd_remote_port; + uint8_t fd_dsfield; + uint8_t fd_dsfield_mask; +} flow_desc_t; + +#define MRP_NCPUS 128 + +/* + * In MCM_CPUS mode, cpu bindings is user specified. In MCM_FANOUT mode, + * user only specifies a fanout count. + * mc_fanout_cnt gives the number of CPUs used for fanout soft rings. + * mc_fanout_cpus[] array stores the CPUs used for fanout soft rings. + */ +typedef enum { + MCM_FANOUT = 1, + MCM_CPUS +} mac_cpu_mode_t; + +typedef struct mac_cpus_props_s { + uint32_t mc_ncpus; /* num of cpus */ + uint32_t mc_cpus[MRP_NCPUS]; /* cpu list */ + uint32_t mc_fanout_cnt; /* soft ring cpu cnt */ + uint32_t mc_fanout_cpus[MRP_NCPUS]; /* SR cpu list */ + uint32_t mc_pollid; /* poll thr binding */ + uint32_t mc_workerid; /* worker thr binding */ + /* + * interrupt cpu: mrp_intr_cpu less than 0 implies platform limitation + * in retargetting the interrupt assignment. + */ + int32_t mc_intr_cpu; + mac_cpu_mode_t mc_fanout_mode; /* fanout mode */ +} mac_cpus_t; + +/* Priority values */ +typedef enum { + MPL_LOW, + MPL_MEDIUM, + MPL_HIGH, + MPL_RESET +} mac_priority_level_t; + +/* The default priority for links */ +#define MPL_LINK_DEFAULT MPL_HIGH + +/* The default priority for flows */ +#define MPL_SUBFLOW_DEFAULT MPL_MEDIUM + +#define MRP_MAXBW 0x00000001 /* Limit set */ +#define MRP_CPUS 0x00000002 /* CPU/fanout set */ +#define MRP_CPUS_USERSPEC 0x00000004 /* CPU/fanout from user */ +#define MRP_PRIORITY 0x00000008 /* Priority set */ + +#define MRP_THROTTLE MRP_MAXBW + +/* 3 levels - low, medium, high */ +#define MRP_PRIORITY_LEVELS 3 + +/* Special value denoting no bandwidth control */ +#define MRP_MAXBW_RESETVAL -1ULL + +/* + * Until sub-megabit limit is implemented, + * reject values lower than 1 MTU per tick or 1.2Mbps + */ +#define MRP_MAXBW_MINVAL 1200000 + +typedef struct mac_resource_props_s { + /* + * Bit-mask for the network resource control types types + */ + uint32_t mrp_mask; + uint64_t mrp_maxbw; /* bandwidth limit in bps */ + mac_priority_level_t mrp_priority; /* relative flow priority */ + mac_cpus_t mrp_cpus; +} mac_resource_props_t; + +#define mrp_ncpus mrp_cpus.mc_ncpus +#define mrp_cpu mrp_cpus.mc_cpus +#define mrp_fanout_cnt mrp_cpus.mc_fanout_cnt +#define mrp_fanout_cpu mrp_cpus.mc_fanout_cpus +#define mrp_pollid mrp_cpus.mc_pollid +#define mrp_workerid mrp_cpus.mc_workerid +#define mrp_intr_cpu mrp_cpus.mc_intr_cpu +#define mrp_fanout_mode mrp_cpus.mc_fanout_mode + +#define MAC_COPY_CPUS(mrp, fmrp) { \ + int ncpus; \ + (fmrp)->mrp_ncpus = (mrp)->mrp_ncpus; \ + (fmrp)->mrp_intr_cpu = (mrp)->mrp_intr_cpu; \ + (fmrp)->mrp_fanout_mode = (mrp)->mrp_fanout_mode; \ + if ((mrp)->mrp_ncpus == 0) { \ + (fmrp)->mrp_mask &= ~MRP_CPUS; \ + (fmrp)->mrp_mask &= ~MRP_CPUS_USERSPEC; \ + } else { \ + for (ncpus = 0; ncpus < (fmrp)->mrp_ncpus; ncpus++) \ + (fmrp)->mrp_cpu[ncpus] = (mrp)->mrp_cpu[ncpus];\ + (fmrp)->mrp_mask |= MRP_CPUS; \ + if ((mrp)->mrp_mask & MRP_CPUS_USERSPEC) \ + (fmrp)->mrp_mask |= MRP_CPUS_USERSPEC; \ + } \ +} + +typedef struct flow_stats_s { + uint64_t fs_rbytes; + uint64_t fs_ipackets; + uint64_t fs_ierrors; + uint64_t fs_obytes; + uint64_t fs_opackets; + uint64_t fs_oerrors; +} flow_stats_t; + +typedef enum { + FLOW_STAT_RBYTES, + FLOW_STAT_IPACKETS, + FLOW_STAT_IERRORS, + FLOW_STAT_OBYTES, + FLOW_STAT_OPACKETS, + FLOW_STAT_OERRORS +} flow_stat_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _MAC_FLOW_H */ diff --git a/usr/src/uts/common/sys/mac_flow_impl.h b/usr/src/uts/common/sys/mac_flow_impl.h new file mode 100644 index 0000000000..6029873930 --- /dev/null +++ b/usr/src/uts/common/sys/mac_flow_impl.h @@ -0,0 +1,537 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _MAC_FLOW_IMPL_H +#define _MAC_FLOW_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/param.h> +#include <sys/atomic.h> +#include <sys/ksynch.h> +#include <sys/mac_flow.h> +#include <sys/stream.h> +#include <sys/sdt.h> +#include <net/if.h> + +/* + * Macros to increment/decrement the reference count on a flow_entry_t. + */ +#define FLOW_REFHOLD(flent) { \ + DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent)); \ + mutex_enter(&(flent)->fe_lock); \ + (flent)->fe_refcnt++; \ + mutex_exit(&(flent)->fe_lock); \ +} + +/* + * Data paths must not attempt to use a flow entry if it is marked INCIPIENT + * or QUIESCE. In the former case the set up is not yet complete and the + * data path could stumble on inconsistent data structures. In the latter + * case a control operation is waiting for quiescence so that it can + * change callbacks or other structures without the use of locks. + */ +#define FLOW_TRY_REFHOLD(flent, err) { \ + DTRACE_PROBE1(flow_refhold, flow_entry_t *, (flent)); \ + (err) = 0; \ + mutex_enter(&(flent)->fe_lock); \ + if ((flent)->fe_flags & (FE_INCIPIENT | FE_QUIESCE | FE_CONDEMNED | \ + FE_UF_NO_DATAPATH | FE_MC_NO_DATAPATH)) \ + (err) = -1; \ + else \ + (flent)->fe_refcnt++; \ + mutex_exit(&(flent)->fe_lock); \ +} + +#define FLOW_REFRELE(flent) { \ + DTRACE_PROBE1(flow_refrele, flow_entry_t *, (flent)); \ + mutex_enter(&(flent)->fe_lock); \ + ASSERT((flent)->fe_refcnt != 0); \ + (flent)->fe_refcnt--; \ + if ((flent)->fe_flags & FE_WAITER) { \ + ASSERT((flent)->fe_refcnt != 0); \ + cv_signal(&(flent)->fe_cv); \ + mutex_exit(&(flent)->fe_lock); \ + } else if ((flent)->fe_refcnt == 0) { \ + mac_flow_destroy(flent); \ + } else { \ + mutex_exit(&(flent)->fe_lock); \ + } \ +} + +#define FLOW_USER_REFHOLD(flent) { \ + mutex_enter(&(flent)->fe_lock); \ + (flent)->fe_user_refcnt++; \ + mutex_exit(&(flent)->fe_lock); \ +} + +#define FLOW_USER_REFRELE(flent) { \ + mutex_enter(&(flent)->fe_lock); \ + ASSERT((flent)->fe_user_refcnt != 0); \ + if (--(flent)->fe_user_refcnt == 0 && \ + ((flent)->fe_flags & FE_WAITER)) \ + cv_signal(&(flent)->fe_cv); \ + mutex_exit(&(flent)->fe_lock); \ +} + +#define FLOW_FINAL_REFRELE(flent) { \ + ASSERT(flent->fe_refcnt == 1 && flent->fe_user_refcnt == 0); \ + FLOW_REFRELE(flent); \ +} + +/* + * Mark or unmark the flent with a bit flag + */ +#define FLOW_MARK(flent, flag) { \ + mutex_enter(&(flent)->fe_lock); \ + (flent)->fe_flags |= flag; \ + mutex_exit(&(flent)->fe_lock); \ +} + +#define FLOW_UNMARK(flent, flag) { \ + mutex_enter(&(flent)->fe_lock); \ + (flent)->fe_flags &= ~flag; \ + mutex_exit(&(flent)->fe_lock); \ +} + +#define FLENT_TO_MIP(flent) \ + (flent->fe_mbg != NULL ? mac_bcast_grp_mip(flent->fe_mbg) : \ + ((mac_client_impl_t *)flent->fe_mcip)->mci_mip) + +/* Convert a bandwidth expressed in bps to a number of bytes per tick. */ +#define FLOW_BYTES_PER_TICK(bps) (((bps) >> 3) / hz) + +/* + * Given an underlying range and a priority level, obtain the minimum for the + * new range. + */ +#define FLOW_MIN_PRIORITY(min, max, pri) \ + ((min) + ((((max) - (min)) / MRP_PRIORITY_LEVELS) * (pri))) + +/* + * Given an underlying range and a minimum level (base), obtain the maximum + * for the new range. + */ +#define FLOW_MAX_PRIORITY(min, max, base) \ + ((base) + (((max) - (min)) / MRP_PRIORITY_LEVELS)) + +/* + * Given an underlying range and a priority level, get the absolute + * priority value. For now there are just 3 values, high, low and + * medium so we can just return max, min or min + (max - min) / 2. + * If there are more than three we need to change this computation. + */ +#define FLOW_PRIORITY(min, max, pri) \ + (pri) == MPL_HIGH ? (max) : \ + (pri) == MPL_LOW ? (min) : \ + ((min) + (((max) - (min)) / 2)) + +#define MAC_FLOW_TAB_SIZE 500 + +typedef struct flow_entry_s flow_entry_t; +typedef struct flow_tab_s flow_tab_t; +typedef struct flow_state_s flow_state_t; +struct mac_impl_s; +struct mac_client_impl_s; + +/* + * Classification flags used to lookup the flow. + */ +#define FLOW_INBOUND 0x01 +#define FLOW_OUTBOUND 0x02 +/* Don't compare VID when classifying the packets, see mac_rx_classify() */ +#define FLOW_IGNORE_VLAN 0x04 + +/* Generic flow client function signature */ +typedef void (*flow_fn_t)(void *, void *, mblk_t *, boolean_t); + +/* Flow state */ +typedef enum { + FLOW_DRIVER_UPCALL, + FLOW_USER_REF +} mac_flow_state_t; + +/* Matches a flow_entry_t using the extracted flow_state_t info */ +typedef boolean_t (*flow_match_fn_t)(flow_tab_t *, flow_entry_t *, + flow_state_t *); + +/* fe_flags */ +#define FE_QUIESCE 0x01 /* Quiesce the flow */ +#define FE_WAITER 0x02 /* Flow has a waiter */ +#define FE_FLOW_TAB 0x04 /* Flow is in the flow tab list */ +#define FE_G_FLOW_HASH 0x08 /* Flow is in the global flow hash */ +#define FE_INCIPIENT 0x10 /* Being setup */ +#define FE_CONDEMNED 0x20 /* Being deleted */ +#define FE_UF_NO_DATAPATH 0x40 /* No datapath setup for User flow */ +#define FE_MC_NO_DATAPATH 0x80 /* No datapath setup for mac client */ + +/* fe_type */ +#define FLOW_PRIMARY_MAC 0x01 /* NIC primary MAC address */ +#define FLOW_VNIC_MAC 0x02 /* VNIC flow */ +#define FLOW_MCAST 0x04 /* Multicast (and broadcast) */ +#define FLOW_OTHER 0x08 /* Other flows configured */ +#define FLOW_USER 0x10 /* User defined flow */ +#define FLOW_VNIC FLOW_VNIC_MAC +#define FLOW_NO_STATS 0x20 /* Don't create stats for the flow */ + +/* + * Shared Bandwidth control counters between the soft ring set and its + * associated soft rings. In case the flow associated with NIC/VNIC + * has a group of Rx rings assigned to it, we have the same + * number of soft ring sets as we have the Rx ring in the group + * and each individual SRS (and its soft rings) decide when to + * poll their Rx ring independently. But if there is a B/W limit + * associated with the NIC/VNIC, then the B/W control counter is + * shared across all the SRS in the group and their associated + * soft rings. + * + * There is a many to 1 mapping between the SRS and + * mac_bw_ctl if the flow has a group of Rx rings associated with + * it. + */ +typedef struct mac_bw_ctl_s { + kmutex_t mac_bw_lock; + uint32_t mac_bw_state; + size_t mac_bw_sz; /* ?? Is it needed */ + size_t mac_bw_limit; /* Max bytes to process per tick */ + size_t mac_bw_used; /* Bytes processed in current tick */ + size_t mac_bw_drop_threshold; /* Max queue length */ + size_t mac_bw_drop_bytes; + size_t mac_bw_polled; + size_t mac_bw_intr; + clock_t mac_bw_curr_time; +} mac_bw_ctl_t; + +struct flow_entry_s { /* Protected by */ + struct flow_entry_s *fe_next; /* ft_lock */ + + datalink_id_t fe_link_id; /* WO */ + + /* Properties as specified for this flow */ + mac_resource_props_t fe_resource_props; /* SL */ + + /* Properties actually effective at run time for this flow */ + mac_resource_props_t fe_effective_props; /* SL */ + + kmutex_t fe_lock; + char fe_flow_name[MAXFLOWNAME]; /* fe_lock */ + flow_desc_t fe_flow_desc; /* fe_lock */ + kcondvar_t fe_cv; /* fe_lock */ + /* + * Initial flow ref is 1 on creation. A thread that lookups the + * flent typically by a mac_flow_lookup() dynamically holds a ref. + * If the ref is 1, it means there arent' any upcalls from the driver + * or downcalls from the stack using this flent. Structures pointing + * to the flent or flent inserted in lists don't count towards this + * refcnt. Instead they are tracked using fe_flags. Only a control + * thread doing a teardown operation deletes the flent, after waiting + * for upcalls to finish synchronously. The fe_refcnt tracks + * the number of upcall refs + */ + uint32_t fe_refcnt; /* fe_lock */ + + /* + * This tracks lookups done using the global hash list for user + * generated flows. This refcnt only protects the flent itself + * from disappearing and helps walkers to read the flent info such + * as flow spec. However the flent may be quiesced and the SRS could + * be deleted. The fe_user_refcnt tracks the number of global flow + * has refs. + */ + uint32_t fe_user_refcnt; /* fe_lock */ + uint_t fe_flags; /* fe_lock */ + + /* + * Function/args to invoke for delivering matching packets + * Only the function ff_fn may be changed dynamically and atomically. + * The ff_arg1 and ff_arg2 are set at creation time and may not + * be changed. + */ + flow_fn_t fe_cb_fn; /* fe_lock */ + void *fe_cb_arg1; /* fe_lock */ + void *fe_cb_arg2; /* fe_lock */ + + void *fe_client_cookie; /* WO */ + void *fe_rx_ring_group; /* SL */ + void *fe_rx_srs[MAX_RINGS_PER_GROUP]; /* fe_lock */ + int fe_rx_srs_cnt; /* fe_lock */ + void *fe_tx_srs; /* WO */ + + /* + * This is a unicast flow, and is a mac_client_impl_t + */ + void *fe_mcip; /* WO */ + + /* + * Used by mci_flent_list of mac_client_impl_t to track flows sharing + * the same mac_client_impl_t. + */ + struct flow_entry_s *fe_client_next; + + /* + * This is a broadcast or multicast flow and is a mac_bcast_grp_t + */ + void *fe_mbg; /* WO */ + uint_t fe_type; /* WO */ + + /* + * BW control info. + */ + mac_bw_ctl_t fe_tx_bw; + mac_bw_ctl_t fe_rx_bw; + + /* + * Used by flow table lookup code + */ + flow_match_fn_t fe_match; + + /* + * Used by mac_flow_remove(). + */ + int fe_index; + flow_tab_t *fe_flow_tab; + + kstat_t *fe_ksp; + flow_stats_t fe_flowstats; + boolean_t fe_desc_logged; + zoneid_t fe_zoneid; + uint64_t fe_nic_speed; +}; + +/* + * Various structures used by the flows framework for keeping track + * of packet state information. + */ + +/* Layer 2 */ +typedef struct flow_l2info_s { + uchar_t *l2_start; + uint8_t *l2_daddr; + uint16_t l2_vid; + uint32_t l2_sap; + uint_t l2_hdrsize; +} flow_l2info_t; + +/* Layer 3 */ +typedef struct flow_l3info_s { + uchar_t *l3_start; + uint8_t l3_protocol; + uint8_t l3_version; + boolean_t l3_dst_or_src; + uint_t l3_hdrsize; + boolean_t l3_fragmented; +} flow_l3info_t; + +/* Layer 4 */ +typedef struct flow_l4info_s { + uchar_t *l4_start; + uint16_t l4_src_port; + uint16_t l4_dst_port; + uint16_t l4_hash_port; +} flow_l4info_t; + +/* + * Combined state structure. + * Holds flow direction and an mblk_t pointer. + */ +struct flow_state_s { + uint_t fs_flags; + mblk_t *fs_mp; + flow_l2info_t fs_l2info; + flow_l3info_t fs_l3info; + flow_l4info_t fs_l4info; +}; + +/* + * Flow ops vector. + * There are two groups of functions. The ones ending with _fe are + * called when a flow is being added. The others (hash, accept) are + * called at flow lookup time. + */ +#define FLOW_MAX_ACCEPT 16 +typedef struct flow_ops_s { + /* + * fo_accept_fe(): + * Validates the contents of the flow and checks whether + * it's compatible with the flow table. sets the fe_match + * function of the flow. + */ + int (*fo_accept_fe)(flow_tab_t *, flow_entry_t *); + /* + * fo_hash_fe(): + * Generates a hash index to the flow table. This function + * must use the same algorithm as fo_hash(), which is used + * by the flow lookup code path. + */ + uint32_t (*fo_hash_fe)(flow_tab_t *, flow_entry_t *); + /* + * fo_match_fe(): + * This is used for finding identical flows. + */ + boolean_t (*fo_match_fe)(flow_tab_t *, flow_entry_t *, + flow_entry_t *); + /* + * fo_insert_fe(): + * Used for inserting a flow to a flow chain. + * Protocols that have special ordering requirements would + * need to implement this. For those that don't, + * flow_generic_insert_fe() may be used. + */ + int (*fo_insert_fe)(flow_tab_t *, flow_entry_t **, + flow_entry_t *); + + /* + * Calculates the flow hash index based on the accumulated + * state in flow_state_t. Must use the same algorithm as + * fo_hash_fe(). + */ + uint32_t (*fo_hash)(flow_tab_t *, flow_state_t *); + + /* + * Array of accept fuctions. + * Each function in the array will accumulate enough state + * (header length, protocol) to allow the next function to + * proceed. We support up to FLOW_MAX_ACCEPT functions which + * should be sufficient for all practical purposes. + */ + int (*fo_accept[FLOW_MAX_ACCEPT])(flow_tab_t *, + flow_state_t *); +} flow_ops_t; + +/* + * Generic flow table. + */ +struct flow_tab_s { + krwlock_t ft_lock; + /* + * Contains a list of functions (described above) + * specific to this table type. + */ + flow_ops_t ft_ops; + + /* + * Indicates what types of flows are supported. + */ + flow_mask_t ft_mask; + + /* + * An array of flow_entry_t * of size ft_size. + * Each element is the beginning of a hash chain. + */ + flow_entry_t **ft_table; + uint_t ft_size; + + /* + * The number of flows inserted into ft_table. + */ + uint_t ft_flow_count; + struct mac_impl_s *ft_mip; + struct mac_client_impl_s *ft_mcip; +}; + +/* + * This is used for describing what type of flow table can be created. + * mac_flow.c contains a list of these structures. + */ +typedef struct flow_tab_info_s { + flow_ops_t *fti_ops; + flow_mask_t fti_mask; + uint_t fti_size; +} flow_tab_info_t; + +#define FLOW_TAB_EMPTY(ft) ((ft) == NULL || (ft)->ft_flow_count == 0) + +/* + * This is used by mac_tx_send. + */ +typedef struct mac_tx_stats_s { + uint_t ts_opackets; + uint_t ts_obytes; + uint_t ts_oerrors; +} mac_tx_stats_t; + +#define FLOW_STAT_UPDATE(f, s, c) { \ + ((flow_entry_t *)(f))->fe_flowstats.fs_##s += ((uint64_t)(c)); \ +} + +#define FLOW_TX_STATS_UPDATE(f, s) { \ + FLOW_STAT_UPDATE((f), opackets, (s)->ts_opackets); \ + FLOW_STAT_UPDATE((f), obytes, (s)->ts_obytes); \ + FLOW_STAT_UPDATE((f), oerrors, (s)->ts_oerrors); \ +} + +extern void mac_flow_init(); +extern void mac_flow_fini(); +extern int mac_flow_create(flow_desc_t *, mac_resource_props_t *, + char *, void *, uint_t, flow_entry_t **); + +extern int mac_flow_add(flow_tab_t *, flow_entry_t *); +extern int mac_flow_add_subflow(mac_client_handle_t, flow_entry_t *, + boolean_t); +extern int mac_flow_hash_add(flow_entry_t *); +extern int mac_flow_lookup_byname(char *, flow_entry_t **); +extern int mac_flow_lookup(flow_tab_t *, mblk_t *, uint_t, + flow_entry_t **); + +extern int mac_flow_walk(flow_tab_t *, int (*)(flow_entry_t *, void *), + void *); + +extern int mac_flow_walk_nolock(flow_tab_t *, + int (*)(flow_entry_t *, void *), void *); + +extern void mac_flow_modify(flow_tab_t *, flow_entry_t *, + mac_resource_props_t *); + +extern void *mac_flow_get_client_cookie(flow_entry_t *); + +extern uint32_t mac_flow_modify_props(flow_entry_t *, mac_resource_props_t *); + +extern int mac_flow_update(flow_tab_t *, flow_entry_t *, flow_desc_t *); +extern void mac_flow_get_desc(flow_entry_t *, flow_desc_t *); +extern void mac_flow_set_desc(flow_entry_t *, flow_desc_t *); + +extern void mac_flow_remove(flow_tab_t *, flow_entry_t *, boolean_t); +extern void mac_flow_hash_remove(flow_entry_t *); +extern void mac_flow_wait(flow_entry_t *, mac_flow_state_t); +extern void mac_flow_quiesce(flow_entry_t *); +extern void mac_flow_restart(flow_entry_t *); +extern void mac_flow_cleanup(flow_entry_t *); +extern void mac_flow_destroy(flow_entry_t *); + +extern void mac_flow_tab_create(flow_ops_t *, flow_mask_t, uint_t, + struct mac_impl_s *, flow_tab_t **); +extern void mac_flow_l2tab_create(struct mac_impl_s *, flow_tab_t **); +extern void mac_flow_tab_destroy(flow_tab_t *); +extern void mac_flow_drop(void *, void *, mblk_t *); +extern void flow_stat_destroy(flow_entry_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _MAC_FLOW_IMPL_H */ diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h index 6b36a978f0..9c8bfb7ce9 100644 --- a/usr/src/uts/common/sys/mac_impl.h +++ b/usr/src/uts/common/sys/mac_impl.h @@ -26,23 +26,17 @@ #ifndef _SYS_MAC_IMPL_H #define _SYS_MAC_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/mac.h> +#include <sys/modhash.h> +#include <sys/mac_client.h> +#include <sys/mac_provider.h> #include <net/if.h> +#include <sys/mac_flow_impl.h> +#include <netinet/ip6.h> #ifdef __cplusplus extern "C" { #endif -typedef struct mac_multicst_addr_s mac_multicst_addr_t; - -struct mac_multicst_addr_s { - mac_multicst_addr_t *mma_nextp; - uint_t mma_ref; - uint8_t mma_addr[MAXMACADDRLEN]; -}; - typedef struct mac_margin_req_s mac_margin_req_t; struct mac_margin_req_s { @@ -51,31 +45,85 @@ struct mac_margin_req_s { uint32_t mmr_margin; }; -typedef struct mac_notify_fn_s mac_notify_fn_t; +/* Generic linked chain type */ +typedef struct mac_chain_s { + struct mac_chain_s *next; + void *item; +} mac_chain_t; -struct mac_notify_fn_s { - mac_notify_fn_t *mnf_nextp; - mac_notify_t mnf_fn; - void *mnf_arg; -}; +/* + * Generic mac callback list manipulation structures and macros. The mac_cb_t + * represents a general callback list element embedded in a particular + * data structure such as a mac_notify_cb_t or a mac_promisc_impl_t. + * The mac_cb_info_t represents general information about list walkers. + * Please see the comments above mac_callback_add for more information. + */ +/* mcb_flags */ +#define MCB_CONDEMNED 0x1 /* Logically deleted */ +#define MCB_NOTIFY_CB_T 0x2 +#define MCB_TX_NOTIFY_CB_T 0x4 + +typedef struct mac_cb_s { + struct mac_cb_s *mcb_nextp; /* Linked list of callbacks */ + void *mcb_objp; /* Ptr to enclosing object */ + size_t mcb_objsize; /* Sizeof the enclosing obj */ + uint_t mcb_flags; +} mac_cb_t; + +typedef struct mac_cb_info_s { + kmutex_t *mcbi_lockp; + kcondvar_t mcbi_cv; + uint_t mcbi_del_cnt; /* Deleted callback cnt */ + uint_t mcbi_walker_cnt; /* List walker count */ +} mac_cb_info_t; + +typedef struct mac_notify_cb_s { + mac_cb_t mncb_link; /* Linked list of callbacks */ + mac_notify_t mncb_fn; /* callback function */ + void *mncb_arg; /* callback argument */ + struct mac_impl_s *mncb_mip; +} mac_notify_cb_t; -typedef struct mac_rx_fn_s mac_rx_fn_t; +/* + * mac_callback_add(listinfo, listhead, listelement) + * mac_callback_remove(listinfo, listhead, listelement) + */ +typedef boolean_t (*mcb_func_t)(mac_cb_info_t *, mac_cb_t **, mac_cb_t *); -struct mac_rx_fn_s { - mac_rx_fn_t *mrf_nextp; - mac_rx_t mrf_fn; - void *mrf_arg; - boolean_t mrf_inuse; - boolean_t mrf_active; -}; +#define MAC_CALLBACK_WALKER_INC(mcbi) { \ + mutex_enter((mcbi)->mcbi_lockp); \ + (mcbi)->mcbi_walker_cnt++; \ + mutex_exit((mcbi)->mcbi_lockp); \ +} -typedef struct mac_txloop_fn_s mac_txloop_fn_t; +#define MAC_CALLBACK_WALKER_INC_HELD(mcbi) (mcbi)->mcbi_walker_cnt++; -struct mac_txloop_fn_s { - mac_txloop_fn_t *mtf_nextp; - mac_txloop_t mtf_fn; - void *mtf_arg; -}; +#define MAC_CALLBACK_WALKER_DCR(mcbi, headp) { \ + mac_cb_t *rmlist; \ + \ + mutex_enter((mcbi)->mcbi_lockp); \ + if (--(mcbi)->mcbi_walker_cnt == 0 && (mcbi)->mcbi_del_cnt != 0) { \ + rmlist = mac_callback_walker_cleanup((mcbi), headp); \ + mac_callback_free(rmlist); \ + cv_broadcast(&(mcbi)->mcbi_cv); \ + } \ + mutex_exit((mcbi)->mcbi_lockp); \ +} + +#define MAC_PROMISC_WALKER_INC(mip) \ + MAC_CALLBACK_WALKER_INC(&(mip)->mi_promisc_cb_info) + +#define MAC_PROMISC_WALKER_DCR(mip) { \ + mac_cb_info_t *mcbi; \ + \ + mcbi = &(mip)->mi_promisc_cb_info; \ + mutex_enter(mcbi->mcbi_lockp); \ + if (--mcbi->mcbi_walker_cnt == 0 && mcbi->mcbi_del_cnt != 0) { \ + i_mac_promisc_walker_cleanup(mip); \ + cv_broadcast(&mcbi->mcbi_cv); \ + } \ + mutex_exit(mcbi->mcbi_lockp); \ +} typedef struct mactype_s { const char *mt_ident; @@ -91,118 +139,354 @@ typedef struct mactype_s { size_t mt_mappingcount; } mactype_t; +/* + * Multiple rings implementation. + */ +typedef enum { + MAC_GROUP_STATE_UNINIT = 0, /* initial state of data structure */ + MAC_GROUP_STATE_REGISTERED, /* hooked with h/w group */ + MAC_GROUP_STATE_RESERVED, /* group is reserved and opened */ + MAC_GROUP_STATE_SHARED /* default group shared among */ + /* multiple mac clients */ +} mac_group_state_t; + +typedef struct mac_ring_s mac_ring_t; +typedef struct mac_group_s mac_group_t; + +/* + * Ring data structure for ring control and management. + */ +typedef enum { + MR_FREE, /* Available for assignment to flows */ + MR_NEWLY_ADDED, /* Just assigned to another group */ + MR_INUSE /* Assigned to an SRS */ +} mac_ring_state_t; + +/* mr_flag values */ +#define MR_INCIPIENT 0x1 +#define MR_CONDEMNED 0x2 +#define MR_QUIESCE 0x4 + +struct mac_ring_s { + int mr_index; /* index in the original list */ + mac_ring_type_t mr_type; /* ring type */ + mac_ring_t *mr_next; /* next ring in the chain */ + mac_group_handle_t mr_gh; /* reference to group */ + + mac_classify_type_t mr_classify_type; /* HW vs SW */ + struct mac_soft_ring_set_s *mr_srs; /* associated SRS */ + uint_t mr_refcnt; /* Ring references */ + /* ring generation no. to guard against drivers using stale rings */ + uint64_t mr_gen_num; + + kmutex_t mr_lock; + kcondvar_t mr_cv; /* mr_lock */ + mac_ring_state_t mr_state; /* mr_lock */ + uint_t mr_flag; /* mr_lock */ + + mac_ring_info_t mr_info; /* driver supplied info */ +}; +#define mr_driver mr_info.mri_driver +#define mr_start mr_info.mri_start +#define mr_stop mr_info.mri_stop + +#define MAC_RING_MARK(mr, flag) \ + (mr)->mr_flag |= flag; -#define MAC_VNIC_TXINFO_REFHOLD(mvt) { \ - mutex_enter(&(mvt)->mv_lock); \ - (mvt)->mv_refs++; \ - mutex_exit(&(mvt)->mv_lock); \ +#define MAC_RING_UNMARK(mr, flag) \ + (mr)->mr_flag &= ~flag; + +/* + * Reference hold and release on mac_ring_t 'mr' + */ +#define MR_REFHOLD_LOCKED(mr) { \ + ASSERT(MUTEX_HELD(&mr->mr_lock)); \ + (mr)->mr_refcnt++; \ } -#define MAC_VNIC_TXINFO_REFRELE(mvt) { \ - mutex_enter(&(mvt)->mv_lock); \ - if (--(mvt)->mv_refs == 0 && (mvt)->mv_clearing) { \ - (mvt)->mv_clearing = B_FALSE; \ - cv_signal(&(mvt)->mv_cv); \ - } \ - mutex_exit(&(mvt)->mv_lock); \ +#define MR_REFRELE(mr) { \ + mutex_enter(&(mr)->mr_lock); \ + ASSERT((mr)->mr_refcnt != 0); \ + (mr)->mr_refcnt--; \ + if ((mr)->mr_refcnt == 0 && \ + ((mr)->mr_flag & (MR_CONDEMNED | MR_QUIESCE))) \ + cv_signal(&(mr)->mr_cv); \ + mutex_exit(&(mr)->mr_lock); \ } -typedef struct mac_vnic_tx_s { - mac_txinfo_t mv_txinfo; /* provided by VNIC */ - uint32_t mv_refs; - kmutex_t mv_lock; - kcondvar_t mv_cv; - boolean_t mv_clearing; -} mac_vnic_tx_t; +/* + * Per mac client flow information associated with a RX group. + * The entire structure is SL protected. + */ +typedef struct mac_grp_client { + struct mac_grp_client *mgc_next; + struct mac_client_impl_s *mgc_client; +} mac_grp_client_t; + +#define MAC_RX_GROUP_NO_CLIENT(g) ((g)->mrg_clients == NULL) +#define MAC_RX_GROUP_ONLY_CLIENT(g) \ + ((((g)->mrg_clients != NULL) && \ + ((g)->mrg_clients->mgc_next == NULL)) ? \ + (g)->mrg_clients->mgc_client : NULL) /* - * Each registered MAC is associated with a mac_t structure. + * Common ring group data structure for ring control and management. + * The entire structure is SL protected */ -typedef struct mac_impl_s { +struct mac_group_s { + int mrg_index; /* index in the list */ + mac_ring_type_t mrg_type; /* ring type */ + mac_group_state_t mrg_state; /* state of the group */ + mac_group_t *mrg_next; /* next ring in the chain */ + mac_handle_t mrg_mh; /* reference to MAC */ + mac_ring_t *mrg_rings; /* grouped rings */ + uint_t mrg_cur_count; /* actual size of group */ + + mac_grp_client_t *mrg_clients; /* clients list */ + + struct mac_client_impl_s *mrg_tx_client; /* TX client pointer */ + mac_group_info_t mrg_info; /* driver supplied info */ +}; + +#define mrg_driver mrg_info.mgi_driver +#define mrg_start mrg_info.mgi_start +#define mrg_stop mrg_info.mgi_stop + +#define GROUP_INTR_HANDLE(g) (g)->mrg_info.mgi_intr.mi_handle +#define GROUP_INTR_ENABLE_FUNC(g) (g)->mrg_info.mgi_intr.mi_enable +#define GROUP_INTR_DISABLE_FUNC(g) (g)->mrg_info.mgi_intr.mi_disable + +#define MAC_DEFAULT_GROUP(mh) (((mac_impl_t *)mh)->mi_rx_groups) + +#define MAC_RING_TX_DEFAULT(mip, mp) \ + ((mip->mi_default_tx_ring == NULL) ? \ + mip->mi_tx(mip->mi_driver, mp) : \ + mac_ring_tx(mip->mi_default_tx_ring, mp)) + +#define MAC_TX(mip, ring, mp, mcip) { \ + /* \ + * If the MAC client has a bound Hybrid I/O share, \ + * send the packet through the default tx ring, since \ + * the tx rings of this client are now mapped in the \ + * guest domain and not accessible from this domain. \ + */ \ + if (mcip->mci_share_bound || (ring == NULL)) \ + mp = MAC_RING_TX_DEFAULT(mip, mp); \ + else \ + mp = mac_ring_tx(ring, mp); \ +} + +/* mci_tx_flag */ +#define MCI_TX_QUIESCE 0x1 + +typedef struct mac_factory_addr_s { + boolean_t mfa_in_use; + uint8_t mfa_addr[MAXMACADDRLEN]; + struct mac_client_impl_s *mfa_client; +} mac_factory_addr_t; + +typedef struct mac_mcast_addrs_s { + struct mac_mcast_addrs_s *mma_next; + uint8_t mma_addr[MAXMACADDRLEN]; + int mma_ref; +} mac_mcast_addrs_t; + +typedef enum { + MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED = 1, /* hardware steering */ + MAC_ADDRESS_TYPE_UNICAST_PROMISC /* promiscuous mode */ +} mac_address_type_t; + +typedef struct mac_impl_s mac_impl_t; + +typedef struct mac_address_s { + mac_address_type_t ma_type; /* address type */ + int ma_nusers; /* number of users */ + /* of that address */ + struct mac_address_s *ma_next; /* next address */ + uint8_t ma_addr[MAXMACADDRLEN]; /* address value */ + size_t ma_len; /* address length */ + mac_group_t *ma_group; /* asscociated group */ + mac_impl_t *ma_mip; /* MAC handle */ +} mac_address_t; + +extern krwlock_t i_mac_impl_lock; +extern mod_hash_t *i_mac_impl_hash; +extern kmem_cache_t *i_mac_impl_cachep; +extern uint_t i_mac_impl_count; + +/* + * Each registered MAC is associated with a mac_impl_t structure. The + * structure represents the undelying hardware, in terms of definition, + * resources (transmit, receive rings etc.), callback functions etc. It + * also holds the table of MAC clients that are configured on the device. + * The table is used for classifying incoming packets in software. + * + * The protection scheme uses 2 elements, a coarse serialization mechanism + * called perimeter and a finer traditional lock based scheme. More details + * can be found in the big block comment in mac.c. + * + * The protection scheme for each member of the mac_impl_t is described below. + * + * Write Once Only (WO): Typically these don't change for the lifetime of the + * data structure. For example something in mac_impl_t that stays the same + * from mac_register to mac_unregister, or something in a mac_client_impl_t + * that stays the same from mac_client_open to mac_client_close. + * + * Serializer (SL): Protected by the Serializer. All SLOP operations on a + * mac endpoint go through the serializer. MTOPs don't care about reading + * these fields atomically. + * + * Lock: Traditional mutex/rw lock. Modify operations still go through the + * mac serializer, the lock helps synchronize readers with writers. + */ +struct mac_impl_s { + krwlock_t mi_rw_lock; + char mi_name[LIFNAMSIZ]; /* WO */ + uint32_t mi_state_flags; + void *mi_driver; /* Driver private, WO */ + mac_info_t mi_info; /* WO */ + mactype_t *mi_type; /* WO */ + void *mi_pdata; /* WO */ + size_t mi_pdata_size; /* WO */ + mac_callbacks_t *mi_callbacks; /* WO */ + dev_info_t *mi_dip; /* WO */ + uint32_t mi_ref; /* i_mac_impl_lock */ + uint_t mi_active; /* SL */ + link_state_t mi_linkstate; /* none */ + link_state_t mi_lastlinkstate; /* none */ + uint_t mi_promisc; /* SL */ + uint_t mi_devpromisc; /* SL */ + kmutex_t mi_lock; + uint8_t mi_addr[MAXMACADDRLEN]; /* mi_rw_lock */ + uint8_t mi_dstaddr[MAXMACADDRLEN]; /* mi_rw_lock */ + /* - * The following fields are set in mac_register() and will not be - * changed until mac_unregister(). No lock is needed to access them. + * The mac perimeter. All client initiated create/modify operations + * on a mac end point go through this. */ - char mi_name[LIFNAMSIZ]; - void *mi_driver; /* Driver private data */ - mac_info_t mi_info; - mactype_t *mi_type; - void *mi_pdata; - size_t mi_pdata_size; - mac_callbacks_t *mi_callbacks; - dev_info_t *mi_dip; - minor_t mi_minor; - dev_t mi_phy_dev; - kstat_t *mi_ksp; - uint_t mi_kstat_count; - mac_txinfo_t mi_txinfo; - mac_txinfo_t mi_txloopinfo; - - krwlock_t mi_gen_lock; - uint32_t mi_oref; - uint32_t mi_ref; - boolean_t mi_disabled; - boolean_t mi_exclusive; - - krwlock_t mi_state_lock; - uint_t mi_active; - - krwlock_t mi_data_lock; - link_state_t mi_linkstate; - link_state_t mi_lastlinkstate; - uint_t mi_promisc; - uint_t mi_devpromisc; - uint8_t mi_addr[MAXMACADDRLEN]; - uint8_t mi_dstaddr[MAXMACADDRLEN]; - uint_t mi_sdu_min; - uint_t mi_sdu_max; - mac_multicst_addr_t *mi_mmap; - - krwlock_t mi_notify_lock; - uint32_t mi_notify_bits; - kmutex_t mi_notify_bits_lock; - kthread_t *mi_notify_thread; - mac_notify_fn_t *mi_mnfp; - kcondvar_t mi_notify_cv; - - krwlock_t mi_rx_lock; - mac_rx_fn_t *mi_mrfp; - krwlock_t mi_tx_lock; - mac_txloop_fn_t *mi_mtfp; - - krwlock_t mi_resource_lock; - mac_resource_add_t mi_resource_add; - void *mi_resource_add_arg; - - kmutex_t mi_activelink_lock; - boolean_t mi_activelink; - - uint32_t mi_rx_ref; /* #threads in mac_rx() */ - uint32_t mi_rx_removed; /* #callbacks marked */ - /* for removal */ - kmutex_t mi_lock; - kcondvar_t mi_rx_cv; - boolean_t mi_shareable; - boolean_t mi_vnic_present; - mac_vnic_tx_t *mi_vnic_tx; - mac_txinfo_t mi_vnic_txinfo; - mac_txinfo_t mi_vnic_txloopinfo; - mac_getcapab_t mi_vnic_getcapab_fn; - void *mi_vnic_getcapab_arg; - - boolean_t mi_legacy; - uint32_t mi_unsup_note; - uint32_t mi_margin; + kmutex_t mi_perim_lock; + kthread_t *mi_perim_owner; /* mi_perim_lock */ + uint_t mi_perim_ocnt; /* mi_perim_lock */ + kcondvar_t mi_perim_cv; /* mi_perim_lock */ + + /* mac notification callbacks */ + kmutex_t mi_notify_lock; + mac_cb_info_t mi_notify_cb_info; /* mi_notify_lock */ + mac_cb_t *mi_notify_cb_list; /* mi_notify_lock */ + kthread_t *mi_notify_thread; /* mi_notify_lock */ + uint_t mi_notify_bits; /* mi_notify_lock */ + + uint32_t mi_v12n_level; /* Virt'ion readiness */ /* + * RX groups, ring capability + * Fields of this block are SL protected. + */ + mac_group_type_t mi_rx_group_type; /* grouping type */ + uint_t mi_rx_group_count; + mac_group_t *mi_rx_groups; + + mac_capab_rings_t mi_rx_rings_cap; + + /* + * TX groups and ring capability, SL Protected. + */ + mac_group_type_t mi_tx_group_type; /* grouping type */ + uint_t mi_tx_group_count; + uint_t mi_tx_group_free; + mac_group_t *mi_tx_groups; + + mac_capab_rings_t mi_tx_rings_cap; + + mac_ring_handle_t mi_default_tx_ring; + + /* + * MAC address list. SL protected. + */ + mac_address_t *mi_addresses; + + /* + * This MAC's table of sub-flows + */ + flow_tab_t *mi_flow_tab; /* WO */ + + kstat_t *mi_ksp; /* WO */ + uint_t mi_kstat_count; /* WO */ + uint_t mi_nactiveclients; /* SL */ + + /* for broadcast and multicast support */ + struct mac_mcast_addrs_s *mi_mcast_addrs; /* mi_rw_lock */ + struct mac_bcast_grp_s *mi_bcast_grp; /* mi_rw_lock */ + uint_t mi_bcast_ngrps; /* mi_rw_lock */ + + /* list of MAC clients which opened this MAC */ + struct mac_client_impl_s *mi_clients_list; /* mi_rw_lock */ + uint_t mi_nclients; /* mi_rw_lock */ + + uint32_t mi_margin; /* mi_rw_lock */ + uint_t mi_sdu_min; /* mi_rw_lock */ + uint_t mi_sdu_max; /* mi_rw_lock */ + + /* + * Cache of factory MAC addresses provided by the driver. If + * the driver doesn't provide multiple factory MAC addresses, + * the mi_factory_addr is set to NULL, and mi_factory_addr_num + * is set to zero. + */ + mac_factory_addr_t *mi_factory_addr; /* mi_rw_lock */ + uint_t mi_factory_addr_num; /* mi_rw_lock */ + + /* for promiscuous mode support */ + kmutex_t mi_promisc_lock; + mac_cb_t *mi_promisc_list; /* mi_promisc_lock */ + mac_cb_info_t mi_promisc_cb_info; /* mi_promisc_lock */ + + /* cache of rings over this mac_impl */ + kmutex_t mi_ring_lock; + mac_ring_t *mi_ring_freelist; /* mi_ring_lock */ + + /* + * These are used for caching the properties, if any, for the + * primary MAC client. If the MAC client is not yet in place + * when the properties are set then we cache them here to be + * applied to the MAC client when it is created. + */ + mac_resource_props_t mi_resource_props; /* SL */ + + minor_t mi_minor; /* WO */ + dev_t mi_phy_dev; /* WO */ + uint32_t mi_oref; /* SL */ + uint32_t mi_unsup_note; /* WO */ + /* * List of margin value requests added by mac clients. This list is * sorted: the first one has the greatest value. */ mac_margin_req_t *mi_mmrp; mac_priv_prop_t *mi_priv_prop; uint_t mi_priv_prop_count; -} mac_impl_t; + + /* + * Hybrid I/O related definitions. + */ + mac_capab_share_t mi_share_capab; + +/* This should be the last block in this structure */ +#ifdef DEBUG +#define MAC_PERIM_STACK_DEPTH 15 + int mi_perim_stack_depth; + pc_t mi_perim_stack[MAC_PERIM_STACK_DEPTH]; +#endif +}; + +/* for mi_state_flags */ +#define MIS_DISABLED 0x0001 +#define MIS_IS_VNIC 0x0002 +#define MIS_IS_AGGR 0x0004 +#define MIS_NOTIFY_DONE 0x0008 +#define MIS_EXCLUSIVE 0x0010 +#define MIS_EXCLUSIVE_HELD 0x0020 +#define MIS_LEGACY 0x0040 #define mi_getstat mi_callbacks->mc_getstat #define mi_start mi_callbacks->mc_start @@ -212,19 +496,193 @@ typedef struct mac_impl_s { #define mi_setpromisc mi_callbacks->mc_setpromisc #define mi_multicst mi_callbacks->mc_multicst #define mi_unicst mi_callbacks->mc_unicst -#define mi_resources mi_callbacks->mc_resources #define mi_tx mi_callbacks->mc_tx #define mi_ioctl mi_callbacks->mc_ioctl #define mi_getcapab mi_callbacks->mc_getcapab +typedef struct mac_notify_task_arg { + mac_impl_t *mnt_mip; + mac_notify_type_t mnt_type; + mac_ring_t *mnt_ring; +} mac_notify_task_arg_t; + +typedef enum { + MAC_RX_NO_RESERVE, + MAC_RX_RESERVE_DEFAULT, + MAC_RX_RESERVE_NONDEFAULT +} mac_rx_group_reserve_type_t; + +/* + * XXX All MAC_DBG_PRTs must be replaced with call to dtrace probes. For now + * it may be easier to have these printfs for easier debugging + */ +#ifdef DEBUG +extern int mac_dbg; +#define MAC_DBG_PRT(a) if (mac_dbg > 0) {(void) printf a; } +#else +#define MAC_DBG_PRT(a) +#endif + +/* + * The mac_perim_handle_t is an opaque type that encodes the 'mip' pointer + * and whether internally a mac_open was done when acquiring the perimeter. + */ +#define MAC_ENCODE_MPH(mph, mh, need_close) \ + (mph) = (mac_perim_handle_t)((uintptr_t)(mh) | need_close) + +#define MAC_DECODE_MPH(mph, mip, need_close) { \ + mip = (mac_impl_t *)(((uintptr_t)mph) & ~0x1); \ + (need_close) = ((uintptr_t)mph & 0x1); \ +} + +typedef struct mac_client_impl_s mac_client_impl_t; + extern void mac_init(void); extern int mac_fini(void); extern void mac_stat_create(mac_impl_t *); extern void mac_stat_destroy(mac_impl_t *); extern uint64_t mac_stat_default(mac_impl_t *, uint_t); +extern void mac_ndd_ioctl(mac_impl_t *, queue_t *, mblk_t *); +extern void mac_create_soft_ring_kstats(mac_impl_t *, int32_t); +extern boolean_t mac_ip_hdr_length_v6(mblk_t *, ip6_t *, uint16_t *, + uint8_t *); + +extern mblk_t *mac_copymsgchain_cksum(mblk_t *); +extern mblk_t *mac_fix_cksum(mblk_t *); +extern void mac_packet_print(mac_handle_t, mblk_t *); +extern void mac_rx_deliver(void *, mac_resource_handle_t, mblk_t *, + mac_header_info_t *); +extern void mac_tx_notify(mac_impl_t *); + +extern boolean_t mac_callback_find(mac_cb_info_t *, mac_cb_t **, mac_cb_t *); +extern void mac_callback_add(mac_cb_info_t *, mac_cb_t **, mac_cb_t *); +extern boolean_t mac_callback_remove(mac_cb_info_t *, mac_cb_t **, mac_cb_t *); +extern void mac_callback_remove_wait(mac_cb_info_t *); +extern void mac_callback_free(mac_cb_t *); +extern mac_cb_t *mac_callback_walker_cleanup(mac_cb_info_t *, mac_cb_t **); + +/* in mac_bcast.c */ +extern void mac_bcast_init(void); +extern void mac_bcast_fini(void); +extern mac_impl_t *mac_bcast_grp_mip(void *); +extern int mac_bcast_add(mac_client_impl_t *, const uint8_t *, uint16_t, + mac_addrtype_t); +extern void mac_bcast_delete(mac_client_impl_t *, const uint8_t *, uint16_t); +extern void mac_bcast_send(void *, void *, mblk_t *, boolean_t); +extern void mac_bcast_grp_free(void *); +extern void mac_bcast_refresh(mac_impl_t *, mac_multicst_t, void *, + boolean_t); +extern void mac_client_bcast_refresh(mac_client_impl_t *, mac_multicst_t, + void *, boolean_t); -extern void mac_ndd_ioctl(mac_impl_t *, queue_t *, mblk_t *); +/* + * Grouping functions are used internally by MAC layer. + */ +extern int mac_group_addmac(mac_group_t *, const uint8_t *); +extern int mac_group_remmac(mac_group_t *, const uint8_t *); +extern int mac_rx_group_add_flow(mac_client_impl_t *, flow_entry_t *, + mac_group_t *); +extern mblk_t *mac_ring_tx(mac_ring_handle_t, mblk_t *); +extern mac_ring_t *mac_reserve_tx_ring(mac_impl_t *, mac_ring_t *); +extern void mac_release_tx_ring(mac_ring_handle_t); +extern mac_group_t *mac_reserve_tx_group(mac_impl_t *, mac_share_handle_t); +extern void mac_release_tx_group(mac_impl_t *, mac_group_t *); + +/* + * MAC address functions are used internally by MAC layer. + */ +extern mac_address_t *mac_find_macaddr(mac_impl_t *, uint8_t *); +extern boolean_t mac_check_macaddr_shared(mac_address_t *); +extern int mac_update_macaddr(mac_address_t *, uint8_t *); +extern void mac_freshen_macaddr(mac_address_t *, uint8_t *); +extern void mac_retrieve_macaddr(mac_address_t *, uint8_t *); +extern void mac_init_macaddr(mac_impl_t *); +extern void mac_fini_macaddr(mac_impl_t *); + +/* + * Flow construction/destruction routines. + * Not meant to be used by mac clients. + */ +extern int mac_link_flow_init(mac_client_handle_t, flow_entry_t *); +extern void mac_link_flow_clean(mac_client_handle_t, flow_entry_t *); + +/* + * Called from mac_provider.c + */ +extern void mac_fanout_recompute(mac_impl_t *); + +/* + * The following functions are used internally by the MAC layer to + * add/remove/update flows associated with a mac_impl_t. They should + * never be used directly by MAC clients. + */ +extern int mac_datapath_setup(mac_client_impl_t *, flow_entry_t *, uint32_t); +extern void mac_datapath_teardown(mac_client_impl_t *, flow_entry_t *, + uint32_t); +extern void mac_srs_group_setup(mac_client_impl_t *, flow_entry_t *, + mac_group_t *, uint32_t); +extern void mac_srs_group_teardown(mac_client_impl_t *, flow_entry_t *, + uint32_t); +extern int mac_rx_classify_flow_quiesce(flow_entry_t *, void *); +extern int mac_rx_classify_flow_restart(flow_entry_t *, void *); +extern void mac_tx_client_quiesce(mac_client_impl_t *, uint_t); +extern void mac_tx_client_restart(mac_client_impl_t *); +extern void mac_client_quiesce(mac_client_impl_t *); +extern void mac_client_restart(mac_client_impl_t *); + +extern void mac_flow_update_priority(mac_client_impl_t *, flow_entry_t *); + +extern void mac_flow_rem_subflow(flow_entry_t *); +extern void mac_rename_flow(flow_entry_t *, const char *); +extern void mac_flow_set_name(flow_entry_t *, const char *); + +extern mblk_t *mac_add_vlan_tag(mblk_t *, uint_t, uint16_t); +extern mblk_t *mac_add_vlan_tag_chain(mblk_t *, uint_t, uint16_t); +extern mblk_t *mac_strip_vlan_tag_chain(mblk_t *); +extern void mac_pkt_drop(void *, mac_resource_handle_t, mblk_t *, boolean_t); +extern mblk_t *mac_rx_flow(mac_handle_t, mac_resource_handle_t, mblk_t *); + +extern void i_mac_share_alloc(mac_client_impl_t *); +extern void i_mac_share_free(mac_client_impl_t *); +extern void i_mac_perim_enter(mac_impl_t *); +extern void i_mac_perim_exit(mac_impl_t *); +extern int i_mac_perim_enter_nowait(mac_impl_t *); +extern void i_mac_tx_srs_notify(mac_impl_t *, mac_ring_handle_t); +extern int mac_hold(const char *, mac_impl_t **); +extern void mac_rele(mac_impl_t *); +extern int i_mac_disable(mac_impl_t *); +extern void i_mac_notify(mac_impl_t *, mac_notify_type_t); +extern void i_mac_notify_exit(mac_impl_t *); +extern int mac_start(mac_impl_t *); +extern void mac_stop(mac_impl_t *); +extern void mac_rx_group_unmark(mac_group_t *, uint_t); +extern void mac_tx_client_flush(mac_client_impl_t *); +extern void mac_tx_client_block(mac_client_impl_t *); +extern void mac_tx_client_unblock(mac_client_impl_t *); +extern int i_mac_promisc_set(mac_impl_t *, boolean_t, mac_promisc_type_t); +extern void i_mac_promisc_walker_cleanup(mac_impl_t *); +extern mactype_t *mactype_getplugin(const char *); +extern void mac_addr_factory_init(mac_impl_t *); +extern void mac_addr_factory_fini(mac_impl_t *); +extern void mac_register_priv_prop(mac_impl_t *, mac_priv_prop_t *, uint_t); +extern void mac_unregister_priv_prop(mac_impl_t *); +extern int mac_init_rings(mac_impl_t *, mac_ring_type_t); +extern void mac_free_rings(mac_impl_t *, mac_ring_type_t); + +extern int mac_start_group(mac_group_t *); +extern void mac_stop_group(mac_group_t *); +extern int mac_start_ring(mac_ring_t *); +extern void mac_stop_ring(mac_ring_t *); +extern int mac_add_macaddr(mac_impl_t *, mac_group_t *, uint8_t *); +extern int mac_remove_macaddr(mac_address_t *); + +extern void mac_set_rx_group_state(mac_group_t *, mac_group_state_t); +extern void mac_rx_group_add_client(mac_group_t *, mac_client_impl_t *); +extern void mac_rx_group_remove_client(mac_group_t *, mac_client_impl_t *) +; +extern int i_mac_group_add_ring(mac_group_t *, mac_ring_t *, int); +extern void i_mac_group_rem_ring(mac_group_t *, mac_ring_t *, boolean_t); #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/mac_provider.h b/usr/src/uts/common/sys/mac_provider.h new file mode 100644 index 0000000000..9564efc00d --- /dev/null +++ b/usr/src/uts/common/sys/mac_provider.h @@ -0,0 +1,478 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_MAC_PROVIDER_H +#define _SYS_MAC_PROVIDER_H + +#include <sys/types.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/stream.h> +#include <sys/mac_flow.h> +#include <sys/mac.h> + +/* + * MAC Provider Interface + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * MAC version identifier. This is used by mac_alloc() mac_register() to + * verify that incompatible drivers don't register. + */ +#define MAC_VERSION 0x1 + +/* + * Opaque handle types + */ +typedef struct __mac_rule_handle *mac_rule_handle_t; + +/* + * Statistics + */ + +#define XCVR_UNDEFINED 0 +#define XCVR_NONE 1 +#define XCVR_10 2 +#define XCVR_100T4 3 +#define XCVR_100X 4 +#define XCVR_100T2 5 +#define XCVR_1000X 6 +#define XCVR_1000T 7 + +#ifdef _KERNEL + +/* + * Definitions for MAC Drivers Capabilities + */ +/* + * MAC layer capabilities. These capabilities are handled by the drivers' + * mc_capab_get() callbacks. Some capabilities require the driver to fill + * in a given data structure, and others are simply boolean capabilities. + * Note that capability values must be powers of 2 so that consumers and + * providers of this interface can keep track of which capabilities they + * care about by keeping a bitfield of these things around somewhere. + */ +typedef enum { + /* + * Capabilities reserved for internal use only + */ + MAC_CAPAB_VNIC = 0x0001, /* data is mac_capab_vnic_t */ + MAC_CAPAB_ANCHOR_VNIC = 0x0002, /* boolean only, no data */ + MAC_CAPAB_AGGR = 0x0004, /* data is mac_capab_aggr_t */ + MAC_CAPAB_NO_NATIVEVLAN = 0x0008, /* boolean only, no data */ + MAC_CAPAB_NO_ZCOPY = 0x0010, /* boolean only, no data */ + MAC_CAPAB_LEGACY = 0x0020, /* data is mac_capab_legacy_t */ + + /* + * Public Capabilities + */ + MAC_CAPAB_HCKSUM = 0x0100, /* data is a uint32_t */ + MAC_CAPAB_LSO = 0x0200, /* data is mac_capab_lso_t */ + MAC_CAPAB_RINGS = 0x0400, /* data is mac_capab_rings_t */ + MAC_CAPAB_MULTIFACTADDR = 0x0800, /* mac_data_multifactaddr_t */ + MAC_CAPAB_SHARES = 0x1000 /* data is mac_capab_share_t */ + + /* add new capabilities here */ +} mac_capab_t; + + +/* + * LSO capability + */ +typedef struct lso_basic_tcp_ipv4_s { + t_uscalar_t lso_max; /* maximum payload */ +} lso_basic_tcp_ipv4_t; + +/* + * Currently supported flags for LSO. + */ +#define LSO_TX_BASIC_TCP_IPV4 0x01 /* TCP LSO capability */ + +/* + * Future LSO capabilities can be added at the end of the mac_capab_lso_t. + * When such capability is added to the GLDv3 framework, the size of the + * mac_capab_lso_t it allocates and passes to the drivers increases. Older + * drivers wil access only the (upper) sections of that structure, that is the + * sections carrying the capabilities they understand. This ensures the + * interface can be safely extended in a binary compatible way. + */ +typedef struct mac_capab_lso_s { + t_uscalar_t lso_flags; + lso_basic_tcp_ipv4_t lso_basic_tcp_ipv4; + /* Add future lso capabilities here */ +} mac_capab_lso_t; + +/* + * Multiple Factory MAC Addresses Capability + */ +typedef struct mac_capab_multifactaddr_s { + /* + * Number of factory addresses + */ + uint_t mcm_naddr; + + /* + * Callbacks to query all the factory addresses. + */ + void (*mcm_getaddr)(void *, uint_t, uint8_t *); +} mac_capab_multifactaddr_t; + +/* + * MAC driver entry point types. + */ +typedef int (*mac_getstat_t)(void *, uint_t, uint64_t *); +typedef int (*mac_start_t)(void *); +typedef void (*mac_stop_t)(void *); +typedef int (*mac_setpromisc_t)(void *, boolean_t); +typedef int (*mac_multicst_t)(void *, boolean_t, const uint8_t *); +typedef int (*mac_unicst_t)(void *, const uint8_t *); +typedef void (*mac_ioctl_t)(void *, queue_t *, mblk_t *); +typedef void (*mac_resources_t)(void *); +typedef mblk_t *(*mac_tx_t)(void *, mblk_t *); +typedef boolean_t (*mac_getcapab_t)(void *, mac_capab_t, void *); +typedef int (*mac_open_t)(void *); +typedef void (*mac_close_t)(void *); +typedef int (*mac_set_prop_t)(void *, const char *, mac_prop_id_t, + uint_t, const void *); +typedef int (*mac_get_prop_t)(void *, const char *, mac_prop_id_t, + uint_t, uint_t, void *, uint_t *); + +/* + * Drivers must set all of these callbacks except for mc_resources, + * mc_ioctl, and mc_getcapab, which are optional. If any of these optional + * callbacks are set, their appropriate flags must be set in mc_callbacks. + * Any future additions to this list must also be accompanied by an + * associated mc_callbacks flag so that the framework can grow without + * affecting the binary compatibility of the interface. + */ +typedef struct mac_callbacks_s { + uint_t mc_callbacks; /* Denotes which callbacks are set */ + mac_getstat_t mc_getstat; /* Get the value of a statistic */ + mac_start_t mc_start; /* Start the device */ + mac_stop_t mc_stop; /* Stop the device */ + mac_setpromisc_t mc_setpromisc; /* Enable or disable promiscuous mode */ + mac_multicst_t mc_multicst; /* Enable or disable a multicast addr */ + mac_unicst_t mc_unicst; /* Set the unicast MAC address */ + mac_tx_t mc_tx; /* Transmit a packet */ + mac_ioctl_t mc_ioctl; /* Process an unknown ioctl */ + mac_getcapab_t mc_getcapab; /* Get capability information */ + mac_open_t mc_open; /* Open the device */ + mac_close_t mc_close; /* Close the device */ + mac_set_prop_t mc_setprop; + mac_get_prop_t mc_getprop; +} mac_callbacks_t; + +typedef struct mac_priv_prop_s { + char mpp_name[MAXLINKPROPNAME]; + uint_t mpp_flags; +} mac_priv_prop_t; + +/* + * Virtualization Capabilities + */ +/* + * The ordering of entries below is important. MAC_HW_CLASSIFIER + * is the cutoff below which are entries which don't depend on + * H/W. MAC_HW_CLASSIFIER and entries after that are cases where + * H/W has been updated through add/modify/delete APIs. + */ +typedef enum { + MAC_NO_CLASSIFIER = 0, + MAC_SW_CLASSIFIER, + MAC_HW_CLASSIFIER +} mac_classify_type_t; + +typedef void (*mac_rx_func_t)(void *, mac_resource_handle_t, mblk_t *, + boolean_t); + +/* + * The virtualization level conveys the extent of the NIC hardware assistance + * for traffic steering employed for virtualization: + * + * MAC_VIRT_NONE: No assist for v12n. + * + * MAC_VIRT_LEVEL1: Multiple Rx rings with MAC address level + * classification between groups of rings. + * Requires the support of the MAC_CAPAB_RINGS + * capability. + * + * MAC_VIRT_HIO: Hybrid I/O capable MAC. Require the support + * of the MAC_CAPAB_SHARES capability. + * + * MAC_VIRT_SERIALIZE: Temporary flag *ONLY* for nxge. Mac layer + * uses this to enable mac Tx serializer on + * outbound traffic and to always enqueue + * incoming traffic on Rx soft rings in mac. + */ +#define MAC_VIRT_NONE 0x0 +#define MAC_VIRT_LEVEL1 0x1 +#define MAC_VIRT_HIO 0x2 +#define MAC_VIRT_SERIALIZE 0x4 + +typedef enum { + MAC_RING_TYPE_RX = 1, /* Receive ring */ + MAC_RING_TYPE_TX /* Transmit ring */ +} mac_ring_type_t; + +#define MAX_RINGS_PER_GROUP 32 + +/* + * Grouping type of a ring group + * + * MAC_GROUP_TYPE_STATIC: The ring group can not be re-grouped. + * MAC_GROUP_TYPE_DYNAMIC: The ring group support dynamic re-grouping + */ +typedef enum { + MAC_GROUP_TYPE_STATIC = 1, /* Static ring group */ + MAC_GROUP_TYPE_DYNAMIC /* Dynamic ring group */ +} mac_group_type_t; + +typedef struct __mac_ring_driver *mac_ring_driver_t; +typedef struct __mac_group_driver *mac_group_driver_t; + +typedef struct mac_ring_info_s mac_ring_info_t; +typedef struct mac_group_info_s mac_group_info_t; + +typedef void (*mac_get_ring_t)(void *, mac_ring_type_t, const int, const int, + mac_ring_info_t *, mac_ring_handle_t); +typedef void (*mac_get_group_t)(void *, mac_ring_type_t, const int, + mac_group_info_t *, mac_group_handle_t); + +typedef void (*mac_group_add_ring_t)(mac_group_driver_t, + mac_ring_driver_t, mac_ring_type_t); +typedef void (*mac_group_rem_ring_t)(mac_group_driver_t, + mac_ring_driver_t, mac_ring_type_t); + +/* + * Multiple Rings Capability + */ +typedef struct mac_capab_rings_s { + mac_ring_type_t mr_type; /* Ring type: Rx vs Tx */ + mac_group_type_t mr_group_type; /* Dynamic vs static grouping */ + uint_t mr_rnum; /* Number of rings */ + uint_t mr_gnum; /* Number of ring groups */ + mac_get_ring_t mr_rget; /* Get ring from driver */ + mac_get_group_t mr_gget; /* Get ring group from driver */ + mac_group_add_ring_t mr_gaddring; /* Add ring into a group */ + mac_group_rem_ring_t mr_gremring; /* Remove ring from a group */ +} mac_capab_rings_t; + +/* + * Common ring functions and driver interfaces + */ +typedef int (*mac_ring_start_t)(mac_ring_driver_t, uint64_t); +typedef void (*mac_ring_stop_t)(mac_ring_driver_t); + +typedef mblk_t *(*mac_ring_send_t)(void *, mblk_t *); +typedef mblk_t *(*mac_ring_poll_t)(void *, int); + +typedef struct mac_ring_info_s { + mac_ring_driver_t mri_driver; + mac_ring_start_t mri_start; + mac_ring_stop_t mri_stop; + mac_intr_t mri_intr; + union { + mac_ring_send_t send; + mac_ring_poll_t poll; + } mrfunion; +} mac_ring_info_s; + +#define mri_tx mrfunion.send +#define mri_poll mrfunion.poll + +typedef int (*mac_group_start_t)(mac_group_driver_t); +typedef void (*mac_group_stop_t)(mac_group_driver_t); +typedef int (*mac_add_mac_addr_t)(void *, const uint8_t *); +typedef int (*mac_rem_mac_addr_t)(void *, const uint8_t *); + +struct mac_group_info_s { + mac_group_driver_t mgi_driver; /* Driver reference */ + mac_group_start_t mgi_start; /* Start the group */ + mac_group_stop_t mgi_stop; /* Stop the group */ + uint_t mgi_count; /* Count of rings */ + mac_intr_t mgi_intr; /* Optional per-group intr */ + + /* Only used for rx groups */ + mac_add_mac_addr_t mgi_addmac; /* Add a MAC address */ + mac_rem_mac_addr_t mgi_remmac; /* Remove a MAC address */ +}; + +/* + * Share management functions. + */ +typedef uint64_t mac_share_handle_t; + +/* + * Allocate and free a share. Returns ENOSPC if all shares have been + * previously allocated. + */ +typedef int (*mac_alloc_share_t)(void *, mac_share_handle_t *); +typedef void (*mac_free_share_t)(mac_share_handle_t); + +/* + * Bind and unbind a share. Binding a share allows a domain + * to have direct access to the groups and rings associated with + * that share. + */ +typedef int (*mac_bind_share_t)(mac_share_handle_t, uint64_t, uint64_t *); +typedef void (*mac_unbind_share_t)(mac_share_handle_t); + +/* + * Return information on about a share. + */ +typedef void (*mac_share_query_t)(mac_share_handle_t, mac_ring_type_t, + mac_ring_handle_t *, uint_t *); + +/* + * Basic idea, bind previously created ring groups to shares + * for them to be exported (or shared) by another domain. + * These interfaces bind/unbind the ring group to a share. + * The groups and their rings will be shared with the guest + * as soon as the share is bound. + */ +typedef int (*mac_share_add_group_t)(mac_share_handle_t, + mac_group_driver_t); +typedef int (*mac_share_rem_group_t)(mac_share_handle_t, + mac_group_driver_t); + +typedef struct mac_capab_share_s { + uint_t ms_snum; /* Number of shares (vr's) */ + void *ms_handle; /* Handle to driver. */ + mac_alloc_share_t ms_salloc; /* Get a share from driver. */ + mac_free_share_t ms_sfree; /* Return a share to driver. */ + mac_share_add_group_t ms_sadd; /* Add a group to the share. */ + mac_share_rem_group_t ms_sremove; /* Remove group from share. */ + mac_share_query_t ms_squery; /* Query share constraints */ + mac_bind_share_t ms_sbind; /* Bind a share */ + mac_unbind_share_t ms_sunbind; /* Unbind a share */ +} mac_capab_share_t; + +/* + * MAC registration interface + */ +typedef struct mac_register_s { + uint_t m_version; /* set by mac_alloc() */ + const char *m_type_ident; + void *m_driver; /* Driver private data */ + dev_info_t *m_dip; + uint_t m_instance; + uint8_t *m_src_addr; + uint8_t *m_dst_addr; + mac_callbacks_t *m_callbacks; + uint_t m_min_sdu; + uint_t m_max_sdu; + void *m_pdata; + size_t m_pdata_size; + uint32_t m_margin; + mac_priv_prop_t *m_priv_props; + size_t m_priv_prop_count; + uint32_t m_v12n; /* Virtualization level */ +} mac_register_t; + +/* + * Flags for mc_callbacks. Requiring drivers to set the flags associated + * with optional callbacks initialized in the structure allows the mac + * module to add optional callbacks in the future without requiring drivers + * to recompile. + */ +#define MC_IOCTL 0x001 +#define MC_GETCAPAB 0x002 +#define MC_OPEN 0x004 +#define MC_CLOSE 0x008 +#define MC_SETPROP 0x010 +#define MC_GETPROP 0x020 + +/* + * Driver interface functions. + */ +extern void mac_sdu_get(mac_handle_t, uint_t *, uint_t *); +extern int mac_maxsdu_update(mac_handle_t, uint_t); +extern int mac_set_prop(mac_handle_t, mac_prop_t *, + void *, uint_t); +extern int mac_get_prop(mac_handle_t, mac_prop_t *, + void *, uint_t, uint_t *); + +extern mac_register_t *mac_alloc(uint_t); +extern void mac_free(mac_register_t *); +extern int mac_register(mac_register_t *, mac_handle_t *); +extern int mac_disable_nowait(mac_handle_t); +extern int mac_disable(mac_handle_t); +extern int mac_unregister(mac_handle_t); +extern void mac_rx(mac_handle_t, mac_resource_handle_t, + mblk_t *); +extern void mac_rx_ring(mac_handle_t, mac_ring_handle_t, + mblk_t *, uint64_t); +extern void mac_link_update(mac_handle_t, link_state_t); +extern void mac_unicst_update(mac_handle_t, + const uint8_t *); +extern void mac_tx_update(mac_handle_t); +extern void mac_tx_ring_update(mac_handle_t, + mac_ring_handle_t); +extern void mac_resource_update(mac_handle_t); +extern void mac_capab_update(mac_handle_t); +extern int mac_pdata_update(mac_handle_t, void *, + size_t); +extern void mac_multicast_refresh(mac_handle_t, + mac_multicst_t, void *, boolean_t); +extern void mac_unicst_refresh(mac_handle_t, mac_unicst_t, + void *); +extern void mac_promisc_refresh(mac_handle_t, + mac_setpromisc_t, void *); +extern boolean_t mac_margin_update(mac_handle_t, uint32_t); +extern void mac_margin_get(mac_handle_t, uint32_t *); +extern int mac_margin_remove(mac_handle_t, uint32_t); +extern int mac_margin_add(mac_handle_t, uint32_t *, + boolean_t); +extern void mac_init_ops(struct dev_ops *, const char *); +extern void mac_fini_ops(struct dev_ops *); +extern uint32_t mac_no_notification(mac_handle_t); + +extern mactype_register_t *mactype_alloc(uint_t); +extern void mactype_free(mactype_register_t *); +extern int mactype_register(mactype_register_t *); +extern int mactype_unregister(const char *); +extern void mac_set_ring(void *, void *); + +extern boolean_t mac_unicst_verify(mac_handle_t, + const uint8_t *, uint_t); + +extern boolean_t mac_is_vnic(mac_handle_t); + +extern int mac_group_add_ring(mac_group_handle_t, int); +extern void mac_group_rem_ring(mac_group_handle_t, + mac_ring_handle_t); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MAC_PROVIDER_H */ diff --git a/usr/src/uts/common/sys/mac_soft_ring.h b/usr/src/uts/common/sys/mac_soft_ring.h new file mode 100644 index 0000000000..45fcdf65bf --- /dev/null +++ b/usr/src/uts/common/sys/mac_soft_ring.h @@ -0,0 +1,724 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_MAC_SOFT_RING_H +#define _SYS_MAC_SOFT_RING_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <sys/cpuvar.h> +#include <sys/processor.h> +#include <sys/stream.h> +#include <sys/squeue.h> +#include <sys/dlpi.h> +#include <sys/mac_impl.h> + +#define S_RING_NAMELEN 64 + +#define MAX_SR_FANOUT 32 + +extern boolean_t mac_soft_ring_enable; +extern boolean_t mac_latency_optimize; + +typedef struct mac_soft_ring_s mac_soft_ring_t; +typedef struct mac_soft_ring_set_s mac_soft_ring_set_t; + +typedef void (*mac_soft_ring_drain_func_t)(mac_soft_ring_t *); +typedef mac_tx_cookie_t (*mac_tx_func_t)(mac_soft_ring_set_t *, mblk_t *, + uintptr_t, uint16_t, mblk_t **); + + +/* Tx notify callback */ +typedef struct mac_tx_notify_cb_s { + mac_cb_t mtnf_link; /* Linked list of callbacks */ + mac_tx_notify_t mtnf_fn; /* The callback function */ + void *mtnf_arg; /* Callback function argument */ +} mac_tx_notify_cb_t; + +struct mac_soft_ring_s { + /* Keep the most used members 64bytes cache aligned */ + kmutex_t s_ring_lock; /* lock before using any member */ + uint16_t s_ring_type; /* processing model of the sq */ + uint16_t s_ring_state; /* state flags and message count */ + int s_ring_count; /* # of mblocks in mac_soft_ring */ + size_t s_ring_size; /* Size of data queued */ + mblk_t *s_ring_first; /* first mblk chain or NULL */ + mblk_t *s_ring_last; /* last mblk chain or NULL */ + + mac_direct_rx_t s_ring_rx_func; + void *s_ring_rx_arg1; + mac_resource_handle_t s_ring_rx_arg2; + + /* + * Threshold after which packets get dropped. + * Is always greater than s_ring_tx_hiwat + */ + int s_ring_tx_max_q_cnt; + /* # of mblocks after which to apply flow control */ + int s_ring_tx_hiwat; + /* # of mblocks after which to relieve flow control */ + int s_ring_tx_lowat; + boolean_t s_ring_tx_woken_up; + uint32_t s_ring_blocked_cnt; /* times blocked for Tx descs */ + uint32_t s_ring_unblocked_cnt; /* unblock calls from driver */ + uint32_t s_ring_hiwat_cnt; /* times blocked for Tx descs */ + + void *s_ring_tx_arg1; + void *s_ring_tx_arg2; + + /* Tx notify callback */ + mac_cb_info_t s_ring_notify_cb_info; /* cb list info */ + mac_cb_t *s_ring_notify_cb_list; /* The cb list */ + + clock_t s_ring_awaken; /* time async thread was awakened */ + + kthread_t *s_ring_run; /* Current thread processing sq */ + processorid_t s_ring_cpuid; /* processor to bind to */ + processorid_t s_ring_cpuid_save; /* saved cpuid during offline */ + kcondvar_t s_ring_async; /* async thread blocks on */ + clock_t s_ring_wait; /* lbolts to wait after a fill() */ + timeout_id_t s_ring_tid; /* timer id of pending timeout() */ + kthread_t *s_ring_worker; /* kernel thread id */ + char s_ring_name[S_RING_NAMELEN + 1]; + uint32_t s_ring_total_inpkt; + uint32_t s_ring_drops; + struct mac_client_impl_s *s_ring_mcip; + void *s_ring_flent; + kstat_t *s_ring_ksp; + + /* Teardown, poll disable control ops */ + kcondvar_t s_ring_client_cv; /* Client wait for control op */ + + mac_soft_ring_set_t *s_ring_set; /* The SRS this ring belongs to */ + mac_soft_ring_t *s_ring_next; + mac_soft_ring_t *s_ring_prev; + mac_soft_ring_drain_func_t s_ring_drain_func; +}; + +typedef void (*mac_srs_drain_proc_t)(mac_soft_ring_set_t *, uint_t); + +/* Transmit side Soft Ring Set */ +typedef struct mac_srs_tx_s { + /* Members for Tx size processing */ + uint32_t st_mode; + mac_tx_func_t st_func; + void *st_arg1; + void *st_arg2; + mac_group_t *st_group; /* TX group for share */ + boolean_t st_woken_up; + + /* + * st_max_q_cnt is the queue depth threshold to limit + * outstanding packets on the Tx SRS. Once the limit + * is reached, Tx SRS will drop packets until the + * limit goes below the threshold. + */ + uint32_t st_max_q_cnt; /* max. outstanding packets */ + /* + * st_hiwat is used Tx serializer and bandwidth mode. + * This is the queue depth threshold upto which + * packets will get buffered with no flow-control + * back pressure applied to the caller. Once this + * threshold is reached, back pressure will be + * applied to the caller of mac_tx() (mac_tx() starts + * returning a cookie to indicate a blocked SRS). + * st_hiwat should always be lesser than or equal to + * st_max_q_cnt. + */ + uint32_t st_hiwat; /* mblk cnt to apply flow control */ + uint32_t st_lowat; /* mblk cnt to relieve flow control */ + uint32_t st_drop_count; + /* + * Number of times the srs gets blocked due to lack of Tx + * desc is noted down. Corresponding wakeup from driver + * to unblock is also noted down. They should match in a + * correctly working setup. If there is less unblocks + * than blocks, then Tx side waits forever for a wakeup + * from below. The following protected by srs_lock. + */ + uint32_t st_blocked_cnt; /* times blocked for Tx descs */ + uint32_t st_unblocked_cnt; /* unblock calls from driver */ + uint32_t st_hiwat_cnt; /* times blocked for Tx descs */ +} mac_srs_tx_t; + +/* Receive side Soft Ring Set */ +typedef struct mac_srs_rx_s { + /* + * Upcall Function for fanout, Rx processing etc. Perhaps + * the same 3 members below can be used for Tx + * processing, but looking around, mac_rx_func_t has + * proliferated too much into various files at different + * places. I am leaving the consolidation battle for + * another day. + */ + mac_direct_rx_t sr_func; /* srs_lock */ + void *sr_arg1; /* srs_lock */ + mac_resource_handle_t sr_arg2; /* srs_lock */ + mac_rx_func_t sr_lower_proc; /* Atomically changed */ + boolean_t sr_enqueue_always; /* enqueue at soft ring */ + uint32_t sr_poll_pkt_cnt; + uint32_t sr_poll_thres; + + /* mblk cnt to apply flow control */ + uint32_t sr_hiwat; + /* mblk cnt to relieve flow control */ + uint32_t sr_lowat; + uint32_t sr_poll_count; + uint32_t sr_intr_count; + uint32_t sr_drop_count; + + /* Times polling was enabled */ + uint32_t sr_poll_on; + /* Times polling was enabled by worker thread */ + uint32_t sr_worker_poll_on; + /* Times polling was disabled */ + uint32_t sr_poll_off; + /* Poll thread signalled count */ + uint32_t sr_poll_thr_sig; + /* Poll thread busy */ + uint32_t sr_poll_thr_busy; + /* SRS drains, stays in poll mode but doesn't poll */ + uint32_t sr_poll_drain_no_poll; + /* + * SRS has nothing to do and no packets in H/W but + * there is a backlog in softrings. SRS stays in + * poll mode but doesn't do polling. + */ + uint32_t sr_poll_no_poll; + /* Active polling restarted */ + uint32_t sr_below_hiwat; + /* Found packets in last poll so try and poll again */ + uint32_t sr_poll_again; + /* + * Packets in queue but poll thread not allowed to process so + * signal the worker thread. + */ + uint32_t sr_poll_sig_worker; + /* + * Poll thread has nothing to do and H/W has nothing so + * reenable the interrupts. + */ + uint32_t sr_poll_intr_enable; + /* + * Poll thread has nothing to do and worker thread was already + * running so it can decide to reenable interrupt or poll again. + */ + uint32_t sr_poll_goto_sleep; + /* Worker thread goes back to draining the queue */ + uint32_t sr_drain_again; + /* More Packets in queue so signal the worker thread to drain */ + uint32_t sr_drain_worker_sig; + /* Poll thread is already running so worker has nothing to do */ + uint32_t sr_drain_poll_running; + /* We have packets already queued so keep polling */ + uint32_t sr_drain_keep_polling; + /* Drain is done and interrupts are reenabled */ + uint32_t sr_drain_finish_intr; + /* Polling thread needs to schedule worker wakeup */ + uint32_t sr_poll_worker_wakeup; + + /* Chains less than 10 pkts */ + uint32_t sr_chain_cnt_undr10; + /* Chains between 10 & 50 pkts */ + uint32_t sr_chain_cnt_10to50; + /* Chains over 50 pkts */ + uint32_t sr_chain_cnt_over50; +} mac_srs_rx_t; + +/* + * mac_soft_ring_set_s: + * This is used both for Tx and Rx side. The srs_type identifies Rx or + * Tx type. + * + * Note that the structure is carefully crafted, with Rx elements coming + * first followed by Tx specific members. Future additions to this + * structure should follow the same guidelines. + * + * Rx-side notes: + * mac_rx_classify_flow_add() always creates a mac_soft_ring_set_t and fn_flow + * points to info from it (func = srs_lower_proc, arg = soft_ring_set). On + * interrupt path, srs_lower_proc does B/W adjustment and switch to polling mode + * (if poll capable) and feeds the packets to soft_ring_list via choosen + * fanout type (specified by srs_type). In poll mode, the poll thread which is + * also a pointer can pick up the packets and feed them to various + * soft_ring_list. + * + * The srs_type can either be protocol based or fanout based where fanout itelf + * can be various types + * + * The polling works by turning off interrupts as soon as a packets + * are queued on the soft ring set. Once the backlog is clear and poll + * thread return empty handed i.e. Rx ring doesn't have anything, the + * interrupt is turned back on. For this purpose we keep a separate + * srs_poll_pkt_cnt counter which tracks the packets queued between SRS + * and the soft rings as well. The counter is incremented when packets + * are queued and decremented when SRS processes them (in case it has + * no soft rings) or the soft ring process them. Its important that + * in case SRS has softrings, the decrement doesn't happen till the + * packet is processed by the soft rings since it takes very little time + * for SRS to queue packet from SRS to soft rings and it will keep + * bringing more packets in the system faster than soft rings can + * process them. + * + * Tx side notes: + * The srs structure acts as a serializer with a worker thread. The + * default behavior of srs though is to act as a pass-thru. The queues + * (srs_first, srs_last, srs_count) get used when Tx ring runs out of Tx + * descriptors or to enforce bandwidth limits. + * + * When multiple Tx rings are present, the SRS state will be set to + * SRS_FANOUT_OTH. Outgoing packets coming into mac_tx_srs_process() + * function will be fanned out to one of the Tx side soft rings based on + * a hint passed in mac_tx_srs_process(). Each soft ring, in turn, will + * be associated with a distinct h/w Tx ring. + */ + +struct mac_soft_ring_set_s { + /* + * Common elements, common to both Rx and Tx SRS type. + * The following block of fields are protected by srs_lock + */ + kmutex_t srs_lock; + uint32_t srs_type; + uint32_t srs_state; /* state flags */ + uint32_t srs_count; + mblk_t *srs_first; /* first mblk chain or NULL */ + mblk_t *srs_last; /* last mblk chain or NULL */ + kcondvar_t srs_async; /* cv for worker thread */ + kcondvar_t srs_cv; /* cv for poll thread */ + kcondvar_t srs_quiesce_done_cv; /* cv for removal */ + timeout_id_t srs_tid; /* timeout id for pending timeout */ + + /* + * List of soft rings & processing function. + * The following block is protected by Rx quiescence. + * i.e. they can be changed only after quiescing the SRS + * Protected by srs_lock. + */ + mac_soft_ring_t *srs_soft_ring_head; + mac_soft_ring_t *srs_soft_ring_tail; + int srs_soft_ring_count; + int srs_soft_ring_quiesced_count; + int srs_soft_ring_condemned_count; + mac_soft_ring_t **srs_tcp_soft_rings; + int srs_tcp_ring_count; + mac_soft_ring_t **srs_udp_soft_rings; + int srs_udp_ring_count; + /* + * srs_oth_soft_rings is also used by tx_srs in + * when operating in multi tx ring mode. + */ + mac_soft_ring_t **srs_oth_soft_rings; + int srs_oth_ring_count; + + /* + * Bandwidth control related members. + * They are common to both Rx- and Tx-side. + * Following protected by srs_lock + */ + mac_bw_ctl_t *srs_bw; + size_t srs_size; /* Size of packets queued in bytes */ + pri_t srs_pri; + + mac_soft_ring_set_t *srs_next; /* mac_srs_g_lock */ + mac_soft_ring_set_t *srs_prev; /* mac_srs_g_lock */ + + /* Attribute specific drain func (BW ctl vs non-BW ctl) */ + mac_srs_drain_proc_t srs_drain_func; /* Write once (WO) */ + + /* + * If the associated ring is exclusively used by a mac client, e.g., + * an aggregation, this fields is used to keep a reference to the + * MAC client's pseudo ring. + */ + mac_resource_handle_t srs_mrh; + /* + * The following blocks are write once (WO) and valid for the life + * of the SRS + */ + struct mac_client_impl_s *srs_mcip; /* back ptr to mac client */ + void *srs_flent; /* back ptr to flent */ + mac_ring_t *srs_ring; /* Ring Descriptor */ + + /* Teardown, disable control ops */ + kcondvar_t srs_client_cv; /* Client wait for the control op */ + + kthread_t *srs_worker; /* WO, worker thread */ + kthread_t *srs_poll_thr; /* WO, poll thread */ + + uint_t srs_ind; /* Round Robin indx for picking up SR */ + processorid_t srs_worker_cpuid; /* processor to bind to */ + processorid_t srs_worker_cpuid_save; /* saved cpuid during offline */ + processorid_t srs_poll_cpuid; /* processor to bind to */ + processorid_t srs_poll_cpuid_save; /* saved cpuid during offline */ + uint_t srs_fanout_state; + mac_cpus_t srs_cpu; + + mac_srs_rx_t srs_rx; + mac_srs_tx_t srs_tx; +}; + +/* + * type flags - combination allowed to process and drain the queue + */ +#define ST_RING_WORKER_ONLY 0x0001 /* Worker thread only */ +#define ST_RING_ANY 0x0002 /* Any thread can process the queue */ +#define ST_RING_TCP 0x0004 +#define ST_RING_UDP 0x0008 +#define ST_RING_OTH 0x0010 + +#define ST_RING_BW_CTL 0x0020 +#define ST_RING_TX 0x0040 + +/* + * State flags. + */ +#define S_RING_PROC 0x0001 /* being processed */ +#define S_RING_BOUND 0x0002 /* Worker thread is bound to a cpu */ +#define S_RING_BLOCK 0x0004 /* No Tx descs */ +#define S_RING_TX_HIWAT 0x0008 /* Tx high watermark reached */ + +#define S_RING_WAKEUP_CLIENT 0x0010 /* flow ctrl, client wakeup needed */ +#define S_RING_BLANK 0x0020 /* Has been put into polling mode */ +#define S_RING_CLIENT_WAIT 0x0040 /* Client waiting for control op */ + +#define S_RING_CONDEMNED 0x0100 /* Being torn down */ +#define S_RING_CONDEMNED_DONE 0x0200 /* Being torn down */ +#define S_RING_QUIESCE 0x0400 /* No traffic flow, transient flag */ +#define S_RING_QUIESCE_DONE 0x0800 /* No traffic flow, transient flag */ + +#define S_RING_RESTART 0x1000 /* Go back to normal traffic flow */ +#define S_RING_ENQUEUED 0x2000 /* Pkts enqueued in Tx soft ring */ + +/* + * arguments for processors to bind to + */ +#define S_RING_BIND_NONE -1 + +/* + * defines for srs_type - identifies a link or a sub-flow + * and other static characteristics of a SRS like a tx + * srs, tcp only srs, etc. + */ +#define SRST_LINK 0x00000001 +#define SRST_FLOW 0x00000002 +#define SRST_NO_SOFT_RINGS 0x00000004 +#define SRST_TCP_ONLY 0x00000008 + +#define SRST_FANOUT_PROTO 0x00000010 +#define SRST_FANOUT_SRC_IP 0x00000020 +#define SRST_FANOUT_OTH 0x00000040 +#define SRST_DEFAULT_GRP 0x00000080 + +#define SRST_TX 0x00000100 +#define SRST_BW_CONTROL 0x00000200 +#define SRST_DIRECT_POLL 0x00000400 + +#define SRST_DLS_BYPASS 0x00001000 +#define SRST_CLIENT_POLL_ENABLED 0x00002000 + +/* + * soft ring set flags. These bits are dynamic in nature and get + * applied to srs_state. They reflect the state of SRS at any + * point of time + */ +#define SRS_BLANK 0x00000001 +#define SRS_WORKER_BOUND 0x00000002 +#define SRS_POLL_BOUND 0x00000004 +#define SRS_POLLING_CAPAB 0x00000008 + +#define SRS_PROC 0x00000010 +#define SRS_GET_PKTS 0x00000020 +#define SRS_POLLING 0x00000040 +#define SRS_BW_ENFORCED 0x00000080 + +#define SRS_WORKER 0x00000100 +#define SRS_ENQUEUED 0x00000200 +#define SRS_ANY_PROCESS 0x00000400 +#define SRS_PROC_FAST 0x00000800 + +#define SRS_POLL_PROC 0x00001000 +#define SRS_TX_BLOCKED 0x00002000 /* out of Tx descs */ +#define SRS_TX_HIWAT 0x00004000 /* Tx count exceeds hiwat */ +#define SRS_TX_WAKEUP_CLIENT 0x00008000 /* Flow-ctl: wakeup client */ + +#define SRS_CLIENT_PROC 0x00010000 +#define SRS_CLIENT_WAIT 0x00020000 +#define SRS_QUIESCE 0x00040000 +#define SRS_QUIESCE_DONE 0x00080000 + +#define SRS_CONDEMNED 0x00100000 +#define SRS_CONDEMNED_DONE 0x00200000 +#define SRS_POLL_THR_QUIESCED 0x00400000 +#define SRS_RESTART 0x00800000 + +#define SRS_RESTART_DONE 0x01000000 +#define SRS_POLL_THR_RESTART 0x02000000 +#define SRS_IN_GLIST 0x04000000 +#define SRS_POLL_THR_EXITED 0x08000000 + +#define SRS_QUIESCE_PERM 0x10000000 +#define SRS_LATENCY_OPT 0x20000000 + +#define SRS_QUIESCED(srs) (srs->srs_state & SRS_QUIESCE_DONE) + +/* + * If the SRS_QUIESCE_PERM flag is set, the SRS worker thread will not be + * able to be restarted. + */ +#define SRS_QUIESCED_PERMANENT(srs) (srs->srs_state & SRS_QUIESCE_PERM) + +/* + * soft ring set (SRS) Tx modes + */ +typedef enum { + SRS_TX_DEFAULT = 0, + SRS_TX_SERIALIZE, + SRS_TX_FANOUT, + SRS_TX_BW, + SRS_TX_BW_FANOUT +} mac_tx_srs_mode_t; + +/* + * SRS fanout states + */ +typedef enum { + SRS_FANOUT_UNINIT = 0, + SRS_FANOUT_INIT, + SRS_FANOUT_REINIT +} mac_srs_fanout_state_t; + +/* + * Structure for dls statistics + */ +struct dls_kstats { + kstat_named_t dlss_soft_ring_pkt_drop; +}; + +extern struct dls_kstats dls_kstat; + +#define DLS_BUMP_STAT(x, y) (dls_kstat.x.value.ui32 += y) + +/* Turn dynamic polling off */ +#define MAC_SRS_POLLING_OFF(mac_srs) { \ + ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ + if (((mac_srs)->srs_state & (SRS_POLLING_CAPAB|SRS_POLLING)) == \ + (SRS_POLLING_CAPAB|SRS_POLLING)) { \ + (mac_srs)->srs_state &= ~SRS_POLLING; \ + (void) mac_hwring_enable_intr((mac_ring_handle_t) \ + (mac_srs)->srs_ring); \ + (mac_srs)->srs_rx.sr_poll_off++; \ + } \ +} + +#define MAC_COUNT_CHAIN(mac_srs, head, tail, cnt, sz) { \ + mblk_t *tmp; \ + boolean_t bw_ctl = B_FALSE; \ + \ + ASSERT((head) != NULL); \ + cnt = 0; \ + sz = 0; \ + if ((mac_srs)->srs_type & SRST_BW_CONTROL) \ + bw_ctl = B_TRUE; \ + tmp = tail = (head); \ + if ((head)->b_next == NULL) { \ + cnt = 1; \ + if (bw_ctl) \ + sz += msgdsize(head); \ + } else { \ + while (tmp != NULL) { \ + tail = tmp; \ + cnt++; \ + if (bw_ctl) \ + sz += msgdsize(tmp); \ + tmp = tmp->b_next; \ + } \ + } \ +} + +/* + * Decrement the cumulative packet count in SRS and its + * soft rings. If the srs_poll_pkt_cnt goes below lowat, then check + * if if the interface was left in a polling mode and no one + * is really processing the queue (to get the interface out + * of poll mode). If no one is processing the queue, then + * acquire the PROC and signal the poll thread to check the + * interface for packets and get the interface back to interrupt + * mode if nothing is found. + */ +#define MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt) { \ + mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ + ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ + \ + srs_rx->sr_poll_pkt_cnt -= cnt; \ + if ((srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_poll_thres) && \ + (((mac_srs)->srs_state & \ + (SRS_POLLING|SRS_PROC|SRS_GET_PKTS)) == SRS_POLLING)) \ + { \ + (mac_srs)->srs_state |= (SRS_PROC|SRS_GET_PKTS); \ + cv_signal(&(mac_srs)->srs_cv); \ + srs_rx->sr_below_hiwat++; \ + } \ +} + +/* + * The following two macros are used to update the inbound packet and byte. + * count. The packet and byte count reflect the packets and bytes that are + * taken out of the SRS's queue, i.e. indicating they are being delivered. + * The srs_count and srs_size are updated in different locations as the + * srs_size is also used to take into account any bandwidth limits. The + * srs_size is updated only when a soft ring, if any, sends a packet up, + * as opposed to updating it when the SRS sends a packet to the SR, i.e. + * the srs_size reflects the packets in the SRS and SRs. These + * macros decrement the srs_size and srs_count and also increment the + * ipackets and ibytes stats resp. + * + * xxx-venu These are done under srs_lock, for now we still update + * mci_stat_ibytes/mci_stat_ipackets atomically, need to check if + * just updating them would be accurate enough. + * + * If we are updating these for a sub-flow SRS, then we need to also + * updated it's MAC client bandwidth info, if the MAC client is also + * bandwidth regulated. + */ +#define MAC_UPDATE_SRS_SIZE_LOCKED(srs, sz) { \ + if ((srs)->srs_type & SRST_BW_CONTROL) { \ + mutex_enter(&(srs)->srs_bw->mac_bw_lock); \ + (srs)->srs_bw->mac_bw_sz -= (sz); \ + (srs)->srs_bw->mac_bw_used += (sz); \ + mutex_exit(&(srs)->srs_bw->mac_bw_lock); \ + } \ +} + +#define MAC_TX_UPDATE_BW_INFO(srs, sz) { \ + (srs)->srs_bw->mac_bw_sz -= (sz); \ + (srs)->srs_bw->mac_bw_used += (sz); \ +} + +#define TX_MULTI_RING_MODE(mac_srs) \ + ((mac_srs)->srs_tx.st_mode == SRS_TX_FANOUT || \ + (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT) + +/* Soft ring flags for teardown */ +#define SRS_POLL_THR_OWNER (SRS_PROC | SRS_POLLING | SRS_GET_PKTS) +#define SRS_PAUSE (SRS_CONDEMNED | SRS_QUIESCE) +#define S_RING_PAUSE (S_RING_CONDEMNED | S_RING_QUIESCE) + +/* Soft rings */ +extern void mac_soft_ring_init(void); +extern void mac_soft_ring_finish(void); +extern void mac_fanout_setup(mac_client_impl_t *, flow_entry_t *, + mac_resource_props_t *, mac_direct_rx_t, void *, mac_resource_handle_t); + +extern void mac_soft_ring_worker_wakeup(mac_soft_ring_t *); +extern void mac_soft_ring_blank(void *, time_t, uint_t, int); +extern mblk_t *mac_soft_ring_poll(mac_soft_ring_t *, int); +extern void mac_soft_ring_destroy(mac_soft_ring_t *); +extern void mac_soft_ring_dls_bypass(void *, mac_direct_rx_t, void *); + +/* Rx SRS */ +extern mac_soft_ring_set_t *mac_srs_create(struct mac_client_impl_s *, + flow_entry_t *, uint32_t, mac_direct_rx_t, void *, mac_resource_handle_t, + mac_ring_t *); +extern void mac_srs_free(mac_soft_ring_set_t *); +extern void mac_srs_signal(mac_soft_ring_set_t *, uint_t); +extern cpu_t *mac_srs_bind(mac_soft_ring_set_t *, processorid_t); + +extern void mac_srs_change_upcall(void *, mac_direct_rx_t, void *); +extern void mac_srs_quiesce_initiate(mac_soft_ring_set_t *); +extern void mac_srs_client_poll_enable(struct mac_client_impl_s *, + mac_soft_ring_set_t *); +extern void mac_srs_client_poll_disable(struct mac_client_impl_s *, + mac_soft_ring_set_t *); +extern void mac_srs_client_poll_quiesce(struct mac_client_impl_s *, + mac_soft_ring_set_t *); +extern void mac_srs_client_poll_restart(struct mac_client_impl_s *, + mac_soft_ring_set_t *); +extern void mac_rx_srs_quiesce(mac_soft_ring_set_t *, uint_t); +extern void mac_rx_srs_restart(mac_soft_ring_set_t *); +extern void mac_rx_srs_subflow_process(void *, mac_resource_handle_t, mblk_t *, + boolean_t); +extern void mac_tx_srs_quiesce(mac_soft_ring_set_t *, uint_t); + +/* Tx SRS, Tx softring */ +extern void mac_tx_srs_wakeup(mac_soft_ring_set_t *, mac_ring_handle_t); +extern void mac_tx_srs_setup(struct mac_client_impl_s *, + flow_entry_t *, uint32_t); +extern mac_tx_func_t mac_tx_get_func(uint32_t); +extern mblk_t *mac_tx_send(mac_client_handle_t, mac_ring_handle_t, mblk_t *, + mac_tx_stats_t *); +extern boolean_t mac_tx_srs_ring_present(mac_soft_ring_set_t *, mac_ring_t *); +extern void mac_tx_srs_add_ring(mac_soft_ring_set_t *, mac_ring_t *); +extern void mac_tx_srs_del_ring(mac_soft_ring_set_t *, mac_ring_t *); +extern mac_tx_cookie_t mac_tx_srs_no_desc(mac_soft_ring_set_t *, mblk_t *, + uint16_t, mblk_t **); + +/* Subflow specific stuff */ +extern int mac_srs_flow_create(struct mac_client_impl_s *, flow_entry_t *, + mac_resource_props_t *, int, int, mac_direct_rx_t); +extern void mac_srs_update_bwlimit(flow_entry_t *, mac_resource_props_t *); +extern void mac_srs_adjust_subflow_bwlimit(struct mac_client_impl_s *); +extern void mac_srs_update_drv(struct mac_client_impl_s *); +extern void mac_update_srs_priority(mac_soft_ring_set_t *, pri_t); +extern void mac_client_update_classifier(mac_client_impl_t *, boolean_t); + +extern void mac_soft_ring_intr_enable(void *); +extern void mac_soft_ring_intr_disable(void *); +extern mac_soft_ring_t *mac_soft_ring_create(int, clock_t, void *, uint16_t, + pri_t, mac_client_impl_t *, mac_soft_ring_set_t *, + processorid_t, mac_direct_rx_t, void *, mac_resource_handle_t); +extern cpu_t *mac_soft_ring_bind(mac_soft_ring_t *, processorid_t); + extern void mac_soft_ring_unbind(mac_soft_ring_t *); +extern void mac_soft_ring_free(mac_soft_ring_t *, boolean_t); +extern void mac_soft_ring_signal(mac_soft_ring_t *, uint_t); +extern void mac_rx_soft_ring_process(mac_client_impl_t *, mac_soft_ring_t *, + mblk_t *, mblk_t *, int, size_t); +extern mac_tx_cookie_t mac_tx_soft_ring_process(mac_soft_ring_t *, + mblk_t *, uint16_t, mblk_t **); +extern void mac_srs_worker_quiesce(mac_soft_ring_set_t *); +extern void mac_srs_worker_restart(mac_soft_ring_set_t *); +extern void mac_rx_attach_flow_srs(mac_impl_t *, flow_entry_t *, + mac_soft_ring_set_t *, mac_ring_t *, mac_classify_type_t); + +extern void mac_rx_srs_drain_bw(mac_soft_ring_set_t *, uint_t); +extern void mac_rx_srs_drain(mac_soft_ring_set_t *, uint_t); +extern void mac_rx_srs_process(void *, mac_resource_handle_t, mblk_t *, + boolean_t); +extern void mac_srs_worker(mac_soft_ring_set_t *); +extern void mac_rx_srs_poll_ring(mac_soft_ring_set_t *); +extern void mac_tx_srs_drain(mac_soft_ring_set_t *, uint_t); + +extern void mac_tx_srs_restart(mac_soft_ring_set_t *); +extern void mac_rx_srs_remove(mac_soft_ring_set_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MAC_SOFT_RING_H */ diff --git a/usr/src/uts/common/sys/modhash.h b/usr/src/uts/common/sys/modhash.h index 5860ad165a..68d1c4dedd 100644 --- a/usr/src/uts/common/sys/modhash.h +++ b/usr/src/uts/common/sys/modhash.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_MODHASH_H #define _SYS_MODHASH_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Generic hash implementation for the kernel. */ @@ -129,6 +126,8 @@ int mod_hash_destroy(mod_hash_t *, mod_hash_key_t); int mod_hash_find(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *); int mod_hash_find_cb(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *, void (*)(mod_hash_key_t, mod_hash_val_t)); +int mod_hash_find_cb_rval(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *, + int (*)(mod_hash_key_t, mod_hash_val_t), int *); void mod_hash_walk(mod_hash_t *, uint_t (*)(mod_hash_key_t, mod_hash_val_t *, void *), void *); diff --git a/usr/src/uts/common/sys/nxge/nxge.h b/usr/src/uts/common/sys/nxge/nxge.h index 37cd6db405..624e433572 100644 --- a/usr/src/uts/common/sys/nxge/nxge.h +++ b/usr/src/uts/common/sys/nxge/nxge.h @@ -319,6 +319,7 @@ typedef struct _filter_t { uint32_t all_sap_cnt; } filter_t, *p_filter_t; + typedef struct _nxge_port_stats_t { /* * Overall structure size @@ -470,6 +471,8 @@ typedef struct _nxge_stats_t { } nxge_stats_t, *p_nxge_stats_t; + + typedef struct _nxge_intr_t { boolean_t intr_registered; /* interrupts are registered */ boolean_t intr_enabled; /* interrupts are enabled */ @@ -497,7 +500,7 @@ typedef struct _nxge_ldgv_t { p_nxge_ldg_t ldgp; p_nxge_ldv_t ldvp; p_nxge_ldv_t ldvp_syserr; - int ldvp_syserr_allocated; + boolean_t ldvp_syserr_alloced; } nxge_ldgv_t, *p_nxge_ldgv_t; typedef enum { @@ -542,7 +545,8 @@ typedef struct { #define NXGE_DC_SET(map, channel) map |= (1 << channel) #define NXGE_DC_RESET(map, channel) map &= (~(1 << channel)) -#define NXGE_LOGICAL_GROUP_MAX NXGE_MAX_TDCS +/* For now, we only support up to 8 RDC/TDC groups */ +#define NXGE_LOGICAL_GROUP_MAX NXGE_MAX_RDC_GROUPS typedef struct { int sequence; /* To order groups in time. */ @@ -558,6 +562,12 @@ typedef struct { } nxge_grp_set_t; /* + * Transmit Ring Group + * TX groups will be used exclusively for the purpose of Hybrid I/O. From + * the point of view of the nxge driver, the groups will be software + * constructs which will be used to establish the relationship between TX + * rings and shares. + * * Receive Ring Group * One of the advanced virtualization features is the ability to bundle * multiple Receive Rings in a single group. One or more MAC addresses may @@ -567,12 +577,16 @@ typedef struct { * RX ring groups can come with a predefined set of member rings, or they * are programmable by adding and removing rings to/from them. */ -typedef struct _nxge_rx_ring_group_t { +typedef struct _nxge_ring_group_t { mac_group_handle_t ghandle; p_nxge_t nxgep; + boolean_t started; + mac_ring_type_t type; int gindex; int sindex; -} nxge_rx_ring_group_t; + int rdctbl; + int n_mac_addrs; +} nxge_ring_group_t; /* * Ring Handle @@ -581,7 +595,7 @@ typedef struct _nxge_ring_handle_t { p_nxge_t nxgep; int index; /* port-wise */ mac_ring_handle_t ring_handle; -} nxge_ring_handle_t; +} nxge_ring_handle_t, *p_nxge_ring_handle_t; /* * Share Handle @@ -613,9 +627,6 @@ struct _nxge_t { uint64_t nxge_debug_level; /* driver state bit flags */ kmutex_t genlock[1]; enum nxge_mac_state nxge_mac_state; - ddi_softintr_t resched_id; /* reschedule callback */ - boolean_t resched_needed; - boolean_t resched_running; p_dev_regs_t dev_regs; npi_handle_t npi_handle; @@ -695,17 +706,12 @@ struct _nxge_t { p_rx_rcr_rings_t rx_rcr_rings; p_rx_mbox_areas_t rx_mbox_areas_p; - uint32_t start_rdc; - uint32_t max_rdcs; uint32_t rdc_mask; /* Transmit descriptors rings */ p_tx_rings_t tx_rings; p_tx_mbox_areas_t tx_mbox_areas_p; - uint32_t start_tdc; - uint32_t max_tdcs; - ddi_dma_handle_t dmasparehandle; ulong_t sys_page_sz; @@ -777,7 +783,15 @@ struct _nxge_t { nxge_grp_set_t tx_set; boolean_t tdc_is_shared[NXGE_MAX_TDCS]; - nxge_rx_ring_group_t rx_hio_groups[NXGE_MAX_RDC_GROUPS]; + boolean_t rx_channel_started[NXGE_MAX_RDCS]; + + /* Ring Handles */ + nxge_ring_handle_t tx_ring_handles[NXGE_MAX_TDCS]; + nxge_ring_handle_t rx_ring_handles[NXGE_MAX_RDCS]; + + nxge_ring_group_t tx_hio_groups[NXGE_MAX_TDC_GROUPS]; + nxge_ring_group_t rx_hio_groups[NXGE_MAX_RDC_GROUPS]; + nxge_share_handle_t shares[NXGE_MAX_VRS]; }; diff --git a/usr/src/uts/common/sys/nxge/nxge_common.h b/usr/src/uts/common/sys/nxge/nxge_common.h index f2bbc8e064..7956b5f653 100644 --- a/usr/src/uts/common/sys/nxge/nxge_common.h +++ b/usr/src/uts/common/sys/nxge/nxge_common.h @@ -277,15 +277,24 @@ typedef struct nxge_tdc_cfg { #define RDC_TABLE_ENTRY_METHOD_SEQ 0 #define RDC_TABLE_ENTRY_METHOD_REP 1 +/* per transmit DMA channel table group data structure */ +typedef struct nxge_tdc_grp { + uint32_t start_tdc; /* assume assigned in sequence */ + uint8_t max_tdcs; + dc_map_t map; + uint8_t grp_index; /* nxge_t.tx_set.group[grp_index] */ +} nxge_tdc_grp_t, *p_nxge_tdc_grp_t; + /* per receive DMA channel table group data structure */ typedef struct nxge_rdc_grp { - uint32_t flag; /* 0: not configured 1: configured */ + boolean_t flag; /* 0: not configured 1: configured */ uint8_t port; - uint8_t start_rdc; /* assume assigned in sequence */ + uint32_t start_rdc; /* assume assigned in sequence */ uint8_t max_rdcs; uint8_t def_rdc; dc_map_t map; uint16_t config_method; + uint8_t grp_index; /* nxge_t.rx_set.group[grp_index] */ } nxge_rdc_grp_t, *p_nxge_rdc_grp_t; #define RDC_MAP_IN(map, rdc) \ @@ -383,7 +392,6 @@ typedef struct nxge_hw_pt_cfg { uint32_t ser_ldvid; uint32_t def_rdc; /* default RDC */ uint32_t drr_wt; /* port DRR weight */ - uint32_t start_grpid; /* starting group ID */ uint32_t max_grpids; /* max group ID */ uint32_t grpids[NXGE_MAX_RDCS]; /* RDC group IDs */ uint32_t max_rdc_grpids; /* max RDC group ID */ @@ -393,6 +401,7 @@ typedef struct nxge_hw_pt_cfg { uint32_t start_mac_entry; /* where to put the first mac */ uint32_t max_macs; /* the max mac entry allowed */ uint32_t mac_pref; /* preference over VLAN */ + uint32_t def_mac_txdma_grpid; /* default TDC group ID */ uint32_t def_mac_rxdma_grpid; /* default RDC group ID */ uint32_t vlan_pref; /* preference over MAC */ @@ -417,6 +426,9 @@ typedef struct nxge_dma_pt_cfg { */ uint32_t tx_dma_map; /* Transmit DMA channel bit map */ + /* Transmit DMA channel: device wise */ + nxge_tdc_grp_t tdc_grps[NXGE_MAX_TDC_GROUPS]; + /* Receive DMA channel */ nxge_rdc_grp_t rdc_grps[NXGE_MAX_RDC_GROUPS]; diff --git a/usr/src/uts/common/sys/nxge/nxge_defs.h b/usr/src/uts/common/sys/nxge/nxge_defs.h index db061381da..8f8e226b32 100644 --- a/usr/src/uts/common/sys/nxge/nxge_defs.h +++ b/usr/src/uts/common/sys/nxge/nxge_defs.h @@ -278,6 +278,12 @@ extern "C" { */ #define NXGE_MAX_VRS 8 +/* + * TDC groups are used exclusively for the purpose of Hybrid I/O + * TX needs one group for each VR + */ +#define NXGE_MAX_TDC_GROUPS (NXGE_MAX_VRS) + /* Max. RDC table groups */ #define NXGE_MAX_RDC_GROUPS 8 #define NXGE_MAX_RDCS 16 diff --git a/usr/src/uts/common/sys/nxge/nxge_fflp_hw.h b/usr/src/uts/common/sys/nxge/nxge_fflp_hw.h index fc99701ca3..d7270a6fb1 100644 --- a/usr/src/uts/common/sys/nxge/nxge_fflp_hw.h +++ b/usr/src/uts/common/sys/nxge/nxge_fflp_hw.h @@ -18,7 +18,6 @@ * * CDDL HEADER END */ - /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -33,6 +32,7 @@ extern "C" { #include <nxge_defs.h> + /* FZC_FFLP Offsets */ #define FFLP_ENET_VLAN_TBL_REG (FZC_FFLP + 0x00000) @@ -1284,6 +1284,7 @@ typedef struct tcam_entry { * before this header file. * Need to move these includes to impl files ... */ + #include <netinet/in.h> typedef union flow_template { diff --git a/usr/src/uts/common/sys/nxge/nxge_flow.h b/usr/src/uts/common/sys/nxge/nxge_flow.h index 352834d796..c76f2731a1 100644 --- a/usr/src/uts/common/sys/nxge/nxge_flow.h +++ b/usr/src/uts/common/sys/nxge/nxge_flow.h @@ -18,7 +18,6 @@ * * CDDL HEADER END */ - /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. diff --git a/usr/src/uts/common/sys/nxge/nxge_hio.h b/usr/src/uts/common/sys/nxge/nxge_hio.h index 2a25341111..10487202b6 100644 --- a/usr/src/uts/common/sys/nxge/nxge_hio.h +++ b/usr/src/uts/common/sys/nxge/nxge_hio.h @@ -34,7 +34,7 @@ extern "C" { #include <nxge_mac.h> #include <nxge_ipp.h> #include <nxge_fflp.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #if defined(sun4v) #include <sys/vnet_res.h> #endif @@ -249,9 +249,10 @@ typedef struct nxge_hio_vr { size_t size; vr_region_t region; /* 1 of 8 regions. */ - uint8_t rdc_tbl; /* 1 of 8 RDC tables. */ + int rdc_tbl; /* 1 of 8 RDC tables. */ + int tdc_tbl; /* 1 of 8 TDC tables. */ ether_addr_t altmac; /* The alternate MAC address. */ - mac_addr_slot_t slot; /* According to nxge_m_mmac_add(). */ + int slot; /* According to nxge_m_mmac_add(). */ #if defined(sun4v) vio_net_handle_t vhp; /* The handle given to us by the vnet. */ @@ -369,12 +370,18 @@ extern const char *nxge_ddi_perror(int); */ extern void nxge_hio_group_get(void *arg, mac_ring_type_t type, int group, mac_group_info_t *infop, mac_group_handle_t ghdl); -extern int nxge_hio_share_alloc(void *arg, uint64_t cookie, uint64_t *rcookie, - mac_share_handle_t *shandle); +extern int nxge_hio_share_alloc(void *arg, mac_share_handle_t *shandle); extern void nxge_hio_share_free(mac_share_handle_t shandle); extern void nxge_hio_share_query(mac_share_handle_t shandle, - mac_ring_type_t type, uint32_t *rmin, uint32_t *rmax, uint64_t *rmap, - uint64_t *gnum); + mac_ring_type_t type, mac_ring_handle_t *rings, uint_t *n_rings); +extern int nxge_hio_share_add_group(mac_share_handle_t, + mac_group_driver_t); +extern int nxge_hio_share_rem_group(mac_share_handle_t, + mac_group_driver_t); +extern int nxge_hio_share_bind(mac_share_handle_t, uint64_t cookie, + uint64_t *rcookie); +extern void nxge_hio_share_unbind(mac_share_handle_t); + /* nxge_hio_guest.c */ extern void nxge_hio_unregister(nxge_t *); @@ -416,12 +423,6 @@ extern int nxge_hio_hostinfo_get_rdc_table(p_nxge_t); extern int nxge_hio_hostinfo_init(nxge_t *, nxge_hio_vr_t *, ether_addr_t *); extern void nxge_hio_hostinfo_uninit(nxge_t *, nxge_hio_vr_t *); - /* nxge_rxdma.c */ -extern nxge_status_t nxge_rx_poll(nxge_t *, int); - - /* nxge_txdma.c */ -extern uint_t nxge_tx_poll(nxge_t *, int); - #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/nxge/nxge_impl.h b/usr/src/uts/common/sys/nxge/nxge_impl.h index 5420ac00bb..63779b4e88 100644 --- a/usr/src/uts/common/sys/nxge/nxge_impl.h +++ b/usr/src/uts/common/sys/nxge/nxge_impl.h @@ -36,6 +36,8 @@ extern "C" { #define NIU_MAJOR_VER 1 #define NIU_MINOR_VER 1 +#if defined(sun4v) + /* * NIU HV API v1.0 definitions */ @@ -44,6 +46,8 @@ extern "C" { #define N2NIU_TX_LP_CONF 0x144 #define N2NIU_TX_LP_INFO 0x145 +#endif /* defined(sun4v) */ + #ifndef _ASM #include <sys/types.h> @@ -81,8 +85,7 @@ extern "C" { #include <sys/netlb.h> #include <sys/ddi_intr.h> -#include <sys/mac.h> -#include <sys/mac_impl.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> #if defined(sun4v) @@ -611,7 +614,6 @@ struct _nxge_ldg_t { uint8_t ldg; /* logical group number */ uint8_t vldg_index; boolean_t arm; - boolean_t interrupted; uint16_t ldg_timer; /* counter */ uint8_t func; uint8_t vector; @@ -749,6 +751,13 @@ typedef struct _nxge_mmac_stats_t { struct ether_addr mmac_avail_pool[16]; } nxge_mmac_stats_t, *p_nxge_mmac_stats_t; +/* + * Copied from mac.h. Should be cleaned up by driver. + */ +#define MMAC_SLOT_USED 0x1 /* address slot used */ +#define MMAC_VENDOR_ADDR 0x2 /* address returned is vendor supplied */ + + #define NXGE_MAX_MMAC_ADDRS 32 #define NXGE_NUM_MMAC_ADDRS 8 #define NXGE_NUM_OF_PORTS_QUAD 4 @@ -885,6 +894,8 @@ void nxge_hw_set_mac_modes(p_nxge_t); /* nxge_send.c. */ uint_t nxge_reschedule(caddr_t); +mblk_t *nxge_tx_ring_send(void *, mblk_t *); +int nxge_start(p_nxge_t, p_tx_ring_t, p_mblk_t); /* nxge_rxdma.c */ nxge_status_t nxge_rxdma_cfg_rdcgrp_default_rdc(p_nxge_t, @@ -1050,6 +1061,8 @@ int nxge_get_nports(p_nxge_t); void nxge_free_buf(buf_alloc_type_t, uint64_t, uint32_t); +#if defined(sun4v) + uint64_t hv_niu_rx_logical_page_conf(uint64_t, uint64_t, uint64_t, uint64_t); #pragma weak hv_niu_rx_logical_page_conf @@ -1131,6 +1144,8 @@ uint64_t hv_niu_vrtx_to_logical_dev(uint32_t cookie, uint64_t v_chidx, uint64_t *ldn); #pragma weak hv_niu_vrtx_to_logical_dev +#endif /* defined(sun4v) */ + #ifdef NXGE_DEBUG char *nxge_dump_packet(char *, int); #endif diff --git a/usr/src/uts/common/sys/nxge/nxge_rxdma.h b/usr/src/uts/common/sys/nxge/nxge_rxdma.h index 43a7185148..a336dbb9cb 100644 --- a/usr/src/uts/common/sys/nxge/nxge_rxdma.h +++ b/usr/src/uts/common/sys/nxge/nxge_rxdma.h @@ -155,6 +155,13 @@ typedef struct _nxge_rdc_sys_stats { uint32_t zcp_eop_err; } nxge_rdc_sys_stats_t, *p_nxge_rdc_sys_stats_t; +/* + * Software reserved buffer offset + */ +typedef struct _nxge_rxbuf_off_hdr_t { + uint32_t index; +} nxge_rxbuf_off_hdr_t, *p_nxge_rxbuf_off_hdr_t; + typedef struct _rx_msg_t { nxge_os_dma_common_t buf_dma; @@ -231,8 +238,11 @@ typedef struct _rx_rcr_ring_t { uint32_t intr_timeout; uint32_t intr_threshold; uint64_t max_receive_pkts; - mac_resource_handle_t rcr_mac_handle; + mac_ring_handle_t rcr_mac_handle; + uint64_t rcr_gen_num; uint32_t rcvd_pkt_bytes; /* Received bytes of a packet */ + p_nxge_ldv_t ldvp; + p_nxge_ldg_t ldgp; } rx_rcr_ring_t, *p_rx_rcr_ring_t; @@ -359,11 +369,13 @@ typedef struct _rx_mbox_t { typedef struct _rx_rbr_rings_t { p_rx_rbr_ring_t *rbr_rings; uint32_t ndmas; + boolean_t rxbuf_allocated; } rx_rbr_rings_t, *p_rx_rbr_rings_t; typedef struct _rx_rcr_rings_t { p_rx_rcr_ring_t *rcr_rings; uint32_t ndmas; + boolean_t cntl_buf_allocated; } rx_rcr_rings_t, *p_rx_rcr_rings_t; typedef struct _rx_mbox_areas_t { @@ -414,6 +426,10 @@ void nxge_rxdma_fix_channel(p_nxge_t, uint16_t); void nxge_rxdma_fixup_channel(p_nxge_t, uint16_t, int); int nxge_rxdma_get_ring_index(p_nxge_t, uint16_t); +mblk_t *nxge_rx_poll(void *, int); +int nxge_enable_poll(void *); +int nxge_disable_poll(void *); + void nxge_rxdma_regs_dump_channels(p_nxge_t); nxge_status_t nxge_rxdma_handle_sys_errors(p_nxge_t); void nxge_rxdma_inject_err(p_nxge_t, uint32_t, uint8_t); @@ -422,6 +438,8 @@ extern nxge_status_t nxge_alloc_rx_mem_pool(p_nxge_t); extern nxge_status_t nxge_alloc_rxb(p_nxge_t nxgep, int channel); extern void nxge_free_rxb(p_nxge_t nxgep, int channel); +int nxge_get_rxring_index(p_nxge_t, int, int); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/nxge/nxge_serialize.h b/usr/src/uts/common/sys/nxge/nxge_serialize.h deleted file mode 100644 index f235de7b2e..0000000000 --- a/usr/src/uts/common/sys/nxge/nxge_serialize.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_NXGE_NXGE_SERIALIZE_H -#define _SYS_NXGE_NXGE_SERIALIZE_H - -#ifdef __cplusplus -extern "C" { -#endif - -#define NXGE_TX_AVG_CNT 200000000 -#define NXGE_TX_AVG_RES 2000 /* sleep at least a tick */ -#define MAXHRS 3 /* # of packets to process */ -#define ONESEC 1000000000 /* one second */ - -#include <sys/stream.h> -#include <sys/mutex.h> -#include <sys/condvar.h> -#include <sys/kmem.h> -#include <sys/ddi.h> -#include <sys/callb.h> - -/* - * Thread state flags - */ -#define NXGE_TX_STHREAD_RUNNING 0x0001 /* thread started */ -#define NXGE_TX_STHREAD_DESTROY 0x0002 /* thread is being destroyed */ -#define NXGE_TX_STHREAD_EXIT 0x0003 /* thread exits */ - -typedef int (onetrack_t)(mblk_t *, void *); - -typedef struct { - kmutex_t lock; - int count; - mblk_t *head; - mblk_t *tail; - void *cookie; - onetrack_t *serialop; - int owned; - /* Counter tracks the total time spent in serializer function */ - hrtime_t totaltime; - /* - * Counter tracks the total number of time the serializer - * function was called. - */ - long totalcount; - /* - * Counter maintains the average time spent in the serializer function - * and is derived as (totaltime/totalcount). - */ - int avg; - /* - * The lenght of the queue to which the serializer function - * will append data. - */ - int length; - kcondvar_t serial_cv; - kcondvar_t timecv; - kmutex_t serial; - uint32_t s_state; - boolean_t s_need_signal; - callb_cpr_t s_cprinfo; - kthread_t *tx_sthread; - kmutex_t timelock; -} nxge_serialize_t; - -/* - * Prototypes definitions - */ -nxge_serialize_t *nxge_serialize_create(int, onetrack_t *, void *); -void nxge_serialize_destroy(nxge_serialize_t *); -void nxge_serialize_enter(nxge_serialize_t *, mblk_t *); - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_NXGE_NXGE_SERIALIZE_H */ diff --git a/usr/src/uts/common/sys/nxge/nxge_txdma.h b/usr/src/uts/common/sys/nxge/nxge_txdma.h index 859f6a124e..829d67ebce 100644 --- a/usr/src/uts/common/sys/nxge/nxge_txdma.h +++ b/usr/src/uts/common/sys/nxge/nxge_txdma.h @@ -18,7 +18,6 @@ * * CDDL HEADER END */ - /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -31,9 +30,9 @@ extern "C" { #endif +#include <sys/taskq.h> #include <sys/nxge/nxge_txdma_hw.h> #include <npi_txdma.h> -#include <sys/nxge/nxge_serialize.h> #define TXDMA_PORT_BITMAP(nxgep) (nxgep->pt_config.tx_dma_map) @@ -152,14 +151,13 @@ typedef struct _tx_ring_t { uint32_t tx_ring_offline; boolean_t tx_ring_busy; - p_tx_msg_t tx_free_list_p; - nxge_os_mutex_t freelock; - nxge_os_mutex_t lock; + mac_ring_handle_t tx_ring_handle; + ddi_taskq_t *taskq; uint16_t index; uint16_t tdc; struct nxge_tdc_cfg *tdc_p; - uint_t tx_ring_size; + int tx_ring_size; uint32_t num_chunks; uint_t tx_wrap_mask; @@ -170,11 +168,10 @@ typedef struct _tx_ring_t { tx_ring_kick_t ring_kick_tail; txdma_mailbox_t tx_mbox; - uint_t descs_pending; + int descs_pending; boolean_t queueing; nxge_os_mutex_t sq_lock; - nxge_serialize_t *serial; p_mblk_t head; p_mblk_t tail; diff --git a/usr/src/uts/common/sys/policy.h b/usr/src/uts/common/sys/policy.h index 2591642dc0..8d93c7780e 100644 --- a/usr/src/uts/common/sys/policy.h +++ b/usr/src/uts/common/sys/policy.h @@ -161,6 +161,7 @@ void secpolicy_fs_mount_clearopts(cred_t *, struct vfs *); int secpolicy_setid_setsticky_clear(vnode_t *, vattr_t *, const vattr_t *, cred_t *); int secpolicy_xvattr(xvattr_t *, uid_t, cred_t *, vtype_t); +int secpolicy_dld_ioctl(const cred_t *, const char *, const char *); int secpolicy_xvm_control(const cred_t *); int secpolicy_basic_exec(const cred_t *, vnode_t *); diff --git a/usr/src/uts/common/sys/softmac_impl.h b/usr/src/uts/common/sys/softmac_impl.h index 3fcfc97415..5f9d1401a7 100644 --- a/usr/src/uts/common/sys/softmac_impl.h +++ b/usr/src/uts/common/sys/softmac_impl.h @@ -26,8 +26,6 @@ #ifndef _SYS_SOFTMAC_IMPL_H #define _SYS_SOFTMAC_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/ethernet.h> #include <sys/taskq.h> @@ -37,6 +35,9 @@ #include <sys/stream.h> #include <sys/dlpi.h> #include <sys/mac.h> +#include <sys/mac_provider.h> +#include <sys/mac_client.h> +#include <sys/mac_client_priv.h> #include <sys/mac_ether.h> #ifdef __cplusplus @@ -68,14 +69,20 @@ typedef struct softmac_lower_s { boolean_t sl_pending_ioctl; mblk_t *sl_ack_mp; - mac_resource_handle_t sl_handle; ldi_handle_t sl_lh; } softmac_lower_t; -enum softmac_state { +typedef enum { SOFTMAC_INITIALIZED, SOFTMAC_READY -}; +} softmac_lower_state_t; + +typedef enum { + SOFTMAC_UNINIT, + SOFTMAC_ATTACH_INPROG, + SOFTMAC_ATTACH_DONE, + SOFTMAC_DETACH_INPROG, +} softmac_state_t; typedef struct softmac_dev_s { dev_t sd_dev; @@ -86,8 +93,12 @@ typedef struct softmac_dev_s { */ #define SOFTMAC_GLDV3 0x01 #define SOFTMAC_NOSUPP 0x02 -#define SOFTMAC_ATTACH_DONE 0x04 -#define SOFTMAC_NEED_RECREATE 0x08 +#define SOFTMAC_NEED_RECREATE 0x04 +#define SOFTMAC_NOTIFY_QUIT 0x08 + +#define SMAC_NONZERO_NODECNT(softmac) \ + ((softmac->smac_softmac[0] != NULL) + \ + (softmac->smac_softmac[1] != NULL)) /* * The softmac structure allows all minor nodes (at most two, style-1 and @@ -111,18 +122,14 @@ typedef struct softmac { uint32_t smac_cnt; /* # of minor nodes for this device */ /* - * The following fields are protected by softmac_hash_lock. - */ - /* + * The following fields are protected by smac_mutex. + * * The smac_hold_cnt field increases when softmac_hold_device() is * called to force the dls_vlan_t of the device to be created. The * device pre-detach fails if this counter is not 0. */ + softmac_state_t smac_state; uint32_t smac_hold_cnt; - - /* - * The following fields are protected by smac_lock. - */ kmutex_t smac_mutex; kcondvar_t smac_cv; uint32_t smac_flags; @@ -145,6 +152,16 @@ typedef struct softmac { uint32_t smac_attached_left; /* + * Thread handles the DL_NOTIFY_IND message from the lower stream. + */ + kthread_t *smac_notify_thread; + /* + * Head and tail of the DL_NOTIFY_IND messsages. + */ + mblk_t *smac_notify_head; + mblk_t *smac_notify_tail; + + /* * The remaining fields are used to register the MAC for a legacy * device. They are set in softmac_mac_register() and do not change. * One can access them when mac_register() is done without locks. @@ -177,11 +194,8 @@ typedef struct softmac { dl_capab_mdt_t smac_mdt_capab; boolean_t smac_mdt; - /* - * The following fields are protected by smac_lock - */ - krwlock_t smac_lock; - enum softmac_state smac_state; + /* Following fields protected by the mac perimeter */ + softmac_lower_state_t smac_lower_state; /* Lower stream structure */ softmac_lower_t *smac_lower; } softmac_t; @@ -193,9 +207,6 @@ typedef struct smac_ioc_start_s { #define SMAC_IOC ('S' << 24 | 'M' << 16 | 'C' << 8) #define SMAC_IOC_START (SMAC_IOC | 0x01) -#define SOFTMAC_BLANK_TICKS 128 -#define SOFTMAC_BLANK_PKT_COUNT 8 - extern dev_info_t *softmac_dip; #define SOFTMAC_DEV_NAME "softmac" @@ -217,9 +228,9 @@ extern int softmac_m_unicst(void *, const uint8_t *); extern void softmac_m_ioctl(void *, queue_t *, mblk_t *); extern int softmac_m_stat(void *, uint_t, uint64_t *); extern mblk_t *softmac_m_tx(void *, mblk_t *); -extern void softmac_m_resources(void *); extern int softmac_proto_tx(softmac_lower_t *, mblk_t *, mblk_t **); extern void softmac_ioctl_tx(softmac_lower_t *, mblk_t *, mblk_t **); +extern void softmac_notify_thread(void *); #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/squeue.h b/usr/src/uts/common/sys/squeue.h index 64e52ba808..ec09b3a88b 100644 --- a/usr/src/uts/common/sys/squeue.h +++ b/usr/src/uts/common/sys/squeue.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_SQUEUE_H #define _SYS_SQUEUE_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -47,7 +44,30 @@ typedef struct squeue_s squeue_t; (mp)->b_prev = (mblk_t *)(arg); \ } -#define GET_SQUEUE(mp) ((conn_t *)((mp)->b_prev))->conn_sqp +#define GET_SQUEUE(mp) ((conn_t *)((mp)->b_prev))->conn_sqp + +#define SQ_FILL 0x0001 +#define SQ_NODRAIN 0x0002 +#define SQ_PROCESS 0x0004 + +#define SQUEUE_ENTER(sqp, head, tail, cnt, flag, tag) { \ + sqp->sq_enter(sqp, head, tail, cnt, flag, tag); \ +} + +#define SQUEUE_ENTER_ONE(sqp, mp, proc, arg, flag, tag) { \ + ASSERT(mp->b_next == NULL); \ + ASSERT(mp->b_prev == NULL); \ + SET_SQUEUE(mp, proc, arg); \ + SQUEUE_ENTER(sqp, mp, mp, 1, flag, tag); \ +} + +/* + * May be called only by a thread executing in the squeue. The thread must + * not continue to execute any code needing squeue protection after calling + * this macro. Please see the comments in squeue.c for more details. + */ +#define SQUEUE_SWITCH(connp, new_sqp) \ + (connp)->conn_sqp = new_sqp; /* * Facility-special private data in squeues. @@ -57,26 +77,13 @@ typedef enum { SQPRIVATE_MAX } sqprivate_t; -typedef void (*sqproc_t)(void *, mblk_t *, void *); - extern void squeue_init(void); -extern squeue_t *squeue_create(char *, processorid_t, clock_t, pri_t); +extern squeue_t *squeue_create(clock_t, pri_t); extern void squeue_bind(squeue_t *, processorid_t); extern void squeue_unbind(squeue_t *); -extern void squeue_enter_chain(squeue_t *, mblk_t *, mblk_t *, - uint32_t, uint8_t); -extern void squeue_enter(squeue_t *, mblk_t *, sqproc_t, void *, uint8_t); -extern void squeue_enter_nodrain(squeue_t *, mblk_t *, sqproc_t, void *, - uint8_t); -extern void squeue_fill(squeue_t *, mblk_t *, sqproc_t, void *, uint8_t); +extern void squeue_enter(squeue_t *, mblk_t *, mblk_t *, + uint32_t, int, uint8_t); extern uintptr_t *squeue_getprivate(squeue_t *, sqprivate_t); -extern processorid_t squeue_binding(squeue_t *); - -extern void squeue_profile_reset(squeue_t *); -extern void squeue_profile_enable(squeue_t *); -extern void squeue_profile_disable(squeue_t *); -extern void squeue_profile_stop(void); -extern void squeue_profile_start(void); #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/squeue_impl.h b/usr/src/uts/common/sys/squeue_impl.h index 54870c067c..501377e53f 100644 --- a/usr/src/uts/common/sys/squeue_impl.h +++ b/usr/src/uts/common/sys/squeue_impl.h @@ -19,20 +19,21 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_SQUEUE_IMPL_H #define _SYS_SQUEUE_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif +#include <sys/disp.h> +#include <sys/types.h> #include <sys/squeue.h> +#include <inet/ip.h> #define SQ_NAMELEN 31 @@ -55,6 +56,8 @@ extern "C" { #define SQUEUE_PROFILE 0 #endif +#define SQUEUE_DEFAULT_PRIORITY MAXCLSYSPRI + typedef struct sqstat_s { uint_t sq_max_qlen; uint_t sq_npackets_worker; @@ -70,60 +73,102 @@ typedef struct sqstat_s { hrtime_t sq_time_other; } sqstat_t; +typedef struct squeue_set_s { + squeue_t *sqs_head; + squeue_t *sqs_default; + processorid_t sqs_cpuid; +} squeue_set_t; + +typedef void (*sqproc_t)(void *, mblk_t *, void *); +typedef void (*sq_enter_proc_t)(squeue_t *, mblk_t *, mblk_t *, uint32_t, + int, uint8_t); +typedef void (*sq_drain_proc_t)(squeue_t *, uint_t, hrtime_t); + +extern void squeue_worker_wakeup(squeue_t *); +extern int ip_squeue_flag; + struct squeue_s { - /* Keep the most used members 64bytes cache aligned */ + sq_enter_proc_t sq_enter; /* sq_process function */ + sq_drain_proc_t sq_drain; /* sq_drain function */ kmutex_t sq_lock; /* lock before using any member */ uint32_t sq_state; /* state flags and message count */ int sq_count; /* # of mblocks in squeue */ mblk_t *sq_first; /* first mblk chain or NULL */ mblk_t *sq_last; /* last mblk chain or NULL */ - clock_t sq_awaken; /* time async thread was awakened */ kthread_t *sq_run; /* Current thread processing sq */ - void *sq_rx_ring; - clock_t sq_avg_drain_time; /* Avg time to drain a pkt */ + ill_rx_ring_t *sq_rx_ring; /* The Rx ring tied to this sq */ + ill_t *sq_ill; /* The ill this squeue is tied to */ - processorid_t sq_bind; /* processor to bind to */ - kcondvar_t sq_async; /* async thread blocks on */ + clock_t sq_curr_time; /* Current tick (lbolt) */ + kcondvar_t sq_worker_cv; /* cond var. worker thread blocks on */ + kcondvar_t sq_poll_cv; /* cond variable poll_thr waits on */ + kcondvar_t sq_ctrlop_done_cv; /* cond variable for ctrl ops */ clock_t sq_wait; /* lbolts to wait after a fill() */ - uintptr_t sq_private[SQPRIVATE_MAX]; timeout_id_t sq_tid; /* timer id of pending timeout() */ + clock_t sq_awaken; /* time async thread was awakened */ + + processorid_t sq_bind; /* processor to bind to */ kthread_t *sq_worker; /* kernel thread id */ - char sq_name[SQ_NAMELEN + 1]; + kthread_t *sq_poll_thr; /* polling thread */ + uintptr_t sq_private[SQPRIVATE_MAX]; + + squeue_t *sq_next; /* managed by squeue creator */ + squeue_set_t *sq_set; /* managed by squeue creator */ -#if SQUEUE_DEBUG - /* Debug-only fields */ + pri_t sq_priority; /* squeue thread priority */ + + /* Keep the debug-only fields at the end of the structure */ +#ifdef DEBUG int sq_isintr; /* serviced by interrupt */ mblk_t *sq_curmp; void (*sq_curproc)(); conn_t *sq_connp; uchar_t sq_tag; #endif - -#if SQUEUE_PROFILE - /* Profiling fields */ - kstat_t *sq_kstat; /* exported statistics */ - sqstat_t sq_stats; -#endif }; /* * State flags. * Note: The MDB IP module depends on the values of these flags. */ -#define SQS_PROC 0x0001 /* being processed */ -#define SQS_WORKER 0x0002 /* worker thread */ -#define SQS_ENTER 0x0004 /* enter thread */ -#define SQS_FAST 0x0008 /* enter-fast thread */ -#define SQS_USER 0x0010 /* A non interrupt user */ -#define SQS_BOUND 0x0020 /* Worker thread is bound */ -#define SQS_PROFILE 0x0040 /* Enable profiling */ -#define SQS_REENTER 0x0080 /* Re entered thread */ -#define SQS_TMO_PROG 0x0100 /* Timeout is being set */ -#define SQS_POLL_CAPAB 0x0200 /* Squeue can control interrupts */ -#define SQS_NO_INTR 0x0400 /* Interrupts currently disabled */ -#define SQS_ILL_BOUND 0x0800 /* Squeue bound to an ill */ -#define SQS_GET_PKTS 0x1000 /* Moving pkts from NIC in progress */ -#define SQS_DEFAULT 0x2000 /* The default squeue for the CPU */ +#define SQS_PROC 0x00000001 /* being processed */ +#define SQS_WORKER 0x00000002 /* worker thread */ +#define SQS_ENTER 0x00000004 /* enter thread */ +#define SQS_FAST 0x00000008 /* enter-fast thread */ + +#define SQS_USER 0x00000010 /* A non interrupt user */ +#define SQS_BOUND 0x00000020 /* Worker thread is bound */ +#define SQS_REENTER 0x00000040 /* Re entered thread */ +#define SQS_TMO_PROG 0x00000080 /* Timeout is being set */ + +#define SQS_POLL_CAPAB 0x00000100 /* Squeue can control interrupts */ +#define SQS_ILL_BOUND 0x00000200 /* Squeue bound to an ill */ +#define SQS_GET_PKTS 0x00000400 /* Moving pkts from NIC in progress */ +#define SQS_DEFAULT 0x00000800 /* The default squeue for the CPU */ + +#define SQS_POLLING 0x00001000 /* Squeue in polling mode */ +#define SQS_INTR_BLANK 0x00002000 /* Interrupt blanking capability */ +#define SQS_PROC_HELD 0x00004000 /* SQS_PROC is held by the caller */ +#define SQS_FORCE_TIMER 0x00008000 /* Schedule worker due to B/W control */ + +#define SQS_POLL_CLEANUP 0x00010000 +#define SQS_POLL_CLEANUP_DONE 0x00020000 +#define SQS_POLL_QUIESCE 0x00040000 +#define SQS_POLL_QUIESCE_DONE 0x00080000 + +#define SQS_POLL_RESTART 0x00100000 +#define SQS_POLL_THR_QUIESCED 0x00200000 +#define SQS_POLL_THR_RESTART 0x00400000 +#define SQS_POLL_PROC 0x00800000 /* Poll thread processing the sq */ + +#define SQS_POLL_RESTART_DONE 0x01000000 +#define SQS_POLL_THR_QUIESCE 0x02000000 + +#define SQS_WORKER_THR_CONTROL \ + (SQS_POLL_QUIESCE | SQS_POLL_RESTART | SQS_POLL_CLEANUP) + +#define SQS_POLL_THR_CONTROL \ + (SQS_POLL_THR_QUIESCE | SQS_POLL_THR_RESTART) #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h index 6436c5a0cc..41097cab7f 100644 --- a/usr/src/uts/common/sys/stream.h +++ b/usr/src/uts/common/sys/stream.h @@ -30,8 +30,6 @@ #ifndef _SYS_STREAM_H #define _SYS_STREAM_H -#pragma ident "%Z%%M% %I% %E% SMI" /* SVr4.0 11.44 */ - /* * For source compatibility */ @@ -414,6 +412,7 @@ typedef struct bcache { #define STRUIO_ZCNOTIFY 0x10 /* notify stream head when mblk acked */ #define STRUIO_EAGER 0x20 /* new eager; db_cksumstart has squeue to use */ #define STRUIO_POLICY 0x40 /* new eager when IPsec is enabled */ +#define STRUIO_CONNECT 0x80 /* conn did a connect */ /* * Message flags. These are interpreted by the stream head. diff --git a/usr/src/uts/common/sys/strsubr.h b/usr/src/uts/common/sys/strsubr.h index 401e69dc5e..04c778feaa 100644 --- a/usr/src/uts/common/sys/strsubr.h +++ b/usr/src/uts/common/sys/strsubr.h @@ -30,8 +30,6 @@ #ifndef _SYS_STRSUBR_H #define _SYS_STRSUBR_H -#pragma ident "%Z%%M% %I% %E% SMI" /* SVr4.0 1.17 */ - /* * WARNING: * Everything in this file is private, belonging to the @@ -1238,6 +1236,8 @@ extern int hcksum_assoc(mblk_t *, struct multidata_s *, struct pdesc_s *, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, int); extern void hcksum_retrieve(mblk_t *, struct multidata_s *, struct pdesc_s *, uint32_t *, uint32_t *, uint32_t *, uint32_t *, uint32_t *); +extern void lso_info_set(mblk_t *, uint32_t, uint32_t); +extern void lso_info_get(mblk_t *, uint32_t *, uint32_t *); extern unsigned int bcksum(uchar_t *, int, unsigned int); extern boolean_t is_vmloaned_mblk(mblk_t *, struct multidata_s *, struct pdesc_s *); diff --git a/usr/src/uts/common/sys/vlan.h b/usr/src/uts/common/sys/vlan.h index 2a4e4c8ef0..11c7d41e83 100644 --- a/usr/src/uts/common/sys/vlan.h +++ b/usr/src/uts/common/sys/vlan.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -30,14 +30,14 @@ #ifndef _SYS_VLAN_H #define _SYS_VLAN_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif #define VLAN_TAGSZ 4 +#define VLAN_TPID 0x8100u + #define VLAN_ID_MASK 0x0fffu #define VLAN_ID_SIZE 12 #define VLAN_ID_SHIFT 0 diff --git a/usr/src/uts/common/sys/vnic.h b/usr/src/uts/common/sys/vnic.h index d17da6bf44..37f962e2ff 100644 --- a/usr/src/uts/common/sys/vnic.h +++ b/usr/src/uts/common/sys/vnic.h @@ -30,35 +30,101 @@ #include <sys/ethernet.h> #include <sys/param.h> #include <sys/mac.h> +#include <sys/mac_flow.h> #include <sys/dld_ioc.h> +#include <inet/ip.h> +#include <inet/ip6.h> #ifdef __cplusplus extern "C" { #endif /* - * Note that the datastructures defined here define an ioctl interface - * that is shared betwen user and kernel space. The vnic driver thus - * assumes that the structures have identical layout and size when - * compiled in either IPL32 or LP64. + * Extended diagnostic codes that can be returned by the various */ +typedef enum { + VNIC_IOC_DIAG_NONE, + VNIC_IOC_DIAG_MACADDR_NIC, + VNIC_IOC_DIAG_MACADDR_INUSE, + VNIC_IOC_DIAG_MACADDR_INVALID, + VNIC_IOC_DIAG_MACADDRLEN_INVALID, + VNIC_IOC_DIAG_MACFACTORYSLOTINVALID, + VNIC_IOC_DIAG_MACFACTORYSLOTUSED, + VNIC_IOC_DIAG_MACFACTORYSLOTALLUSED, + VNIC_IOC_DIAG_MACFACTORYNOTSUP, + VNIC_IOC_DIAG_MACPREFIX_INVALID, + VNIC_IOC_DIAG_MACPREFIXLEN_INVALID, + VNIC_IOC_DIAG_MACMARGIN_INVALID, + VNIC_IOC_DIAG_NO_HWRINGS +} vnic_ioc_diag_t; /* - * For now, we support only MAC addresses specified by value. + * Allowed VNIC MAC address types. + * + * - VNIC_MAC_ADDR_TYPE_FIXED, VNIC_MAC_ADDR_TYPE_RANDOM: + * The MAC address is specified by value by the caller, which + * itself can obtain it from the user directly, + * or pick it in a random fashion. Which method is used by the + * caller is irrelevant to the VNIC driver. However two different + * types are provided so that the information can be made available + * back to user-space when listing the kernel defined VNICs. + * + * When a VNIC is created, the address in passed through the + * vc_mac_addr and vc_mac_len fields of the vnic_ioc_create_t + * structure. + * + * - VNIC_MAC_ADDR_TYPE_FACTORY: the MAC address is obtained from + * one of the MAC factory MAC addresses of the underyling NIC. + * + * - VNIC_MAC_ADDR_TYPE_AUTO: the VNIC driver attempts to + * obtain the address from one of the factory MAC addresses of + * the underlying NIC. If none is available, the specified + * MAC address value is used. + * + * - VNIC_MAC_ADDR_TYPE_PRIMARY: this is a VNIC based VLAN. The + * address for this is the address of the primary MAC client. + * */ typedef enum { - VNIC_MAC_ADDR_TYPE_FIXED + VNIC_MAC_ADDR_TYPE_FIXED, + VNIC_MAC_ADDR_TYPE_RANDOM, + VNIC_MAC_ADDR_TYPE_FACTORY, + VNIC_MAC_ADDR_TYPE_AUTO, + VNIC_MAC_ADDR_TYPE_PRIMARY } vnic_mac_addr_type_t; +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif + #define VNIC_IOC_CREATE VNICIOC(1) +#define VNIC_IOC_CREATE_NODUPCHECK 0x00000001 +#define VNIC_IOC_CREATE_ANCHOR 0x00000002 + +/* + * Force creation of VLAN based VNIC without checking if the + * undelying MAC supports the margin size. + */ +#define VNIC_IOC_CREATE_FORCE 0x00000004 + +/* Allocate a hardware ring to the vnic */ +#define VNIC_IOC_CREATE_REQ_HWRINGS 0x00000008 + typedef struct vnic_ioc_create { datalink_id_t vc_vnic_id; datalink_id_t vc_link_id; - uint_t vc_mac_len; vnic_mac_addr_type_t vc_mac_addr_type; + uint_t vc_mac_len; uchar_t vc_mac_addr[MAXMACADDRLEN]; + uint_t vc_mac_prefix_len; + int vc_mac_slot; + uint16_t vc_vid; + uint_t vc_status; + uint_t vc_flags; + vnic_ioc_diag_t vc_diag; + mac_resource_props_t vc_resource_props; } vnic_ioc_create_t; #define VNIC_IOC_DELETE VNICIOC(2) @@ -69,33 +135,43 @@ typedef struct vnic_ioc_delete { #define VNIC_IOC_INFO VNICIOC(3) -typedef struct vnic_ioc_info_vnic { +typedef struct vnic_info { datalink_id_t vn_vnic_id; datalink_id_t vn_link_id; - uint32_t vn_mac_len; - uchar_t vn_mac_addr[MAXMACADDRLEN]; vnic_mac_addr_type_t vn_mac_addr_type; -} vnic_ioc_info_vnic_t; + uint_t vn_mac_len; + uchar_t vn_mac_addr[MAXMACADDRLEN]; + uint_t vn_mac_slot; + uint32_t vn_mac_prefix_len; + uint16_t vn_vid; + boolean_t vn_force; + mac_resource_props_t vn_resource_props; +} vnic_info_t; typedef struct vnic_ioc_info { - uint_t vi_nvnics; - uint_t vi_size; - datalink_id_t vi_vnic_id; /* DATALINK_ALL_LINKID returns all */ - datalink_id_t vi_linkid; + vnic_info_t vi_info; } vnic_ioc_info_t; #define VNIC_IOC_MODIFY VNICIOC(4) #define VNIC_IOC_MODIFY_ADDR 0x01 +#define VNIC_IOC_MODIFY_RESOURCE_CTL 0x02 typedef struct vnic_ioc_modify { datalink_id_t vm_vnic_id; uint_t vm_modify_mask; + uint_t vm_mac_len; + int vm_mac_slot; uchar_t vm_mac_addr[MAXMACADDRLEN]; vnic_mac_addr_type_t vm_mac_addr_type; - uint_t vm_mac_len; + mac_resource_props_t vm_resource_props; + vnic_ioc_diag_t vm_diag; } vnic_ioc_modify_t; +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/vnic_impl.h b/usr/src/uts/common/sys/vnic_impl.h index 6cb64523a8..b5dd59eea3 100644 --- a/usr/src/uts/common/sys/vnic_impl.h +++ b/usr/src/uts/common/sys/vnic_impl.h @@ -26,96 +26,40 @@ #ifndef _SYS_VNIC_IMPL_H #define _SYS_VNIC_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - +#include <sys/mac_provider.h> +#include <sys/mac_client.h> +#include <sys/mac_client_priv.h> #include <sys/vnic.h> +#include <sys/mac_flow.h> #include <sys/ksynch.h> #ifdef __cplusplus extern "C" { #endif -typedef void (*vnic_rx_fn_t)(void *, void *, mblk_t *); - -typedef struct vnic_flow_fn_info_s { - vnic_rx_fn_t ff_fn; - void *ff_arg1; - void *ff_arg2; -} vnic_flow_fn_info_t; - -typedef struct vnic_flow_s { - uchar_t vf_addr[MAXMACADDRLEN]; - uint_t vf_addr_len; - vnic_flow_fn_info_t vf_fn_info; - void *vf_cookie; - struct vnic_flow_s *vf_next; - kmutex_t vf_lock; - kcondvar_t vf_cv; - uint32_t vf_refs; - boolean_t vf_clearing; - boolean_t vf_is_active; -} vnic_flow_t; - -typedef struct vnic_flow_tab_s { - vnic_flow_t *vt_flow_list; - krwlock_t vt_lock; - uint_t vt_addr_len; -} vnic_flow_tab_t; - -typedef struct vnic_mac_s { - mac_handle_t va_mh; - uint_t va_refs; - datalink_id_t va_linkid; - const mac_txinfo_t *va_txinfo; - struct vnic_bcast_grp_s *va_bcast_grp; - krwlock_t va_bcast_grp_lock; - size_t va_addr_len; - mac_notify_handle_t va_notify_hdl; - mac_rx_handle_t va_rx_hdl; - vnic_flow_t *va_active_flow; - vnic_flow_tab_t *va_flow_tab; - boolean_t va_mac_set; - struct vnic_s *va_promisc; - krwlock_t va_promisc_lock; - uint64_t va_promisc_gen; -} vnic_mac_t; - typedef struct vnic_s { - datalink_id_t vn_id; + datalink_id_t vn_id; uint32_t - vn_started : 1, - vn_promisc : 1, - vn_bcast_grp : 1, - vn_multi_mac : 1, - vn_promisc_mac : 1, - vn_pad_to_bit_31 : 27; - - int vn_slot_id; - multiaddress_capab_t vn_mma_capab; - uint8_t vn_addr[ETHERADDRL]; - vnic_mac_addr_type_t vn_addr_type; - - mac_handle_t vn_mh; - uint32_t vn_margin; - vnic_mac_t *vn_vnic_mac; - vnic_flow_t *vn_flow_ent; - uint32_t vn_hcksum_txflags; - struct vnic_s *vn_promisc_next; - - uint64_t vn_stat_multircv; - uint64_t vn_stat_brdcstrcv; - uint64_t vn_stat_multixmt; - uint64_t vn_stat_brdcstxmt; - uint64_t vn_stat_ierrors; - uint64_t vn_stat_oerrors; - uint64_t vn_stat_rbytes; - uint64_t vn_stat_ipackets; - uint64_t vn_stat_obytes; - uint64_t vn_stat_opackets; + vn_started : 1, + vn_pad_to_bit_31 : 31; + + mac_handle_t vn_mh; + mac_handle_t vn_lower_mh; + mac_client_handle_t vn_mch; + mac_unicast_handle_t vn_muh; + uint32_t vn_margin; + int vn_slot_id; + vnic_mac_addr_type_t vn_addr_type; + uint8_t vn_addr[MAXMACADDRLEN]; + size_t vn_addr_len; + uint16_t vn_vid; + boolean_t vn_force; + datalink_id_t vn_link_id; + mac_notify_handle_t vn_mnh; + + uint32_t vn_hcksum_txflags; } vnic_t; -#define vn_txinfo vn_vnic_mac->va_txinfo - #define vn_madd_naddr vn_mma_capab.maddr_naddr #define vn_maddr_naddrfree vn_mma_capab.maddr_naddrfree #define vn_maddr_flag vn_mma_capab.maddr_flag @@ -126,68 +70,19 @@ typedef struct vnic_s { #define vn_maddr_modify vn_mma_capab.maddr_modify #define vn_maddr_get vn_mma_capab.maddr_get -#define VNIC_FLOW_REFHOLD(flow) { \ - mutex_enter(&(flow)->vf_lock); \ - (flow)->vf_refs++; \ - mutex_exit(&(flow)->vf_lock); \ -} - -#define VNIC_FLOW_REFRELE(flow) { \ - mutex_enter(&(flow)->vf_lock); \ - if (--(flow)->vf_refs == 0 && (flow)->vf_clearing) { \ - (flow)->vf_clearing = B_FALSE; \ - cv_signal(&(flow)->vf_cv); \ - } \ - mutex_exit(&(flow)->vf_lock); \ -} - -extern int vnic_dev_create(datalink_id_t, datalink_id_t, int, uchar_t *); +extern int vnic_dev_create(datalink_id_t, datalink_id_t, vnic_mac_addr_type_t *, + int *, uchar_t *, int *, uint_t, uint16_t, mac_resource_props_t *, + uint32_t, vnic_ioc_diag_t *); extern int vnic_dev_modify(datalink_id_t, uint_t, vnic_mac_addr_type_t, - uint_t, uchar_t *); -extern int vnic_dev_delete(datalink_id_t); - -typedef int (*vnic_info_new_vnic_fn_t)(void *, datalink_id_t, - vnic_mac_addr_type_t, uint_t, uint8_t *, datalink_id_t); + uint_t, uchar_t *, uint_t, mac_resource_props_t *); +extern int vnic_dev_delete(datalink_id_t, uint32_t); extern void vnic_dev_init(void); extern void vnic_dev_fini(void); extern uint_t vnic_dev_count(void); extern dev_info_t *vnic_get_dip(void); -extern int vnic_info(uint_t *, datalink_id_t, datalink_id_t, void *, - vnic_info_new_vnic_fn_t); - -extern void vnic_rx(void *, void *, mblk_t *); -extern mblk_t *vnic_fix_cksum(mblk_t *); -extern mblk_t *vnic_copymsgchain_cksum(mblk_t *); -extern mblk_t *vnic_copymsg_cksum(mblk_t *); - -extern void vnic_promisc_rx(vnic_mac_t *, vnic_t *, mblk_t *); - -extern void vnic_bcast_init(void); -extern void vnic_bcast_fini(void); -extern int vnic_bcast_add(vnic_t *, const uint8_t *, mac_addrtype_t); -extern void vnic_bcast_delete(vnic_t *, const uint8_t *); -extern void vnic_bcast_send(void *, void *, mblk_t *); - -extern void vnic_classifier_init(void); -extern void vnic_classifier_fini(void); -extern vnic_flow_t *vnic_classifier_flow_create(uint_t, uchar_t *, void *, - boolean_t, int); -extern void vnic_classifier_flow_destroy(vnic_flow_t *); -extern void vnic_classifier_flow_add(vnic_mac_t *, vnic_flow_t *, vnic_rx_fn_t, - void *, void *); -extern void vnic_classifier_flow_remove(vnic_mac_t *, vnic_flow_t *); -extern void vnic_classifier_flow_update_addr(vnic_flow_t *, uchar_t *); -extern void vnic_classifier_flow_update_fn(vnic_flow_t *, vnic_rx_fn_t, - void *, void *); -extern int vnic_classifier_flow_tab_init(vnic_mac_t *, uint_t, int); -extern void vnic_classifier_flow_tab_fini(vnic_mac_t *); -extern vnic_flow_t *vnic_classifier_get_flow(vnic_mac_t *, mblk_t *); -extern void *vnic_classifier_get_client_cookie(vnic_flow_t *); -extern vnic_flow_fn_info_t *vnic_classifier_get_fn_info(vnic_flow_t *); -extern boolean_t vnic_classifier_is_active(vnic_flow_t *); - +extern int vnic_info(vnic_info_t *); #ifdef __cplusplus } diff --git a/usr/src/uts/common/syscall/acctctl.c b/usr/src/uts/common/syscall/acctctl.c index 4fb322a211..ce325109be 100644 --- a/usr/src/uts/common/syscall/acctctl.c +++ b/usr/src/uts/common/syscall/acctctl.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/proc.h> #include <sys/systm.h> #include <sys/param.h> @@ -115,6 +113,7 @@ ac_file_in_use(vnode_t *vp) mutex_enter(&acg->ac_proc.ac_lock); mutex_enter(&acg->ac_task.ac_lock); mutex_enter(&acg->ac_flow.ac_lock); + mutex_enter(&acg->ac_net.ac_lock); } for (acg = list_head(&exacct_globals_list); !in_use && acg != NULL; @@ -125,7 +124,8 @@ ac_file_in_use(vnode_t *vp) */ if (vn_compare(acg->ac_proc.ac_vnode, vp) || vn_compare(acg->ac_task.ac_vnode, vp) || - vn_compare(acg->ac_flow.ac_vnode, vp)) + vn_compare(acg->ac_flow.ac_vnode, vp) || + vn_compare(acg->ac_net.ac_vnode, vp)) in_use = B_TRUE; } @@ -137,6 +137,7 @@ ac_file_in_use(vnode_t *vp) mutex_exit(&acg->ac_proc.ac_lock); mutex_exit(&acg->ac_task.ac_lock); mutex_exit(&acg->ac_flow.ac_lock); + mutex_exit(&acg->ac_net.ac_lock); } mutex_exit(&exacct_globals_list_lock); return (in_use); @@ -449,17 +450,21 @@ acctctl(int cmd, void *buf, size_t bufsz) info = &acg->ac_proc; maxres = AC_PROC_MAX_RES; break; + /* + * Flow/net accounting isn't configurable in non-global + * zones, but we have this field on a per-zone basis for future + * expansion as well as the ability to return default "unset" + * values for the various AC_*_GET queries. AC_*_SET commands + * fail with EPERM for AC_FLOW and AC_NET in non-global zones. + */ case AC_FLOW: - /* - * Flow accounting isn't currently configurable in non-global - * zones, but we have this field on a per-zone basis for future - * expansion as well as the ability to return default "unset" - * values for the various AC_*_GET queries. AC_*_SET commands - * fail with EPERM for AC_FLOW in non-global zones. - */ info = &acg->ac_flow; maxres = AC_FLOW_MAX_RES; break; + case AC_NET: + info = &acg->ac_net; + maxres = AC_NET_MAX_RES; + break; default: return (set_errno(EINVAL)); } @@ -468,7 +473,8 @@ acctctl(int cmd, void *buf, size_t bufsz) case AC_STATE_SET: if ((error = secpolicy_acct(CRED())) != 0) break; - if (mode == AC_FLOW && getzoneid() != GLOBAL_ZONEID) { + if ((mode == AC_FLOW || mode == AC_NET) && + getzoneid() != GLOBAL_ZONEID) { error = EPERM; break; } @@ -480,7 +486,8 @@ acctctl(int cmd, void *buf, size_t bufsz) case AC_FILE_SET: if ((error = secpolicy_acct(CRED())) != 0) break; - if (mode == AC_FLOW && getzoneid() != GLOBAL_ZONEID) { + if ((mode == AC_FLOW || mode == AC_NET) && + getzoneid() != GLOBAL_ZONEID) { error = EPERM; break; } @@ -492,7 +499,8 @@ acctctl(int cmd, void *buf, size_t bufsz) case AC_RES_SET: if ((error = secpolicy_acct(CRED())) != 0) break; - if (mode == AC_FLOW && getzoneid() != GLOBAL_ZONEID) { + if ((mode == AC_FLOW || mode == AC_NET) && + getzoneid() != GLOBAL_ZONEID) { error = EPERM; break; } @@ -580,6 +588,7 @@ exacct_zone_shutdown(zoneid_t zoneid, void *data) exacct_free_info(&acg->ac_proc); exacct_free_info(&acg->ac_task); exacct_free_info(&acg->ac_flow); + exacct_free_info(&acg->ac_net); } /* ARGSUSED */ @@ -595,6 +604,7 @@ exacct_zone_fini(zoneid_t zoneid, void *data) mutex_destroy(&acg->ac_proc.ac_lock); mutex_destroy(&acg->ac_task.ac_lock); mutex_destroy(&acg->ac_flow.ac_lock); + mutex_destroy(&acg->ac_net.ac_lock); kmem_free(acg, sizeof (*acg)); } diff --git a/usr/src/uts/common/xen/io/xnb.c b/usr/src/uts/common/xen/io/xnb.c index 6ac3e6e6ab..308f3c60ff 100644 --- a/usr/src/uts/common/xen/io/xnb.c +++ b/usr/src/uts/common/xen/io/xnb.c @@ -35,6 +35,7 @@ #include <sys/modctl.h> #include <sys/conf.h> #include <sys/mac.h> +#include <sys/mac_impl.h> /* XXXXBOW - remove, included for mac_fix_cksum() */ #include <sys/dlpi.h> #include <sys/strsubr.h> #include <sys/strsun.h> @@ -247,7 +248,7 @@ xnb_software_csum(xnb_t *xnbp, mblk_t *mp) (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, HCK_FULLCKSUM, KM_NOSLEEP); - return (vnic_fix_cksum(mp)); + return (mac_fix_cksum(mp)); } mblk_t * diff --git a/usr/src/uts/common/xen/io/xnbo.c b/usr/src/uts/common/xen/io/xnbo.c index 790e850289..79831ee7f1 100644 --- a/usr/src/uts/common/xen/io/xnbo.c +++ b/usr/src/uts/common/xen/io/xnbo.c @@ -34,8 +34,12 @@ #include "xnb.h" #include <sys/sunddi.h> +#include <sys/ddi.h> #include <sys/modctl.h> #include <sys/strsubr.h> +#include <sys/mac_client.h> +#include <sys/mac_provider.h> +#include <sys/mac_client_priv.h> #include <sys/mac.h> #include <net/if.h> #include <sys/dlpi.h> @@ -45,9 +49,9 @@ typedef struct xnbo { mac_handle_t o_mh; - mac_rx_handle_t o_mrh; - const mac_txinfo_t *o_mtx; - mac_notify_handle_t o_mnh; + mac_client_handle_t o_mch; + mac_unicast_handle_t o_mah; + mac_promisc_handle_t o_mphp; boolean_t o_running; boolean_t o_promiscuous; uint32_t o_hcksum_capab; @@ -70,11 +74,9 @@ xnbo_to_mac(xnb_t *xnbp, mblk_t *mp) goto fail; } - mp = xnbop->o_mtx->mt_fn(xnbop->o_mtx->mt_arg, mp); - - if (mp != NULL) { + if (mac_tx(xnbop->o_mch, mp, 0, + MAC_DROP_ON_NO_DESC, NULL) != NULL) { xnbp->xnb_stat_mac_full++; - goto fail; } return; @@ -156,7 +158,8 @@ xnbo_cksum_to_peer(xnb_t *xnbp, mblk_t *mp) */ /*ARGSUSED*/ static void -xnbo_from_mac(void *arg, mac_resource_handle_t mrh, mblk_t *mp) +xnbo_from_mac(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback) { xnb_t *xnbp = arg; @@ -173,7 +176,8 @@ xnbo_from_mac(void *arg, mac_resource_handle_t mrh, mblk_t *mp) */ /*ARGSUSED*/ static void -xnbo_from_mac_filter(void *arg, mac_resource_handle_t mrh, mblk_t *mp) +xnbo_from_mac_filter(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback) { xnb_t *xnbp = arg; xnbo_t *xnbop = xnbp->xnb_flavour_data; @@ -216,25 +220,12 @@ xnbo_from_mac_filter(void *arg, mac_resource_handle_t mrh, mblk_t *mp) #undef ADD if (keep_head != NULL) - xnbo_from_mac(xnbp, mrh, keep_head); + xnbo_from_mac(xnbp, mrh, keep_head, B_FALSE); if (free_head != NULL) freemsgchain(free_head); } -static void -xnbo_notify(void *arg, mac_notify_type_t type) -{ - xnb_t *xnbp = arg; - xnbo_t *xnbop = xnbp->xnb_flavour_data; - - switch (type) { - case MAC_NOTE_PROMISC: - xnbop->o_mtx = mac_tx_get(xnbop->o_mh); - break; - } -} - static boolean_t xnbo_open_mac(xnb_t *xnbp, char *mac) { @@ -242,8 +233,10 @@ xnbo_open_mac(xnb_t *xnbp, char *mac) int err, need_rx_filter, need_setphysaddr, need_promiscuous; const mac_info_t *mi; char *xsname; - void (*rx_fn)(void *, mac_resource_handle_t, mblk_t *); + void (*rx_fn)(void *, mac_resource_handle_t, mblk_t *, boolean_t); + struct ether_addr ea; uint_t max_sdu; + mac_diag_t diag; xsname = xvdi_get_xsname(xnbp->xnb_devinfo); @@ -279,8 +272,22 @@ xnbo_open_mac(xnb_t *xnbp, char *mac) return (B_FALSE); } - xnbop->o_mnh = mac_notify_add(xnbop->o_mh, xnbo_notify, xnbp); - ASSERT(xnbop->o_mnh != NULL); + if (mac_client_open(xnbop->o_mh, &xnbop->o_mch, NULL, + MAC_OPEN_FLAGS_USE_DATALINK_NAME) != 0) { + cmn_err(CE_WARN, "xnbo_open_mac: " + "error (%d) opening mac client", err); + xnbo_close_mac(xnbop); + return (B_FALSE); + } + + err = mac_unicast_primary_add(xnbop->o_mch, &xnbop->o_mah, &diag); + if (err != 0) { + cmn_err(CE_WARN, "xnbo_open_mac: " + "failed to get the primary MAC address of " + "%s: %d", mac, err); + xnbo_close_mac(xnbop); + return (B_FALSE); + } /* * Should the receive path filter packets from the downstream @@ -294,11 +301,27 @@ xnbo_open_mac(xnb_t *xnbp, char *mac) else rx_fn = xnbo_from_mac; - xnbop->o_mrh = mac_rx_add(xnbop->o_mh, rx_fn, xnbp); - ASSERT(xnbop->o_mrh != NULL); - - xnbop->o_mtx = mac_tx_get(xnbop->o_mh); - ASSERT(xnbop->o_mtx != NULL); + /* + * Should we set the underlying NIC into promiscuous mode? The + * default is "no". + */ + if (xenbus_scanf(XBT_NULL, xsname, + "SUNW-need-promiscuous", "%d", &need_promiscuous) != 0) + need_promiscuous = 0; + if (need_promiscuous == 0) { + mac_rx_set(xnbop->o_mch, rx_fn, xnbp); + } else { + err = mac_promisc_add(xnbop->o_mch, MAC_CLIENT_PROMISC_ALL, + rx_fn, xnbp, &xnbop->o_mphp, MAC_PROMISC_FLAGS_NO_TX_LOOP); + if (err != 0) { + cmn_err(CE_WARN, "xnbo_open_mac: " + "cannot enable promiscuous mode of %s: %d", + mac, err); + xnbo_close_mac(xnbop); + return (B_FALSE); + } + xnbop->o_promiscuous = B_TRUE; + } if (!mac_capab_get(xnbop->o_mh, MAC_CAPAB_HCKSUM, &xnbop->o_hcksum_capab)) @@ -312,45 +335,17 @@ xnbo_open_mac(xnb_t *xnbp, char *mac) "SUNW-need-set-physaddr", "%d", &need_setphysaddr) != 0) need_setphysaddr = 0; if (need_setphysaddr > 0) { - struct ether_addr ea; - - err = mac_unicst_set(xnbop->o_mh, xnbp->xnb_mac_addr); + err = mac_unicast_primary_set(xnbop->o_mh, xnbp->xnb_mac_addr); /* Warn, but continue on. */ if (err != 0) { bcopy(xnbp->xnb_mac_addr, ea.ether_addr_octet, ETHERADDRL); cmn_err(CE_WARN, "xnbo_open_mac: " "cannot set MAC address of %s to " - "%s: %d", mac, ether_sprintf(&ea), - err); - } - } - - /* - * Should we set the underlying NIC into promiscuous mode? The - * default is "no". - */ - if (xenbus_scanf(XBT_NULL, xsname, - "SUNW-need-promiscuous", "%d", &need_promiscuous) != 0) - need_promiscuous = 0; - if (need_promiscuous > 0) { - err = mac_promisc_set(xnbop->o_mh, B_TRUE, MAC_DEVPROMISC); - if (err != 0) { - cmn_err(CE_WARN, "xnbo_open_mac: " - "cannot enable promiscuous mode of %s: %d", - mac, err); - xnbo_close_mac(xnbop); - return (B_FALSE); + "%s: %d", mac, ether_sprintf(&ea), err); } - xnbop->o_promiscuous = B_TRUE; } - if ((err = mac_start(xnbop->o_mh)) != 0) { - cmn_err(CE_WARN, "xnbo_open_mac: " - "cannot start mac device (%d)", err); - xnbo_close_mac(xnbop); - return (B_FALSE); - } xnbop->o_running = B_TRUE; return (B_TRUE); @@ -385,26 +380,24 @@ xnbo_close_mac(xnbo_t *xnbop) return; if (xnbop->o_running) { - mac_stop(xnbop->o_mh); xnbop->o_running = B_FALSE; } if (xnbop->o_promiscuous) { - (void) mac_promisc_set(xnbop->o_mh, B_FALSE, - MAC_DEVPROMISC); + (void) mac_promisc_remove(xnbop->o_mphp); xnbop->o_promiscuous = B_FALSE; + } else { + mac_rx_clear(xnbop->o_mch); } - xnbop->o_mtx = NULL; - - if (xnbop->o_mrh != NULL) { - mac_rx_remove(xnbop->o_mh, xnbop->o_mrh, B_TRUE); - xnbop->o_mrh = NULL; + if (xnbop->o_mah != NULL) { + (void) mac_unicast_remove(xnbop->o_mch, xnbop->o_mah); + xnbop->o_mah = NULL; } - if (xnbop->o_mnh != NULL) { - mac_notify_remove(xnbop->o_mh, xnbop->o_mnh); - xnbop->o_mnh = NULL; + if (xnbop->o_mch != NULL) { + mac_client_close(xnbop->o_mch, 0); + xnbop->o_mch = NULL; } mac_close(xnbop->o_mh); @@ -453,8 +446,9 @@ xnbo_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) xnbop = kmem_zalloc(sizeof (*xnbop), KM_SLEEP); xnbop->o_mh = NULL; - xnbop->o_mrh = NULL; - xnbop->o_mtx = NULL; + xnbop->o_mch = NULL; + xnbop->o_mah = NULL; + xnbop->o_mphp = NULL; xnbop->o_running = B_FALSE; xnbop->o_hcksum_capab = 0; diff --git a/usr/src/uts/common/xen/io/xnbu.c b/usr/src/uts/common/xen/io/xnbu.c index f5c0ba9809..80e2378608 100644 --- a/usr/src/uts/common/xen/io/xnbu.c +++ b/usr/src/uts/common/xen/io/xnbu.c @@ -40,7 +40,7 @@ #include <sys/strsubr.h> #include <sys/dlpi.h> #include <sys/pattr.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> #include <xen/sys/xendev.h> @@ -51,19 +51,16 @@ static int xnbu_m_set_mac_addr(void *, const uint8_t *); static int xnbu_m_set_multicast(void *, boolean_t, const uint8_t *); static int xnbu_m_set_promiscuous(void *, boolean_t); static int xnbu_m_stat(void *, uint_t, uint64_t *); -static void xnbu_m_blank(void *, time_t, uint_t); -static void xnbu_m_resources(void *); static boolean_t xnbu_m_getcapab(void *, mac_capab_t, void *); static mblk_t *xnbu_m_send(void *, mblk_t *); typedef struct xnbu { mac_handle_t u_mh; - mac_resource_handle_t u_mrh; boolean_t u_need_sched; } xnbu_t; static mac_callbacks_t xnb_callbacks = { - MC_RESOURCES | MC_GETCAPAB, + MC_GETCAPAB, xnbu_m_stat, xnbu_m_start, xnbu_m_stop, @@ -71,7 +68,6 @@ static mac_callbacks_t xnb_callbacks = { xnbu_m_set_multicast, xnbu_m_set_mac_addr, xnbu_m_send, - xnbu_m_resources, NULL, xnbu_m_getcapab }; @@ -84,7 +80,7 @@ xnbu_to_host(xnb_t *xnbp, mblk_t *mp) ASSERT(mp != NULL); - mac_rx(xnbup->u_mh, xnbup->u_mrh, mp); + mac_rx(xnbup->u_mh, NULL, mp); mutex_enter(&xnbp->xnb_rx_lock); @@ -328,32 +324,6 @@ xnbu_m_stat(void *arg, uint_t stat, uint64_t *val) return (0); } -/*ARGSUSED*/ -static void -xnbu_m_blank(void *arg, time_t ticks, uint_t count) -{ - /* - * XXPV dme: blanking is not currently implemented. - */ -} - -static void -xnbu_m_resources(void *arg) -{ - xnb_t *xnbp = arg; - xnbu_t *xnbup = xnbp->xnb_flavour_data; - mac_rx_fifo_t mrf; - - mrf.mrf_type = MAC_RX_FIFO; - mrf.mrf_blank = xnbu_m_blank; - mrf.mrf_arg = (void *)xnbp; - mrf.mrf_normal_blank_time = 128; /* XXPV dme: see xnbu_m_blank() */ - mrf.mrf_normal_pkt_count = 8; /* XXPV dme: see xnbu_m_blank() */ - - xnbup->u_mrh = mac_resource_add(xnbup->u_mh, - (mac_resource_t *)&mrf); -} - static boolean_t xnbu_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) { @@ -369,11 +339,6 @@ xnbu_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) *capab = 0; break; } - - case MAC_CAPAB_POLL: - /* Just return B_TRUE. */ - break; - default: return (B_FALSE); } diff --git a/usr/src/uts/common/xen/io/xnf.c b/usr/src/uts/common/xen/io/xnf.c index c14c651c61..0813d6cbe1 100644 --- a/usr/src/uts/common/xen/io/xnf.c +++ b/usr/src/uts/common/xen/io/xnf.c @@ -80,7 +80,7 @@ #include <inet/ip_impl.h> #include <sys/gld.h> #include <sys/modctl.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> #include <sys/bootinfo.h> #include <sys/mach_mmu.h> @@ -148,8 +148,6 @@ static int xnf_set_promiscuous(void *, boolean_t); static mblk_t *xnf_send(void *, mblk_t *); static uint_t xnf_intr(caddr_t); static int xnf_stat(void *, uint_t, uint64_t *); -static void xnf_blank(void *, time_t, uint_t); -static void xnf_resources(void *); static void xnf_ioctl(void *, queue_t *, mblk_t *); static boolean_t xnf_getcapab(void *, mac_capab_t, void *); @@ -178,7 +176,7 @@ static boolean_t xnf_kstat_init(xnf_t *xnfp); * XXPV dme: remove MC_IOCTL? */ static mac_callbacks_t xnf_callbacks = { - MC_RESOURCES | MC_IOCTL | MC_GETCAPAB, + MC_IOCTL | MC_GETCAPAB, xnf_stat, xnf_start, xnf_stop, @@ -186,7 +184,6 @@ static mac_callbacks_t xnf_callbacks = { xnf_set_multicast, xnf_set_mac_addr, xnf_send, - xnf_resources, xnf_ioctl, xnf_getcapab }; @@ -1436,7 +1433,7 @@ xnf_intr(caddr_t arg) mp = xnf_process_recv(xnfp); if (mp != NULL) - mac_rx(xnfp->xnf_mh, xnfp->xnf_rx_handle, mp); + mac_rx(xnfp->xnf_mh, NULL, mp); } xnfp->xnf_stat_interrupts++; @@ -2518,39 +2515,6 @@ xnf_stat(void *arg, uint_t stat, uint64_t *val) /*ARGSUSED*/ static void -xnf_blank(void *arg, time_t ticks, uint_t count) -{ - /* - * XXPV dme: blanking is not currently implemented. - * - * It's not obvious how to use the 'ticks' argument here. - * - * 'Count' might be used as an indicator of how to set - * rsp_event when posting receive buffers to the rx_ring. It - * would replace the code at the tail of xnf_process_recv() - * that simply indicates that the next completed packet should - * cause an interrupt. - */ -} - -static void -xnf_resources(void *arg) -{ - xnf_t *xnfp = arg; - mac_rx_fifo_t mrf; - - mrf.mrf_type = MAC_RX_FIFO; - mrf.mrf_blank = xnf_blank; - mrf.mrf_arg = (void *)xnfp; - mrf.mrf_normal_blank_time = 128; /* XXPV dme: see xnf_blank() */ - mrf.mrf_normal_pkt_count = 8; /* XXPV dme: see xnf_blank() */ - - xnfp->xnf_rx_handle = mac_resource_add(xnfp->xnf_mh, - (mac_resource_t *)&mrf); -} - -/*ARGSUSED*/ -static void xnf_ioctl(void *arg, queue_t *q, mblk_t *mp) { miocnak(q, mp, 0, EINVAL); @@ -2588,11 +2552,6 @@ xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data) *capab = 0; break; } - - case MAC_CAPAB_POLL: - /* Just return B_TRUE. */ - break; - default: return (B_FALSE); } diff --git a/usr/src/uts/common/xen/io/xnf.h b/usr/src/uts/common/xen/io/xnf.h index d8edf89f86..9b0cc4c357 100644 --- a/usr/src/uts/common/xen/io/xnf.h +++ b/usr/src/uts/common/xen/io/xnf.h @@ -135,7 +135,6 @@ typedef struct xnf { struct tx_pktinfo xnf_tx_pkt_info[NET_TX_RING_SIZE]; struct xnf_buffer_desc *xnf_rxpkt_bufptr[XNF_MAX_RXDESCS]; - mac_resource_handle_t xnf_rx_handle; ddi_iblock_cookie_t xnf_icookie; kmutex_t xnf_tx_buf_mutex; kmutex_t xnf_rx_buf_mutex; diff --git a/usr/src/uts/i86xpv/xnb/Makefile b/usr/src/uts/i86xpv/xnb/Makefile index 4fa08e3f70..dc7503a46e 100644 --- a/usr/src/uts/i86xpv/xnb/Makefile +++ b/usr/src/uts/i86xpv/xnb/Makefile @@ -20,10 +20,9 @@ # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" # # This makefile drives the production of the xnb # network driver support module. @@ -59,7 +58,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) # # Module depends on VNIC. # -LDFLAGS += -dy -N drv/vnic +LDFLAGS += -dy -N drv/vnic -N misc/mac # # use Solaris specific code in xen public header files diff --git a/usr/src/uts/intel/ia32/ml/modstubs.s b/usr/src/uts/intel/ia32/ml/modstubs.s index ee03e0967f..e29afc6c29 100644 --- a/usr/src/uts/intel/ia32/ml/modstubs.s +++ b/usr/src/uts/intel/ia32/ml/modstubs.s @@ -1245,6 +1245,8 @@ fcnname/**/_info: \ STUB(dld, dld_init_ops, nomod_void); STUB(dld, dld_fini_ops, nomod_void); STUB(dld, dld_autopush, nomod_minus_one); + STUB(dld, dld_ioc_register, nomod_einval); + STUB(dld, dld_ioc_unregister, nomod_void); END_MODULE(dld); #endif @@ -1255,12 +1257,15 @@ fcnname/**/_info: \ */ #ifndef DLS_MODULE MODULE(dls,misc); - STUB(dls, dls_devnet_vid, nomod_zero); STUB(dls, dls_devnet_mac, nomod_zero); STUB(dls, dls_devnet_hold_tmp, nomod_einval); STUB(dls, dls_devnet_rele_tmp, nomod_void); + STUB(dls, dls_devnet_hold_link, nomod_einval); + STUB(dls, dls_devnet_rele_link, nomod_void); STUB(dls, dls_devnet_prop_task_wait, nomod_void); STUB(dls, dls_mgmt_get_linkid, nomod_einval); + STUB(dls, dls_devnet_macname2linkid, nomod_einval); + STUB(dls, dls_mgmt_get_linkinfo, nomod_einval); END_MODULE(dls); #endif diff --git a/usr/src/uts/intel/io/amd8111s/amd8111s_main.c b/usr/src/uts/intel/io/amd8111s/amd8111s_main.c index 6587531959..1664ee7543 100644 --- a/usr/src/uts/intel/io/amd8111s/amd8111s_main.c +++ b/usr/src/uts/intel/io/amd8111s/amd8111s_main.c @@ -76,7 +76,6 @@ static int amd8111s_detach(dev_info_t *, ddi_detach_cmd_t); static int amd8111s_m_unicst(void *, const uint8_t *); static int amd8111s_m_promisc(void *, boolean_t); static int amd8111s_m_stat(void *, uint_t, uint64_t *); -static void amd8111s_m_resources(void *arg); static void amd8111s_m_ioctl(void *, queue_t *, mblk_t *); static int amd8111s_m_multicst(void *, boolean_t, const uint8_t *addr); static int amd8111s_m_start(void *); @@ -186,11 +185,9 @@ static ddi_device_acc_attr_t pcn_acc_attr = { DDI_STRICTORDER_ACC }; -#define AMD8111S_M_CALLBACK_FLAGS (MC_RESOURCES | MC_IOCTL) - static mac_callbacks_t amd8111s_m_callbacks = { - AMD8111S_M_CALLBACK_FLAGS, + MC_IOCTL, amd8111s_m_stat, amd8111s_m_start, amd8111s_m_stop, @@ -198,7 +195,6 @@ static mac_callbacks_t amd8111s_m_callbacks = { amd8111s_m_multicst, amd8111s_m_unicst, amd8111s_m_tx, - amd8111s_m_resources, amd8111s_m_ioctl }; @@ -248,29 +244,6 @@ _fini() return (status); } -/* Adjust Interrupt Coalescing Register to coalesce interrupts */ -static void -amd8111s_m_blank(void *arg, time_t ticks, uint32_t count) -{ - _NOTE(ARGUNUSED(arg, ticks, count)); -} - -static void -amd8111s_m_resources(void *arg) -{ - struct LayerPointers *adapter = arg; - mac_rx_fifo_t mrf; - - mrf.mrf_type = MAC_RX_FIFO; - mrf.mrf_blank = amd8111s_m_blank; - mrf.mrf_arg = (void *)adapter; - mrf.mrf_normal_blank_time = 128; - mrf.mrf_normal_pkt_count = 8; - - adapter->pOdl->mrh = mac_resource_add(adapter->pOdl->mh, - (mac_resource_t *)&mrf); -} - /* * Loopback Support */ @@ -665,7 +638,7 @@ amd8111s_receive(struct LayerPointers *pLayerPointers) } if (ret_mp) { - mac_rx(pOdl->mh, pOdl->mrh, ret_mp); + mac_rx(pOdl->mh, NULL, ret_mp); } (void) ddi_dma_sync(pOdl->rx_desc_dma_handle, 0, 0, diff --git a/usr/src/uts/intel/io/amd8111s/amd8111s_main.h b/usr/src/uts/intel/io/amd8111s/amd8111s_main.h index 922f5150c1..00f430273f 100755..100644 --- a/usr/src/uts/intel/io/amd8111s/amd8111s_main.h +++ b/usr/src/uts/intel/io/amd8111s/amd8111s_main.h @@ -1,13 +1,11 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef AMD8111S_MAIN_H #define AMD8111S_MAIN_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Copyright (c) 2001-2006 Advanced Micro Devices, Inc. All rights reserved. * @@ -55,10 +53,6 @@ * nationals of countries subject to national security controls. */ - -#pragma ident "@(#)$RCSfile: odl.h,v $ $Revision: 1.1 $ " \ -"$Date: 2004/04/22 15:22:52 $ AMD" - #include <sys/types.h> #include <sys/errno.h> #include <sys/kmem.h> @@ -79,7 +73,7 @@ #include <sys/ethernet.h> #include <sys/dlpi.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> #include <sys/netlb.h> #include "amd8111s_hw.h" @@ -278,7 +272,6 @@ struct odl { dev_info_t *devinfo; mac_handle_t mh; /* mac module handle */ - mac_resource_handle_t mrh; struct amd8111s_statistics statistics; diff --git a/usr/src/uts/intel/ip/Makefile b/usr/src/uts/intel/ip/Makefile index c2e44f9934..6cd3d4ac5a 100644 --- a/usr/src/uts/intel/ip/Makefile +++ b/usr/src/uts/intel/ip/Makefile @@ -19,10 +19,9 @@ # CDDL HEADER END # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" # # This makefile drives the production of the ip driver # kernel module. diff --git a/usr/src/uts/intel/ip/ip.global-objs.debug64 b/usr/src/uts/intel/ip/ip.global-objs.debug64 index 5854497325..f4bcb8ab0c 100644 --- a/usr/src/uts/intel/ip/ip.global-objs.debug64 +++ b/usr/src/uts/intel/ip/ip.global-objs.debug64 @@ -44,7 +44,6 @@ cl_sctp_disconnect cl_sctp_listen cl_sctp_unlisten conn_drain_nthreads -crctab default_ip6_asp_table do_tcp_direct_sockfs do_tcp_fusion @@ -105,7 +104,6 @@ ip_cgtp_filter_rev ip_conn_cache ip_debug ip_g_all_ones -ip_input_proc ip_ioctl_ftbl ip_ire_cleanup_cnt ip_ire_cpu_ratio @@ -133,15 +131,12 @@ ip_poll_normal_ms ip_poll_normal_ticks ip_rput_pullups ip_six_byte_all_ones -ip_soft_rings_cnt -ip_squeue_bind ip_squeue_create_callback ip_squeue_enter ip_squeue_enter_unbound ip_squeue_fanout -ip_squeue_profile +ip_squeue_flag ip_squeue_worker_wait -ip_squeues_per_cpu ip_thread_data ip_thread_list ip_thread_rwlock @@ -221,10 +216,6 @@ req_arr rn_mkfreelist rn_ones rn_zeros -rr_max_blank_ratio -rr_max_pkt_cnt_ratio -rr_min_blank_ratio -rr_min_pkt_cnt_ratio rt_entry_cache rts_conn_cache rts_g_t_info_ack @@ -262,19 +253,12 @@ sin_null skip_sctp_cksum sqset_global_list sqset_global_size +sqset_lock squeue_cache -squeue_intrdrain_ms -squeue_intrdrain_ns -squeue_kstat -squeue_kstat_lock -squeue_profile -squeue_worker_poll_min -squeue_workerdrain_ms -squeue_workerdrain_ns +squeue_drain_ms +squeue_drain_ns squeue_workerwait_ms squeue_workerwait_tick -squeue_writerdrain_ms -squeue_writerdrain_ns tcp_acceptor_rinit tcp_acceptor_winit tcp_conn_cache @@ -307,10 +291,8 @@ tcp_rinitv4 tcp_rinitv6 tcp_sack_info_cache tcp_sock_winit -tcp_squeue_close -tcp_squeue_close_proc +tcp_squeue_flag tcp_squeue_wput -tcp_squeue_wput_proc tcp_static_maxpsz tcp_taskq tcp_timercache @@ -318,6 +300,7 @@ tcp_tx_pull_len tcp_valid_levels_arr tcp_winfo tcp_winit +tcp_outbound_squeue_switch tcpinfov4 tcpinfov6 tsol_strict_error diff --git a/usr/src/uts/intel/ip/ip.global-objs.obj64 b/usr/src/uts/intel/ip/ip.global-objs.obj64 index 065904b585..3866432363 100644 --- a/usr/src/uts/intel/ip/ip.global-objs.obj64 +++ b/usr/src/uts/intel/ip/ip.global-objs.obj64 @@ -44,7 +44,6 @@ cl_sctp_disconnect cl_sctp_listen cl_sctp_unlisten conn_drain_nthreads -crctab default_ip6_asp_table do_tcp_direct_sockfs do_tcp_fusion @@ -105,7 +104,6 @@ ip_cgtp_filter_rev ip_conn_cache ip_debug ip_g_all_ones -ip_input_proc ip_ioctl_ftbl ip_ire_cleanup_cnt ip_ire_cpu_ratio @@ -133,15 +131,12 @@ ip_poll_normal_ms ip_poll_normal_ticks ip_rput_pullups ip_six_byte_all_ones -ip_soft_rings_cnt -ip_squeue_bind ip_squeue_create_callback ip_squeue_enter ip_squeue_enter_unbound ip_squeue_fanout -ip_squeue_profile +ip_squeue_flag ip_squeue_worker_wait -ip_squeues_per_cpu ip_thread_data ip_thread_list ip_thread_rwlock @@ -217,10 +212,6 @@ req_arr rn_mkfreelist rn_ones rn_zeros -rr_max_blank_ratio -rr_max_pkt_cnt_ratio -rr_min_blank_ratio -rr_min_pkt_cnt_ratio rt_entry_cache rts_conn_cache rts_g_t_info_ack @@ -254,16 +245,12 @@ sin6_null sin_null sqset_global_list sqset_global_size +sqset_lock squeue_cache -squeue_intrdrain_ms -squeue_intrdrain_ns -squeue_worker_poll_min -squeue_workerdrain_ms -squeue_workerdrain_ns +squeue_drain_ms +squeue_drain_ns squeue_workerwait_ms squeue_workerwait_tick -squeue_writerdrain_ms -squeue_writerdrain_ns tcp_acceptor_rinit tcp_acceptor_winit tcp_conn_cache @@ -296,10 +283,8 @@ tcp_rinitv4 tcp_rinitv6 tcp_sack_info_cache tcp_sock_winit -tcp_squeue_close -tcp_squeue_close_proc +tcp_squeue_flag tcp_squeue_wput -tcp_squeue_wput_proc tcp_static_maxpsz tcp_taskq tcp_timercache @@ -307,6 +292,7 @@ tcp_tx_pull_len tcp_valid_levels_arr tcp_winfo tcp_winit +tcp_outbound_squeue_switch tcpinfov4 tcpinfov6 tsol_strict_error diff --git a/usr/src/uts/intel/mac/Makefile b/usr/src/uts/intel/mac/Makefile index 12bd648ee0..870b260f75 100644 --- a/usr/src/uts/intel/mac/Makefile +++ b/usr/src/uts/intel/mac/Makefile @@ -22,13 +22,10 @@ # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" -# # # This makefile drives the production of the mac driver # kernel module. # - # # Path to the base of the uts directory tree (usually /usr/src/uts). # @@ -53,7 +50,6 @@ include $(UTSBASE)/intel/Makefile.intel ALL_TARGET = $(BINARY) LINT_TARGET = $(MODULE).lint INSTALL_TARGET = $(BINARY) $(ROOTMODULE) -LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN # # Overrides. @@ -61,6 +57,9 @@ LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN CFLAGS += $(CCVERBOSE) LDFLAGS += -dy +LINTTAGS += -erroff=E_PTRDIFF_OVERFLOW +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN + # # Default build targets. # diff --git a/usr/src/uts/intel/vnic/Makefile b/usr/src/uts/intel/vnic/Makefile index 748d61a8b0..83a4c749c2 100644 --- a/usr/src/uts/intel/vnic/Makefile +++ b/usr/src/uts/intel/vnic/Makefile @@ -22,9 +22,6 @@ # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" -# - # # Path to the base of the uts directory tree (usually /usr/src/uts). # @@ -55,7 +52,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) # Overrides # CFLAGS += $(CCVERBOSE) -LDFLAGS += -dy -Ndrv/dld -Nmisc/mac -Ndrv/ip -Nmisc/dls +LDFLAGS += -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls # # Default build targets. diff --git a/usr/src/uts/intel/xge/Makefile b/usr/src/uts/intel/xge/Makefile index 6689f7a758..8541c1b052 100644 --- a/usr/src/uts/intel/xge/Makefile +++ b/usr/src/uts/intel/xge/Makefile @@ -20,11 +20,9 @@ # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# # This makefile drives the production of the Neterion Xframe # 10G Ethernet (XGE) driver module in x86 systems # diff --git a/usr/src/uts/sparc/ip/Makefile b/usr/src/uts/sparc/ip/Makefile index c330f273f9..515f079865 100644 --- a/usr/src/uts/sparc/ip/Makefile +++ b/usr/src/uts/sparc/ip/Makefile @@ -19,17 +19,15 @@ # CDDL HEADER END # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" # # This makefile drives the production of the ip driver # kernel module. # # sparc architecture dependent # - # # Path to the base of the uts directory tree (usually /usr/src/uts). # diff --git a/usr/src/uts/sparc/ip/ip.global-objs.debug64 b/usr/src/uts/sparc/ip/ip.global-objs.debug64 index 5854497325..f4bcb8ab0c 100644 --- a/usr/src/uts/sparc/ip/ip.global-objs.debug64 +++ b/usr/src/uts/sparc/ip/ip.global-objs.debug64 @@ -44,7 +44,6 @@ cl_sctp_disconnect cl_sctp_listen cl_sctp_unlisten conn_drain_nthreads -crctab default_ip6_asp_table do_tcp_direct_sockfs do_tcp_fusion @@ -105,7 +104,6 @@ ip_cgtp_filter_rev ip_conn_cache ip_debug ip_g_all_ones -ip_input_proc ip_ioctl_ftbl ip_ire_cleanup_cnt ip_ire_cpu_ratio @@ -133,15 +131,12 @@ ip_poll_normal_ms ip_poll_normal_ticks ip_rput_pullups ip_six_byte_all_ones -ip_soft_rings_cnt -ip_squeue_bind ip_squeue_create_callback ip_squeue_enter ip_squeue_enter_unbound ip_squeue_fanout -ip_squeue_profile +ip_squeue_flag ip_squeue_worker_wait -ip_squeues_per_cpu ip_thread_data ip_thread_list ip_thread_rwlock @@ -221,10 +216,6 @@ req_arr rn_mkfreelist rn_ones rn_zeros -rr_max_blank_ratio -rr_max_pkt_cnt_ratio -rr_min_blank_ratio -rr_min_pkt_cnt_ratio rt_entry_cache rts_conn_cache rts_g_t_info_ack @@ -262,19 +253,12 @@ sin_null skip_sctp_cksum sqset_global_list sqset_global_size +sqset_lock squeue_cache -squeue_intrdrain_ms -squeue_intrdrain_ns -squeue_kstat -squeue_kstat_lock -squeue_profile -squeue_worker_poll_min -squeue_workerdrain_ms -squeue_workerdrain_ns +squeue_drain_ms +squeue_drain_ns squeue_workerwait_ms squeue_workerwait_tick -squeue_writerdrain_ms -squeue_writerdrain_ns tcp_acceptor_rinit tcp_acceptor_winit tcp_conn_cache @@ -307,10 +291,8 @@ tcp_rinitv4 tcp_rinitv6 tcp_sack_info_cache tcp_sock_winit -tcp_squeue_close -tcp_squeue_close_proc +tcp_squeue_flag tcp_squeue_wput -tcp_squeue_wput_proc tcp_static_maxpsz tcp_taskq tcp_timercache @@ -318,6 +300,7 @@ tcp_tx_pull_len tcp_valid_levels_arr tcp_winfo tcp_winit +tcp_outbound_squeue_switch tcpinfov4 tcpinfov6 tsol_strict_error diff --git a/usr/src/uts/sparc/ip/ip.global-objs.obj64 b/usr/src/uts/sparc/ip/ip.global-objs.obj64 index 065904b585..3866432363 100644 --- a/usr/src/uts/sparc/ip/ip.global-objs.obj64 +++ b/usr/src/uts/sparc/ip/ip.global-objs.obj64 @@ -44,7 +44,6 @@ cl_sctp_disconnect cl_sctp_listen cl_sctp_unlisten conn_drain_nthreads -crctab default_ip6_asp_table do_tcp_direct_sockfs do_tcp_fusion @@ -105,7 +104,6 @@ ip_cgtp_filter_rev ip_conn_cache ip_debug ip_g_all_ones -ip_input_proc ip_ioctl_ftbl ip_ire_cleanup_cnt ip_ire_cpu_ratio @@ -133,15 +131,12 @@ ip_poll_normal_ms ip_poll_normal_ticks ip_rput_pullups ip_six_byte_all_ones -ip_soft_rings_cnt -ip_squeue_bind ip_squeue_create_callback ip_squeue_enter ip_squeue_enter_unbound ip_squeue_fanout -ip_squeue_profile +ip_squeue_flag ip_squeue_worker_wait -ip_squeues_per_cpu ip_thread_data ip_thread_list ip_thread_rwlock @@ -217,10 +212,6 @@ req_arr rn_mkfreelist rn_ones rn_zeros -rr_max_blank_ratio -rr_max_pkt_cnt_ratio -rr_min_blank_ratio -rr_min_pkt_cnt_ratio rt_entry_cache rts_conn_cache rts_g_t_info_ack @@ -254,16 +245,12 @@ sin6_null sin_null sqset_global_list sqset_global_size +sqset_lock squeue_cache -squeue_intrdrain_ms -squeue_intrdrain_ns -squeue_worker_poll_min -squeue_workerdrain_ms -squeue_workerdrain_ns +squeue_drain_ms +squeue_drain_ns squeue_workerwait_ms squeue_workerwait_tick -squeue_writerdrain_ms -squeue_writerdrain_ns tcp_acceptor_rinit tcp_acceptor_winit tcp_conn_cache @@ -296,10 +283,8 @@ tcp_rinitv4 tcp_rinitv6 tcp_sack_info_cache tcp_sock_winit -tcp_squeue_close -tcp_squeue_close_proc +tcp_squeue_flag tcp_squeue_wput -tcp_squeue_wput_proc tcp_static_maxpsz tcp_taskq tcp_timercache @@ -307,6 +292,7 @@ tcp_tx_pull_len tcp_valid_levels_arr tcp_winfo tcp_winit +tcp_outbound_squeue_switch tcpinfov4 tcpinfov6 tsol_strict_error diff --git a/usr/src/uts/sparc/mac/Makefile b/usr/src/uts/sparc/mac/Makefile index d343e0bc74..5ef314a2ef 100644 --- a/usr/src/uts/sparc/mac/Makefile +++ b/usr/src/uts/sparc/mac/Makefile @@ -22,14 +22,12 @@ # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" # # This makefile drives the production of the mac driver # kernel module. # # sparc architecture dependent # - # # Path to the base of the uts directory tree (usually /usr/src/uts). # @@ -54,7 +52,6 @@ include $(UTSBASE)/sparc/Makefile.sparc ALL_TARGET = $(BINARY) LINT_TARGET = $(MODULE).lint INSTALL_TARGET = $(BINARY) $(ROOTMODULE) -LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN # # Overrides. @@ -64,6 +61,9 @@ $(RELEASE_BUILD)CFLAGS += -xinline=auto -xcrossfile $(RELEASE_BUILD)COPTIMIZE = -xO5 LDFLAGS += -dy +LINTTAGS += -erroff=E_PTRDIFF_OVERFLOW +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN + # # Default build targets. # diff --git a/usr/src/uts/sparc/ml/modstubs.s b/usr/src/uts/sparc/ml/modstubs.s index e45cd91325..e315c9857c 100644 --- a/usr/src/uts/sparc/ml/modstubs.s +++ b/usr/src/uts/sparc/ml/modstubs.s @@ -1199,6 +1199,8 @@ stubs_base: MODULE(dld,drv); STUB(dld, dld_init_ops, nomod_void); STUB(dld, dld_fini_ops, nomod_void); + STUB(dld, dld_ioc_register, nomod_einval); + STUB(dld, dld_ioc_unregister, nomod_void); STUB(dld, dld_autopush, nomod_minus_one); END_MODULE(dld); #endif @@ -1210,12 +1212,15 @@ stubs_base: */ #ifndef DLS_MODULE MODULE(dls,misc); - STUB(dls, dls_devnet_vid, nomod_zero); STUB(dls, dls_devnet_mac, nomod_zero); STUB(dls, dls_devnet_hold_tmp, nomod_einval); STUB(dls, dls_devnet_rele_tmp, nomod_void); + STUB(dls, dls_devnet_hold_link, nomod_einval); + STUB(dls, dls_devnet_rele_link, nomod_void); STUB(dls, dls_devnet_prop_task_wait, nomod_void); STUB(dls, dls_mgmt_get_linkid, nomod_einval); + STUB(dls, dls_devnet_macname2linkid, nomod_einval); + STUB(dls, dls_mgmt_get_linkinfo, nomod_einval); END_MODULE(dls); #endif diff --git a/usr/src/uts/sparc/vnic/Makefile b/usr/src/uts/sparc/vnic/Makefile index f3389cb97a..41052c901d 100644 --- a/usr/src/uts/sparc/vnic/Makefile +++ b/usr/src/uts/sparc/vnic/Makefile @@ -22,9 +22,6 @@ # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" -# - # # Path to the base of the uts directory tree (usually /usr/src/uts). # @@ -55,7 +52,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) # Overrides # CFLAGS += $(CCVERBOSE) -LDFLAGS += -dy -Ndrv/dld -Nmisc/mac -Ndrv/ip -Nmisc/dls +LDFLAGS += -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls # # Default build targets. diff --git a/usr/src/uts/sparc/xge/Makefile b/usr/src/uts/sparc/xge/Makefile index 2d66030c07..f30c4612e3 100644 --- a/usr/src/uts/sparc/xge/Makefile +++ b/usr/src/uts/sparc/xge/Makefile @@ -20,11 +20,9 @@ # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# # This makefile drives the production of the Neterion Xframe # 10G Ethernet (XGE) driver module in x86 systems # diff --git a/usr/src/uts/sun/io/eri/eri.c b/usr/src/uts/sun/io/eri/eri.c index 0fac98abf1..7635d9553e 100644 --- a/usr/src/uts/sun/io/eri/eri.c +++ b/usr/src/uts/sun/io/eri/eri.c @@ -47,7 +47,7 @@ #include <sys/ethernet.h> #include <sys/vlan.h> #include <sys/policy.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> #include <sys/dlpi.h> @@ -200,7 +200,6 @@ static mac_callbacks_t eri_m_callbacks = { eri_m_multicst, eri_m_unicst, eri_m_tx, - NULL, eri_m_ioctl, eri_m_getcapab }; @@ -1293,7 +1292,6 @@ eri_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) *hcksum_txflags = HCKSUM_INET_PARTIAL; return (B_TRUE); } - case MAC_CAPAB_POLL: default: return (B_FALSE); } diff --git a/usr/src/uts/sun/io/hme.c b/usr/src/uts/sun/io/hme.c index 399d995b10..0423d1d736 100644 --- a/usr/src/uts/sun/io/hme.c +++ b/usr/src/uts/sun/io/hme.c @@ -44,7 +44,7 @@ #include <sys/pattr.h> #include <sys/dlpi.h> #include <sys/strsubr.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> #include <sys/ethernet.h> #include <sys/vlan.h> @@ -487,7 +487,6 @@ static mac_callbacks_t hme_m_callbacks = { hme_m_multicst, hme_m_unicst, hme_m_tx, - NULL, hme_m_ioctl, hme_m_getcapab, }; diff --git a/usr/src/uts/sun/io/qfe.c b/usr/src/uts/sun/io/qfe.c index 4a98701b87..ad9bfe8fee 100644 --- a/usr/src/uts/sun/io/qfe.c +++ b/usr/src/uts/sun/io/qfe.c @@ -36,7 +36,7 @@ #include <sys/kmem.h> #include <sys/modctl.h> #include <sys/conf.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> #include <sys/ddi.h> #include <sys/sunddi.h> diff --git a/usr/src/uts/sun4u/io/rmclomv.c b/usr/src/uts/sun4u/io/rmclomv.c index 2afee7d1dd..93e236b121 100644 --- a/usr/src/uts/sun4u/io/rmclomv.c +++ b/usr/src/uts/sun4u/io/rmclomv.c @@ -61,7 +61,6 @@ #define CPU_SIGNATURE_DELAY_TIME 5000000 /* 5 secs, in microsecs */ extern void pmugpio_watchdog_pat(); -static clock_t timesync_interval; extern int watchdog_activated; static int last_watchdog_msg = 1; @@ -118,6 +117,10 @@ static uint_t rmc_clear_watchdog_timer(void); static void send_watchdog_msg(int msg); static void plat_timesync(void *arg); +static kmutex_t timesync_lock; +static clock_t timesync_interval = 0; +static timeout_id_t timesync_tid = 0; + /* * Driver entry points */ @@ -310,6 +313,7 @@ _init(void) mutex_init(&rmclomv_refresh_lock, NULL, MUTEX_DRIVER, NULL); mutex_init(&rmclomv_cache_lock, NULL, MUTEX_DRIVER, NULL); mutex_init(&rmclomv_state_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(×ync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&rmclomv_checkrmc_sig_cv, NULL, CV_DRIVER, NULL); cv_init(&rmclomv_refresh_sig_cv, NULL, CV_DRIVER, NULL); @@ -344,6 +348,7 @@ _fini(void) return (error); cv_destroy(&rmclomv_refresh_sig_cv); cv_destroy(&rmclomv_checkrmc_sig_cv); + mutex_destroy(×ync_lock); mutex_destroy(&rmclomv_state_lock); mutex_destroy(&rmclomv_cache_lock); mutex_destroy(&rmclomv_refresh_lock); @@ -479,8 +484,9 @@ rmclomv_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) static int rmclomv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { - int instance; - int err; + timeout_id_t tid; + int instance; + int err; switch (cmd) { case DDI_DETACH: @@ -502,6 +508,13 @@ rmclomv_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) rmclomv_reset_cache(NULL, NULL, NULL); ddi_remove_minor_node(dip, NULL); + mutex_enter(×ync_lock); + tid = timesync_tid; + timesync_tid = 0; + timesync_interval = 0; + mutex_exit(×ync_lock); + (void) untimeout(tid); + /* Forget the dev info */ rmclomv_dip = NULL; rmc_comm_unregister(); @@ -3419,7 +3432,10 @@ plat_timesync(void *arg) (void) rmc_comm_request_nowait(&request, 0); - (void) timeout(plat_timesync, NULL, timesync_interval); + mutex_enter(×ync_lock); + if (timesync_interval != 0) + timesync_tid = timeout(plat_timesync, NULL, timesync_interval); + mutex_exit(×ync_lock); } /* diff --git a/usr/src/uts/sun4v/io/vnet.c b/usr/src/uts/sun4v/io/vnet.c index 191cfba92b..64f3c278f5 100644 --- a/usr/src/uts/sun4v/io/vnet.c +++ b/usr/src/uts/sun4v/io/vnet.c @@ -39,7 +39,7 @@ #include <sys/ethernet.h> #include <sys/dlpi.h> #include <net/if.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> #include <sys/ddi.h> #include <sys/sunddi.h> diff --git a/usr/src/uts/sun4v/io/vnet_gen.c b/usr/src/uts/sun4v/io/vnet_gen.c index 2a273019b8..b6671a36ad 100644 --- a/usr/src/uts/sun4v/io/vnet_gen.c +++ b/usr/src/uts/sun4v/io/vnet_gen.c @@ -42,7 +42,7 @@ #include <sys/sunddi.h> #include <sys/strsun.h> #include <sys/note.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> #include <sys/ldc.h> #include <sys/mach_descrip.h> diff --git a/usr/src/uts/sun4v/io/vsw.c b/usr/src/uts/sun4v/io/vsw.c index 27ad33ff66..fc3fdceeeb 100644 --- a/usr/src/uts/sun4v/io/vsw.c +++ b/usr/src/uts/sun4v/io/vsw.c @@ -53,12 +53,12 @@ #include <sys/machsystm.h> #include <sys/modctl.h> #include <sys/modhash.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> #include <sys/taskq.h> #include <sys/note.h> #include <sys/mach_descrip.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mdeg.h> #include <sys/ldc.h> #include <sys/vsw_fdb.h> @@ -78,7 +78,7 @@ static int vsw_attach(dev_info_t *, ddi_attach_cmd_t); static int vsw_detach(dev_info_t *, ddi_detach_cmd_t); static int vsw_get_md_physname(vsw_t *, md_t *, mde_cookie_t, char *); -static int vsw_get_md_smodes(vsw_t *, md_t *, mde_cookie_t, uint8_t *, int *); +static int vsw_get_md_smodes(vsw_t *, md_t *, mde_cookie_t, uint8_t *); /* MDEG routines */ static int vsw_mdeg_register(vsw_t *vswp); @@ -88,7 +88,7 @@ static int vsw_port_mdeg_cb(void *cb_argp, mdeg_result_t *); static int vsw_get_initial_md_properties(vsw_t *vswp, md_t *, mde_cookie_t); static int vsw_read_mdprops(vsw_t *vswp); static void vsw_vlan_read_ids(void *arg, int type, md_t *mdp, - mde_cookie_t node, uint16_t *pvidp, uint16_t **vidspp, + mde_cookie_t node, uint16_t *pvidp, vsw_vlanid_t **vidspp, uint16_t *nvidsp, uint16_t *default_idp); static int vsw_port_read_props(vsw_port_t *portp, vsw_t *vswp, md_t *mdp, mde_cookie_t *node); @@ -99,6 +99,8 @@ static void vsw_mtu_read(vsw_t *vswp, md_t *mdp, mde_cookie_t node, static int vsw_mtu_update(vsw_t *vswp, uint32_t mtu); static void vsw_update_md_prop(vsw_t *, md_t *, mde_cookie_t); static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr); +static boolean_t vsw_cmp_vids(vsw_vlanid_t *vids1, + vsw_vlanid_t *vids2, int nvids); /* Mac driver related routines */ static int vsw_mac_register(vsw_t *); @@ -132,13 +134,9 @@ static int vsw_port_update(vsw_t *vswp, md_t *curr_mdp, mde_cookie_t curr_mdex, md_t *prev_mdp, mde_cookie_t prev_mdex); extern int vsw_port_attach(vsw_port_t *port); extern vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance); -extern int vsw_mac_attach(vsw_t *vswp); -extern void vsw_mac_detach(vsw_t *vswp); extern int vsw_mac_open(vsw_t *vswp); extern void vsw_mac_close(vsw_t *vswp); -extern int vsw_set_hw(vsw_t *, vsw_port_t *, int); -extern int vsw_unset_hw(vsw_t *, vsw_port_t *, int); -extern void vsw_reconfig_hw(vsw_t *); +extern void vsw_mac_cleanup_ports(vsw_t *vswp); extern void vsw_unset_addrs(vsw_t *vswp); extern void vsw_setup_layer2_post_process(vsw_t *vswp); extern void vsw_create_vlans(void *arg, int type); @@ -150,6 +148,16 @@ extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt); extern mblk_t *vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp); extern void vsw_hio_cleanup(vsw_t *vswp); +extern void vsw_hio_start_ports(vsw_t *vswp); +extern void vsw_hio_port_update(vsw_port_t *portp, boolean_t hio_enabled); +extern int vsw_mac_multicast_add(vsw_t *, vsw_port_t *, mcst_addr_t *, int); +extern void vsw_mac_multicast_remove(vsw_t *, vsw_port_t *, mcst_addr_t *, int); +extern void vsw_mac_port_reconfig_vlans(vsw_port_t *portp, uint16_t new_pvid, + vsw_vlanid_t *new_vids, int new_nvids); +extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type); +extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type); +extern void vsw_if_mac_reconfig(vsw_t *vswp, boolean_t update_vlans, + uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids); extern void vsw_reset_ports(vsw_t *vswp); extern void vsw_port_reset(vsw_port_t *portp); void vsw_hio_port_update(vsw_port_t *portp, boolean_t hio_enabled); @@ -223,16 +231,6 @@ boolean_t vsw_hio_enabled = B_TRUE; /* Enable/disable HybridIO */ int vsw_hio_max_cleanup_retries = 10; /* Max retries for HybridIO cleanp */ int vsw_hio_cleanup_delay = 10000; /* 10ms */ -/* - * External tunables. - */ -/* - * Enable/disable thread per ring. This is a mode selection - * that is done a vsw driver attach time. - */ -boolean_t vsw_multi_ring_enable = B_FALSE; -int vsw_mac_rx_rings = VSW_MAC_RX_RINGS; - /* Number of transmit descriptors - must be power of 2 */ uint32_t vsw_ntxds = VSW_RING_NUM_EL; @@ -543,11 +541,11 @@ vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) vswp->instance = instance; ddi_set_driver_private(dip, (caddr_t)vswp); - mutex_init(&vswp->hw_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&vswp->mac_lock, NULL, MUTEX_DRIVER, NULL); mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL); mutex_init(&vswp->swtmout_lock, NULL, MUTEX_DRIVER, NULL); + rw_init(&vswp->maccl_rwlock, NULL, RW_DRIVER, NULL); rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL); - rw_init(&vswp->mac_rwlock, NULL, RW_DRIVER, NULL); rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL); rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL); @@ -669,10 +667,9 @@ vsw_attach_fail: if (progress & PROG_swmode) { vsw_stop_switching_timeout(vswp); vsw_hio_cleanup(vswp); - WRITE_ENTER(&vswp->mac_rwlock); - vsw_mac_detach(vswp); + mutex_enter(&vswp->mac_lock); vsw_mac_close(vswp); - RW_EXIT(&vswp->mac_rwlock); + mutex_exit(&vswp->mac_lock); } if (progress & PROG_taskq) @@ -697,11 +694,11 @@ vsw_attach_fail: if (progress & PROG_locks) { rw_destroy(&vswp->plist.lockrw); rw_destroy(&vswp->mfdbrw); - rw_destroy(&vswp->mac_rwlock); rw_destroy(&vswp->if_lockrw); + rw_destroy(&vswp->maccl_rwlock); mutex_destroy(&vswp->swtmout_lock); mutex_destroy(&vswp->mca_lock); - mutex_destroy(&vswp->hw_lock); + mutex_destroy(&vswp->mac_lock); } ddi_soft_state_free(vsw_state, instance); @@ -736,6 +733,9 @@ vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) /* Stop any pending timeout to setup switching mode. */ vsw_stop_switching_timeout(vswp); + /* Cleanup the interface's mac client */ + vsw_mac_client_cleanup(vswp, NULL, VSW_LOCALDEV); + if (vswp->if_state & VSW_IF_REG) { if (vsw_mac_unregister(vswp) != 0) { cmn_err(CE_WARN, "!vsw%d: Unable to detach from " @@ -746,13 +746,8 @@ vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) vsw_mdeg_unregister(vswp); - /* remove mac layer callback */ - WRITE_ENTER(&vswp->mac_rwlock); - if ((vswp->mh != NULL) && (vswp->mrh != NULL)) { - mac_rx_remove(vswp->mh, vswp->mrh, B_TRUE); - vswp->mrh = NULL; - } - RW_EXIT(&vswp->mac_rwlock); + /* cleanup HybridIO */ + vsw_hio_cleanup(vswp); if (vsw_detach_ports(vswp) != 0) { cmn_err(CE_WARN, "!vsw%d: Unable to unconfigure ports", @@ -762,24 +757,19 @@ vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) rw_destroy(&vswp->if_lockrw); - /* cleanup HybridIO */ - vsw_hio_cleanup(vswp); - - mutex_destroy(&vswp->hw_lock); + vsw_mac_cleanup_ports(vswp); /* * Now that the ports have been deleted, stop and close * the physical device. */ - WRITE_ENTER(&vswp->mac_rwlock); - - vsw_mac_detach(vswp); + mutex_enter(&vswp->mac_lock); vsw_mac_close(vswp); + mutex_exit(&vswp->mac_lock); - RW_EXIT(&vswp->mac_rwlock); - - rw_destroy(&vswp->mac_rwlock); + mutex_destroy(&vswp->mac_lock); mutex_destroy(&vswp->swtmout_lock); + rw_destroy(&vswp->maccl_rwlock); /* * Destroy any free pools that may still exist. @@ -936,15 +926,12 @@ vsw_get_md_physname(vsw_t *vswp, md_t *mdp, mde_cookie_t node, char *name) /* * Read the 'vsw-switch-mode' property from the specified MD node. * - * Returns 0 on success and the number of modes found in 'found', - * otherwise returns 1. + * Returns 0 on success, otherwise returns 1. */ static int -vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node, - uint8_t *modes, int *found) +vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node, uint8_t *mode) { int len = 0; - int smode_num = 0; char *smode = NULL; char *curr_mode = NULL; @@ -956,7 +943,6 @@ vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node, * first item in list. */ len = 0; - smode_num = 0; if (md_get_prop_data(mdp, node, smode_propname, (uint8_t **)(&smode), &len) != 0) { /* @@ -965,7 +951,6 @@ vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node, */ cmn_err(CE_WARN, "!vsw%d: Unable to get switch mode property" " from the MD", vswp->instance); - *found = 0; return (1); } @@ -979,25 +964,24 @@ vsw_get_md_smodes(vsw_t *vswp, md_t *mdp, mde_cookie_t node, * 'routed' - layer 3 (i.e. IP) routing, underlying HW * in non-promiscuous mode. */ - while ((curr_mode < (smode + len)) && (smode_num < NUM_SMODES)) { + while (curr_mode < (smode + len)) { D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode); if (strcmp(curr_mode, "switched") == 0) { - modes[smode_num++] = VSW_LAYER2; + *mode = VSW_LAYER2; } else if (strcmp(curr_mode, "promiscuous") == 0) { - modes[smode_num++] = VSW_LAYER2_PROMISC; + *mode = VSW_LAYER2 | VSW_LAYER2_PROMISC; } else if (strcmp(curr_mode, "routed") == 0) { - modes[smode_num++] = VSW_LAYER3; + *mode = VSW_LAYER3; } else { - DWARN(vswp, "%s: Unknown switch mode %s, " - "setting to default 'switched' mode", - __func__, curr_mode); - modes[smode_num++] = VSW_LAYER2; + cmn_err(CE_WARN, "!vsw%d: Unknown switch mode %s, " + "setting to default switched mode", + vswp->instance, curr_mode); + *mode = VSW_LAYER2; } curr_mode += strlen(curr_mode) + 1; } - *found = smode_num; - D2(vswp, "%s: %d modes found", __func__, smode_num); + D2(vswp, "%s: %d mode", __func__, *mode); D1(vswp, "%s: exit", __func__); @@ -1082,16 +1066,16 @@ vsw_m_stat(void *arg, uint_t stat, uint64_t *val) D1(vswp, "%s: enter", __func__); - WRITE_ENTER(&vswp->mac_rwlock); + mutex_enter(&vswp->mac_lock); if (vswp->mh == NULL) { - RW_EXIT(&vswp->mac_rwlock); + mutex_exit(&vswp->mac_lock); return (EINVAL); } /* return stats from underlying device */ *val = mac_stat_get(vswp->mh, stat); - RW_EXIT(&vswp->mac_rwlock); + mutex_exit(&vswp->mac_lock); return (0); } @@ -1107,14 +1091,8 @@ vsw_m_stop(void *arg) vswp->if_state &= ~VSW_IF_UP; RW_EXIT(&vswp->if_lockrw); - mutex_enter(&vswp->hw_lock); - - (void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV); - - if (vswp->recfg_reqd) - vsw_reconfig_hw(vswp); - - mutex_exit(&vswp->hw_lock); + /* Cleanup and close the mac client */ + vsw_mac_client_cleanup(vswp, NULL, VSW_LOCALDEV); D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state); } @@ -1122,6 +1100,7 @@ vsw_m_stop(void *arg) static int vsw_m_start(void *arg) { + int rv; vsw_t *vswp = (vsw_t *)arg; D1(vswp, "%s: enter", __func__); @@ -1143,9 +1122,13 @@ vsw_m_start(void *arg) /* if in layer2 mode, program unicast address. */ if (vswp->mh != NULL) { - mutex_enter(&vswp->hw_lock); - (void) vsw_set_hw(vswp, NULL, VSW_LOCALDEV); - mutex_exit(&vswp->hw_lock); + /* Init a mac client and program addresses */ + rv = vsw_mac_client_init(vswp, NULL, VSW_LOCALDEV); + if (rv != 0) { + cmn_err(CE_NOTE, + "!vsw%d: failed to program interface " + "unicast address\n", vswp->instance); + } } RW_EXIT(&vswp->if_lockrw); @@ -1211,29 +1194,21 @@ vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca) * Call into the underlying driver to program the * address into HW. */ - WRITE_ENTER(&vswp->mac_rwlock); - if (vswp->mh != NULL) { - ret = mac_multicst_add(vswp->mh, mca); - if (ret != 0) { - cmn_err(CE_NOTE, "!vsw%d: unable to " - "add multicast address", - vswp->instance); - RW_EXIT(&vswp->mac_rwlock); - (void) vsw_del_mcst(vswp, - VSW_LOCALDEV, addr, NULL); - kmem_free(mcst_p, sizeof (*mcst_p)); - return (ret); - } - mcst_p->mac_added = B_TRUE; + ret = vsw_mac_multicast_add(vswp, NULL, mcst_p, + VSW_LOCALDEV); + if (ret != 0) { + (void) vsw_del_mcst(vswp, + VSW_LOCALDEV, addr, NULL); + kmem_free(mcst_p, sizeof (*mcst_p)); + return (ret); } - RW_EXIT(&vswp->mac_rwlock); mutex_enter(&vswp->mca_lock); mcst_p->nextp = vswp->mcap; vswp->mcap = mcst_p; mutex_exit(&vswp->mca_lock); } else { - cmn_err(CE_NOTE, "!vsw%d: unable to add multicast " + cmn_err(CE_WARN, "!vsw%d: unable to add multicast " "address", vswp->instance); } return (ret); @@ -1252,12 +1227,7 @@ vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca) mcst_p = vsw_del_addr(VSW_LOCALDEV, vswp, addr); ASSERT(mcst_p != NULL); - WRITE_ENTER(&vswp->mac_rwlock); - if (vswp->mh != NULL && mcst_p->mac_added) { - (void) mac_multicst_remove(vswp->mh, mca); - mcst_p->mac_added = B_FALSE; - } - RW_EXIT(&vswp->mac_rwlock); + vsw_mac_multicast_remove(vswp, NULL, mcst_p, VSW_LOCALDEV); kmem_free(mcst_p, sizeof (*mcst_p)); } @@ -1685,8 +1655,7 @@ vsw_readmd_exit: static int vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node) { - int i; - uint64_t macaddr = 0; + uint64_t macaddr = 0; D1(vswp, "%s: enter", __func__); @@ -1703,17 +1672,12 @@ vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node) vsw_save_lmacaddr(vswp, macaddr); - if (vsw_get_md_smodes(vswp, mdp, node, vswp->smode, &vswp->smode_num)) { + if (vsw_get_md_smodes(vswp, mdp, node, &vswp->smode)) { DWARN(vswp, "%s: Unable to read %s property from MD, " "defaulting to 'switched' mode", __func__, smode_propname); - for (i = 0; i < NUM_SMODES; i++) - vswp->smode[i] = VSW_LAYER2; - - vswp->smode_num = NUM_SMODES; - } else { - ASSERT(vswp->smode_num != 0); + vswp->smode = VSW_LAYER2; } /* read mtu */ @@ -1751,7 +1715,7 @@ vsw_get_initial_md_properties(vsw_t *vswp, md_t *mdp, mde_cookie_t node) */ static void vsw_vlan_read_ids(void *arg, int type, md_t *mdp, mde_cookie_t node, - uint16_t *pvidp, uint16_t **vidspp, uint16_t *nvidsp, + uint16_t *pvidp, vsw_vlanid_t **vidspp, uint16_t *nvidsp, uint16_t *default_idp) { vsw_t *vswp; @@ -1823,11 +1787,12 @@ vsw_vlan_read_ids(void *arg, int type, md_t *mdp, mde_cookie_t node, if (nvids != 0) { D2(vswp, "%s: %s(%d): ", __func__, vid_propname, inst); - vids_size = sizeof (uint16_t) * nvids; + vids_size = sizeof (vsw_vlanid_t) * nvids; *vidspp = kmem_zalloc(vids_size, KM_SLEEP); for (i = 0; i < nvids; i++) { - (*vidspp)[i] = data[i] & 0xFFFF; - D2(vswp, " %d ", (*vidspp)[i]); + (*vidspp)[i].vl_vid = data[i] & 0xFFFF; + (*vidspp)[i].vl_set = B_FALSE; + D2(vswp, " %d ", (*vidspp)[i].vl_vid); } D2(vswp, "\n"); } @@ -1959,35 +1924,6 @@ vsw_mtu_update(vsw_t *vswp, uint32_t mtu) RW_EXIT(&vswp->if_lockrw); - WRITE_ENTER(&vswp->mac_rwlock); - - if (vswp->mh == 0) { - /* - * Physical device is not available yet; mtu will be - * updated after we open it successfully, as we have - * saved the new mtu. - */ - D2(vswp, "%s: Physical device:%s is not " - "available yet; can't update its mtu\n", - __func__, vswp->physname); - - } else { - - /* - * Stop and restart to enable the - * new mtu in the physical device. - */ - vsw_mac_detach(vswp); - rv = vsw_mac_attach(vswp); - if (rv != 0) { - RW_EXIT(&vswp->mac_rwlock); - return (EIO); - } - - } - - RW_EXIT(&vswp->mac_rwlock); - /* Reset ports to renegotiate with the new mtu */ vsw_reset_ports(vswp); @@ -2014,8 +1950,8 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node) char physname[LIFNAMSIZ]; char drv[LIFNAMSIZ]; uint_t ddi_instance; - uint8_t new_smode[NUM_SMODES]; - int i, smode_num = 0; + uint8_t new_smode; + int i; uint64_t macaddr = 0; enum {MD_init = 0x1, MD_physname = 0x2, @@ -2025,7 +1961,7 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node) MD_mtu = 0x20} updated; int rv; uint16_t pvid; - uint16_t *vids; + vsw_vlanid_t *vids; uint16_t nvids; uint32_t mtu; @@ -2099,25 +2035,16 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node) /* * Check if switching modes have changed. */ - if (vsw_get_md_smodes(vswp, mdp, node, - new_smode, &smode_num)) { + if (vsw_get_md_smodes(vswp, mdp, node, &new_smode)) { cmn_err(CE_WARN, "!vsw%d: Unable to read %s property from MD", vswp->instance, smode_propname); goto fail_reconf; } else { - ASSERT(smode_num != 0); - if (smode_num != vswp->smode_num) { - D2(vswp, "%s: number of modes changed from %d to %d", - __func__, vswp->smode_num, smode_num); - } + if (new_smode != vswp->smode) { + D2(vswp, "%s: switching mode changed from %d to %d", + __func__, vswp->smode, new_smode); - for (i = 0; i < smode_num; i++) { - if (new_smode[i] != vswp->smode[i]) { - D2(vswp, "%s: mode changed from %d to %d", - __func__, vswp->smode[i], new_smode[i]); - updated |= MD_smode; - break; - } + updated |= MD_smode; } } @@ -2129,7 +2056,7 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node) if ((pvid != vswp->pvid) || /* pvid changed? */ (nvids != vswp->nvids) || /* # of vids changed? */ ((nvids != 0) && (vswp->nvids != 0) && /* vids changed? */ - bcmp(vids, vswp->vids, sizeof (uint16_t) * nvids))) { + !vsw_cmp_vids(vids, vswp->vids, nvids))) { updated |= MD_vlans; } @@ -2149,7 +2076,7 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node) * Now make any changes which are needed... */ - if (updated & (MD_physname | MD_smode)) { + if (updated & (MD_physname | MD_smode | MD_mtu)) { /* * Stop any pending timeout to setup switching mode. @@ -2161,19 +2088,17 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node) /* * Remove unicst, mcst addrs of vsw interface - * and ports from the physdev. + * and ports from the physdev. This also closes + * the corresponding mac clients. */ vsw_unset_addrs(vswp); /* * Stop, detach and close the old device.. */ - WRITE_ENTER(&vswp->mac_rwlock); - - vsw_mac_detach(vswp); + mutex_enter(&vswp->mac_lock); vsw_mac_close(vswp); - - RW_EXIT(&vswp->mac_rwlock); + mutex_exit(&vswp->mac_lock); /* * Update phys name. @@ -2189,11 +2114,15 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node) * Update array with the new switch mode values. */ if (updated & MD_smode) { - for (i = 0; i < smode_num; i++) - vswp->smode[i] = new_smode[i]; + vswp->smode = new_smode; + } - vswp->smode_num = smode_num; - vswp->smode_idx = 0; + /* Update mtu */ + if (updated & MD_mtu) { + rv = vsw_mtu_update(vswp, mtu); + if (rv != 0) { + goto fail_update; + } } /* @@ -2237,24 +2166,9 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node) READ_ENTER(&vswp->if_lockrw); if (vswp->if_state & VSW_IF_UP) { + /* reconfigure with new address */ + vsw_if_mac_reconfig(vswp, B_FALSE, 0, NULL, 0); - mutex_enter(&vswp->hw_lock); - /* - * Remove old mac address of vsw interface - * from the physdev - */ - (void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV); - /* - * Program new mac address of vsw interface - * in the physdev - */ - rv = vsw_set_hw(vswp, NULL, VSW_LOCALDEV); - mutex_exit(&vswp->hw_lock); - if (rv != 0) { - cmn_err(CE_NOTE, - "!vsw%d: failed to program interface " - "unicast address\n", vswp->instance); - } /* * Notify the MAC layer of the changed address. */ @@ -2270,32 +2184,24 @@ vsw_update_md_prop(vsw_t *vswp, md_t *mdp, mde_cookie_t node) /* Remove existing vlan ids from the hash table. */ vsw_vlan_remove_ids(vswp, VSW_LOCALDEV); - /* save the new vlan ids */ - vswp->pvid = pvid; - if (vswp->nvids != 0) { - kmem_free(vswp->vids, sizeof (uint16_t) * vswp->nvids); - vswp->nvids = 0; - } - if (nvids != 0) { - vswp->nvids = nvids; + if (vswp->if_state & VSW_IF_UP) { + vsw_if_mac_reconfig(vswp, B_TRUE, pvid, vids, nvids); + } else { + if (vswp->nvids != 0) { + kmem_free(vswp->vids, + sizeof (vsw_vlanid_t) * vswp->nvids); + } vswp->vids = vids; + vswp->nvids = nvids; + vswp->pvid = pvid; } /* add these new vlan ids into hash table */ vsw_vlan_add_ids(vswp, VSW_LOCALDEV); } else { if (nvids != 0) { - kmem_free(vids, sizeof (uint16_t) * nvids); - } - } - - if (updated & MD_mtu) { - - rv = vsw_mtu_update(vswp, mtu); - if (rv != 0) { - goto fail_update; + kmem_free(vids, sizeof (vsw_vlanid_t) * nvids); } - } return; @@ -2397,7 +2303,7 @@ vsw_port_read_props(vsw_port_t *portp, vsw_t *vswp, /* now update all properties into the port */ portp->p_vswp = vswp; portp->p_instance = inst; - portp->addr_set = VSW_ADDR_UNSET; + portp->addr_set = B_FALSE; ether_copy(&ea, &portp->p_macaddr); if (nchan > VSW_PORT_MAX_LDCS) { D2(vswp, "%s: using first of %d ldc ids", @@ -2466,7 +2372,7 @@ vsw_port_update(vsw_t *vswp, md_t *curr_mdp, mde_cookie_t curr_mdex, vsw_port_t *portp; boolean_t updated_vlans = B_FALSE; uint16_t pvid; - uint16_t *vids; + vsw_vlanid_t *vids; uint16_t nvids; uint64_t val; boolean_t hio_enabled = B_FALSE; @@ -2503,7 +2409,7 @@ vsw_port_update(vsw_t *vswp, md_t *curr_mdp, mde_cookie_t curr_mdex, if ((pvid != portp->pvid) || /* pvid changed? */ (nvids != portp->nvids) || /* # of vids changed? */ ((nvids != 0) && (portp->nvids != 0) && /* vids changed? */ - bcmp(vids, portp->vids, sizeof (uint16_t) * nvids))) { + !vsw_cmp_vids(vids, portp->vids, nvids))) { updated_vlans = B_TRUE; } @@ -2512,20 +2418,8 @@ vsw_port_update(vsw_t *vswp, md_t *curr_mdp, mde_cookie_t curr_mdex, /* Remove existing vlan ids from the hash table. */ vsw_vlan_remove_ids(portp, VSW_VNETPORT); - /* save the new vlan ids */ - portp->pvid = pvid; - if (portp->nvids != 0) { - kmem_free(portp->vids, - sizeof (uint16_t) * portp->nvids); - portp->nvids = 0; - } - if (nvids != 0) { - portp->vids = kmem_zalloc(sizeof (uint16_t) * - nvids, KM_SLEEP); - bcopy(vids, portp->vids, sizeof (uint16_t) * nvids); - portp->nvids = nvids; - kmem_free(vids, sizeof (uint16_t) * nvids); - } + /* Reconfigure vlans with network device */ + vsw_mac_port_reconfig_vlans(portp, pvid, vids, nvids); /* add these new vlan ids into hash table */ vsw_vlan_add_ids(portp, VSW_VNETPORT); @@ -2628,3 +2522,23 @@ vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr) } RW_EXIT(&vswp->if_lockrw); } + +/* Compare VLAN ids, array size expected to be same. */ +static boolean_t +vsw_cmp_vids(vsw_vlanid_t *vids1, vsw_vlanid_t *vids2, int nvids) +{ + int i, j; + uint16_t vid; + + for (i = 0; i < nvids; i++) { + vid = vids1[i].vl_vid; + for (j = 0; j < nvids; j++) { + if (vid == vids2[i].vl_vid) + break; + } + if (j == nvids) { + return (B_FALSE); + } + } + return (B_TRUE); +} diff --git a/usr/src/uts/sun4v/io/vsw_hio.c b/usr/src/uts/sun4v/io/vsw_hio.c index 278896d977..084c338548 100644 --- a/usr/src/uts/sun4v/io/vsw_hio.c +++ b/usr/src/uts/sun4v/io/vsw_hio.c @@ -53,7 +53,7 @@ #include <sys/machsystm.h> #include <sys/modctl.h> #include <sys/modhash.h> -#include <sys/mac.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> #include <sys/taskq.h> #include <sys/note.h> @@ -80,9 +80,9 @@ extern int vsw_hio_cleanup_delay; /* Functions imported from other files */ extern int vsw_send_msg(vsw_ldc_t *, void *, int, boolean_t); -extern int vsw_set_hw(vsw_t *, vsw_port_t *, int); -extern int vsw_unset_hw(vsw_t *, vsw_port_t *, int); extern void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate); +extern void vsw_port_mac_reconfig(vsw_port_t *portp, boolean_t update_vlans, + uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids); /* Functions exported to other files */ void vsw_hio_init(vsw_t *vswp); @@ -104,11 +104,24 @@ static int vsw_send_dds_msg(vsw_ldc_t *ldcp, uint8_t dds_subclass, uint64_t cookie, uint64_t macaddr, uint32_t req_id); static int vsw_send_dds_resp_msg(vsw_ldc_t *ldcp, vio_dds_msg_t *dmsg, int ack); static int vsw_hio_send_delshare_msg(vsw_share_t *vsharep); -static int vsw_hio_bind_macaddr(vsw_share_t *vsharep); -static void vsw_hio_unbind_macaddr(vsw_share_t *vsharep); static boolean_t vsw_hio_reboot_callb(void *arg, int code); static boolean_t vsw_hio_panic_callb(void *arg, int code); +/* + * Locking strategy for HybridIO is followed as below: + * + * - As the Shares are associated with a network device, the + * the global lock('vswp>mac_lock') is used for all Shares + * related operations. + * - The 'port->maccl_rwlock' is used to synchronize only the + * the operations that operate on that port's mac client. That + * is, the share_bind and unbind operations only. + * + * - The locking hierarchy follows that the global mac_lock is + * acquired first and then the ports mac client lock(maccl_rwlock) + */ + + static kstat_t *vsw_hio_setup_kstats(char *ks_mod, char *ks_name, vsw_t *vswp); static void vsw_hio_destroy_kstats(vsw_t *vswp); static int vsw_hio_kstats_update(kstat_t *ksp, int rw); @@ -122,32 +135,23 @@ void vsw_hio_init(vsw_t *vswp) { vsw_hio_t *hiop = &vswp->vhio; + int num_shares; int i; - int rv; + ASSERT(MUTEX_HELD(&vswp->mac_lock)); D1(vswp, "%s:enter\n", __func__); - mutex_enter(&vswp->hw_lock); if (vsw_hio_enabled == B_FALSE) { - mutex_exit(&vswp->hw_lock); return; } vswp->hio_capable = B_FALSE; - rv = mac_capab_get(vswp->mh, MAC_CAPAB_SHARES, &hiop->vh_scapab); - if (rv == B_FALSE) { + num_shares = mac_share_capable(vswp->mh); + if (num_shares == 0) { D2(vswp, "%s: %s is not HybridIO capable\n", __func__, vswp->physname); - mutex_exit(&vswp->hw_lock); return; } - rv = mac_capab_get(vswp->mh, MAC_CAPAB_RINGS, &hiop->vh_rcapab); - if (rv == B_FALSE) { - DWARN(vswp, "%s: %s has no RINGS capability\n", __func__, - vswp->physname); - mutex_exit(&vswp->hw_lock); - return; - } - hiop->vh_num_shares = hiop->vh_scapab.ms_snum; + hiop->vh_num_shares = num_shares; hiop->vh_shares = kmem_zalloc((sizeof (vsw_share_t) * hiop->vh_num_shares), KM_SLEEP); for (i = 0; i < hiop->vh_num_shares; i++) { @@ -176,7 +180,6 @@ vsw_hio_init(vsw_t *vswp) D2(vswp, "%s: %s is HybridIO capable num_shares=%d\n", __func__, vswp->physname, hiop->vh_num_shares); D1(vswp, "%s:exit\n", __func__); - mutex_exit(&vswp->hw_lock); } /* @@ -187,13 +190,9 @@ vsw_hio_init(vsw_t *vswp) static vsw_share_t * vsw_hio_alloc_share(vsw_t *vswp, vsw_ldc_t *ldcp) { - vsw_hio_t *hiop = &vswp->vhio; - mac_capab_share_t *hcapab = &hiop->vh_scapab; vsw_share_t *vsharep; vsw_port_t *portp = ldcp->ldc_port; uint64_t ldc_id = ldcp->ldc_id; - uint32_t rmin, rmax; - uint64_t rmap; int rv; D1(vswp, "%s:enter\n", __func__); @@ -202,39 +201,19 @@ vsw_hio_alloc_share(vsw_t *vswp, vsw_ldc_t *ldcp) /* No free shares available */ return (NULL); } - /* - * Allocate a Share - it will come with rings/groups - * already assigned to it. - */ - rv = hcapab->ms_salloc(hcapab->ms_handle, ldc_id, - &vsharep->vs_cookie, &vsharep->vs_shdl); + + WRITE_ENTER(&portp->maccl_rwlock); + rv = mac_share_bind(portp->p_mch, ldc_id, &vsharep->vs_cookie); + RW_EXIT(&portp->maccl_rwlock); if (rv != 0) { - D2(vswp, "Alloc a share failed for ldc=0x%lx rv=%d", - ldc_id, rv); return (NULL); } - /* - * Query the RX group number to bind the port's - * MAC address to it. - */ - hcapab->ms_squery(vsharep->vs_shdl, MAC_RING_TYPE_RX, - &rmin, &rmax, &rmap, &vsharep->vs_gnum); - /* Cache some useful info */ vsharep->vs_ldcid = ldcp->ldc_id; vsharep->vs_macaddr = vnet_macaddr_strtoul( portp->p_macaddr.ether_addr_octet); vsharep->vs_portp = ldcp->ldc_port; - - /* Bind the Guest's MAC address */ - rv = vsw_hio_bind_macaddr(vsharep); - if (rv != 0) { - /* something went wrong, cleanup */ - hcapab->ms_sfree(vsharep->vs_shdl); - return (NULL); - } - vsharep->vs_state |= VSW_SHARE_ASSIGNED; D1(vswp, "%s:exit\n", __func__); @@ -242,61 +221,6 @@ vsw_hio_alloc_share(vsw_t *vswp, vsw_ldc_t *ldcp) } /* - * vsw_hio_bind_macaddr -- Remove the port's MAC address from the - * physdev and bind it to the Share's RX group. - */ -static int -vsw_hio_bind_macaddr(vsw_share_t *vsharep) -{ - vsw_t *vswp = vsharep->vs_vswp; - vsw_port_t *portp = vsharep->vs_portp; - mac_capab_rings_t *rcapab = &vswp->vhio.vh_rcapab; - mac_group_info_t *ginfop = &vsharep->vs_rxginfo; - int rv; - - /* Get the RX groupinfo */ - rcapab->mr_gget(rcapab->mr_handle, MAC_RING_TYPE_RX, - vsharep->vs_gnum, &vsharep->vs_rxginfo, NULL); - - /* Unset the MAC address first */ - if (portp->addr_set != VSW_ADDR_UNSET) { - (void) vsw_unset_hw(vswp, portp, VSW_VNETPORT); - } - - /* Bind the MAC address to the RX group */ - rv = ginfop->mrg_addmac(ginfop->mrg_driver, - (uint8_t *)&portp->p_macaddr.ether_addr_octet); - if (rv != 0) { - /* Restore the address back as it was */ - (void) vsw_set_hw(vswp, portp, VSW_VNETPORT); - return (rv); - } - return (0); -} - -/* - * vsw_hio_unbind_macaddr -- Unbind the port's MAC address and restore - * it back as it was before. - */ -static void -vsw_hio_unbind_macaddr(vsw_share_t *vsharep) -{ - vsw_t *vswp = vsharep->vs_vswp; - vsw_port_t *portp = vsharep->vs_portp; - mac_group_info_t *ginfop = &vsharep->vs_rxginfo; - - if (portp == NULL) { - return; - } - /* Unbind the MAC address from the RX group */ - (void) ginfop->mrg_remmac(ginfop->mrg_driver, - (uint8_t *)&portp->p_macaddr.ether_addr_octet); - - /* Program the MAC address back */ - (void) vsw_set_hw(vswp, portp, VSW_VNETPORT); -} - -/* * vsw_hio_find_free_share -- Find a free Share. */ static vsw_share_t * @@ -380,16 +304,13 @@ static void vsw_hio_free_share(vsw_share_t *vsharep) { vsw_t *vswp = vsharep->vs_vswp; - vsw_hio_t *hiop = &vswp->vhio; - mac_capab_share_t *hcapab = &hiop->vh_scapab; + vsw_port_t *portp = vsharep->vs_portp; D1(vswp, "%s:enter\n", __func__); - /* First unbind the MAC address and restore it back */ - vsw_hio_unbind_macaddr(vsharep); - - /* free share */ - hcapab->ms_sfree(vsharep->vs_shdl); + WRITE_ENTER(&portp->maccl_rwlock); + mac_share_unbind(portp->p_mch); + RW_EXIT(&portp->maccl_rwlock); vsharep->vs_state = VSW_SHARE_FREE; vsharep->vs_macaddr = 0; @@ -455,7 +376,7 @@ vsw_hio_free_all_shares(vsw_t *vswp, boolean_t reboot) * HybridIO. */ READ_ENTER(&plist->lockrw); - mutex_enter(&vswp->hw_lock); + mutex_enter(&vswp->mac_lock); /* * first clear the hio_capable flag so that no more * HybridIO operations are initiated. @@ -515,9 +436,9 @@ vsw_hio_free_all_shares(vsw_t *vswp, boolean_t reboot) * This delay is also needed for the port reset to * release the Hybrid resource. */ - mutex_exit(&vswp->hw_lock); + mutex_exit(&vswp->mac_lock); drv_usecwait(vsw_hio_cleanup_delay); - mutex_enter(&vswp->hw_lock); + mutex_enter(&vswp->mac_lock); max_retries--; } while ((free_shares < hiop->vh_num_shares) && (max_retries > 0)); @@ -532,7 +453,7 @@ vsw_hio_free_all_shares(vsw_t *vswp, boolean_t reboot) kmem_free(hiop->vh_shares, sizeof (vsw_share_t) * hiop->vh_num_shares); hiop->vh_shares = NULL; hiop->vh_num_shares = 0; - mutex_exit(&vswp->hw_lock); + mutex_exit(&vswp->mac_lock); RW_EXIT(&plist->lockrw); D1(vswp, "%s:exit\n", __func__); } @@ -560,12 +481,12 @@ vsw_hio_start_ports(vsw_t *vswp) } reset = B_FALSE; - mutex_enter(&vswp->hw_lock); + mutex_enter(&vswp->mac_lock); vsharep = vsw_hio_find_vshare_port(vswp, portp); if (vsharep == NULL) { reset = B_TRUE; } - mutex_exit(&vswp->hw_lock); + mutex_exit(&vswp->mac_lock); if (reset == B_TRUE) { /* Cause a rest to trigger HybridIO setup */ @@ -586,9 +507,9 @@ vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp) int rv; D1(vswp, "%s:enter ldc=0x%lx", __func__, ldcp->ldc_id); - mutex_enter(&vswp->hw_lock); + mutex_enter(&vswp->mac_lock); if (vswp->hio_capable == B_FALSE) { - mutex_exit(&vswp->hw_lock); + mutex_exit(&vswp->mac_lock); D2(vswp, "%s:not HIO capable", __func__); return; } @@ -596,14 +517,14 @@ vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp) /* Verify if a share was already allocated */ vsharep = vsw_hio_find_vshare_ldcid(vswp, ldcp->ldc_id); if (vsharep != NULL) { - mutex_exit(&vswp->hw_lock); + mutex_exit(&vswp->mac_lock); D2(vswp, "%s:Share already allocated to ldc=0x%lx", __func__, ldcp->ldc_id); return; } vsharep = vsw_hio_alloc_share(vswp, ldcp); if (vsharep == NULL) { - mutex_exit(&vswp->hw_lock); + mutex_exit(&vswp->mac_lock); D2(vswp, "%s: no Share available for ldc=0x%lx", __func__, ldcp->ldc_id); return; @@ -616,12 +537,12 @@ vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp) * Failed to send a DDS message, so cleanup now. */ vsw_hio_free_share(vsharep); - mutex_exit(&vswp->hw_lock); + mutex_exit(&vswp->mac_lock); return; } vsharep->vs_state &= ~VSW_SHARE_DDS_ACKD; vsharep->vs_state |= VSW_SHARE_DDS_SENT; - mutex_exit(&vswp->hw_lock); + mutex_exit(&vswp->mac_lock); /* DERR only to print by default */ DERR(vswp, "Share allocated for ldc_id=0x%lx Cookie=0x%lX", @@ -640,16 +561,16 @@ vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp) D1(vswp, "%s:enter ldc=0x%lx", __func__, ldcp->ldc_id); - mutex_enter(&vswp->hw_lock); + mutex_enter(&vswp->mac_lock); vsharep = vsw_hio_find_vshare_ldcid(vswp, ldcp->ldc_id); if (vsharep == NULL) { D1(vswp, "%s:no share found for ldc=0x%lx", __func__, ldcp->ldc_id); - mutex_exit(&vswp->hw_lock); + mutex_exit(&vswp->mac_lock); return; } vsw_hio_free_share(vsharep); - mutex_exit(&vswp->hw_lock); + mutex_exit(&vswp->mac_lock); D1(vswp, "%s:exit ldc=0x%lx", __func__, ldcp->ldc_id); } @@ -669,12 +590,12 @@ vsw_hio_send_delshare_msg(vsw_share_t *vsharep) uint64_t macaddr = vsharep->vs_macaddr; int rv; - ASSERT(MUTEX_HELD(&vswp->hw_lock)); - mutex_exit(&vswp->hw_lock); + ASSERT(MUTEX_HELD(&vswp->mac_lock)); + mutex_exit(&vswp->mac_lock); portp = vsharep->vs_portp; if (portp == NULL) { - mutex_enter(&vswp->hw_lock); + mutex_enter(&vswp->mac_lock); return (0); } @@ -683,7 +604,7 @@ vsw_hio_send_delshare_msg(vsw_share_t *vsharep) ldcp = ldcl->head; if ((ldcp == NULL) || (ldcp->ldc_id != vsharep->vs_ldcid)) { RW_EXIT(&ldcl->lockrw); - mutex_enter(&vswp->hw_lock); + mutex_enter(&vswp->mac_lock); return (0); } req_id = VSW_DDS_NEXT_REQID(vsharep); @@ -691,7 +612,7 @@ vsw_hio_send_delshare_msg(vsw_share_t *vsharep) cookie, macaddr, req_id); RW_EXIT(&ldcl->lockrw); - mutex_enter(&vswp->hw_lock); + mutex_enter(&vswp->mac_lock); if (rv == 0) { vsharep->vs_state &= ~VSW_SHARE_DDS_ACKD; vsharep->vs_state |= VSW_SHARE_DDS_SENT; @@ -740,14 +661,14 @@ vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg) /* discard */ return; } - mutex_enter(&vswp->hw_lock); + mutex_enter(&vswp->mac_lock); /* * We expect to receive DDS messages only from guests that * have HybridIO started. */ vsharep = vsw_hio_find_vshare_ldcid(vswp, ldcp->ldc_id); if (vsharep == NULL) { - mutex_exit(&vswp->hw_lock); + mutex_exit(&vswp->mac_lock); return; } @@ -816,7 +737,7 @@ vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg) __func__, dmsg->dds_subclass); break; } - mutex_exit(&vswp->hw_lock); + mutex_exit(&vswp->mac_lock); D1(vswp, "%s:exit ldc=0x%lx\n", __func__, ldcp->ldc_id); } @@ -857,8 +778,12 @@ vsw_hio_port_update(vsw_port_t *portp, boolean_t hio_enabled) /* Hybrid Mode is disabled, so stop HybridIO */ vsw_hio_stop_port(portp); portp->p_hio_enabled = B_FALSE; + + vsw_port_mac_reconfig(portp, B_FALSE, 0, NULL, 0); } else { portp->p_hio_enabled = B_TRUE; + vsw_port_mac_reconfig(portp, B_FALSE, 0, NULL, 0); + /* reset the port to initiate HybridIO setup */ vsw_hio_port_reset(portp, B_FALSE); } @@ -877,16 +802,16 @@ vsw_hio_stop_port(vsw_port_t *portp) int max_retries = vsw_hio_max_cleanup_retries; D1(vswp, "%s:enter\n", __func__); - mutex_enter(&vswp->hw_lock); + mutex_enter(&vswp->mac_lock); if (vswp->hio_capable == B_FALSE) { - mutex_exit(&vswp->hw_lock); + mutex_exit(&vswp->mac_lock); return; } vsharep = vsw_hio_find_vshare_port(vswp, portp); if (vsharep == NULL) { - mutex_exit(&vswp->hw_lock); + mutex_exit(&vswp->mac_lock); return; } @@ -925,9 +850,9 @@ vsw_hio_stop_port(vsw_port_t *portp) * messages come and get processed, that is, shares * get freed. */ - mutex_exit(&vswp->hw_lock); + mutex_exit(&vswp->mac_lock); drv_usecwait(vsw_hio_cleanup_delay); - mutex_enter(&vswp->hw_lock); + mutex_enter(&vswp->mac_lock); /* Check if the share still assigned to this port */ if ((vsharep->vs_portp != portp) || @@ -937,7 +862,7 @@ vsw_hio_stop_port(vsw_port_t *portp) max_retries--; } while ((vsharep->vs_state != VSW_SHARE_FREE) && (max_retries > 0)); - mutex_exit(&vswp->hw_lock); + mutex_exit(&vswp->mac_lock); D1(vswp, "%s:exit\n", __func__); } @@ -1111,7 +1036,7 @@ vsw_hio_kstats_update(kstat_t *ksp, int rw) return (0); } - mutex_enter(&vswp->hw_lock); + mutex_enter(&vswp->mac_lock); hiokp->hio_num_shares.value.ul = (uint32_t)hiop->vh_num_shares; for (i = 0; i < hiop->vh_num_shares; i++) { hiokp->share[i].assigned.value.ul = @@ -1119,7 +1044,7 @@ vsw_hio_kstats_update(kstat_t *ksp, int rw) hiokp->share[i].state.value.ul = hiop->vh_shares[i].vs_state; } - mutex_exit(&vswp->hw_lock); + mutex_exit(&vswp->mac_lock); } else { return (EACCES); } diff --git a/usr/src/uts/sun4v/io/vsw_ldc.c b/usr/src/uts/sun4v/io/vsw_ldc.c index e2273596a1..bfd6dde2fb 100644 --- a/usr/src/uts/sun4v/io/vsw_ldc.c +++ b/usr/src/uts/sun4v/io/vsw_ldc.c @@ -58,7 +58,6 @@ #include <sys/taskq.h> #include <sys/note.h> #include <sys/mach_descrip.h> -#include <sys/mac.h> #include <sys/mdeg.h> #include <sys/ldc.h> #include <sys/vsw_fdb.h> @@ -88,7 +87,7 @@ int vsw_detach_ports(vsw_t *vswp); int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node); mcst_addr_t *vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr); int vsw_port_detach(vsw_t *vswp, int p_instance); -int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count); +int vsw_portsend(vsw_port_t *port, mblk_t *mp); int vsw_port_attach(vsw_port_t *portp); vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance); void vsw_vlan_unaware_port_reset(vsw_port_t *portp); @@ -165,7 +164,6 @@ static void vsw_stop_rx_thread(vsw_ldc_t *ldcp); static void vsw_ldc_rx_worker(void *arg); /* Misc support routines */ -static caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf); static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t); static void vsw_free_ring(dring_info_t *); static void vsw_save_lmacaddr(vsw_t *vswp, uint64_t macaddr); @@ -183,8 +181,7 @@ static void display_ring(dring_info_t *); * Functions imported from other files. */ extern int vsw_set_hw(vsw_t *, vsw_port_t *, int); -extern int vsw_unset_hw(vsw_t *, vsw_port_t *, int); -extern void vsw_reconfig_hw(vsw_t *); +extern void vsw_unset_hw(vsw_t *, vsw_port_t *, int); extern int vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port); extern void vsw_del_mcst_port(vsw_port_t *port); extern int vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg); @@ -205,7 +202,10 @@ extern void vsw_hio_start(vsw_t *vswp, vsw_ldc_t *ldcp); extern void vsw_hio_stop(vsw_t *vswp, vsw_ldc_t *ldcp); extern void vsw_process_dds_msg(vsw_t *vswp, vsw_ldc_t *ldcp, void *msg); extern void vsw_hio_stop_port(vsw_port_t *portp); -extern void vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr); +extern void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp); +extern int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type); +extern void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type); + #define VSW_NUM_VMPOOLS 3 /* number of vio mblk pools */ @@ -309,6 +309,7 @@ vsw_port_attach(vsw_port_t *port) int i; int nids = port->num_ldcs; uint64_t *ldcids; + int rv; D1(vswp, "%s: enter : port %d", __func__, port->p_instance); @@ -328,6 +329,7 @@ vsw_port_attach(vsw_port_t *port) mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL); mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL); + rw_init(&port->maccl_rwlock, NULL, RW_DRIVER, NULL); mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&port->state_cv, NULL, CV_DRIVER, NULL); @@ -339,29 +341,20 @@ vsw_port_attach(vsw_port_t *port) D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]); if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) { DERR(vswp, "%s: ldc_attach failed", __func__); - - rw_destroy(&port->p_ldclist.lockrw); - - cv_destroy(&port->state_cv); - mutex_destroy(&port->state_lock); - - mutex_destroy(&port->tx_lock); - mutex_destroy(&port->mca_lock); - kmem_free(port, sizeof (vsw_port_t)); - return (1); + goto exit_error; } } if (vswp->switching_setup_done == B_TRUE) { /* - * If the underlying physical device has been setup, - * program the mac address of this port in it. - * Otherwise, port macaddr will be set after the physical - * device is successfully setup by the timeout handler. + * If the underlying network device has been setup, + * then open a mac client and porgram the mac address + * for this port. */ - mutex_enter(&vswp->hw_lock); - (void) vsw_set_hw(vswp, port, VSW_VNETPORT); - mutex_exit(&vswp->hw_lock); + rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT); + if (rv != 0) { + goto exit_error; + } } /* create the fdb entry for this port/mac address */ @@ -386,11 +379,23 @@ vsw_port_attach(vsw_port_t *port) /* announce macaddr of vnet to the physical switch */ if (vsw_publish_macaddr_count != 0) { /* enabled */ - vsw_publish_macaddr(vswp, (uint8_t *)&(port->p_macaddr)); + vsw_publish_macaddr(vswp, port); } D1(vswp, "%s: exit", __func__); return (0); + +exit_error: + rw_destroy(&port->p_ldclist.lockrw); + + cv_destroy(&port->state_cv); + mutex_destroy(&port->state_lock); + + rw_destroy(&port->maccl_rwlock); + mutex_destroy(&port->tx_lock); + mutex_destroy(&port->mca_lock); + kmem_free(port, sizeof (vsw_port_t)); + return (1); } /* @@ -427,6 +432,9 @@ vsw_port_detach(vsw_t *vswp, int p_instance) */ RW_EXIT(&plist->lockrw); + /* Cleanup and close the mac client */ + vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT); + /* Remove the fdb entry for this port/mac address */ vsw_fdbe_del(vswp, &(port->p_macaddr)); vsw_destroy_vlans(port, VSW_VNETPORT); @@ -434,23 +442,6 @@ vsw_port_detach(vsw_t *vswp, int p_instance) /* Remove any multicast addresses.. */ vsw_del_mcst_port(port); - /* Remove address if was programmed into HW. */ - mutex_enter(&vswp->hw_lock); - - /* - * Port's address may not have been set in hardware. This could - * happen if the underlying physical device is not yet available and - * vsw_setup_switching_timeout() may be in progress. - * We remove its addr from hardware only if it has been set before. - */ - if (port->addr_set != VSW_ADDR_UNSET) - (void) vsw_unset_hw(vswp, port, VSW_VNETPORT); - - if (vswp->recfg_reqd) - vsw_reconfig_hw(vswp); - - mutex_exit(&vswp->hw_lock); - if (vsw_port_delete(port)) { return (1); } @@ -482,10 +473,8 @@ vsw_detach_ports(vsw_t *vswp) return (1); } - /* Remove address if was programmed into HW. */ - mutex_enter(&vswp->hw_lock); - (void) vsw_unset_hw(vswp, port, VSW_VNETPORT); - mutex_exit(&vswp->hw_lock); + /* Cleanup and close the mac client */ + vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT); /* Remove the fdb entry for this port/mac address */ vsw_fdbe_del(vswp, &(port->p_macaddr)); @@ -560,6 +549,7 @@ vsw_port_delete(vsw_port_t *port) rw_destroy(&port->p_ldclist.lockrw); + rw_destroy(&port->maccl_rwlock); mutex_destroy(&port->mca_lock); mutex_destroy(&port->tx_lock); @@ -570,6 +560,11 @@ vsw_port_delete(vsw_port_t *port) kmem_free(port->ldc_ids, port->num_ldcs * sizeof (uint64_t)); port->num_ldcs = 0; } + + if (port->nvids != 0) { + kmem_free(port->vids, sizeof (vsw_vlanid_t) * port->nvids); + } + kmem_free(port, sizeof (vsw_port_t)); D1(vswp, "%s: exit", __func__); @@ -4205,12 +4200,13 @@ vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t *tagp) /* transmit the packet over the given port */ int -vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count) +vsw_portsend(vsw_port_t *port, mblk_t *mp) { vsw_ldc_list_t *ldcl = &port->p_ldclist; vsw_ldc_t *ldcp; + mblk_t *mpt; + int count; int status = 0; - uint32_t n; READ_ENTER(&ldcl->lockrw); /* @@ -4224,18 +4220,13 @@ vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, uint32_t count) return (1); } - n = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt); + count = vsw_vlan_frame_untag(port, VSW_VNETPORT, &mp, &mpt); - count -= n; - if (count == 0) { - goto vsw_portsend_exit; + if (count != 0) { + status = ldcp->tx(ldcp, mp, mpt, count); } - status = ldcp->tx(ldcp, mp, mpt, count); - -vsw_portsend_exit: RW_EXIT(&ldcl->lockrw); - return (status); } @@ -5735,14 +5726,6 @@ vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg) } -static caddr_t -vsw_print_ethaddr(uint8_t *a, char *ebuf) -{ - (void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x", - a[0], a[1], a[2], a[3], a[4], a[5]); - return (ebuf); -} - /* * Reset and free all the resources associated with * the channel. diff --git a/usr/src/uts/sun4v/io/vsw_phys.c b/usr/src/uts/sun4v/io/vsw_phys.c index 962ccc1cb9..127e1635c1 100644 --- a/usr/src/uts/sun4v/io/vsw_phys.c +++ b/usr/src/uts/sun4v/io/vsw_phys.c @@ -55,7 +55,8 @@ #include <sys/machsystm.h> #include <sys/modctl.h> #include <sys/modhash.h> -#include <sys/mac.h> +#include <sys/mac_client.h> +#include <sys/mac_provider.h> #include <sys/mac_ether.h> #include <sys/taskq.h> #include <sys/note.h> @@ -63,134 +64,133 @@ #include <sys/mac.h> #include <sys/mdeg.h> #include <sys/vsw.h> +#include <sys/vlan.h> /* MAC Ring table functions. */ -static void vsw_mac_ring_tbl_init(vsw_t *vswp); -static void vsw_mac_ring_tbl_destroy(vsw_t *vswp); -static void vsw_queue_worker(vsw_mac_ring_t *rrp); -static void vsw_queue_stop(vsw_queue_t *vqp); -static vsw_queue_t *vsw_queue_create(); -static void vsw_queue_destroy(vsw_queue_t *vqp); -static void vsw_rx_queue_cb(void *, mac_resource_handle_t, mblk_t *); -static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *); +static void vsw_port_rx_cb(void *, mac_resource_handle_t, mblk_t *, + boolean_t); +static void vsw_if_rx_cb(void *, mac_resource_handle_t, mblk_t *, boolean_t); /* MAC layer routines */ -static mac_resource_handle_t vsw_mac_ring_add_cb(void *arg, - mac_resource_t *mrp); -static int vsw_set_hw_addr(vsw_t *, mac_multi_addr_t *); -static int vsw_set_hw_promisc(vsw_t *, vsw_port_t *, int); -static int vsw_unset_hw_addr(vsw_t *, int); -static int vsw_unset_hw_promisc(vsw_t *, vsw_port_t *, int); -static int vsw_prog_if(vsw_t *); +static int vsw_set_port_hw_addr(vsw_port_t *port); +static int vsw_set_if_hw_addr(vsw_t *vswp); +static void vsw_unset_hw_addr(vsw_t *, vsw_port_t *, int); +static int vsw_maccl_open(vsw_t *vswp, vsw_port_t *port, int type); +static void vsw_maccl_close(vsw_t *vswp, vsw_port_t *port, int type); +static void vsw_mac_multicast_add_all(vsw_t *vswp, vsw_port_t *portp, int type); +static void vsw_mac_multicast_remove_all(vsw_t *vswp, + vsw_port_t *portp, int type); +static void vsw_mac_add_vlans(vsw_t *vswp, mac_client_handle_t mch, + uint8_t *macaddr, uint16_t flags, vsw_vlanid_t *vids, int nvids); +static void vsw_mac_remove_vlans(mac_client_handle_t mch, vsw_vlanid_t *vids, + int nvids); static void vsw_mac_set_mtu(vsw_t *vswp, uint32_t mtu); /* Support functions */ -static int vsw_prog_ports(vsw_t *); int vsw_set_hw(vsw_t *, vsw_port_t *, int); -int vsw_unset_hw(vsw_t *, vsw_port_t *, int); +void vsw_unset_hw(vsw_t *, vsw_port_t *, int); void vsw_reconfig_hw(vsw_t *); -int vsw_mac_attach(vsw_t *vswp); -void vsw_mac_detach(vsw_t *vswp); int vsw_mac_open(vsw_t *vswp); void vsw_mac_close(vsw_t *vswp); +int vsw_mac_multicast_add(vsw_t *vswp, vsw_port_t *port, mcst_addr_t *mcst_p, + int type); +void vsw_mac_multicast_remove(vsw_t *vswp, vsw_port_t *port, + mcst_addr_t *mcst_p, int type); +int vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type); +void vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type); +void vsw_mac_cleanup_ports(vsw_t *vswp); void vsw_unset_addrs(vsw_t *vswp); void vsw_set_addrs(vsw_t *vswp); -int vsw_get_hw_maddr(vsw_t *); -mblk_t *vsw_tx_msg(vsw_t *, mblk_t *); -void vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr); +mblk_t *vsw_tx_msg(vsw_t *, mblk_t *, int, vsw_port_t *); +void vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp); +void vsw_port_mac_reconfig(vsw_port_t *portp, boolean_t update_vlans, + uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids); +void vsw_mac_port_reconfig_vlans(vsw_port_t *portp, uint16_t new_pvid, + vsw_vlanid_t *new_vids, int new_nvids); +void vsw_if_mac_reconfig(vsw_t *vswp, boolean_t update_vlans, + uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids); +/* + * Functions imported from other files. + */ +extern int vsw_portsend(vsw_port_t *port, mblk_t *mp); +extern void vsw_hio_stop_port(vsw_port_t *portp); +extern void vsw_hio_port_reset(vsw_port_t *portp, boolean_t immediate); +extern uint32_t vsw_publish_macaddr_count; +extern uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, + mblk_t **npt); static char mac_mtu_propname[] = "mtu"; /* * Tunables used in this file. */ extern int vsw_mac_open_retries; -extern boolean_t vsw_multi_ring_enable; -extern int vsw_mac_rx_rings; -extern uint32_t vsw_publish_macaddr_count; -/* - * Check to see if the card supports the setting of multiple unicst - * addresses. - * - * Returns 0 if card supports the programming of multiple unicast addresses, - * otherwise returns 1. - */ -int -vsw_get_hw_maddr(vsw_t *vswp) -{ - D1(vswp, "%s: enter", __func__); - ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock)); +#define WRITE_MACCL_ENTER(vswp, port, type) \ + (type == VSW_LOCALDEV) ? rw_enter(&vswp->maccl_rwlock, RW_WRITER) :\ + rw_enter(&port->maccl_rwlock, RW_WRITER) - if (vswp->mh == NULL) - return (1); +#define READ_MACCL_ENTER(vswp, port, type) \ + (type == VSW_LOCALDEV) ? rw_enter(&vswp->maccl_rwlock, RW_READER) :\ + rw_enter(&port->maccl_rwlock, RW_READER) - if (!mac_capab_get(vswp->mh, MAC_CAPAB_MULTIADDRESS, &vswp->maddr)) { - cmn_err(CE_NOTE, "!vsw%d: device (%s) does not support " - "programming multiple addresses", vswp->instance, - vswp->physname); - return (1); - } +#define RW_MACCL_EXIT(vswp, port, type) \ + (type == VSW_LOCALDEV) ? rw_exit(&vswp->maccl_rwlock) : \ + rw_exit(&port->maccl_rwlock) - D2(vswp, "%s: %d addrs : %d free", __func__, - vswp->maddr.maddr_naddr, vswp->maddr.maddr_naddrfree); - D1(vswp, "%s: exit", __func__); +/* + * Locking strategy in this file is explained as follows: + * - A global lock(vswp->mac_lock) is used to protect the + * MAC calls that deal with entire device. That is, the + * operations that deal with mac_handle which include + * mac_open()/close() and mac_client_open(). + * + * - A per port/interface RW lock(maccl_rwlock) is used protect + * the operations that deal with the MAC client. + * + * When both mac_lock and maccl_rwlock need to be held, the + * mac_lock need be acquired first and then maccl_rwlock. That is, + * mac_lock---->maccl_rwlock + * + * The 'mca_lock' that protects the mcast list is also acquired + * within the context of maccl_rwlock. The hierarchy for this + * one is as below: + * maccl_rwlock---->mca_lock + */ - return (0); -} /* * Program unicast and multicast addresses of vsw interface and the ports - * into the physical device. + * into the network device. */ void vsw_set_addrs(vsw_t *vswp) { vsw_port_list_t *plist = &vswp->plist; vsw_port_t *port; - mcst_addr_t *mcap; int rv; READ_ENTER(&vswp->if_lockrw); if (vswp->if_state & VSW_IF_UP) { - /* program unicst addr of vsw interface in the physdev */ - if (vswp->addr_set == VSW_ADDR_UNSET) { - mutex_enter(&vswp->hw_lock); - rv = vsw_set_hw(vswp, NULL, VSW_LOCALDEV); - mutex_exit(&vswp->hw_lock); - if (rv != 0) { - cmn_err(CE_NOTE, - "!vsw%d: failed to program interface " - "unicast address\n", vswp->instance); - } - /* - * Notify the MAC layer of the changed address. - */ - mac_unicst_update(vswp->if_mh, - (uint8_t *)&vswp->if_addr); + /* Open a mac client and program addresses */ + rv = vsw_mac_client_init(vswp, NULL, VSW_LOCALDEV); + if (rv != 0) { + cmn_err(CE_NOTE, + "!vsw%d: failed to program interface " + "unicast address\n", vswp->instance); } - /* program mcast addrs of vsw interface in the physdev */ - mutex_enter(&vswp->mca_lock); - WRITE_ENTER(&vswp->mac_rwlock); - for (mcap = vswp->mcap; mcap != NULL; mcap = mcap->nextp) { - if (mcap->mac_added) - continue; - rv = mac_multicst_add(vswp->mh, (uchar_t *)&mcap->mca); - if (rv == 0) { - mcap->mac_added = B_TRUE; - } else { - cmn_err(CE_NOTE, "!vsw%d: unable to add " - "multicast address: %s\n", vswp->instance, - ether_sprintf((void *)&mcap->mca)); - } + /* + * Notify the MAC layer of the changed address. + */ + if (rv == 0) { + mac_unicst_update(vswp->if_mh, + (uint8_t *)&vswp->if_addr); } - RW_EXIT(&vswp->mac_rwlock); - mutex_exit(&vswp->mca_lock); } @@ -198,43 +198,24 @@ vsw_set_addrs(vsw_t *vswp) WRITE_ENTER(&plist->lockrw); - /* program unicast address of ports in the physical device */ - mutex_enter(&vswp->hw_lock); + /* program unicast address of ports in the network device */ for (port = plist->head; port != NULL; port = port->p_next) { - if (port->addr_set != VSW_ADDR_UNSET) /* addr already set */ + if (port->addr_set) /* addr already set */ continue; - if (vsw_set_hw(vswp, port, VSW_VNETPORT)) { - cmn_err(CE_NOTE, - "!vsw%d: port:%d failed to set unicast address\n", - vswp->instance, port->p_instance); - } - } - mutex_exit(&vswp->hw_lock); - /* program multicast addresses of ports in the physdev */ - for (port = plist->head; port != NULL; port = port->p_next) { - mutex_enter(&port->mca_lock); - WRITE_ENTER(&vswp->mac_rwlock); - for (mcap = port->mcap; mcap != NULL; mcap = mcap->nextp) { - if (mcap->mac_added) - continue; - rv = mac_multicst_add(vswp->mh, (uchar_t *)&mcap->mca); - if (rv == 0) { - mcap->mac_added = B_TRUE; - } else { - cmn_err(CE_NOTE, "!vsw%d: unable to add " - "multicast address: %s\n", vswp->instance, - ether_sprintf((void *)&mcap->mca)); - } + /* Open a mac client and program addresses */ + rv = vsw_mac_client_init(vswp, port, VSW_VNETPORT); + if (rv != 0) { + cmn_err(CE_NOTE, + "!vsw%d: failed to program port(%d) " + "unicast address\n", vswp->instance, + port->p_instance); } - RW_EXIT(&vswp->mac_rwlock); - mutex_exit(&port->mca_lock); } - /* announce macaddr of vnets to the physical switch */ if (vsw_publish_macaddr_count != 0) { /* enabled */ for (port = plist->head; port != NULL; port = port->p_next) { - vsw_publish_macaddr(vswp, (uint8_t *)&port->p_macaddr); + vsw_publish_macaddr(vswp, port); } } @@ -242,93 +223,37 @@ vsw_set_addrs(vsw_t *vswp) } /* - * Remove unicast and multicast addresses of vsw interface and the ports - * from the physical device. + * Remove unicast, multicast addresses and close mac clients + * for the vsw interface and all ports. */ void vsw_unset_addrs(vsw_t *vswp) { - vsw_port_list_t *plist = &vswp->plist; - vsw_port_t *port; - mcst_addr_t *mcap; - READ_ENTER(&vswp->if_lockrw); - if (vswp->if_state & VSW_IF_UP) { - /* - * Remove unicast addr of vsw interfce - * from current physdev - */ - mutex_enter(&vswp->hw_lock); - (void) vsw_unset_hw(vswp, NULL, VSW_LOCALDEV); - mutex_exit(&vswp->hw_lock); - - /* - * Remove mcast addrs of vsw interface - * from current physdev - */ - mutex_enter(&vswp->mca_lock); - WRITE_ENTER(&vswp->mac_rwlock); - for (mcap = vswp->mcap; mcap != NULL; mcap = mcap->nextp) { - if (!mcap->mac_added) - continue; - (void) mac_multicst_remove(vswp->mh, - (uchar_t *)&mcap->mca); - mcap->mac_added = B_FALSE; - } - RW_EXIT(&vswp->mac_rwlock); - mutex_exit(&vswp->mca_lock); - + /* Cleanup and close the mac client for the interface */ + vsw_mac_client_cleanup(vswp, NULL, VSW_LOCALDEV); } - RW_EXIT(&vswp->if_lockrw); - WRITE_ENTER(&plist->lockrw); - - /* - * Remove unicast address of ports from the current physical device - */ - mutex_enter(&vswp->hw_lock); - for (port = plist->head; port != NULL; port = port->p_next) { - /* Remove address if was programmed into HW. */ - if (port->addr_set == VSW_ADDR_UNSET) - continue; - (void) vsw_unset_hw(vswp, port, VSW_VNETPORT); - } - mutex_exit(&vswp->hw_lock); - - /* Remove multicast addresses of ports from the current physdev */ - for (port = plist->head; port != NULL; port = port->p_next) { - mutex_enter(&port->mca_lock); - WRITE_ENTER(&vswp->mac_rwlock); - for (mcap = port->mcap; mcap != NULL; mcap = mcap->nextp) { - if (!mcap->mac_added) - continue; - (void) mac_multicst_remove(vswp->mh, - (uchar_t *)&mcap->mca); - mcap->mac_added = B_FALSE; - } - RW_EXIT(&vswp->mac_rwlock); - mutex_exit(&port->mca_lock); - } - - RW_EXIT(&plist->lockrw); + /* Cleanup and close the mac clients for all ports */ + vsw_mac_cleanup_ports(vswp); } /* - * Open the underlying physical device for access in layer2 mode. + * Open the underlying network device for access in layer2 mode. * Returns: - * 0 on success - * EAGAIN if mac_open() fails due to the device being not available yet. - * EIO on any other failures. + * 0 on success + * EAGAIN if mac_open() fails due to the device being not available yet. + * EIO on any other failures. */ int vsw_mac_open(vsw_t *vswp) { - int rv; + int rv; - ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock)); + ASSERT(MUTEX_HELD(&vswp->mac_lock)); if (vswp->mh != NULL) { /* already open */ @@ -352,14 +277,15 @@ vsw_mac_open(vsw_t *vswp) if (rv == ENOENT || rv == EBADF) { return (EAGAIN); } else { - cmn_err(CE_WARN, "vsw%d: device (%s) open failed rv:%x", + cmn_err(CE_WARN, "vsw%d: mac_open %s failed rv:%x", vswp->instance, vswp->physname, rv); return (EIO); } } - vswp->mac_open_retries = 0; + vsw_mac_set_mtu(vswp, vswp->mtu); + return (0); } @@ -369,1005 +295,852 @@ vsw_mac_open(vsw_t *vswp) void vsw_mac_close(vsw_t *vswp) { - ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock)); + ASSERT(MUTEX_HELD(&vswp->mac_lock)); if (vswp->mh != NULL) { + if (vswp->mtu != vswp->mtu_physdev_orig) { + vsw_mac_set_mtu(vswp, vswp->mtu_physdev_orig); + } mac_close(vswp->mh); vswp->mh = NULL; } } /* - * Link into the MAC layer to gain access to the services provided by - * the underlying physical device driver (which should also have - * registered with the MAC layer). - * - * Only when in layer 2 mode. + * Add multicast addr. */ int -vsw_mac_attach(vsw_t *vswp) +vsw_mac_multicast_add(vsw_t *vswp, vsw_port_t *port, mcst_addr_t *mcst_p, + int type) { - D1(vswp, "%s: enter", __func__); - - ASSERT(vswp->mrh == NULL); - ASSERT(vswp->mstarted == B_FALSE); - ASSERT(vswp->mresources == B_FALSE); - - ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock)); - - ASSERT(vswp->mh != NULL); - - D2(vswp, "vsw_mac_attach: using device %s", vswp->physname); - - vsw_mac_set_mtu(vswp, vswp->mtu); - - if (vsw_multi_ring_enable) { - /* - * Initialize the ring table. - */ - vsw_mac_ring_tbl_init(vswp); - - /* - * Register our rx callback function. - */ - vswp->mrh = mac_rx_add(vswp->mh, - vsw_rx_queue_cb, (void *)vswp); - ASSERT(vswp->mrh != NULL); - - /* - * Register our mac resource callback. - */ - mac_resource_set(vswp->mh, vsw_mac_ring_add_cb, (void *)vswp); - vswp->mresources = B_TRUE; - - /* - * Get the ring resources available to us from - * the mac below us. - */ - mac_resources(vswp->mh); - } else { - /* - * Just register our rx callback function - */ - vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp); - ASSERT(vswp->mrh != NULL); - } - - /* Get the MAC tx fn */ - vswp->txinfo = mac_tx_get(vswp->mh); - - /* start the interface */ - if (mac_start(vswp->mh) != 0) { - cmn_err(CE_WARN, "!vsw%d: Could not start mac interface", - vswp->instance); - goto mac_fail_exit; + int ret = 0; + mac_client_handle_t mch; + + WRITE_MACCL_ENTER(vswp, port, type); + + mch = (type == VSW_LOCALDEV) ? vswp->mch : port->p_mch; + + if (mch != NULL) { + ret = mac_multicast_add(mch, mcst_p->mca.ether_addr_octet); + if (ret != 0) { + cmn_err(CE_WARN, "!vsw%d: unable to " + "program multicast address(%s) err=%d", + vswp->instance, + ether_sprintf((void *)&mcst_p->mca), ret); + RW_MACCL_EXIT(vswp, port, type); + return (ret); + } + mcst_p->mac_added = B_TRUE; } - vswp->mstarted = B_TRUE; - - D1(vswp, "%s: exit", __func__); - return (0); - -mac_fail_exit: - vsw_mac_detach(vswp); - - D1(vswp, "%s: exit", __func__); - return (1); + RW_MACCL_EXIT(vswp, port, type); + return (ret); } +/* + * Remove multicast addr. + */ void -vsw_mac_detach(vsw_t *vswp) +vsw_mac_multicast_remove(vsw_t *vswp, vsw_port_t *port, mcst_addr_t *mcst_p, + int type) { - D1(vswp, "vsw_mac_detach: enter"); - - ASSERT(vswp != NULL); - ASSERT(RW_LOCK_HELD(&vswp->mac_rwlock)); + mac_client_handle_t mch; - if (vsw_multi_ring_enable) { - vsw_mac_ring_tbl_destroy(vswp); - } + WRITE_MACCL_ENTER(vswp, port, type); + mch = (type == VSW_LOCALDEV) ? vswp->mch : port->p_mch; - if (vswp->mh != NULL) { - if (vswp->mstarted) - mac_stop(vswp->mh); - if (vswp->mrh != NULL) - mac_rx_remove(vswp->mh, vswp->mrh, B_TRUE); - if (vswp->mresources) - mac_resource_set(vswp->mh, NULL, NULL); - if (vswp->mtu != vswp->mtu_physdev_orig) { - vsw_mac_set_mtu(vswp, vswp->mtu_physdev_orig); - } + if (mch != NULL && mcst_p->mac_added) { + mac_multicast_remove(mch, mcst_p->mca.ether_addr_octet); + mcst_p->mac_added = B_FALSE; } - - vswp->mrh = NULL; - vswp->txinfo = NULL; - vswp->mstarted = B_FALSE; - - D1(vswp, "vsw_mac_detach: exit"); + RW_MACCL_EXIT(vswp, port, type); } + /* - * Depending on the mode specified, the capabilites and capacity - * of the underlying device setup the physical device. - * - * If in layer 3 mode, then do nothing. - * - * If in layer 2 programmed mode attempt to program the unicast address - * associated with the port into the physical device. If this is not - * possible due to resource exhaustion or simply because the device does - * not support multiple unicast addresses then if required fallback onto - * putting the card into promisc mode. - * - * If in promisc mode then simply set the card into promisc mode. - * - * Returns 0 success, 1 on failure. + * Add all multicast addresses of the port. */ -int -vsw_set_hw(vsw_t *vswp, vsw_port_t *port, int type) +static void +vsw_mac_multicast_add_all(vsw_t *vswp, vsw_port_t *portp, int type) { - mac_multi_addr_t mac_addr; - int err; + mcst_addr_t *mcap; + mac_client_handle_t mch; + kmutex_t *mca_lockp; + int rv; - D1(vswp, "%s: enter", __func__); - - ASSERT(MUTEX_HELD(&vswp->hw_lock)); ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT)); - - if (vswp->smode[vswp->smode_idx] == VSW_LAYER3) - return (0); - - if (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) { - return (vsw_set_hw_promisc(vswp, port, type)); - } - - /* - * Attempt to program the unicast address into the HW. - */ - mac_addr.mma_addrlen = ETHERADDRL; - if (type == VSW_VNETPORT) { - ASSERT(port != NULL); - ether_copy(&port->p_macaddr, &mac_addr.mma_addr); + if (type == VSW_LOCALDEV) { + ASSERT(RW_WRITE_HELD(&vswp->maccl_rwlock)); + mch = vswp->mch; + mcap = vswp->mcap; + mca_lockp = &vswp->mca_lock; } else { - ether_copy(&vswp->if_addr, &mac_addr.mma_addr); + ASSERT(RW_WRITE_HELD(&portp->maccl_rwlock)); + mch = portp->p_mch; + mcap = portp->mcap; + mca_lockp = &portp->mca_lock; } - err = vsw_set_hw_addr(vswp, &mac_addr); - if (err == ENOSPC) { - /* - * Mark that attempt should be made to re-config sometime - * in future if a port is deleted. - */ - vswp->recfg_reqd = B_TRUE; - - /* - * Only 1 mode specified, nothing more to do. - */ - if (vswp->smode_num == 1) - return (err); + if (mch == NULL) + return; - /* - * If promiscuous was next mode specified try to - * set the card into that mode. - */ - if ((vswp->smode_idx <= (vswp->smode_num - 2)) && - (vswp->smode[vswp->smode_idx + 1] == - VSW_LAYER2_PROMISC)) { - vswp->smode_idx += 1; - return (vsw_set_hw_promisc(vswp, port, type)); + mutex_enter(mca_lockp); + for (mcap = mcap; mcap != NULL; mcap = mcap->nextp) { + if (mcap->mac_added) + continue; + rv = mac_multicast_add(mch, (uchar_t *)&mcap->mca); + if (rv == 0) { + mcap->mac_added = B_TRUE; + } else { + cmn_err(CE_WARN, "!vsw%d: unable to program " + "multicast address(%s) err=%d", vswp->instance, + ether_sprintf((void *)&mcap->mca), rv); } - return (err); } + mutex_exit(mca_lockp); +} - if (err != 0) - return (err); +/* + * Remove all multicast addresses of the port. + */ +static void +vsw_mac_multicast_remove_all(vsw_t *vswp, vsw_port_t *portp, int type) +{ + mac_client_handle_t mch; + mcst_addr_t *mcap; + kmutex_t *mca_lockp; - if (type == VSW_VNETPORT) { - port->addr_slot = mac_addr.mma_slot; - port->addr_set = VSW_ADDR_HW; + ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT)); + if (type == VSW_LOCALDEV) { + ASSERT(RW_WRITE_HELD(&vswp->maccl_rwlock)); + mch = vswp->mch; + mcap = vswp->mcap; + mca_lockp = &vswp->mca_lock; } else { - vswp->addr_slot = mac_addr.mma_slot; - vswp->addr_set = VSW_ADDR_HW; + ASSERT(RW_WRITE_HELD(&portp->maccl_rwlock)); + mch = portp->p_mch; + mcap = portp->mcap; + mca_lockp = &portp->mca_lock; } - D2(vswp, "programmed addr %s into slot %d " - "of device %s", ether_sprintf((void *)mac_addr.mma_addr), - mac_addr.mma_slot, vswp->physname); - - D1(vswp, "%s: exit", __func__); + if (mch == NULL) + return; - return (0); + mutex_enter(mca_lockp); + for (; mcap != NULL; mcap = mcap->nextp) { + if (!mcap->mac_added) + continue; + (void) mac_multicast_remove(mch, (uchar_t *)&mcap->mca); + mcap->mac_added = B_FALSE; + } + mutex_exit(mca_lockp); } /* - * If in layer 3 mode do nothing. - * - * If in layer 2 switched mode remove the address from the physical - * device. - * - * If in layer 2 promiscuous mode disable promisc mode. - * - * Returns 0 on success. + * Open a mac client and program uncast and multicast addresses + * for a port or the interface. + * Returns: + * 0 on success + * non-zero for failure. */ int -vsw_unset_hw(vsw_t *vswp, vsw_port_t *port, int type) +vsw_mac_client_init(vsw_t *vswp, vsw_port_t *port, int type) { - mac_addr_slot_t slot; - int rv; - - D1(vswp, "%s: enter", __func__); - - ASSERT(MUTEX_HELD(&vswp->hw_lock)); - - if (vswp->smode[vswp->smode_idx] == VSW_LAYER3) - return (0); + int rv; - switch (type) { - case VSW_VNETPORT: - ASSERT(port != NULL); - - if (port->addr_set == VSW_ADDR_PROMISC) { - return (vsw_unset_hw_promisc(vswp, port, type)); - - } else if (port->addr_set == VSW_ADDR_HW) { - slot = port->addr_slot; - if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0) - port->addr_set = VSW_ADDR_UNSET; - } + mutex_enter(&vswp->mac_lock); + WRITE_MACCL_ENTER(vswp, port, type); + rv = vsw_maccl_open(vswp, port, type); - break; + /* Release mac_lock now */ + mutex_exit(&vswp->mac_lock); - case VSW_LOCALDEV: - if (vswp->addr_set == VSW_ADDR_PROMISC) { - return (vsw_unset_hw_promisc(vswp, NULL, type)); - - } else if (vswp->addr_set == VSW_ADDR_HW) { - slot = vswp->addr_slot; - if ((rv = vsw_unset_hw_addr(vswp, slot)) == 0) - vswp->addr_set = VSW_ADDR_UNSET; - } - - break; - - default: - /* should never happen */ - DERR(vswp, "%s: unknown type %d", __func__, type); - ASSERT(0); - return (1); + if (rv == 0) { + (void) vsw_set_hw(vswp, port, type); + vsw_mac_multicast_add_all(vswp, port, type); } - - D1(vswp, "%s: exit", __func__); + RW_MACCL_EXIT(vswp, port, type); return (rv); } /* - * Attempt to program a unicast address into HW. + * Open a MAC client for a port or an interface. + * The flags and their purpose as below: * - * Returns 0 on sucess, 1 on failure. + * MAC_OPEN_FLAGS_NO_HWRINGS -- This flag is used by default + * for all ports/interface so that they are associated with + * default group & resources. It will not be used for the + * ports that have HybridIO is enabled so that the h/w resources + * assigned to it. + * + * MAC_OPEN_FLAGS_SHARES_DESIRED -- This flag is used to indicate + * that a port desires a Share. This will be the case with the + * the ports that have hybrid mode enabled. This will only cause + * MAC layer to allocate a share and corresponding resources + * ahead of time. + * + * MAC_OPEN_FLAGS_TAG_DISABLE -- This flag is used for VLAN + * support. It will cause MAC to not add any tags, but expect + * vsw to tag the packets. + * + * MAC_OPEN_FLAGS_STRIP_DISABLE -- This flag is used for VLAN + * support. It will case the MAC layer to not strip the tags. + * Vsw may have to strip the tag for pvid case. */ static int -vsw_set_hw_addr(vsw_t *vswp, mac_multi_addr_t *mac) +vsw_maccl_open(vsw_t *vswp, vsw_port_t *port, int type) { - void *mah; - int rv = EINVAL; - - D1(vswp, "%s: enter", __func__); - - ASSERT(MUTEX_HELD(&vswp->hw_lock)); - - if (vswp->maddr.maddr_handle == NULL) - return (rv); - - mah = vswp->maddr.maddr_handle; - - rv = vswp->maddr.maddr_add(mah, mac); + int rv = 0; + int instance; + char mac_cl_name[MAXNAMELEN]; + const char *dev_name; + mac_client_handle_t *mchp; + uint64_t flags = (MAC_OPEN_FLAGS_NO_HWRINGS | + MAC_OPEN_FLAGS_TAG_DISABLE | + MAC_OPEN_FLAGS_STRIP_DISABLE); + + ASSERT(MUTEX_HELD(&vswp->mac_lock)); + if (vswp->mh == NULL) { + /* + * In case net-dev is changed (either set to nothing or + * using aggregation device), return success here as the + * timeout mechanism will handle it. + */ + return (0); + } - if (rv == 0) - return (rv); + mchp = (type == VSW_LOCALDEV) ? &vswp->mch : &port->p_mch; + if (*mchp != NULL) { + /* already open */ + return (0); + } + dev_name = ddi_driver_name(vswp->dip); + instance = ddi_get_instance(vswp->dip); + if (type == VSW_VNETPORT) { + if (port->p_hio_enabled == B_TRUE) { + flags &= ~MAC_OPEN_FLAGS_NO_HWRINGS; + flags |= MAC_OPEN_FLAGS_SHARES_DESIRED; + } + (void) snprintf(mac_cl_name, MAXNAMELEN, "%s%d%s%d", dev_name, + instance, "_port", port->p_instance); + } else { + (void) snprintf(mac_cl_name, MAXNAMELEN, "%s%s%d", + dev_name, "_if", instance); + } - /* - * Its okay for the add to fail because we have exhausted - * all the resouces in the hardware device. Any other error - * we want to flag. - */ - if (rv != ENOSPC) { - cmn_err(CE_NOTE, "!vsw%d: error programming " - "address %s into HW err (%d)", - vswp->instance, ether_sprintf((void *)mac->mma_addr), rv); + rv = mac_client_open(vswp->mh, mchp, mac_cl_name, flags); + if (rv != 0) { + cmn_err(CE_NOTE, "!vsw%d:%s mac_client_open() failed\n", + vswp->instance, mac_cl_name); } - D1(vswp, "%s: exit", __func__); return (rv); } /* - * Remove a unicast mac address which has previously been programmed - * into HW. - * - * Returns 0 on sucess, 1 on failure. + * Clean up by removing uncast, multicast addresses and + * closing the MAC client for a port or the interface. */ -static int -vsw_unset_hw_addr(vsw_t *vswp, int slot) +void +vsw_mac_client_cleanup(vsw_t *vswp, vsw_port_t *port, int type) { - void *mah; - int rv; - - D1(vswp, "%s: enter", __func__); - - ASSERT(MUTEX_HELD(&vswp->hw_lock)); - ASSERT(slot >= 0); + WRITE_MACCL_ENTER(vswp, port, type); + vsw_unset_hw(vswp, port, type); + vsw_maccl_close(vswp, port, type); + vsw_mac_multicast_remove_all(vswp, port, type); + RW_MACCL_EXIT(vswp, port, type); +} - if (vswp->maddr.maddr_handle == NULL) - return (1); +/* + * Close a MAC client for a port or an interface. + */ +static void +vsw_maccl_close(vsw_t *vswp, vsw_port_t *port, int type) +{ + mac_client_handle_t *mchp; - mah = vswp->maddr.maddr_handle; + ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT)); - rv = vswp->maddr.maddr_remove(mah, slot); - if (rv != 0) { - DWARN(vswp, "%s: unable to remove address " - "from slot %d in device %s (err %d)", - __func__, slot, vswp->physname, rv); - return (1); + mchp = (type == VSW_LOCALDEV) ? &vswp->mch : &port->p_mch; + if (*mchp != NULL) { + mac_client_close(*mchp, 0); + *mchp = NULL; } +} - D2(vswp, "removed addr from slot %d in device %s", - slot, vswp->physname); +/* + * Cleanup MAC client related stuff for all ports. + */ +void +vsw_mac_cleanup_ports(vsw_t *vswp) +{ + vsw_port_list_t *plist = &vswp->plist; + vsw_port_t *port; - D1(vswp, "%s: exit", __func__); - return (0); + READ_ENTER(&plist->lockrw); + for (port = plist->head; port != NULL; port = port->p_next) { + vsw_mac_client_cleanup(vswp, port, VSW_VNETPORT); + } + RW_EXIT(&plist->lockrw); } /* - * Set network card into promisc mode. + * Depending on the mode specified, the capabilites and capacity + * of the underlying device setup the physical device. + * + * If in layer 3 mode, then do nothing. * - * Returns 0 on success, 1 on failure. + * If in layer 2 mode, open a mac client and program the mac-address + * and vlan-ids. The MAC layer will take care of programming + * the address into h/w or set the h/w into promiscuous mode. + * + * Returns 0 success, 1 on failure. */ -static int -vsw_set_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type) +int +vsw_set_hw(vsw_t *vswp, vsw_port_t *port, int type) { + int err = 1; + D1(vswp, "%s: enter", __func__); - ASSERT(MUTEX_HELD(&vswp->hw_lock)); ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT)); - WRITE_ENTER(&vswp->mac_rwlock); - if (vswp->mh == NULL) { - RW_EXIT(&vswp->mac_rwlock); - return (1); - } - - if (vswp->promisc_cnt++ == 0) { - if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) { - vswp->promisc_cnt--; - RW_EXIT(&vswp->mac_rwlock); - return (1); - } - cmn_err(CE_NOTE, "!vsw%d: switching device %s into " - "promiscuous mode", vswp->instance, vswp->physname); - } - RW_EXIT(&vswp->mac_rwlock); + if (vswp->smode == VSW_LAYER3) + return (0); if (type == VSW_VNETPORT) { ASSERT(port != NULL); - port->addr_set = VSW_ADDR_PROMISC; + err = vsw_set_port_hw_addr(port); } else { - vswp->addr_set = VSW_ADDR_PROMISC; + err = vsw_set_if_hw_addr(vswp); } D1(vswp, "%s: exit", __func__); - - return (0); + return (err); } /* - * Turn off promiscuous mode on network card. + * If in layer 3 mode do nothing. * - * Returns 0 on success, 1 on failure. + * If in layer 2 switched mode remove the address from the physical + * device. + * + * If in layer 2 promiscuous mode disable promisc mode. + * + * Returns 0 on success. */ -static int -vsw_unset_hw_promisc(vsw_t *vswp, vsw_port_t *port, int type) +void +vsw_unset_hw(vsw_t *vswp, vsw_port_t *port, int type) { - vsw_port_list_t *plist = &vswp->plist; - - D2(vswp, "%s: enter", __func__); + D1(vswp, "%s: enter", __func__); - ASSERT(MUTEX_HELD(&vswp->hw_lock)); ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT)); - WRITE_ENTER(&vswp->mac_rwlock); - if (vswp->mh == NULL) { - RW_EXIT(&vswp->mac_rwlock); - return (1); - } - - if (--vswp->promisc_cnt == 0) { - if (mac_promisc_set(vswp->mh, B_FALSE, MAC_DEVPROMISC) != 0) { - vswp->promisc_cnt++; - RW_EXIT(&vswp->mac_rwlock); - return (1); - } - - /* - * We are exiting promisc mode either because we were - * only in promisc mode because we had failed over from - * switched mode due to HW resource issues, or the user - * wanted the card in promisc mode for all the ports and - * the last port is now being deleted. Tweak the message - * accordingly. - */ - if (plist->num_ports != 0) { - cmn_err(CE_NOTE, "!vsw%d: switching device %s back to " - "programmed mode", vswp->instance, vswp->physname); - } else { - cmn_err(CE_NOTE, "!vsw%d: switching device %s out of " - "promiscuous mode", vswp->instance, vswp->physname); - } - } - RW_EXIT(&vswp->mac_rwlock); + if (vswp->smode == VSW_LAYER3) + return; if (type == VSW_VNETPORT) { ASSERT(port != NULL); - ASSERT(port->addr_set == VSW_ADDR_PROMISC); - port->addr_set = VSW_ADDR_UNSET; + vsw_unset_hw_addr(vswp, port, type); } else { - ASSERT(vswp->addr_set == VSW_ADDR_PROMISC); - vswp->addr_set = VSW_ADDR_UNSET; + vsw_unset_hw_addr(vswp, NULL, type); } D1(vswp, "%s: exit", __func__); - return (0); } /* - * Determine whether or not we are operating in our prefered - * mode and if not whether the physical resources now allow us - * to operate in it. + * Program the macaddress and vlans of a port. * - * If a port is being removed should only be invoked after port has been - * removed from the port list. + * Returns 0 on sucess, 1 on failure. */ -void -vsw_reconfig_hw(vsw_t *vswp) +static int +vsw_set_port_hw_addr(vsw_port_t *port) { - int s_idx; + vsw_t *vswp = port->p_vswp; + uint16_t mac_flags = 0; + mac_diag_t diag; + uint8_t *macaddr; + uint16_t vid = VLAN_ID_NONE; + int rv; D1(vswp, "%s: enter", __func__); - ASSERT(MUTEX_HELD(&vswp->hw_lock)); - - if (vswp->maddr.maddr_handle == NULL) { - return; - } + ASSERT(RW_WRITE_HELD(&port->maccl_rwlock)); + if (port->p_mch == NULL) + return (0); /* - * If we are in layer 2 (i.e. switched) or would like to be - * in layer 2 then check if any ports or the vswitch itself - * need to be programmed into the HW. - * - * This can happen in two cases - switched was specified as - * the prefered mode of operation but we exhausted the HW - * resources and so failed over to the next specifed mode, - * or switched was the only mode specified so after HW - * resources were exhausted there was nothing more we - * could do. + * If the port has a specific 'pvid', then + * register with that vlan-id, otherwise register + * with VLAN_ID_NONE. */ - if (vswp->smode_idx > 0) - s_idx = vswp->smode_idx - 1; - else - s_idx = vswp->smode_idx; - - if (vswp->smode[s_idx] != VSW_LAYER2) { - return; + if (port->pvid != vswp->default_vlan_id) { + vid = port->pvid; } + macaddr = (uint8_t *)port->p_macaddr.ether_addr_octet; - D2(vswp, "%s: attempting reconfig..", __func__); - - /* - * First, attempt to set the vswitch mac address into HW, - * if required. - */ - if (vsw_prog_if(vswp)) { - return; + if (!(vswp->smode & VSW_LAYER2_PROMISC)) { + mac_flags |= MAC_UNICAST_HW; } - /* - * Next, attempt to set any ports which have not yet been - * programmed into HW. - */ - if (vsw_prog_ports(vswp)) { - return; + if (port->addr_set == B_FALSE) { + port->p_muh = NULL; + rv = mac_unicast_add(port->p_mch, macaddr, mac_flags, + &port->p_muh, vid, &diag); + + if (rv != 0) { + cmn_err(CE_WARN, "vsw%d: Failed to program" + "macaddr,vid(%s, %d) err=%d", + vswp->instance, ether_sprintf((void *)macaddr), + vid, rv); + return (rv); + } + port->addr_set = B_TRUE; + + D2(vswp, "%s:programmed macaddr(%s) vid(%d) into device %s", + __func__, ether_sprintf((void *)macaddr), vid, + vswp->physname); } - /* - * By now we know that have programmed all desired ports etc - * into HW, so safe to mark reconfiguration as complete. - */ - vswp->recfg_reqd = B_FALSE; + /* Add vlans to the MAC layer */ + vsw_mac_add_vlans(vswp, port->p_mch, macaddr, + mac_flags, port->vids, port->nvids); - vswp->smode_idx = s_idx; + mac_rx_set(port->p_mch, vsw_port_rx_cb, (void *)port); D1(vswp, "%s: exit", __func__); + return (rv); } /* - * Check to see if vsw itself is plumbed, and if so whether or not - * its mac address should be written into HW. + * Program the macaddress and vlans of a port. * - * Returns 0 if could set address, or didn't have to set it. - * Returns 1 if failed to set address. + * Returns 0 on sucess, 1 on failure. */ static int -vsw_prog_if(vsw_t *vswp) +vsw_set_if_hw_addr(vsw_t *vswp) { - mac_multi_addr_t addr; + uint16_t mac_flags = 0; + mac_diag_t diag; + uint8_t *macaddr; + uint8_t primary_addr[ETHERADDRL]; + uint16_t vid = VLAN_ID_NONE; + int rv; D1(vswp, "%s: enter", __func__); - ASSERT(MUTEX_HELD(&vswp->hw_lock)); + ASSERT(RW_WRITE_HELD(&vswp->maccl_rwlock)); + if (vswp->mch == NULL) + return (0); - READ_ENTER(&vswp->if_lockrw); - if ((vswp->if_state & VSW_IF_UP) && - (vswp->addr_set != VSW_ADDR_HW)) { + macaddr = (uint8_t *)vswp->if_addr.ether_addr_octet; + + /* check if it is the primary macaddr of the card. */ + mac_unicast_primary_get(vswp->mh, primary_addr); + if (ether_cmp((void *)primary_addr, (void*)macaddr) == 0) { + mac_flags |= MAC_UNICAST_PRIMARY; + } + + /* + * If the interface has a specific 'pvid', then + * register with that vlan-id, otherwise register + * with VLAN_ID_NONE. + */ + if (vswp->pvid != vswp->default_vlan_id) { + vid = vswp->pvid; + } - addr.mma_addrlen = ETHERADDRL; - ether_copy(&vswp->if_addr, &addr.mma_addr); + if (!(vswp->smode & VSW_LAYER2_PROMISC)) { + mac_flags |= MAC_UNICAST_HW; + } - if (vsw_set_hw_addr(vswp, &addr) != 0) { - RW_EXIT(&vswp->if_lockrw); - return (1); + if (vswp->addr_set == B_FALSE) { + vswp->muh = NULL; + rv = mac_unicast_add(vswp->mch, macaddr, mac_flags, + &vswp->muh, vid, &diag); + + if (rv != 0) { + cmn_err(CE_WARN, "vsw%d: Failed to program" + "macaddr,vid(%s, %d) err=%d", + vswp->instance, ether_sprintf((void *)macaddr), + vid, rv); + return (rv); } + vswp->addr_set = B_TRUE; - vswp->addr_slot = addr.mma_slot; + D2(vswp, "%s:programmed macaddr(%s) vid(%d) into device %s", + __func__, ether_sprintf((void *)macaddr), vid, + vswp->physname); + } - /* - * If previously when plumbed had had to place - * interface into promisc mode, now reverse that. - * - * Note that interface will only actually be set into - * non-promisc mode when last port/interface has been - * programmed into HW. - */ - if (vswp->addr_set == VSW_ADDR_PROMISC) - (void) vsw_unset_hw_promisc(vswp, NULL, VSW_LOCALDEV); + vsw_mac_add_vlans(vswp, vswp->mch, macaddr, mac_flags, + vswp->vids, vswp->nvids); - vswp->addr_set = VSW_ADDR_HW; - } - RW_EXIT(&vswp->if_lockrw); + mac_rx_set(vswp->mch, vsw_if_rx_cb, (void *)vswp); D1(vswp, "%s: exit", __func__); - return (0); + return (rv); } /* - * Scan the port list for any ports which have not yet been set - * into HW. For those found attempt to program their mac addresses - * into the physical device. + * Remove a unicast mac address which has previously been programmed + * into HW. * - * Returns 0 if able to program all required ports (can be 0) into HW. - * Returns 1 if failed to set at least one mac address. + * Returns 0 on sucess, 1 on failure. */ -static int -vsw_prog_ports(vsw_t *vswp) +static void +vsw_unset_hw_addr(vsw_t *vswp, vsw_port_t *port, int type) { - mac_multi_addr_t addr; - vsw_port_list_t *plist = &vswp->plist; - vsw_port_t *tp; - int rv = 0; + vsw_vlanid_t *vids; + int nvids; + mac_client_handle_t mch = NULL; D1(vswp, "%s: enter", __func__); - ASSERT(MUTEX_HELD(&vswp->hw_lock)); + ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT)); - READ_ENTER(&plist->lockrw); - for (tp = plist->head; tp != NULL; tp = tp->p_next) { - if (tp->addr_set != VSW_ADDR_HW) { - addr.mma_addrlen = ETHERADDRL; - ether_copy(&tp->p_macaddr, &addr.mma_addr); - - if (vsw_set_hw_addr(vswp, &addr) != 0) { - rv = 1; - break; - } - - tp->addr_slot = addr.mma_slot; - - /* - * If when this port had first attached we had - * had to place the interface into promisc mode, - * then now reverse that. - * - * Note that the interface will not actually - * change to non-promisc mode until all ports - * have been programmed. - */ - if (tp->addr_set == VSW_ADDR_PROMISC) - (void) vsw_unset_hw_promisc(vswp, - tp, VSW_VNETPORT); - - tp->addr_set = VSW_ADDR_HW; - } + if (type == VSW_VNETPORT) { + ASSERT(port != NULL); + ASSERT(RW_WRITE_HELD(&port->maccl_rwlock)); + vids = port->vids; + nvids = port->nvids; + } else { + ASSERT(RW_WRITE_HELD(&vswp->maccl_rwlock)); + vids = vswp->vids; + nvids = vswp->nvids; } - RW_EXIT(&plist->lockrw); - D1(vswp, "%s: exit", __func__); - return (rv); -} + /* First clear the callback */ + if (type == VSW_LOCALDEV) { + mch = vswp->mch; + } else if (type == VSW_VNETPORT) { + mch = port->p_mch; + } -static void -vsw_mac_ring_tbl_entry_init(vsw_t *vswp, vsw_mac_ring_t *ringp) -{ - ringp->ring_state = VSW_MAC_RING_FREE; - ringp->ring_arg = NULL; - ringp->ring_blank = NULL; - ringp->ring_vqp = NULL; - ringp->ring_vswp = vswp; -} -static void -vsw_mac_ring_tbl_init(vsw_t *vswp) -{ - int i; + if (mch == NULL) { + return; + } - mutex_init(&vswp->mac_ring_lock, NULL, MUTEX_DRIVER, NULL); + mac_rx_clear(mch); - vswp->mac_ring_tbl_sz = vsw_mac_rx_rings; - vswp->mac_ring_tbl = - kmem_alloc(vsw_mac_rx_rings * sizeof (vsw_mac_ring_t), KM_SLEEP); + /* Remove vlans */ + vsw_mac_remove_vlans(mch, vids, nvids); - for (i = 0; i < vswp->mac_ring_tbl_sz; i++) - vsw_mac_ring_tbl_entry_init(vswp, &vswp->mac_ring_tbl[i]); -} + if ((type == VSW_LOCALDEV) && (vswp->addr_set == B_TRUE)) { + (void) mac_unicast_remove(vswp->mch, vswp->muh); + vswp->muh = NULL; + D2(vswp, "removed vsw interface mac-addr from " + "the device %s", vswp->physname); + vswp->addr_set = B_FALSE; -static void -vsw_mac_ring_tbl_destroy(vsw_t *vswp) -{ - int i; - vsw_mac_ring_t *ringp; - - mutex_enter(&vswp->mac_ring_lock); - for (i = 0; i < vswp->mac_ring_tbl_sz; i++) { - ringp = &vswp->mac_ring_tbl[i]; - - if (ringp->ring_state != VSW_MAC_RING_FREE) { - /* - * Destroy the queue. - */ - vsw_queue_stop(ringp->ring_vqp); - vsw_queue_destroy(ringp->ring_vqp); - - /* - * Re-initialize the structure. - */ - vsw_mac_ring_tbl_entry_init(vswp, ringp); - } + } else if ((type == VSW_VNETPORT) && (port->addr_set == B_TRUE)) { + (void) mac_unicast_remove(port->p_mch, port->p_muh); + port->p_muh = NULL; + D2(vswp, "removed port(0x%p) mac-addr from " + "the device %s", port, vswp->physname); + port->addr_set = B_FALSE; } - mutex_exit(&vswp->mac_ring_lock); - mutex_destroy(&vswp->mac_ring_lock); - kmem_free(vswp->mac_ring_tbl, - vswp->mac_ring_tbl_sz * sizeof (vsw_mac_ring_t)); - vswp->mac_ring_tbl_sz = 0; + D1(vswp, "%s: exit", __func__); } /* - * Handle resource add callbacks from the driver below. + * receive callback routine for vsw interface. Invoked by MAC layer when there + * are pkts being passed up from physical device for this vsw interface. */ -static mac_resource_handle_t -vsw_mac_ring_add_cb(void *arg, mac_resource_t *mrp) +/* ARGSUSED */ +static void +vsw_if_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback) { + _NOTE(ARGUNUSED(mrh)) + vsw_t *vswp = (vsw_t *)arg; - mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; - vsw_mac_ring_t *ringp; - vsw_queue_t *vqp; - int i; + mblk_t *mpt; + int count; ASSERT(vswp != NULL); - ASSERT(mrp != NULL); - ASSERT(vswp->mac_ring_tbl != NULL); D1(vswp, "%s: enter", __func__); - /* - * Check to make sure we have the correct resource type. - */ - if (mrp->mr_type != MAC_RX_FIFO) - return (NULL); - - /* - * Find a open entry in the ring table. - */ - mutex_enter(&vswp->mac_ring_lock); - for (i = 0; i < vswp->mac_ring_tbl_sz; i++) { - ringp = &vswp->mac_ring_tbl[i]; - - /* - * Check for an empty slot, if found, then setup queue - * and thread. - */ - if (ringp->ring_state == VSW_MAC_RING_FREE) { - /* - * Create the queue for this ring. - */ - vqp = vsw_queue_create(); - - /* - * Initialize the ring data structure. - */ - ringp->ring_vqp = vqp; - ringp->ring_arg = mrfp->mrf_arg; - ringp->ring_blank = mrfp->mrf_blank; - ringp->ring_state = VSW_MAC_RING_INUSE; - - /* - * Create the worker thread. - */ - vqp->vq_worker = thread_create(NULL, 0, - vsw_queue_worker, ringp, 0, &p0, - TS_RUN, minclsyspri); - if (vqp->vq_worker == NULL) { - vsw_queue_destroy(vqp); - vsw_mac_ring_tbl_entry_init(vswp, ringp); - ringp = NULL; - } - - if (ringp != NULL) { - /* - * Make sure thread get's running state for - * this ring. - */ - mutex_enter(&vqp->vq_lock); - while ((vqp->vq_state != VSW_QUEUE_RUNNING) && - (vqp->vq_state != VSW_QUEUE_DRAINED)) { - cv_wait(&vqp->vq_cv, &vqp->vq_lock); - } - - /* - * If the thread is not running, cleanup. - */ - if (vqp->vq_state == VSW_QUEUE_DRAINED) { - vsw_queue_destroy(vqp); - vsw_mac_ring_tbl_entry_init(vswp, - ringp); - ringp = NULL; - } - mutex_exit(&vqp->vq_lock); - } - - mutex_exit(&vswp->mac_ring_lock); - D1(vswp, "%s: exit", __func__); - return ((mac_resource_handle_t)ringp); + READ_ENTER(&vswp->if_lockrw); + if (vswp->if_state & VSW_IF_UP) { + RW_EXIT(&vswp->if_lockrw); + count = vsw_vlan_frame_untag(vswp, VSW_LOCALDEV, &mp, &mpt); + if (count != 0) { + mac_rx(vswp->if_mh, NULL, mp); } + } else { + RW_EXIT(&vswp->if_lockrw); + freemsgchain(mp); } - mutex_exit(&vswp->mac_ring_lock); - /* - * No slots in the ring table available. - */ D1(vswp, "%s: exit", __func__); - return (NULL); } +/* + * receive callback routine for port. Invoked by MAC layer when there + * are pkts being passed up from physical device for this port. + */ +/* ARGSUSED */ static void -vsw_queue_stop(vsw_queue_t *vqp) +vsw_port_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback) { - mutex_enter(&vqp->vq_lock); + _NOTE(ARGUNUSED(mrh)) - if (vqp->vq_state == VSW_QUEUE_RUNNING) { - vqp->vq_state = VSW_QUEUE_STOP; - cv_signal(&vqp->vq_cv); + vsw_t *vswp; + vsw_port_t *port = arg; - while (vqp->vq_state != VSW_QUEUE_DRAINED) - cv_wait(&vqp->vq_cv, &vqp->vq_lock); - } + ASSERT(port != NULL); + + vswp = port->p_vswp; - vqp->vq_state = VSW_QUEUE_STOPPED; + D1(vswp, "vsw_port_rx_cb: enter"); - mutex_exit(&vqp->vq_lock); + /* + * Send the packets to the peer directly. + */ + (void) vsw_portsend(port, mp); + + D1(vswp, "vsw_port_rx_cb: exit"); } -static vsw_queue_t * -vsw_queue_create() +/* + * Send a message out over the physical device + * via the MAC layer. + * + * Returns any mblks that it was unable to transmit. + */ +mblk_t * +vsw_tx_msg(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *port) { - vsw_queue_t *vqp; + mac_client_handle_t mch; + mac_unicast_handle_t muh; - vqp = kmem_zalloc(sizeof (vsw_queue_t), KM_SLEEP); + READ_MACCL_ENTER(vswp, port, caller); - mutex_init(&vqp->vq_lock, NULL, MUTEX_DRIVER, NULL); - cv_init(&vqp->vq_cv, NULL, CV_DRIVER, NULL); - vqp->vq_first = NULL; - vqp->vq_last = NULL; - vqp->vq_state = VSW_QUEUE_STOPPED; + mch = (caller == VSW_LOCALDEV) ? vswp->mch : port->p_mch; + muh = (caller == VSW_LOCALDEV) ? vswp->muh : port->p_muh; - return (vqp); -} + if ((mch != NULL) && (muh != NULL)) { + /* packets are sent or dropped */ + (void) mac_tx(mch, mp, 0, MAC_DROP_ON_NO_DESC, NULL); + } -static void -vsw_queue_destroy(vsw_queue_t *vqp) -{ - cv_destroy(&vqp->vq_cv); - mutex_destroy(&vqp->vq_lock); - kmem_free(vqp, sizeof (vsw_queue_t)); + RW_MACCL_EXIT(vswp, port, caller); + return (NULL); } -static void -vsw_queue_worker(vsw_mac_ring_t *rrp) +/* + * vsw_port_mac_reconfig -- Cleanup and close the MAC client + * and reopen and re-configure the MAC client with new flags etc. + * This function is useful for two different purposes: + * 1) To update the MAC client with new vlan-ids. This is done + * by freeing the existing vlan-ids and reopen with the new + * vlan-ids. + * + * 2) If the Hybrid mode status of a port changes, then the + * MAC client need to be closed and re-opened, otherwise, + * Share related resources may not be freed(hybird mode disabled) + * or assigned(hybrid mode enabled). To accomplish this, + * this function simply closes and reopens the MAC client. + * The reopen will result in using the flags based on the + * new hybrid mode of the port. + */ +void +vsw_port_mac_reconfig(vsw_port_t *portp, boolean_t update_vlans, + uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids) { - mblk_t *mp; - vsw_queue_t *vqp = rrp->ring_vqp; - vsw_t *vswp = rrp->ring_vswp; - - mutex_enter(&vqp->vq_lock); - - ASSERT(vqp->vq_state == VSW_QUEUE_STOPPED); + vsw_t *vswp = portp->p_vswp; + int rv; + D1(vswp, "%s: enter", __func__); /* - * Set the state to running, since the thread is now active. + * Remove the multi-cast addresses, unicast address + * and close the mac-client. */ - vqp->vq_state = VSW_QUEUE_RUNNING; - cv_signal(&vqp->vq_cv); - - while (vqp->vq_state == VSW_QUEUE_RUNNING) { - /* - * Wait for work to do or the state has changed - * to not running. - */ - while ((vqp->vq_state == VSW_QUEUE_RUNNING) && - (vqp->vq_first == NULL)) { - cv_wait(&vqp->vq_cv, &vqp->vq_lock); - } - - /* - * Process packets that we received from the interface. - */ - if (vqp->vq_first != NULL) { - mp = vqp->vq_first; - - vqp->vq_first = NULL; - vqp->vq_last = NULL; - - mutex_exit(&vqp->vq_lock); - - /* switch the chain of packets received */ - vswp->vsw_switch_frame(vswp, mp, - VSW_PHYSDEV, NULL, NULL); - - mutex_enter(&vqp->vq_lock); + mutex_enter(&vswp->mac_lock); + WRITE_ENTER(&portp->maccl_rwlock); + vsw_mac_multicast_remove_all(vswp, portp, VSW_VNETPORT); + vsw_unset_hw(vswp, portp, VSW_VNETPORT); + vsw_maccl_close(vswp, portp, VSW_VNETPORT); + + if (update_vlans == B_TRUE) { + if (portp->nvids != 0) { + kmem_free(portp->vids, + sizeof (vsw_vlanid_t) * portp->nvids); + portp->vids = NULL; + portp->nvids = 0; } + portp->vids = new_vids; + portp->nvids = new_nvids; + portp->pvid = new_pvid; } /* - * We are drained and signal we are done. + * Now re-open the mac-client and + * configure unicast addr and multicast addrs. */ - vqp->vq_state = VSW_QUEUE_DRAINED; - cv_signal(&vqp->vq_cv); + rv = vsw_maccl_open(vswp, portp, VSW_VNETPORT); + if (rv != 0) { + goto recret; + } - /* - * Exit lock and drain the remaining packets. - */ - mutex_exit(&vqp->vq_lock); + if (vsw_set_hw(vswp, portp, VSW_VNETPORT)) { + cmn_err(CE_NOTE, "!vsw%d: port:%d failed to " + "set unicast address\n", vswp->instance, portp->p_instance); + goto recret; + } - /* - * Exit the thread - */ - thread_exit(); + vsw_mac_multicast_add_all(vswp, portp, VSW_VNETPORT); + +recret: + RW_EXIT(&portp->maccl_rwlock); + mutex_exit(&vswp->mac_lock); + D1(vswp, "%s: exit", __func__); } /* - * static void - * vsw_rx_queue_cb() - Receive callback routine when - * vsw_multi_ring_enable is non-zero. Queue the packets - * to a packet queue for a worker thread to process. + * vsw_if_mac_reconfig -- Reconfigure the vsw interfaace's mac-client + * by closing and re-opening it. This function is used handle the + * following two cases: + * + * 1) Handle the MAC address change for the interface. + * 2) Handle vlan update. */ -static void -vsw_rx_queue_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) +void +vsw_if_mac_reconfig(vsw_t *vswp, boolean_t update_vlans, + uint16_t new_pvid, vsw_vlanid_t *new_vids, int new_nvids) { - vsw_mac_ring_t *ringp = (vsw_mac_ring_t *)mrh; - vsw_t *vswp = (vsw_t *)arg; - vsw_queue_t *vqp; - mblk_t *bp, *last; - - ASSERT(mrh != NULL); - ASSERT(vswp != NULL); - ASSERT(mp != NULL); + int rv; D1(vswp, "%s: enter", __func__); - /* - * Find the last element in the mblk chain. + * Remove the multi-cast addresses, unicast address + * and close the mac-client. */ - bp = mp; - do { - last = bp; - bp = bp->b_next; - } while (bp != NULL); - - /* Get the queue for the packets */ - vqp = ringp->ring_vqp; - - /* - * Grab the lock such we can queue the packets. - */ - mutex_enter(&vqp->vq_lock); - - if (vqp->vq_state != VSW_QUEUE_RUNNING) { - freemsgchain(mp); - mutex_exit(&vqp->vq_lock); - goto vsw_rx_queue_cb_exit; + mutex_enter(&vswp->mac_lock); + WRITE_ENTER(&vswp->maccl_rwlock); + vsw_mac_multicast_remove_all(vswp, NULL, VSW_LOCALDEV); + vsw_unset_hw(vswp, NULL, VSW_LOCALDEV); + vsw_maccl_close(vswp, NULL, VSW_LOCALDEV); + + if (update_vlans == B_TRUE) { + if (vswp->nvids != 0) { + kmem_free(vswp->vids, + sizeof (vsw_vlanid_t) * vswp->nvids); + vswp->vids = NULL; + vswp->nvids = 0; + } + vswp->vids = new_vids; + vswp->nvids = new_nvids; + vswp->pvid = new_pvid; } /* - * Add the mblk chain to the queue. If there - * is some mblks in the queue, then add the new - * chain to the end. + * Now re-open the mac-client and + * configure unicast addr and multicast addrs. */ - if (vqp->vq_first == NULL) - vqp->vq_first = mp; - else - vqp->vq_last->b_next = mp; - - vqp->vq_last = last; + rv = vsw_maccl_open(vswp, NULL, VSW_LOCALDEV); + if (rv != 0) { + goto ifrecret; + } - /* - * Signal the worker thread that there is work to - * do. - */ - cv_signal(&vqp->vq_cv); + if (vsw_set_hw(vswp, NULL, VSW_LOCALDEV)) { + cmn_err(CE_NOTE, "!vsw%d:failed to set unicast address\n", + vswp->instance); + goto ifrecret; + } - /* - * Let go of the lock and exit. - */ - mutex_exit(&vqp->vq_lock); + vsw_mac_multicast_add_all(vswp, NULL, VSW_LOCALDEV); -vsw_rx_queue_cb_exit: +ifrecret: + RW_EXIT(&vswp->maccl_rwlock); + mutex_exit(&vswp->mac_lock); D1(vswp, "%s: exit", __func__); } /* - * receive callback routine. Invoked by MAC layer when there - * are pkts being passed up from physical device. + * vsw_mac_port_reconfig_vlans -- Reconfigure a port to handle + * vlan configuration update. As the removal of the last unicast-address,vid + * from the MAC client results in releasing all resources, it expects + * no Shares to be associated with such MAC client. * - * PERF: It may be more efficient when the card is in promisc - * mode to check the dest address of the pkts here (against - * the FDB) rather than checking later. Needs to be investigated. + * To handle vlan configuration update for a port that already has + * a Share bound, then we need to free that share prior to reconfiguration. + * Initiate the hybrdIO setup again after the completion of reconfiguration. */ -static void -vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp) +void +vsw_mac_port_reconfig_vlans(vsw_port_t *portp, uint16_t new_pvid, + vsw_vlanid_t *new_vids, int new_nvids) { - _NOTE(ARGUNUSED(mrh)) - - vsw_t *vswp = (vsw_t *)arg; + /* + * As the reconfiguration involves the close of + * mac client, cleanup HybridIO and later restart + * HybridIO setup again. + */ + if (portp->p_hio_enabled == B_TRUE) { + vsw_hio_stop_port(portp); + } + vsw_port_mac_reconfig(portp, B_TRUE, new_pvid, new_vids, new_nvids); + if (portp->p_hio_enabled == B_TRUE) { + /* reset to setup the HybridIO again. */ + vsw_hio_port_reset(portp, B_FALSE); + } +} - ASSERT(vswp != NULL); +/* Add vlans to MAC client */ +static void +vsw_mac_add_vlans(vsw_t *vswp, mac_client_handle_t mch, uint8_t *macaddr, + uint16_t flags, vsw_vlanid_t *vids, int nvids) +{ + vsw_vlanid_t *vidp; + mac_diag_t diag; + int rv; + int i; - D1(vswp, "vsw_rx_cb: enter"); + /* Add vlans to the MAC layer */ + for (i = 0; i < nvids; i++) { + vidp = &vids[i]; - /* switch the chain of packets received */ - vswp->vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL); + if (vidp->vl_set == B_TRUE) { + continue; + } - D1(vswp, "vsw_rx_cb: exit"); + rv = mac_unicast_add(mch, macaddr, flags, + &vidp->vl_muh, vidp->vl_vid, &diag); + if (rv != 0) { + cmn_err(CE_WARN, "vsw%d: Failed to program" + "macaddr,vid(%s, %d) err=%d", + vswp->instance, ether_sprintf((void *)macaddr), + vidp->vl_vid, rv); + } else { + vidp->vl_set = B_TRUE; + D2(vswp, "%s:programmed macaddr(%s) vid(%d) " + "into device %s", __func__, + ether_sprintf((void *)macaddr), + vidp->vl_vid, vswp->physname); + } + } } -/* - * Send a message out over the physical device via the MAC layer. - * - * Returns any mblks that it was unable to transmit. - */ -mblk_t * -vsw_tx_msg(vsw_t *vswp, mblk_t *mp) +/* Remove vlans from the MAC client */ +static void +vsw_mac_remove_vlans(mac_client_handle_t mch, vsw_vlanid_t *vids, int nvids) { - const mac_txinfo_t *mtp; + int i; + vsw_vlanid_t *vidp; - READ_ENTER(&vswp->mac_rwlock); - if ((vswp->mh == NULL) || (vswp->mstarted == B_FALSE)) { - - DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail"); - RW_EXIT(&vswp->mac_rwlock); - return (mp); - } else { - mtp = vswp->txinfo; - mp = mtp->mt_fn(mtp->mt_arg, mp); + for (i = 0; i < nvids; i++) { + vidp = &vids[i]; + if (vidp->vl_set == B_FALSE) { + continue; + } + mac_unicast_remove(mch, vidp->vl_muh); + vidp->vl_set = B_FALSE; } - RW_EXIT(&vswp->mac_rwlock); - - return (mp); } #define ARH_FIXED_LEN 8 /* Length of fixed part of ARP header(see arp.h) */ @@ -1386,7 +1159,7 @@ vsw_tx_msg(vsw_t *vswp, mblk_t *mp) * vsw_publish_macaddr_count to zero in /etc/system. */ void -vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr) +vsw_publish_macaddr(vsw_t *vswp, vsw_port_t *portp) { mblk_t *mp; mblk_t *bp; @@ -1404,7 +1177,7 @@ vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr) /* Initialize eth header */ ehp = (struct ether_header *)mp->b_rptr; bcopy(ðerbroadcastaddr, &ehp->ether_dhost, ETHERADDRL); - bcopy(addr, &ehp->ether_shost, ETHERADDRL); + bcopy(&portp->p_macaddr, &ehp->ether_shost, ETHERADDRL); ehp->ether_type = htons(ETHERTYPE_REVARP); /* Initialize arp packet */ @@ -1420,13 +1193,13 @@ vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr) cp += ARH_FIXED_LEN; /* Sender's hardware address and protocol address */ - bcopy(addr, cp, ETHERADDRL); + bcopy(&portp->p_macaddr, cp, ETHERADDRL); cp += ETHERADDRL; bzero(cp, plen); /* INADDR_ANY */ cp += plen; /* Target hardware address and protocol address */ - bcopy(addr, cp, ETHERADDRL); + bcopy(&portp->p_macaddr, cp, ETHERADDRL); cp += ETHERADDRL; bzero(cp, plen); /* INADDR_ANY */ cp += plen; @@ -1441,7 +1214,7 @@ vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr) } /* transmit the packet */ - bp = vsw_tx_msg(vswp, bp); + bp = vsw_tx_msg(vswp, bp, VSW_VNETPORT, portp); if (bp != NULL) { freemsg(bp); } @@ -1453,50 +1226,18 @@ vsw_publish_macaddr(vsw_t *vswp, uint8_t *addr) static void vsw_mac_set_mtu(vsw_t *vswp, uint32_t mtu) { - mac_prop_t mp; - uint32_t val; - int rv; - uint_t perm_flags = MAC_PROP_PERM_RW; - mp.mp_id = MAC_PROP_MTU; - mp.mp_name = mac_mtu_propname; - mp.mp_flags = 0; - - /* Get the mtu of the physical device */ - rv = mac_get_prop(vswp->mh, &mp, (void *)&val, sizeof (uint32_t), - &perm_flags); - if (rv != 0) { - cmn_err(CE_NOTE, - "!vsw%d: Unable to get the mtu of the physical device:%s\n", - vswp->instance, vswp->physname); - return; - } - - /* Return if the mtu is read-only */ - if (perm_flags != MAC_PROP_PERM_RW) { - cmn_err(CE_NOTE, - "!vsw%d: Read-only mtu of the physical device:%s\n", - vswp->instance, vswp->physname); - return; - } - - /* save the original mtu of physdev to reset it back later if needed */ - vswp->mtu_physdev_orig = val; - - if (val == mtu) { - /* no need to set, as the device already has the right mtu */ - return; - } - - mp.mp_id = MAC_PROP_MTU; - mp.mp_name = mac_mtu_propname; - mp.mp_flags = 0; + uint_t mtu_orig; + int rv; - /* Set the mtu in the physical device */ - rv = mac_set_prop(vswp->mh, &mp, &mtu, sizeof (uint32_t)); + rv = mac_set_mtu(vswp->mh, mtu, &mtu_orig); if (rv != 0) { cmn_err(CE_NOTE, "!vsw%d: Unable to set the mtu:%d, in the " "physical device:%s\n", vswp->instance, mtu, vswp->physname); + return; } + + /* save the original mtu of physdev to reset it back later if needed */ + vswp->mtu_physdev_orig = mtu_orig; } diff --git a/usr/src/uts/sun4v/io/vsw_switching.c b/usr/src/uts/sun4v/io/vsw_switching.c index 8c4ad6d4d0..5033f0665c 100644 --- a/usr/src/uts/sun4v/io/vsw_switching.c +++ b/usr/src/uts/sun4v/io/vsw_switching.c @@ -58,7 +58,6 @@ #include <sys/taskq.h> #include <sys/note.h> #include <sys/mach_descrip.h> -#include <sys/mac.h> #include <sys/mdeg.h> #include <sys/ldc.h> #include <sys/vsw_fdb.h> @@ -82,6 +81,8 @@ static int vsw_setup_layer2(vsw_t *); static int vsw_setup_layer3(vsw_t *); /* Switching/data transmit routines */ +static void vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller, + vsw_port_t *port, mac_resource_handle_t); static void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *port, mac_resource_handle_t); static void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, @@ -117,26 +118,26 @@ void vsw_del_mcst_vsw(vsw_t *); /* Support functions */ static mblk_t *vsw_dupmsgchain(mblk_t *mp); -static uint32_t vsw_get_same_dest_list(struct ether_header *ehp, - mblk_t **rhead, mblk_t **rtail, mblk_t **mpp); +static mblk_t *vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp); /* * Functions imported from other files. */ -extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *); +extern mblk_t *vsw_tx_msg(vsw_t *, mblk_t *, int, vsw_port_t *); extern mcst_addr_t *vsw_del_addr(uint8_t, void *, uint64_t); extern int vsw_mac_open(vsw_t *vswp); extern void vsw_mac_close(vsw_t *vswp); extern void vsw_mac_rx(vsw_t *vswp, mac_resource_handle_t mrh, mblk_t *mp, vsw_macrx_flags_t flags); extern void vsw_set_addrs(vsw_t *vswp); -extern int vsw_get_hw_maddr(vsw_t *); -extern int vsw_mac_attach(vsw_t *vswp); -extern int vsw_portsend(vsw_port_t *port, mblk_t *mp, mblk_t *mpt, - uint32_t count); +extern int vsw_portsend(vsw_port_t *port, mblk_t *mp); extern void vsw_hio_init(vsw_t *vswp); extern void vsw_hio_start_ports(vsw_t *vswp); +extern int vsw_mac_multicast_add(vsw_t *vswp, vsw_port_t *port, + mcst_addr_t *mcst_p, int type); +extern void vsw_mac_multicast_remove(vsw_t *vswp, vsw_port_t *port, + mcst_addr_t *mcst_p, int type); /* * Tunables used in this file. @@ -226,9 +227,9 @@ vsw_stop_switching_timeout(vsw_t *vswp) (void) atomic_swap_32(&vswp->switching_setup_done, B_FALSE); - WRITE_ENTER(&vswp->mac_rwlock); + mutex_enter(&vswp->mac_lock); vswp->mac_open_retries = 0; - RW_EXIT(&vswp->mac_rwlock); + mutex_exit(&vswp->mac_lock); } /* @@ -246,39 +247,24 @@ vsw_stop_switching_timeout(vsw_t *vswp) int vsw_setup_switching(vsw_t *vswp) { - int i, rv = 1; + int rv = 1; D1(vswp, "%s: enter", __func__); /* * Select best switching mode. - * Note that we start from the saved smode_idx. This is done as - * this routine can be called from the timeout handler to retry - * setting up a specific mode. Currently only the function which - * sets up layer2/promisc mode returns EAGAIN if the underlying - * physical device is not available yet, causing retries. + * This is done as this routine can be called from the timeout + * handler to retry setting up a specific mode. Currently only + * the function which sets up layer2/promisc mode returns EAGAIN + * if the underlying network device is not available yet, causing + * retries. */ - for (i = vswp->smode_idx; i < vswp->smode_num; i++) { - vswp->smode_idx = i; - switch (vswp->smode[i]) { - case VSW_LAYER2: - case VSW_LAYER2_PROMISC: - rv = vsw_setup_layer2(vswp); - break; - - case VSW_LAYER3: - rv = vsw_setup_layer3(vswp); - break; - - default: - DERR(vswp, "unknown switch mode"); - break; - } - - if ((rv == 0) || (rv == EAGAIN)) - break; - - /* all other errors(rv != 0): continue & select the next mode */ + if (vswp->smode & VSW_LAYER2) { + rv = vsw_setup_layer2(vswp); + } else if (vswp->smode & VSW_LAYER3) { + rv = vsw_setup_layer3(vswp); + } else { + DERR(vswp, "unknown switch mode"); rv = 1; } @@ -290,7 +276,7 @@ vsw_setup_switching(vsw_t *vswp) } D2(vswp, "%s: Operating in mode %d", __func__, - vswp->smode[vswp->smode_idx]); + vswp->smode); D1(vswp, "%s: exit", __func__); @@ -312,7 +298,12 @@ vsw_setup_layer2(vsw_t *vswp) D1(vswp, "%s: enter", __func__); + /* + * Until the network device is successfully opened, + * set the switching to use vsw_switch_l2_frame. + */ vswp->vsw_switch_frame = vsw_switch_l2_frame; + vswp->mac_cl_switching = B_FALSE; rv = strlen(vswp->physname); if (rv == 0) { @@ -320,61 +311,42 @@ vsw_setup_layer2(vsw_t *vswp) * Physical device name is NULL, which is * required for layer 2. */ - cmn_err(CE_WARN, "!vsw%d: no physical device name specified", + cmn_err(CE_WARN, "!vsw%d: no network device name specified", vswp->instance); return (EIO); } - WRITE_ENTER(&vswp->mac_rwlock); + mutex_enter(&vswp->mac_lock); rv = vsw_mac_open(vswp); if (rv != 0) { if (rv != EAGAIN) { - cmn_err(CE_WARN, "!vsw%d: Unable to open physical " + cmn_err(CE_WARN, "!vsw%d: Unable to open network " "device: %s\n", vswp->instance, vswp->physname); } - RW_EXIT(&vswp->mac_rwlock); + mutex_exit(&vswp->mac_lock); return (rv); } - if (vswp->smode[vswp->smode_idx] == VSW_LAYER2) { - /* - * Verify that underlying device can support multiple - * unicast mac addresses. - */ - rv = vsw_get_hw_maddr(vswp); - if (rv != 0) { - goto exit_error; - } - } - /* - * Attempt to link into the MAC layer so we can get - * and send packets out over the physical adapter. + * Now we can use the mac client switching, so set the switching + * function to use vsw_switch_l2_frame_mac_client(), which simply + * sends the packets to MAC layer for switching. */ - rv = vsw_mac_attach(vswp); - if (rv != 0) { - /* - * Registration with the MAC layer has failed, - * so return error so that can fall back to next - * prefered switching method. - */ - cmn_err(CE_WARN, "!vsw%d: Unable to setup physical device: " - "%s\n", vswp->instance, vswp->physname); - goto exit_error; - } + vswp->vsw_switch_frame = vsw_switch_l2_frame_mac_client; + vswp->mac_cl_switching = B_TRUE; D1(vswp, "%s: exit", __func__); - RW_EXIT(&vswp->mac_rwlock); - /* Initialize HybridIO related stuff */ vsw_hio_init(vswp); + + mutex_exit(&vswp->mac_lock); return (0); exit_error: vsw_mac_close(vswp); - RW_EXIT(&vswp->mac_rwlock); + mutex_exit(&vswp->mac_lock); return (EIO); } @@ -400,6 +372,31 @@ vsw_switch_frame_nop(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *port, } /* + * Use mac client for layer 2 switching . + */ +static void +vsw_switch_l2_frame_mac_client(vsw_t *vswp, mblk_t *mp, int caller, + vsw_port_t *port, mac_resource_handle_t mrh) +{ + _NOTE(ARGUNUSED(mrh)) + + mblk_t *ret_m; + + /* + * This switching function is expected to be called by + * the ports or the interface only. The packets from + * physical interface already switched. + */ + ASSERT((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV)); + + if ((ret_m = vsw_tx_msg(vswp, mp, caller, port)) != NULL) { + DERR(vswp, "%s: drop mblks to " + "phys dev", __func__); + freemsgchain(ret_m); + } +} + +/* * Switch the given ethernet frame when operating in layer 2 mode. * * vswp: pointer to the vsw instance @@ -419,8 +416,6 @@ vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, { struct ether_header *ehp; mblk_t *bp, *ret_m; - mblk_t *mpt = NULL; - uint32_t count; vsw_fdbe_t *fp; D1(vswp, "%s: enter (caller %d)", __func__, caller); @@ -435,8 +430,8 @@ vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, bp = mp; while (bp) { ehp = (struct ether_header *)bp->b_rptr; - count = vsw_get_same_dest_list(ehp, &mp, &mpt, &bp); - ASSERT(count != 0); + mp = vsw_get_same_dest_list(ehp, &bp); + ASSERT(mp != NULL); D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", __func__, MBLKSIZE(mp), MBLKL(mp)); @@ -476,7 +471,7 @@ vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, * vsw_port (connected to a vnet device - * VSW_VNETPORT) */ - (void) vsw_portsend(fp->portp, mp, mpt, count); + (void) vsw_portsend(fp->portp, mp); /* Release the reference on the fdb entry */ VSW_FDBE_REFRELE(fp); @@ -517,8 +512,8 @@ vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, VSW_MACRX_PROMISC | VSW_MACRX_COPYMSG); - if ((ret_m = vsw_tx_msg(vswp, mp)) - != NULL) { + if ((ret_m = vsw_tx_msg(vswp, mp, + caller, arg)) != NULL) { DERR(vswp, "%s: drop mblks to " "phys dev", __func__); freemsgchain(ret_m); @@ -539,8 +534,8 @@ vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller, * Pkt came down the stack, send out * over physical device. */ - if ((ret_m = vsw_tx_msg(vswp, mp)) - != NULL) { + if ((ret_m = vsw_tx_msg(vswp, mp, + caller, NULL)) != NULL) { DERR(vswp, "%s: drop mblks to " "phys dev", __func__); freemsgchain(ret_m); @@ -566,8 +561,6 @@ vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, { struct ether_header *ehp; mblk_t *bp = NULL; - mblk_t *mpt; - uint32_t count; vsw_fdbe_t *fp; D1(vswp, "%s: enter (caller %d)", __func__, caller); @@ -587,8 +580,8 @@ vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, bp = mp; while (bp) { ehp = (struct ether_header *)bp->b_rptr; - count = vsw_get_same_dest_list(ehp, &mp, &mpt, &bp); - ASSERT(count != 0); + mp = vsw_get_same_dest_list(ehp, &bp); + ASSERT(mp != NULL); D2(vswp, "%s: mblk data buffer %lld : actual data size %lld", __func__, MBLKSIZE(mp), MBLKL(mp)); @@ -601,7 +594,7 @@ vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, if (fp != NULL) { D2(vswp, "%s: sending to target port", __func__); - (void) vsw_portsend(fp->portp, mp, mpt, count); + (void) vsw_portsend(fp->portp, mp); /* Release the reference on the fdb entry */ VSW_FDBE_REFRELE(fp); @@ -644,8 +637,7 @@ vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller, void vsw_setup_layer2_post_process(vsw_t *vswp) { - if ((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || - (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) { + if (vswp->smode & VSW_LAYER2) { /* * Program unicst, mcst addrs of vsw * interface and ports in the physdev. @@ -676,13 +668,13 @@ vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) * Broadcast message from inside ldoms so send to outside * world if in either of layer 2 modes. */ - if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || - (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && + if ((vswp->smode & VSW_LAYER2) && ((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) { nmp = vsw_dupmsgchain(mp); if (nmp) { - if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { + if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg)) + != NULL) { DERR(vswp, "%s: dropping pkt(s) " "consisting of %ld bytes of data for" " physical device", __func__, MBLKL(ret_m)); @@ -716,20 +708,12 @@ vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) } else { nmp = vsw_dupmsgchain(mp); if (nmp) { - mblk_t *mpt = nmp; - uint32_t count = 1; - - /* Find tail */ - while (mpt->b_next != NULL) { - mpt = mpt->b_next; - count++; - } /* * The plist->lockrw is protecting the * portp from getting destroyed here. * So, no ref_cnt is incremented here. */ - (void) vsw_portsend(portp, nmp, mpt, count); + (void) vsw_portsend(portp, nmp); } else { DERR(vswp, "vsw_forward_all: nmp NULL"); } @@ -772,12 +756,12 @@ vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) * over the physical adapter, and then check to see if any other * vnets are interested in it. */ - if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) || - (vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) && + if ((vswp->smode & VSW_LAYER2) && ((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) { nmp = vsw_dupmsgchain(mp); if (nmp) { - if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) { + if ((ret_m = vsw_tx_msg(vswp, nmp, caller, arg)) + != NULL) { DERR(vswp, "%s: dropping pkt(s) consisting of " "%ld bytes of data for physical device", __func__, MBLKL(ret_m)); @@ -819,21 +803,12 @@ vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg) nmp = vsw_dupmsgchain(mp); if (nmp) { - mblk_t *mpt = nmp; - uint32_t count = 1; - - /* Find tail */ - while (mpt->b_next != NULL) { - mpt = mpt->b_next; - count++; - } /* * The vswp->mfdbrw is protecting the * portp from getting destroyed here. * So, no ref_cnt is incremented here. */ - (void) vsw_portsend(port, nmp, mpt, - count); + (void) vsw_portsend(port, nmp); } } else { vsw_mac_rx(vswp, NULL, @@ -970,32 +945,46 @@ vsw_vlan_add_ids(void *arg, int type) rv = mod_hash_insert(vswp->vlan_hashp, (mod_hash_key_t)VLAN_ID_KEY(vswp->pvid), (mod_hash_val_t)B_TRUE); - ASSERT(rv == 0); + if (rv != 0) { + cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for " + "the interface", vswp->instance, vswp->pvid); + } for (i = 0; i < vswp->nvids; i++) { rv = mod_hash_insert(vswp->vlan_hashp, - (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i]), + (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i].vl_vid), (mod_hash_val_t)B_TRUE); - ASSERT(rv == 0); + if (rv != 0) { + cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)" + " for the interface", vswp->instance, + vswp->pvid); + } } } else if (type == VSW_VNETPORT) { vsw_port_t *portp = (vsw_port_t *)arg; + vsw_t *vswp = portp->p_vswp; rv = mod_hash_insert(portp->vlan_hashp, (mod_hash_key_t)VLAN_ID_KEY(portp->pvid), (mod_hash_val_t)B_TRUE); - ASSERT(rv == 0); + if (rv != 0) { + cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d) for " + "the port(%d)", vswp->instance, vswp->pvid, + portp->p_instance); + } for (i = 0; i < portp->nvids; i++) { rv = mod_hash_insert(portp->vlan_hashp, - (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i]), + (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i].vl_vid), (mod_hash_val_t)B_TRUE); - ASSERT(rv == 0); + if (rv != 0) { + cmn_err(CE_WARN, "vsw%d: Duplicate vlan-id(%d)" + " for the port(%d)", vswp->instance, + vswp->pvid, portp->p_instance); + } } - } else { - return; } } @@ -1021,10 +1010,12 @@ vsw_vlan_remove_ids(void *arg, int type) } for (i = 0; i < vswp->nvids; i++) { - rv = vsw_vlan_lookup(vswp->vlan_hashp, vswp->vids[i]); + rv = vsw_vlan_lookup(vswp->vlan_hashp, + vswp->vids[i].vl_vid); if (rv == B_TRUE) { rv = mod_hash_remove(vswp->vlan_hashp, - (mod_hash_key_t)VLAN_ID_KEY(vswp->vids[i]), + (mod_hash_key_t)VLAN_ID_KEY( + vswp->vids[i].vl_vid), (mod_hash_val_t *)&vp); ASSERT(rv == 0); } @@ -1043,10 +1034,12 @@ vsw_vlan_remove_ids(void *arg, int type) } for (i = 0; i < portp->nvids; i++) { - rv = vsw_vlan_lookup(portp->vlan_hashp, portp->vids[i]); + rv = vsw_vlan_lookup(portp->vlan_hashp, + portp->vids[i].vl_vid); if (rv == B_TRUE) { rv = mod_hash_remove(portp->vlan_hashp, - (mod_hash_key_t)VLAN_ID_KEY(portp->vids[i]), + (mod_hash_key_t)VLAN_ID_KEY( + portp->vids[i].vl_vid), (mod_hash_val_t *)&vp); ASSERT(rv == 0); } @@ -1097,7 +1090,11 @@ vsw_fdbe_add(vsw_t *vswp, void *port) */ rv = mod_hash_insert(vswp->fdb_hashp, (mod_hash_key_t)addr, (mod_hash_val_t)fp); - ASSERT(rv == 0); + if (rv != 0) { + cmn_err(CE_WARN, "vsw%d: Duplicate mac-address(%s) for " + "the port(%d)", vswp->instance, + ether_sprintf(&portp->p_macaddr), portp->p_instance); + } } /* @@ -1264,7 +1261,7 @@ vsw_vlan_frame_pretag(void *arg, int type, mblk_t *mp) * Returns: * np: head of updated chain of packets * npt: tail of updated chain of packets - * rv: count of any packets dropped + * rv: count of the packets in the returned list */ uint32_t vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt) @@ -1285,6 +1282,7 @@ vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt) ASSERT((type == VSW_LOCALDEV) || (type == VSW_VNETPORT)); + if (type == VSW_LOCALDEV) { vswp = (vsw_t *)arg; pvid = vswp->pvid; @@ -1298,6 +1296,27 @@ vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt) pvid = portp->pvid; } + /* + * If the MAC layer switching in place, then + * untagging required only if the pvid is not + * the same as default_vlan_id. This is because, + * the MAC layer will send packets for the + * registered vlans only. + */ + if ((vswp->mac_cl_switching == B_TRUE) && + (pvid == vswp->default_vlan_id)) { + /* simply count and set the tail */ + count = 1; + bp = *np; + ASSERT(bp != NULL); + while (bp->b_next != NULL) { + bp = bp->b_next; + count++; + } + *npt = bp; + return (count); + } + bpn = bph = bpt = NULL; count = 0; @@ -1313,45 +1332,67 @@ vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt) is_tagged = vsw_frame_lookup_vid(arg, type, ehp, &vlan_id); /* - * Check if the destination is in the same vlan. + * If MAC layer switching in place, then we + * need to untag only if the tagged packet has + * vlan-id same as the pvid. */ - rv = vsw_vlan_lookup(vlan_hashp, vlan_id); - if (rv == B_FALSE) { - /* drop the packet */ - freemsg(bp); - count++; - continue; - } + if (vswp->mac_cl_switching == B_TRUE) { - /* - * Check the frame header if tag/untag is needed. - */ - if (is_tagged == B_FALSE) { - /* - * Untagged frame. We shouldn't have an untagged - * packet at this point, unless the destination's - * vlan id is default-vlan-id; if it is not the - * default-vlan-id, we drop the packet. - */ - if (vlan_id != vswp->default_vlan_id) { - /* drop the packet */ - freemsg(bp); - count++; - continue; - } - } else { - /* - * Tagged frame, untag if it's the destination's pvid. - */ + /* only tagged packets expected here */ + ASSERT(is_tagged == B_TRUE); if (vlan_id == pvid) { - bp = vnet_vlan_remove_tag(bp); if (bp == NULL) { /* packet dropped */ - count++; continue; } } + } else { /* No MAC layer switching */ + + /* + * Check the frame header if tag/untag is needed. + */ + if (is_tagged == B_FALSE) { + /* + * Untagged frame. We shouldn't have an + * untagged packet at this point, unless + * the destination's vlan id is + * default-vlan-id; if it is not the + * default-vlan-id, we drop the packet. + */ + if (vlan_id != vswp->default_vlan_id) { + /* drop the packet */ + freemsg(bp); + continue; + } + } else { /* Tagged */ + /* + * Tagged frame, untag if it's the + * destination's pvid. + */ + if (vlan_id == pvid) { + + bp = vnet_vlan_remove_tag(bp); + if (bp == NULL) { + /* packet dropped */ + continue; + } + } else { + + /* + * Check if the destination is in the + * same vlan. + */ + rv = vsw_vlan_lookup(vlan_hashp, + vlan_id); + if (rv == B_FALSE) { + /* drop the packet */ + freemsg(bp); + continue; + } + } + + } } /* build a chain of processed packets */ @@ -1361,12 +1402,11 @@ vsw_vlan_frame_untag(void *arg, int type, mblk_t **np, mblk_t **npt) bpt->b_next = bp; bpt = bp; } - + count++; } *np = bph; *npt = bpt; - return (count); } @@ -1476,26 +1516,13 @@ vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port) * just increments a ref counter (which is * used when the address is being deleted) */ - WRITE_ENTER(&vswp->mac_rwlock); - if (vswp->mh != NULL) { - if (mac_multicst_add(vswp->mh, - (uchar_t *)&mcst_pkt->mca[i])) { - RW_EXIT(&vswp->mac_rwlock); - cmn_err(CE_WARN, "!vsw%d: " - "unable to add multicast " - "address: %s\n", - vswp->instance, - ether_sprintf((void *) - &mcst_p->mca)); - (void) vsw_del_mcst(vswp, - VSW_VNETPORT, addr, port); - kmem_free(mcst_p, - sizeof (*mcst_p)); - return (1); - } - mcst_p->mac_added = B_TRUE; + if (vsw_mac_multicast_add(vswp, port, mcst_p, + VSW_VNETPORT)) { + (void) vsw_del_mcst(vswp, + VSW_VNETPORT, addr, port); + kmem_free(mcst_p, sizeof (*mcst_p)); + return (1); } - RW_EXIT(&vswp->mac_rwlock); mutex_enter(&port->mca_lock); mcst_p->nextp = port->mcap; @@ -1530,24 +1557,8 @@ vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port) * if other ports are interested in this * address. */ - WRITE_ENTER(&vswp->mac_rwlock); - if (vswp->mh != NULL && mcst_p->mac_added) { - if (mac_multicst_remove(vswp->mh, - (uchar_t *)&mcst_pkt->mca[i])) { - RW_EXIT(&vswp->mac_rwlock); - cmn_err(CE_WARN, "!vsw%d: " - "unable to remove mcast " - "address: %s\n", - vswp->instance, - ether_sprintf((void *) - &mcst_p->mca)); - kmem_free(mcst_p, - sizeof (*mcst_p)); - return (1); - } - mcst_p->mac_added = B_FALSE; - } - RW_EXIT(&vswp->mac_rwlock); + vsw_mac_multicast_remove(vswp, port, mcst_p, + VSW_VNETPORT); kmem_free(mcst_p, sizeof (*mcst_p)); } else { @@ -1780,13 +1791,7 @@ vsw_del_mcst_port(vsw_port_t *port) * if other ports are interested in this * address. */ - WRITE_ENTER(&vswp->mac_rwlock); - if (vswp->mh != NULL && mcap->mac_added) { - (void) mac_multicst_remove(vswp->mh, - (uchar_t *)&mcap->mca); - } - RW_EXIT(&vswp->mac_rwlock); - + vsw_mac_multicast_remove(vswp, port, mcap, VSW_VNETPORT); kmem_free(mcap, sizeof (*mcap)); mutex_enter(&port->mca_lock); @@ -1829,11 +1834,9 @@ vsw_del_mcst_vsw(vsw_t *vswp) D1(vswp, "%s: exit", __func__); } -static uint32_t -vsw_get_same_dest_list(struct ether_header *ehp, - mblk_t **rhead, mblk_t **rtail, mblk_t **mpp) +mblk_t * +vsw_get_same_dest_list(struct ether_header *ehp, mblk_t **mpp) { - uint32_t count = 0; mblk_t *bp; mblk_t *nbp; mblk_t *head = NULL; @@ -1860,16 +1863,12 @@ vsw_get_same_dest_list(struct ether_header *ehp, tail->b_next = bp; tail = bp; } - count++; } else { prev = bp; } bp = nbp; } - *rhead = head; - *rtail = tail; - DTRACE_PROBE1(vsw_same_dest, int, count); - return (count); + return (head); } static mblk_t * diff --git a/usr/src/uts/sun4v/os/mach_startup.c b/usr/src/uts/sun4v/os/mach_startup.c index 694930fe28..df698ebe69 100644 --- a/usr/src/uts/sun4v/os/mach_startup.c +++ b/usr/src/uts/sun4v/os/mach_startup.c @@ -308,18 +308,18 @@ mach_hw_copy_limit(void) } /* - * We need to enable soft ring functionality on Niagara platform since - * one strand can't handle interrupts for a 1Gb NIC. Set the tunable - * ip_squeue_soft_ring by default on this platform. We can also set - * ip_threads_per_cpu to track number of threads per core. The variables - * themselves are defined in space.c and used by IP module + * We need to enable soft ring functionality on Niagara platforms since + * one strand can't handle interrupts for a 1Gb NIC. So set the tunable + * mac_soft_ring_enable by default on this platform. + * mac_soft_ring_enable variable is defined in space.c and used by MAC + * module. This tunable in concert with mac_soft_ring_count (declared + * in mac.h) will configure the number of fanout soft rings for a link. */ -extern uint_t ip_threads_per_cpu; -extern boolean_t ip_squeue_soft_ring; +extern boolean_t mac_soft_ring_enable; void startup_platform(void) { - ip_squeue_soft_ring = B_TRUE; + mac_soft_ring_enable = B_TRUE; if (clock_tick_threshold == 0) clock_tick_threshold = SUN4V_CLOCK_TICK_THRESHOLD; if (clock_tick_ncpus == 0) diff --git a/usr/src/uts/sun4v/sys/vnet_res.h b/usr/src/uts/sun4v/sys/vnet_res.h index 035ad1328c..b5cd4472fb 100644 --- a/usr/src/uts/sun4v/sys/vnet_res.h +++ b/usr/src/uts/sun4v/sys/vnet_res.h @@ -27,12 +27,12 @@ #ifndef _VNET_RES_H #define _VNET_RES_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif +#include <sys/mac_provider.h> + /* * Vio network resource types. * VIO_NET_RES_LDC_SERVICE: diff --git a/usr/src/uts/sun4v/sys/vsw.h b/usr/src/uts/sun4v/sys/vsw.h index 069e26d60a..456480f909 100644 --- a/usr/src/uts/sun4v/sys/vsw.h +++ b/usr/src/uts/sun4v/sys/vsw.h @@ -40,6 +40,7 @@ extern "C" { #include <sys/vio_mailbox.h> #include <sys/vnet_common.h> #include <sys/ethernet.h> +#include <sys/mac_client.h> #include <sys/vio_util.h> #include <sys/vgen_stats.h> #include <sys/vsw_ldc.h> @@ -59,57 +60,6 @@ extern "C" { #define VSW_LOCALDEV 4 /* vsw configured as an eth interface */ /* - * Vsw queue -- largely modeled after squeue - * - * VSW_QUEUE_RUNNING, vqueue thread for queue is running. - * VSW_QUEUE_DRAINED, vqueue thread has drained current work and is exiting. - * VSW_QUEUE_STOP, request for the vqueue thread to stop. - * VSW_QUEUE_STOPPED, vqueue thread is not running. - */ -#define VSW_QUEUE_RUNNING 0x01 -#define VSW_QUEUE_DRAINED 0x02 -#define VSW_QUEUE_STOP 0x04 -#define VSW_QUEUE_STOPPED 0x08 - -typedef struct vsw_queue_s { - kmutex_t vq_lock; /* Lock, before using any member. */ - kcondvar_t vq_cv; /* Async threads block on. */ - uint32_t vq_state; /* State flags. */ - - mblk_t *vq_first; /* First mblk chain or NULL. */ - mblk_t *vq_last; /* Last mblk chain. */ - - processorid_t vq_bind; /* Process to bind to */ - kthread_t *vq_worker; /* Queue's thread */ -} vsw_queue_t; - -/* - * VSW MAC Ring Resources. - * MAC Ring resource is composed of this state structure and - * a kernel thread to perform the processing of the ring. - */ -typedef struct vsw_mac_ring_s { - uint32_t ring_state; - - mac_blank_t ring_blank; - void *ring_arg; - - vsw_queue_t *ring_vqp; - struct vsw *ring_vswp; -} vsw_mac_ring_t; - -/* - * Maximum Ring Resources. - */ -#define VSW_MAC_RX_RINGS 0x40 - -/* - * States for entry in ring table. - */ -#define VSW_MAC_RING_FREE 1 -#define VSW_MAC_RING_INUSE 2 - -/* * Number of hash chains in the multicast forwarding database. */ #define VSW_NCHAINS 8 @@ -139,6 +89,15 @@ typedef struct vsw_mac_ring_s { #define VSW_PRI_ETH_DEFINED(vswp) ((vswp)->pri_num_types != 0) /* + * vlan-id information. + */ +typedef struct vsw_vlanid { + uint16_t vl_vid; /* vlan-id */ + mac_unicast_handle_t vl_muh; /* mac unicast handle */ + boolean_t vl_set; /* set? */ +} vsw_vlanid_t; + +/* * vsw instance state information. */ typedef struct vsw { @@ -147,9 +106,7 @@ typedef struct vsw { uint64_t regprop; /* "reg" property */ struct vsw *next; /* next in list */ char physname[LIFNAMSIZ]; /* phys-dev */ - uint8_t smode[NUM_SMODES]; /* switching mode */ - int smode_idx; /* curr pos in smode array */ - int smode_num; /* # of modes specified */ + uint8_t smode; /* switching mode */ kmutex_t swtmout_lock; /* setup switching tmout lock */ boolean_t swtmout_enabled; /* setup switching tmout on */ timeout_id_t swtmout_id; /* setup switching tmout id */ @@ -174,24 +131,16 @@ typedef struct vsw { vsw_port_t *, mac_resource_handle_t); /* mac layer */ - krwlock_t mac_rwlock; /* protect fields below */ + kmutex_t mac_lock; /* protect mh */ mac_handle_t mh; - mac_rx_handle_t mrh; - multiaddress_capab_t maddr; /* Multiple uni addr capable */ - const mac_txinfo_t *txinfo; /* MAC tx routine */ - boolean_t mstarted; /* Mac Started? */ - boolean_t mresources; /* Mac Resources cb? */ - - /* - * MAC Ring Resources. - */ - kmutex_t mac_ring_lock; /* Lock for the table. */ - uint32_t mac_ring_tbl_sz; - vsw_mac_ring_t *mac_ring_tbl; /* Mac ring table. */ - - kmutex_t hw_lock; /* sync access to HW */ + krwlock_t maccl_rwlock; /* protect fields below */ + mac_client_handle_t mch; /* mac client handle */ + mac_unicast_handle_t muh; /* mac unicast handle */ + boolean_t recfg_reqd; /* Reconfig of addrs needed */ - int promisc_cnt; + + /* mac layer switching flag */ + boolean_t mac_cl_switching; /* Machine Description updates */ mdeg_node_spec_t *inst_spec; @@ -204,8 +153,7 @@ typedef struct vsw { krwlock_t if_lockrw; uint8_t if_state; /* interface state */ - mac_addr_slot_t addr_slot; /* Unicast address slot */ - int addr_set; /* Addr set where */ + boolean_t addr_set; /* is addr set to HW */ /* multicast addresses when configured as eth interface */ kmutex_t mca_lock; /* multicast lock */ @@ -216,7 +164,7 @@ typedef struct vsw { vio_mblk_pool_t *pri_tx_vmp; /* tx priority mblk pool */ uint16_t default_vlan_id; /* default vlan id */ uint16_t pvid; /* port vlan id (untagged) */ - uint16_t *vids; /* vlan ids (tagged) */ + vsw_vlanid_t *vids; /* vlan ids (tagged) */ uint16_t nvids; /* # of vids */ uint32_t vids_size; /* size alloc'd for vids list */ diff --git a/usr/src/uts/sun4v/sys/vsw_hio.h b/usr/src/uts/sun4v/sys/vsw_hio.h index 70b79ea04e..1521d6cff9 100644 --- a/usr/src/uts/sun4v/sys/vsw_hio.h +++ b/usr/src/uts/sun4v/sys/vsw_hio.h @@ -55,10 +55,6 @@ typedef struct vsw_share { uint64_t vs_macaddr; /* Associated MAC addr */ uint64_t vs_cookie; /* Share Cookie from alloc_share */ - /* physdev's share related info */ - mac_share_handle_t vs_shdl; /* HIO share handle */ - mac_group_info_t vs_rxginfo; /* RX group info */ - uint64_t vs_gnum; /* RX group number */ } vsw_share_t; #define VSW_SHARE_FREE 0x0 @@ -68,11 +64,8 @@ typedef struct vsw_share { /* Hybrid related info */ typedef struct vsw_hio { - mac_capab_rings_t vh_rcapab; /* Rings capability data */ - mac_capab_share_t vh_scapab; /* Share capability data */ - vsw_share_t *vh_shares; /* Array of Shares */ uint32_t vh_num_shares; /* Number of shares available */ - + vsw_share_t *vh_shares; /* Array of Shares */ uint32_t vh_kstat_size; /* size for the whole kstats */ vsw_hio_kstats_t *vh_kstatsp; /* stats for vsw hio */ kstat_t *vh_ksp; /* kstats */ diff --git a/usr/src/uts/sun4v/sys/vsw_ldc.h b/usr/src/uts/sun4v/sys/vsw_ldc.h index 31344465f5..46d04fac10 100644 --- a/usr/src/uts/sun4v/sys/vsw_ldc.h +++ b/usr/src/uts/sun4v/sys/vsw_ldc.h @@ -362,10 +362,6 @@ typedef struct mcst_addr { #define VSW_PORT_DETACHING 0x2 /* In process of being detached */ #define VSW_PORT_DETACHABLE 0x4 /* Safe to detach */ -#define VSW_ADDR_UNSET 0x0 /* Addr not set */ -#define VSW_ADDR_HW 0x1 /* Addr programmed in HW */ -#define VSW_ADDR_PROMISC 0x2 /* Card in promisc to see addr */ - /* port information associated with a vsw */ typedef struct vsw_port { int p_instance; /* port instance */ @@ -382,20 +378,22 @@ typedef struct vsw_port { kmutex_t state_lock; kcondvar_t state_cv; + krwlock_t maccl_rwlock; /* protect fields below */ + mac_client_handle_t p_mch; /* mac client handle */ + mac_unicast_handle_t p_muh; /* mac unicast handle */ + kmutex_t mca_lock; /* multicast lock */ mcst_addr_t *mcap; /* list of multicast addrs */ - mac_addr_slot_t addr_slot; /* Unicast address slot */ - int addr_set; /* Addr set where */ + boolean_t addr_set; /* Addr set where */ /* * mac address of the port & connected device */ struct ether_addr p_macaddr; uint16_t pvid; /* port vlan id (untagged) */ - uint16_t *vids; /* vlan ids (tagged) */ + struct vsw_vlanid *vids; /* vlan ids (tagged) */ uint16_t nvids; /* # of vids */ - uint32_t vids_size; /* size alloc'd for vids list */ mod_hash_t *vlan_hashp; /* vlan hash table */ uint32_t vlan_nchains; /* # of vlan hash chains */ @@ -444,7 +442,7 @@ static struct ether_addr etherbroadcastaddr = { }; #define IS_BROADCAST(ehp) \ - (ether_cmp(&ehp->ether_dhost, ðerbroadcastaddr) == 0) + (bcmp(&ehp->ether_dhost, ðerbroadcastaddr, ETHERADDRL) == 0) #define IS_MULTICAST(ehp) \ ((ehp->ether_dhost.ether_addr_octet[0] & 01) == 1) |