diff options
| author | Venugopal Iyer <Venu.Iyer@Sun.COM> | 2009-02-17 01:31:30 -0800 |
|---|---|---|
| committer | Venugopal Iyer <Venu.Iyer@Sun.COM> | 2009-02-17 01:31:30 -0800 |
| commit | ae6aa22afeb444ae208c287e7227a4a7c877f17a (patch) | |
| tree | 744dffd8856e6a2a16544575ca8773771051dada | |
| parent | d02310705313ee2fcefee164a4b26d1fa85e9d22 (diff) | |
| download | illumos-gate-ae6aa22afeb444ae208c287e7227a4a7c877f17a.tar.gz | |
PSARC/2009/099 dladm show-usage modifications
6726676 flow should not be seen by flowadm show-usage after the flow been removed by flowadm remove-flow
6766669 "dladm show-vnic -o" can't accept MACADDRESS
6773854 Per Tx ring flow control for UDP
6777547 mac_tx() should compute the hash if the passed hint is zero
6778557 nxge m_tx() should fanout to multiple rings for vnet scalability
6779356 sometimes packets are not classified to the correct flow
6783011 pre-existing subflows not initialized on a non-dls client when brought up
6786734 acctadm dladm_start_usagelog() calls need some work
6789760 mac perimeter deadlock due to dls_devnet_stat_update()
6789883 dladm show-link -s is adrift again.
6791099 mac_tx() frees the message but returns non-NULL cookie which causes panic
6791109 maxbw set on a link should not apply if this link is the underlying port of an aggregation
6791118 panic in mac_bcast_delete() unplumbing an IP interface
6791456 deleting last vnic interface causes bge interface to stop working
6791678 xvm guests don't communicate through vnics configured on vlan
6792164 race between mac_tx_is_flow_blocked() and mac_srs_group_teardown() could cause panic
6792546 paniced in bge_ring_tx()/freemsg() due to mp->b_next == NULL && mp->b_prev == NULL
6792555 paniced in mac_flow_walk_nolock() due to assertion failed: cnt == ft->ft_flow_count
6792871 multiple VLANs per MAC client cause hang in mac_flow_wait()
6792942 60% regression for Guest-to-Guest network throughput on snv106
6793278 the multicast addresses are not added to the aggregation port in certain scenarios
6793436 panic in mac_fini_macaddr() on mac_register() failure
6796850 SUNWcnetr postinstall script spews errors due to bad interface matching
6803378 need support for dls_bypass and rx fanout on non-ethernet media
48 files changed, 1933 insertions, 1202 deletions
diff --git a/usr/src/cmd/acctadm/main.c b/usr/src/cmd/acctadm/main.c index 580012ccc8..2c610bdc10 100644 --- a/usr/src/cmd/acctadm/main.c +++ b/usr/src/cmd/acctadm/main.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -41,6 +41,8 @@ #include "aconf.h" #include "res.h" +#define ACCTADM_NET_LOG_INTERVAL 20 + static const char USAGE[] = "\ Usage:\n\ acctadm [ {process | task | flow | net} ]\n\ @@ -126,6 +128,7 @@ main(int argc, char *argv[]) int optcnt = 0; int state; const char *fmri; /* FMRI for this instance */ + int err = 0; setup_privs(); @@ -309,10 +312,14 @@ main(int argc, char *argv[]) if (type & AC_NET) { (void) priv_set(PRIV_ON, PRIV_EFFECTIVE, PRIV_SYS_DL_CONFIG, NULL); - (void) dladm_stop_usagelog(dld_handle, + err = dladm_stop_usagelog(dld_handle, DLADM_LOGTYPE_FLOW); (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, PRIV_SYS_DL_CONFIG, NULL); + if (err != DLADM_STATUS_OK) { + die(gettext("failed to stop logging network " + "information, error %d\n"), errno); + } } state = AC_OFF; @@ -356,18 +363,44 @@ main(int argc, char *argv[]) if (type & AC_NET) { (void) priv_set(PRIV_ON, PRIV_EFFECTIVE, PRIV_SYS_DL_CONFIG, NULL); - (void) dladm_stop_usagelog(dld_handle, - strncmp(disabled, "basic", strlen("basic")) - == 0 ? DLADM_LOGTYPE_LINK : - DLADM_LOGTYPE_FLOW); + err = dladm_stop_usagelog(dld_handle, + strcmp(disabled, "basic") == 0 ? + DLADM_LOGTYPE_LINK : DLADM_LOGTYPE_FLOW); (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, PRIV_SYS_DL_CONFIG, NULL); + if (err != DLADM_STATUS_OK) { + die(gettext("failed to stop logging " + "network information, error %d\n"), + errno); + } } str2buf(buf, disabled, AC_OFF, type); } - if (enabled) + if (enabled) { + /* + * Lets us get network logging started. + */ + if (type & AC_NET) { + /* + * Default logging interval for AC_NET is + * ACCTADM_NET_LOG_INTERVAL. + */ + (void) priv_set(PRIV_ON, PRIV_EFFECTIVE, + PRIV_SYS_DL_CONFIG, NULL); + err = dladm_start_usagelog(dld_handle, + strcmp(enabled, "basic") == 0 ? + DLADM_LOGTYPE_LINK : DLADM_LOGTYPE_FLOW, + ACCTADM_NET_LOG_INTERVAL); + (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, + PRIV_SYS_DL_CONFIG, NULL); + if (err != DLADM_STATUS_OK) { + die(gettext("failed to start logging " + "network information, error %d\n"), + errno); + } + } str2buf(buf, enabled, AC_ON, type); - + } (void) priv_set(PRIV_ON, PRIV_EFFECTIVE, PRIV_SYS_ACCT, NULL); if (acctctl(type | AC_RES_SET, buf, AC_BUFSIZE) == -1) { free(buf); @@ -384,24 +417,6 @@ main(int argc, char *argv[]) if (aconf_set_string(AC_PROP_UNTRACKED, untracked) == -1) die(gettext("cannot update %s property\n"), AC_PROP_UNTRACKED); - /* - * We will enable net logging after turning it on so that - * it can immediately start writing log. - */ - if (type & AC_NET && enabled != NULL) { - /* - * Default logging interval for AC_NET is 20. - * XXX need to find the right place to - * configure it. - */ - (void) priv_set(PRIV_ON, PRIV_EFFECTIVE, - PRIV_SYS_DL_CONFIG, NULL); - (void) dladm_start_usagelog(dld_handle, - strncmp(enabled, "basic", strlen("basic")) == 0 ? - DLADM_LOGTYPE_LINK : DLADM_LOGTYPE_FLOW, 20); - (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, - PRIV_SYS_DL_CONFIG, NULL); - } free(tracked); free(untracked); free(buf); @@ -445,10 +460,14 @@ main(int argc, char *argv[]) if (type & AC_NET) { (void) priv_set(PRIV_ON, PRIV_EFFECTIVE, PRIV_SYS_DL_CONFIG, NULL); - (void) dladm_stop_usagelog(dld_handle, + err = dladm_stop_usagelog(dld_handle, DLADM_LOGTYPE_FLOW); (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, PRIV_SYS_DL_CONFIG, NULL); + if (err != DLADM_STATUS_OK) { + die(gettext("failed to stop logging " + "network information, error %d\n"), errno); + } } state = AC_OFF; @@ -468,6 +487,26 @@ main(int argc, char *argv[]) /* * Enable accounting */ + + /* + * Let's get network logging started. + */ + if (type & AC_NET) { + /* + * Default logging interval for AC_NET is + * ACCTADM_NET_LOG_INTERVAL. + */ + (void) priv_set(PRIV_ON, PRIV_EFFECTIVE, + PRIV_SYS_DL_CONFIG, NULL); + err = dladm_start_usagelog(dld_handle, + DLADM_LOGTYPE_FLOW, ACCTADM_NET_LOG_INTERVAL); + (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, + PRIV_SYS_DL_CONFIG, NULL); + if (err != DLADM_STATUS_OK) { + die(gettext("failed to start logging " + "network information, error %d\n"), errno); + } + } state = AC_ON; (void) priv_set(PRIV_ON, PRIV_EFFECTIVE, PRIV_SYS_ACCT, NULL); @@ -480,18 +519,6 @@ main(int argc, char *argv[]) die(gettext("cannot update %s property\n"), AC_PROP_STATE); modified++; - if (type & AC_NET) { - /* - * Default logging interval for AC_NET is 20, - * XXX need to find the right place to configure it. - */ - (void) priv_set(PRIV_ON, PRIV_EFFECTIVE, - PRIV_SYS_DL_CONFIG, NULL); - (void) dladm_start_usagelog(dld_handle, - DLADM_LOGTYPE_FLOW, 20); - (void) priv_set(PRIV_OFF, PRIV_EFFECTIVE, - PRIV_SYS_DL_CONFIG, NULL); - } } (void) priv_set(PRIV_OFF, PRIV_PERMITTED, PRIV_SYS_ACCT, NULL); diff --git a/usr/src/cmd/dladm/dladm.c b/usr/src/cmd/dladm/dladm.c index f122f02716..7bba0420bd 100644 --- a/usr/src/cmd/dladm/dladm.c +++ b/usr/src/cmd/dladm/dladm.c @@ -252,6 +252,7 @@ typedef struct show_usage_state_s { boolean_t us_parseable; boolean_t us_printheader; boolean_t us_first; + boolean_t us_showall; print_state_t us_print; } show_usage_state_t; @@ -392,7 +393,7 @@ static cmd_t cmds[] = { { "show-etherstub", do_show_etherstub, " show-etherstub [-t] [<link>]\n" }, { "show-usage", do_show_usage, - " show-usage [-d|-p -F <format>] " + " show-usage [-a] [-d | -F <format>] " "[-s <DD/MM/YYYY,HH:MM:SS>]\n" "\t\t [-e <DD/MM/YYYY,HH:MM:SS>] -f <logfile> [<link>]" } }; @@ -480,6 +481,14 @@ static const struct option etherstub_lopts[] = { { 0, 0, 0, 0 } }; +static const struct option usage_opts[] = { + {"file", required_argument, 0, 'f' }, + {"format", required_argument, 0, 'F' }, + {"start", required_argument, 0, 's' }, + {"stop", required_argument, 0, 'e' }, + { 0, 0, 0, 0 } +}; + /* * structures for 'dladm show-ether' */ @@ -527,26 +536,33 @@ typedef struct print_ether_state { * structures for 'dladm show-link -s' (print statistics) */ typedef enum { - DEVS_LINK, - DEVS_IPKTS, - DEVS_RBYTES, - DEVS_IERRORS, - DEVS_OPKTS, - DEVS_OBYTES, - DEVS_OERRORS -} devs_field_index_t; - -static print_field_t devs_fields[] = { + LINK_S_LINK, + LINK_S_IPKTS, + LINK_S_RBYTES, + LINK_S_IERRORS, + LINK_S_OPKTS, + LINK_S_OBYTES, + LINK_S_OERRORS +} link_s_field_index_t; + +static print_field_t link_s_fields[] = { /* name, header, field width, index, cmdtype */ -{ "link", "LINK", 15, DEVS_LINK, CMD_TYPE_ANY}, -{ "ipackets", "IPACKETS", 10, DEVS_IPKTS, CMD_TYPE_ANY}, -{ "rbytes", "RBYTES", 8, DEVS_RBYTES, CMD_TYPE_ANY}, -{ "ierrors", "IERRORS", 10, DEVS_IERRORS, CMD_TYPE_ANY}, -{ "opackets", "OPACKETS", 12, DEVS_OPKTS, CMD_TYPE_ANY}, -{ "obytes", "OBYTES", 12, DEVS_OBYTES, CMD_TYPE_ANY}, -{ "oerrors", "OERRORS", 8, DEVS_OERRORS, CMD_TYPE_ANY}} +{ "link", "LINK", 15, LINK_S_LINK, CMD_TYPE_ANY}, +{ "ipackets", "IPACKETS", 10, LINK_S_IPKTS, CMD_TYPE_ANY}, +{ "rbytes", "RBYTES", 8, LINK_S_RBYTES, CMD_TYPE_ANY}, +{ "ierrors", "IERRORS", 10, LINK_S_IERRORS, CMD_TYPE_ANY}, +{ "opackets", "OPACKETS", 12, LINK_S_OPKTS, CMD_TYPE_ANY}, +{ "obytes", "OBYTES", 12, LINK_S_OBYTES, CMD_TYPE_ANY}, +{ "oerrors", "OERRORS", 8, LINK_S_OERRORS, CMD_TYPE_ANY}} ; -#define DEVS_MAX_FIELDS (sizeof (devs_fields) / sizeof (print_field_t)) +#define LINK_S_MAX_FIELDS \ + (sizeof (link_s_fields) / sizeof (print_field_t)) + +typedef struct link_args_s { + char *link_s_link; + pktsum_t *link_s_psum; +} link_args_t; +static char *print_link_stats(print_field_t *, void *); /* * buffer used by print functions for show-{link,phys,vlan} commands. @@ -925,7 +941,7 @@ static print_field_t vnic_fields[] = { offsetof(vnic_fields_buf_t, vnic_over), CMD_TYPE_ANY}, { "speed", "SPEED", 6, offsetof(vnic_fields_buf_t, vnic_speed), CMD_TYPE_ANY}, -{ "macaddr", "MACADDRESS", 20, +{ "macaddress", "MACADDRESS", 20, offsetof(vnic_fields_buf_t, vnic_macaddr), CMD_TYPE_ANY}, { "macaddrtype", "MACADDRTYPE", 19, offsetof(vnic_fields_buf_t, vnic_macaddrtype), CMD_TYPE_ANY}, @@ -1077,9 +1093,24 @@ main(int argc, char *argv[]) static int show_usage_date(dladm_usage_t *usage, void *arg) { + show_usage_state_t *state = (show_usage_state_t *)arg; + time_t stime; + char timebuf[20]; + dladm_status_t status; + uint32_t flags; - time_t stime; - char timebuf[20]; + /* + * Only show usage information for existing links unless '-a' + * is specified. + */ + if (!state->us_showall) { + if ((status = dladm_name2info(handle, usage->du_name, + NULL, &flags, NULL, NULL)) != DLADM_STATUS_OK) { + return (status); + } + if ((flags & DLADM_OPT_ACTIVE) == 0) + return (DLADM_STATUS_LINKINVAL); + } stime = usage->du_stime; (void) strftime(timebuf, sizeof (timebuf), "%m/%d/%Y", @@ -1097,6 +1128,21 @@ show_usage_time(dladm_usage_t *usage, void *arg) usage_l_fields_buf_t ubuf; time_t time; double bw; + dladm_status_t status; + uint32_t flags; + + /* + * Only show usage information for existing links unless '-a' + * is specified. + */ + if (!state->us_showall) { + if ((status = dladm_name2info(handle, usage->du_name, + NULL, &flags, NULL, NULL)) != DLADM_STATUS_OK) { + return (status); + } + if ((flags & DLADM_OPT_ACTIVE) == 0) + return (DLADM_STATUS_LINKINVAL); + } if (state->us_plot) { if (!state->us_printheader) { @@ -1164,6 +1210,21 @@ show_usage_res(dladm_usage_t *usage, void *arg) show_usage_state_t *state = (show_usage_state_t *)arg; char buf[DLADM_STRSIZE]; usage_fields_buf_t ubuf; + dladm_status_t status; + uint32_t flags; + + /* + * Only show usage information for existing links unless '-a' + * is specified. + */ + if (!state->us_showall) { + if ((status = dladm_name2info(handle, usage->du_name, + NULL, &flags, NULL, NULL)) != DLADM_STATUS_OK) { + return (status); + } + if ((flags & DLADM_OPT_ACTIVE) == 0) + return (DLADM_STATUS_LINKINVAL); + } bzero(&ubuf, sizeof (ubuf)); @@ -1210,7 +1271,6 @@ do_show_usage(int argc, char *argv[], const char *use) int opt; dladm_status_t status; boolean_t d_arg = B_FALSE; - boolean_t p_arg = B_FALSE; char *stime = NULL; char *etime = NULL; char *resource = NULL; @@ -1232,13 +1292,14 @@ do_show_usage(int argc, char *argv[], const char *use) state.us_plot = B_FALSE; state.us_first = B_TRUE; - while ((opt = getopt(argc, argv, "dps:e:o:f:F:")) != -1) { + while ((opt = getopt_long(argc, argv, "das:e:o:f:F:", + usage_opts, NULL)) != -1) { switch (opt) { case 'd': d_arg = B_TRUE; break; - case 'p': - state.us_plot = p_arg = B_TRUE; + case 'a': + state.us_showall = B_TRUE; break; case 'f': file = optarg; @@ -1254,7 +1315,7 @@ do_show_usage(int argc, char *argv[], const char *use) fields_str = optarg; break; case 'F': - F_arg = B_TRUE; + state.us_plot = F_arg = B_TRUE; formatspec_str = optarg; break; default: @@ -1267,7 +1328,15 @@ do_show_usage(int argc, char *argv[], const char *use) die("show-usage requires a file"); if (optind == (argc-1)) { + uint32_t flags; + resource = argv[optind]; + if (!state.us_showall && + (((status = dladm_name2info(handle, resource, NULL, &flags, + NULL, NULL)) != DLADM_STATUS_OK) || + ((flags & DLADM_OPT_ACTIVE) == 0))) { + die("invalid link: '%s'", resource); + } } if (resource == NULL && stime == NULL && etime == NULL) { @@ -1289,11 +1358,8 @@ do_show_usage(int argc, char *argv[], const char *use) state.us_print.ps_fields = fields; state.us_print.ps_nfields = nfields; - if (p_arg && d_arg) - die("plot and date options are incompatible"); - - if (p_arg && !F_arg) - die("specify format speicifier: -F <format>"); + if (F_arg && d_arg) + die("incompatible -d and -F options"); if (F_arg && valid_formatspec(formatspec_str) == B_FALSE) die("Format specifier %s not supported", formatspec_str); @@ -1303,7 +1369,7 @@ do_show_usage(int argc, char *argv[], const char *use) status = dladm_usage_dates(show_usage_date, DLADM_LOGTYPE_LINK, file, resource, &state); } else if (resource == NULL && stime == NULL && etime == NULL && - !p_arg) { + !F_arg) { /* Print summary */ status = dladm_usage_summary(show_usage_res, DLADM_LOGTYPE_LINK, file, &state); @@ -2320,6 +2386,48 @@ done: return (DLADM_WALK_CONTINUE); } +static char * +print_link_stats(print_field_t *pf, void *arg) +{ + link_args_t *largs = arg; + pktsum_t *diff_stats = largs->link_s_psum; + static char buf[DLADM_STRSIZE]; + + switch (pf->pf_index) { + case LINK_S_LINK: + (void) snprintf(buf, sizeof (buf), "%s", largs->link_s_link); + break; + case LINK_S_IPKTS: + (void) snprintf(buf, sizeof (buf), "%llu", + diff_stats->ipackets); + break; + case LINK_S_RBYTES: + (void) snprintf(buf, sizeof (buf), "%llu", + diff_stats->rbytes); + break; + case LINK_S_IERRORS: + (void) snprintf(buf, sizeof (buf), "%u", + diff_stats->ierrors); + break; + case LINK_S_OPKTS: + (void) snprintf(buf, sizeof (buf), "%llu", + diff_stats->opackets); + break; + case LINK_S_OBYTES: + (void) snprintf(buf, sizeof (buf), "%llu", + diff_stats->obytes); + break; + case LINK_S_OERRORS: + (void) snprintf(buf, sizeof (buf), "%u", + diff_stats->oerrors); + break; + default: + die("invalid input"); + break; + } + return (buf); +} + static int show_link_stats(dladm_handle_t dh, datalink_id_t linkid, void *arg) { @@ -2328,6 +2436,7 @@ show_link_stats(dladm_handle_t dh, datalink_id_t linkid, void *arg) show_state_t *state = (show_state_t *)arg; pktsum_t stats, diff_stats; dladm_phys_attr_t dpa; + link_args_t largs; if (state->ls_firstonly) { if (state->ls_donefirst) @@ -2356,13 +2465,10 @@ show_link_stats(dladm_handle_t dh, datalink_id_t linkid, void *arg) } dladm_stats_diff(&diff_stats, &stats, &state->ls_prevstats); - (void) printf("%-12s", link); - (void) printf("%-10llu", diff_stats.ipackets); - (void) printf("%-12llu", diff_stats.rbytes); - (void) printf("%-8llu", diff_stats.ierrors); - (void) printf("%-10llu", diff_stats.opackets); - (void) printf("%-12llu", diff_stats.obytes); - (void) printf("%-8llu\n", diff_stats.oerrors); + largs.link_s_link = link; + largs.link_s_psum = &diff_stats; + dladm_print_output(&state->ls_print, state->ls_parseable, + print_link_stats, &largs); state->ls_prevstats = stats; return (DLADM_WALK_CONTINUE); @@ -4200,7 +4306,7 @@ do_show_vnic_common(int argc, char *argv[], const char *use, int pfmax; uint_t nfields; char *all_fields = - "link,over,speed,macaddr,macaddrtype,vid"; + "link,over,speed,macaddress,macaddrtype,vid"; char *all_e_fields = "link"; @@ -4408,8 +4514,8 @@ link_stats(datalink_id_t linkid, uint_t interval, char *fields_str, print_field_t **fields; uint_t nfields; - fields = parse_output_fields(fields_str, devs_fields, DEVS_MAX_FIELDS, - CMD_TYPE_ANY, &nfields); + fields = parse_output_fields(fields_str, link_s_fields, + LINK_S_MAX_FIELDS, CMD_TYPE_ANY, &nfields); if (fields == NULL) { die("invalid field(s) specified"); return; diff --git a/usr/src/cmd/flowadm/flowadm.c b/usr/src/cmd/flowadm/flowadm.c index 091b175869..900ceab40d 100644 --- a/usr/src/cmd/flowadm/flowadm.c +++ b/usr/src/cmd/flowadm/flowadm.c @@ -83,6 +83,7 @@ typedef struct show_usage_state_s { boolean_t us_parseable; boolean_t us_printheader; boolean_t us_first; + boolean_t us_showall; print_state_t us_print; } show_usage_state_t; @@ -105,6 +106,22 @@ static char *flowadm_print_field(print_field_t *, void *); #define MAX_FIELD_LEN 32 +typedef struct show_flow_state { + boolean_t fs_firstonly; + boolean_t fs_donefirst; + pktsum_t fs_prevstats; + uint32_t fs_flags; + dladm_status_t fs_status; + print_state_t fs_print; + const char *fs_flow; + const char *fs_link; + boolean_t fs_parseable; + boolean_t fs_printheader; + boolean_t fs_persist; + boolean_t fs_stats; + uint64_t fs_mask; +} show_flow_state_t; + typedef void cmdfunc_t(int, char **); static cmdfunc_t do_add_flow, do_remove_flow, do_init_flow, do_show_flow; @@ -114,7 +131,8 @@ static cmdfunc_t do_show_usage; static int show_flow(dladm_flow_attr_t *, void *); static int show_flows_onelink(dladm_handle_t, datalink_id_t, void *); -static void flow_stats(const char *, datalink_id_t, uint_t); +static void flow_stats(const char *, datalink_id_t, uint_t, char *, + show_flow_state_t *); static void get_flow_stats(const char *, pktsum_t *); static int show_flow_stats(dladm_flow_attr_t *, void *); static int show_link_flow_stats(dladm_handle_t, datalink_id_t, void *); @@ -168,26 +186,6 @@ static const struct option prop_longopts[] = { }; /* - * structures for 'flowadm show-flow' - */ - -typedef struct show_flow_state { - boolean_t fs_firstonly; - boolean_t fs_donefirst; - pktsum_t fs_prevstats; - uint32_t fs_flags; - dladm_status_t fs_status; - print_state_t fs_print; - const char *fs_flow; - const char *fs_link; - boolean_t fs_parseable; - boolean_t fs_printheader; - boolean_t fs_persist; - boolean_t fs_stats; - uint64_t fs_mask; -} show_flow_state_t; - -/* * structures for 'flowadm remove-flow' */ @@ -197,15 +195,6 @@ typedef struct remove_flow_state { dladm_status_t fs_status; } remove_flow_state_t; -typedef struct flow_args_s { - const char *fa_link; - int fa_attrno; /* -1 indicates flow itself */ - uint64_t fa_mask; - dladm_flow_attr_t *fa_finfop; - dladm_status_t *fa_status; - boolean_t fa_parseable; -} flow_args_t; - #define PROTO_MAXSTR_LEN 7 #define PORT_MAXSTR_LEN 6 #define DSFIELD_MAXSTR_LEN 10 @@ -288,9 +277,40 @@ typedef struct flowprop_args_s { char *fs_propname; char *fs_flowname; } flowprop_args_t; +/* + * structures for 'flowadm show-flow -s' (print statistics) + */ +typedef enum { + FLOW_S_FLOW, + FLOW_S_IPKTS, + FLOW_S_RBYTES, + FLOW_S_IERRORS, + FLOW_S_OPKTS, + FLOW_S_OBYTES, + FLOW_S_OERRORS +} flow_s_field_index_t; + +static print_field_t flow_s_fields[] = { +/* name, header, field width, index, cmdtype */ +{ "flow", "FLOW", 15, FLOW_S_FLOW, CMD_TYPE_ANY}, +{ "ipackets", "IPACKETS", 10, FLOW_S_IPKTS, CMD_TYPE_ANY}, +{ "rbytes", "RBYTES", 8, FLOW_S_RBYTES, CMD_TYPE_ANY}, +{ "ierrors", "IERRORS", 10, FLOW_S_IERRORS, CMD_TYPE_ANY}, +{ "opackets", "OPACKETS", 12, FLOW_S_OPKTS, CMD_TYPE_ANY}, +{ "obytes", "OBYTES", 12, FLOW_S_OBYTES, CMD_TYPE_ANY}, +{ "oerrors", "OERRORS", 8, FLOW_S_OERRORS, CMD_TYPE_ANY}} +; +#define FLOW_S_MAX_FIELDS \ + (sizeof (flow_s_fields) / sizeof (print_field_t)) + +typedef struct flow_args_s { + char *flow_s_flow; + pktsum_t *flow_s_psum; +} flow_args_t; +static char *print_flow_stats(print_field_t *, void *); /* - * structures for 'flow show-usage' + * structures for 'flowadm show-usage' */ typedef struct usage_fields_buf_s { @@ -392,7 +412,7 @@ usage(void) " reset-flowprop [-t] [-p <prop>,...] <flow>\n" " show-flowprop [-cP] [-l <link>] [-p <prop>,...] " "[<flow>]\n\n" - " show-usage [-d|-p -F <format>] " + " show-usage [-a] [-d | -F <format>] " "[-s <DD/MM/YYYY,HH:MM:SS>]\n" "\t\t [-e <DD/MM/YYYY,HH:MM:SS>] -f <logfile> [<flow>]\n")); @@ -476,9 +496,20 @@ do_init_flow(int argc, char *argv[]) static int show_usage_date(dladm_usage_t *usage, void *arg) { + show_usage_state_t *state = (show_usage_state_t *)arg; + time_t stime; + char timebuf[20]; + dladm_flow_attr_t attr; + dladm_status_t status; - time_t stime; - char timebuf[20]; + /* + * Only show usage information for existing flows unless '-a' + * is specified. + */ + if (!state->us_showall && ((status = dladm_flow_info(handle, + usage->du_name, &attr)) != DLADM_STATUS_OK)) { + return (status); + } stime = usage->du_stime; (void) strftime(timebuf, sizeof (timebuf), "%m/%d/%Y", @@ -496,6 +527,17 @@ show_usage_time(dladm_usage_t *usage, void *arg) usage_l_fields_buf_t ubuf; time_t time; double bw; + dladm_flow_attr_t attr; + dladm_status_t status; + + /* + * Only show usage information for existing flows unless '-a' + * is specified. + */ + if (!state->us_showall && ((status = dladm_flow_info(handle, + usage->du_name, &attr)) != DLADM_STATUS_OK)) { + return (status); + } if (state->us_plot) { if (!state->us_printheader) { @@ -563,6 +605,17 @@ show_usage_res(dladm_usage_t *usage, void *arg) show_usage_state_t *state = (show_usage_state_t *)arg; char buf[DLADM_STRSIZE]; usage_fields_buf_t ubuf; + dladm_flow_attr_t attr; + dladm_status_t status; + + /* + * Only show usage information for existing flows unless '-a' + * is specified. + */ + if (!state->us_showall && ((status = dladm_flow_info(handle, + usage->du_name, &attr)) != DLADM_STATUS_OK)) { + return (status); + } bzero(&ubuf, sizeof (ubuf)); @@ -608,7 +661,6 @@ do_show_usage(int argc, char *argv[]) int opt; dladm_status_t status; boolean_t d_arg = B_FALSE; - boolean_t p_arg = B_FALSE; char *stime = NULL; char *etime = NULL; char *resource = NULL; @@ -630,13 +682,13 @@ do_show_usage(int argc, char *argv[]) state.us_plot = B_FALSE; state.us_first = B_TRUE; - while ((opt = getopt(argc, argv, "dps:e:o:f:F:")) != -1) { + while ((opt = getopt(argc, argv, "das:e:o:f:F:")) != -1) { switch (opt) { case 'd': d_arg = B_TRUE; break; - case 'p': - state.us_plot = p_arg = B_TRUE; + case 'a': + state.us_showall = B_TRUE; break; case 'f': file = optarg; @@ -652,7 +704,7 @@ do_show_usage(int argc, char *argv[]) fields_str = optarg; break; case 'F': - F_arg = B_TRUE; + state.us_plot = F_arg = B_TRUE; formatspec_str = optarg; break; default: @@ -664,6 +716,13 @@ do_show_usage(int argc, char *argv[]) die("show-usage requires a file"); if (optind == (argc-1)) { + dladm_flow_attr_t attr; + + if (!state.us_showall && + dladm_flow_info(handle, resource, &attr) != + DLADM_STATUS_OK) { + die("invalid flow: '%s'", resource); + } resource = argv[optind]; } @@ -686,11 +745,8 @@ do_show_usage(int argc, char *argv[]) state.us_print.ps_fields = fields; state.us_print.ps_nfields = nfields; - if (p_arg && d_arg) - die("plot and date options are incompatible"); - - if (p_arg && !F_arg) - die("specify format speicifier: -F <format>"); + if (F_arg && d_arg) + die("incompatible -d and -F options"); if (F_arg && valid_formatspec(formatspec_str) == B_FALSE) die("Format specifier %s not supported", formatspec_str); @@ -700,7 +756,7 @@ do_show_usage(int argc, char *argv[]) status = dladm_usage_dates(show_usage_date, DLADM_LOGTYPE_FLOW, file, resource, &state); } else if (resource == NULL && stime == NULL && etime == NULL && - !p_arg) { + !F_arg) { /* Print summary */ status = dladm_usage_summary(show_usage_res, DLADM_LOGTYPE_FLOW, file, &state); @@ -997,13 +1053,56 @@ get_flow_stats(const char *flowname, pktsum_t *stats) (void) kstat_close(kcp); } + +static char * +print_flow_stats(print_field_t *pf, void *arg) +{ + flow_args_t *fargs = arg; + pktsum_t *diff_stats = fargs->flow_s_psum; + static char buf[DLADM_STRSIZE]; + + switch (pf->pf_index) { + case FLOW_S_FLOW: + (void) snprintf(buf, sizeof (buf), "%s", fargs->flow_s_flow); + break; + case FLOW_S_IPKTS: + (void) snprintf(buf, sizeof (buf), "%llu", + diff_stats->ipackets); + break; + case FLOW_S_RBYTES: + (void) snprintf(buf, sizeof (buf), "%llu", + diff_stats->rbytes); + break; + case FLOW_S_IERRORS: + (void) snprintf(buf, sizeof (buf), "%u", + diff_stats->ierrors); + break; + case FLOW_S_OPKTS: + (void) snprintf(buf, sizeof (buf), "%llu", + diff_stats->opackets); + break; + case FLOW_S_OBYTES: + (void) snprintf(buf, sizeof (buf), "%llu", + diff_stats->obytes); + break; + case FLOW_S_OERRORS: + (void) snprintf(buf, sizeof (buf), "%u", + diff_stats->oerrors); + break; + default: + die("invalid input"); + break; + } + return (buf); +} /* ARGSUSED */ static int show_flow_stats(dladm_flow_attr_t *attr, void *arg) { - show_flow_state_t *state = (show_flow_state_t *)arg; - const char *name = attr->fa_flowname; - pktsum_t stats, diff_stats; + show_flow_state_t *state = (show_flow_state_t *)arg; + char *name = attr->fa_flowname; + pktsum_t stats, diff_stats; + flow_args_t fargs; if (state->fs_firstonly) { if (state->fs_donefirst) @@ -1016,13 +1115,10 @@ show_flow_stats(dladm_flow_attr_t *attr, void *arg) get_flow_stats(name, &stats); dladm_stats_diff(&diff_stats, &stats, &state->fs_prevstats); - (void) printf("%-12s", name); - (void) printf("%-10llu", diff_stats.ipackets); - (void) printf("%-12llu", diff_stats.rbytes); - (void) printf("%-8llu", diff_stats.ierrors); - (void) printf("%-10llu", diff_stats.opackets); - (void) printf("%-12llu", diff_stats.obytes); - (void) printf("%-8llu\n", diff_stats.oerrors); + fargs.flow_s_flow = name; + fargs.flow_s_psum = &diff_stats; + flowadm_print_output(&state->fs_print, state->fs_parseable, + print_flow_stats, &fargs); state->fs_prevstats = stats; @@ -1046,45 +1142,52 @@ show_link_flow_stats(dladm_handle_t dh, datalink_id_t linkid, void * arg) /* ARGSUSED */ static void -flow_stats(const char *flow, datalink_id_t linkid, uint_t interval) +flow_stats(const char *flow, datalink_id_t linkid, uint_t interval, + char *fields_str, show_flow_state_t *state) { - show_flow_state_t state; dladm_flow_attr_t attr; + print_field_t **fields; + uint_t nfields; + + fields = parse_output_fields(fields_str, flow_s_fields, + FLOW_S_MAX_FIELDS, CMD_TYPE_ANY, &nfields); + if (fields == NULL) { + die("invalid field(s) specified"); + return; + } + + state->fs_print.ps_fields = fields; + state->fs_print.ps_nfields = nfields; if (flow != NULL && dladm_flow_info(handle, flow, &attr) != DLADM_STATUS_OK) die("invalid flow %s", flow); - bzero(&state, sizeof (state)); - /* * If an interval is specified, continuously show the stats * for only the first flow. */ - state.fs_firstonly = (interval != 0); + state->fs_firstonly = (interval != 0); + if (!state->fs_parseable) + print_header(&state->fs_print); for (;;) { - if (!state.fs_donefirst) - (void) printf("%-12s%-10s%-12s%-8s%-10s%-12s%-8s\n", - "FLOW", "IPACKETS", "RBYTES", "IERRORS", - "OPACKETS", "OBYTES", "OERRORS"); - - state.fs_donefirst = B_FALSE; + state->fs_donefirst = B_FALSE; /* Show stats for named flow */ if (flow != NULL) { - state.fs_flow = flow; - (void) show_flow_stats(&attr, &state); + state->fs_flow = flow; + (void) show_flow_stats(&attr, state); /* Show all stats on a link */ } else if (linkid != DATALINK_INVALID_LINKID) { (void) dladm_walk_flow(show_flow_stats, handle, linkid, - &state, B_FALSE); + state, B_FALSE); /* Show all stats by datalink */ } else { (void) dladm_walk_datalink_id(show_link_flow_stats, - handle, &state, DATALINK_CLASS_ALL, + handle, state, DATALINK_CLASS_ALL, DATALINK_ANY_MEDIATYPE, DLADM_OPT_ACTIVE); } @@ -1115,6 +1218,8 @@ do_show_flow(int argc, char *argv[]) uint_t nfields; char *all_fields = "flow,link,ipaddr,proto,port,dsfld"; + char *allstat_fields = + "flow,ipackets,rbytes,ierrors,opackets,obytes,oerrors"; bzero(&state, sizeof (state)); @@ -1173,11 +1278,6 @@ do_show_flow(int argc, char *argv[]) break; } } - if (state.fs_parseable && !o_arg) - die("-p requires -o"); - - if (state.fs_parseable && strcasecmp(fields_str, "all") == 0) - die("\"-o all\" is invalid with -p"); if (i_arg && !(s_arg || S_arg)) die("the -i option can be used only with -s or -S"); @@ -1193,19 +1293,23 @@ do_show_flow(int argc, char *argv[]) state.fs_flow = flowname; } - if (s_arg) { - flow_stats(state.fs_flow, linkid, interval); - return; - } - if (S_arg) { dladm_continuous(handle, linkid, state.fs_flow, interval, FLOW_REPORT); return; } - if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0)) - fields_str = all_fields; + if (!o_arg || (o_arg && strcasecmp(fields_str, "all") == 0)) { + if (s_arg) + fields_str = allstat_fields; + else + fields_str = all_fields; + } + + if (s_arg) { + flow_stats(state.fs_flow, linkid, interval, fields_str, &state); + return; + } fields = parse_output_fields(fields_str, flow_fields, FLOW_MAX_FIELDS, CMD_TYPE_ANY, &nfields); diff --git a/usr/src/lib/libdladm/common/usage.c b/usr/src/lib/libdladm/common/usage.c index 07ef7bbb22..cc241e6077 100644 --- a/usr/src/lib/libdladm/common/usage.c +++ b/usr/src/lib/libdladm/common/usage.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -580,12 +580,8 @@ update_pe(net_plot_entry_t *pe, net_stat_t *nns, int nentries, int count; for (count = 0; count < nentries; count++) { - if ((strlen(nns->net_stat_name) == - strlen(pe[count].net_pe_name)) && - (strncmp(pe[count].net_pe_name, nns->net_stat_name, - strlen(nns->net_stat_name)) == 0)) { + if (strcmp(pe[count].net_pe_name, nns->net_stat_name) == 0) break; - } } if (count == nentries) return; @@ -638,10 +634,8 @@ get_ne_from_table(net_table_t *net_table, char *name) for (count = 0; count < net_table->net_entries; count++) { nd = ne->net_entry_desc; - if ((strlen(name) == strlen(nd->net_desc_name)) && - (strncmp(name, nd->net_desc_name, strlen(name)) == 0)) { + if (strcmp(name, nd->net_desc_name) == 0) return (ne); - } ne = ne->net_entry_next; } return (NULL); @@ -657,13 +651,8 @@ get_ndesc(net_table_t *net_table, net_desc_t *nd) for (count = 0; count < net_table->net_entries; count++) { nd1 = ne->net_entry_desc; - if (strlen(nd1->net_desc_name) == strlen(nd->net_desc_name) && - strlen(nd1->net_desc_devname) == - strlen(nd->net_desc_devname) && - strncmp(nd1->net_desc_name, nd->net_desc_name, - strlen(nd1->net_desc_name)) == 0 && - strncmp(nd1->net_desc_devname, nd->net_desc_devname, - strlen(nd1->net_desc_devname)) == 0 && + if (strcmp(nd1->net_desc_name, nd->net_desc_name) == 0 && + strcmp(nd1->net_desc_devname, nd->net_desc_devname) == 0 && bcmp(nd1->net_desc_ehost, nd->net_desc_ehost, ETHERADDRL) == 0 && bcmp(nd1->net_desc_edest, nd->net_desc_edest, @@ -841,10 +830,8 @@ addto_time_list(net_table_t *net_table, net_time_entry_t *nt, NET_DATE_GREATER) { break; } - if ((strlen(ns1->net_stat_name) == - strlen(ns->net_stat_name)) && - (strncmp(ns1->net_stat_name, ns->net_stat_name, - strlen(ns1->net_stat_name)) == 0)) { + if (strcmp(ns1->net_stat_name, ns->net_stat_name) == + 0) { ntc->net_time_entry_next = end->net_time_entry_next; if (end->net_time_entry_next != NULL) { @@ -1084,9 +1071,7 @@ dladm_walk_usage_res(int (*fn)(dladm_usage_t *, void *), int logtype, nns = start->my_time_stat; /* Get to the resource we are interested in */ - if ((strlen(resource) != strlen(nns->net_stat_name)) || - (strncmp(resource, nns->net_stat_name, - strlen(nns->net_stat_name)) != 0)) { + if (strcmp(resource, nns->net_stat_name) != 0) { start = start->net_time_entry_next; continue; } @@ -1400,9 +1385,7 @@ dladm_usage_dates(int (*fn)(dladm_usage_t *, void *), int logtype, /* get to the resource we are interested in */ if (resource != NULL) { - if ((strlen(resource) != strlen(nns->net_stat_name)) || - (strncmp(resource, nns->net_stat_name, - strlen(nns->net_stat_name)) != 0)) { + if (strcmp(resource, nns->net_stat_name) != 0) { start = start->net_time_entry_next; continue; } @@ -1422,6 +1405,8 @@ dladm_usage_dates(int (*fn)(dladm_usage_t *, void *), int logtype, compare_date(&nns->net_stat_time, lasttime) == NET_DATE_GREATER) { bzero(&usage, sizeof (dladm_usage_t)); + (void) strlcpy(usage.du_name, nns->net_stat_name, + sizeof (usage.du_name)); bcopy(&nns->net_stat_ctime, &usage.du_stime, sizeof (usage.du_stime)); fn(&usage, arg); diff --git a/usr/src/pkgdefs/SUNWcnetr/postinstall b/usr/src/pkgdefs/SUNWcnetr/postinstall index 20d09c70ee..1cfa89902f 100644 --- a/usr/src/pkgdefs/SUNWcnetr/postinstall +++ b/usr/src/pkgdefs/SUNWcnetr/postinstall @@ -18,7 +18,7 @@ # # CDDL HEADER END # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -109,9 +109,31 @@ if [ -f "${ORIG}" ]; then removef -f $PKGINST > /dev/null 2>&1 fi -# Convert hostname.xxx and zonecfg vlan entries -host_ifs=`ls -1 $rootprefix/etc | egrep -e '^hostname.|^hostname6.|^dhcp.'| \ - cut -d . -f2 | sort -u` +# Convert (hostname|hostname6|dhcp).xxx and zonecfg vlan entries + +for iftype in hostname hostname6 dhcp +do + interface_names="`echo $rootprefix/etc/$iftype.*[0-9] 2>/dev/null`" + if [ "$interface_names" != "$rootprefix/etc/$iftype.*[0-9]" ]; then + ORIGIFS="$IFS" + IFS="$IFS." + set -- $interface_names + IFS="$ORIGIFS" + while [ $# -ge 2 ]; do + shift + if [ $# -gt 1 -a \ + "$2" != "$rootprefix/etc/$iftype" ]; then + while [ $# -gt 1 -a \ + "$1" != "$rootprefix/etc/$iftype" ]; do + shift + done + else + host_ifs="$host_ifs $1" + shift + fi + done + fi +done zones=`zoneadm list -c | grep -v global` for zone in $zones diff --git a/usr/src/tools/scripts/bfu.sh b/usr/src/tools/scripts/bfu.sh index ffd13b6a26..debf200d36 100644 --- a/usr/src/tools/scripts/bfu.sh +++ b/usr/src/tools/scripts/bfu.sh @@ -4708,8 +4708,31 @@ then if [[ ! -f $root/sbin/flowadm ]] && \ archive_file_exists generic.sbin "sbin/flowadm"; then flowadm_status="new" - host_ifs=`ls -1 $rootprefix/etc | egrep -e \ - '^hostname.|^hostname6.|^dhcp.'| cut -d . -f2 | sort -u` + + for iftype in hostname hostname6 dhcp + do + interface_names="`echo /etc/$iftype.*[0-9] 2>/dev/null`" + if [ "$interface_names" != "/etc/iftype.*[0-9]" ]; then + ORIGIFS="$IFS" + IFS="$IFS." + set -- $interface_names + IFS="$ORIGIFS" + while [ $# -ge 2 ]; do + shift + if [ $# -gt 1 -a \ + "$2" != "/etc/$iftype" ]; then + while [ $# -gt 1 -a \ + "$1" != "/etc/$iftype" ]; do + shift + done + else + host_ifs="$host_ifs $1" + shift + fi + done + fi + done + zones=`zoneadm list -c | grep -v global` for zone in $zones do diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h index ff820814bf..a18c3d0f4c 100644 --- a/usr/src/uts/common/inet/ip.h +++ b/usr/src/uts/common/inet/ip.h @@ -1128,7 +1128,7 @@ typedef struct iulp_s { extern const iulp_t ire_uinfo_null; /* - * The conn drain list structure. + * The conn drain list structure (idl_t). * The list is protected by idl_lock. Each conn_t inserted in the list * points back at this idl_t using conn_idl. IP primes the draining of the * conns queued in these lists, by qenabling the 1st conn of each list. This @@ -1137,8 +1137,27 @@ extern const iulp_t ire_uinfo_null; * idl_lock protects all other members of idl_t and conn_drain_next * and conn_drain_prev of conn_t. The conn_lock protects IPCF_DRAIN_DISABLED * flag of the conn_t and conn_idl. + * + * The conn drain list, idl_t, itself is part of tx cookie list structure. + * A tx cookie list points to a blocked Tx ring and contains the list of + * all conn's that are blocked due to the flow-controlled Tx ring (via + * the idl drain list). Note that a link can have multiple Tx rings. The + * drain list will store the conn's blocked due to Tx ring being flow + * controlled. */ -typedef struct idl_s { + +typedef uintptr_t ip_mac_tx_cookie_t; +typedef struct idl_s idl_t; +typedef struct idl_tx_list_s idl_tx_list_t; + +struct idl_tx_list_s { + ip_mac_tx_cookie_t txl_cookie; + kmutex_t txl_lock; /* Lock for this list */ + idl_t *txl_drain_list; + int txl_drain_index; +}; + +struct idl_s { conn_t *idl_conn; /* Head of drain list */ kmutex_t idl_lock; /* Lock for this list */ conn_t *idl_conn_draining; /* conn that is draining */ @@ -1146,7 +1165,8 @@ typedef struct idl_s { idl_repeat : 1, /* Last conn must re-enable */ /* drain list again */ idl_unused : 31; -} idl_t; + idl_tx_list_t *idl_itl; +}; #define CONN_DRAIN_LIST_LOCK(connp) (&((connp)->conn_idl->idl_lock)) /* @@ -3336,8 +3356,8 @@ extern ill_t *ipmp_ipif_hold_bound_ill(const ipif_t *); extern boolean_t ipmp_ipif_is_dataaddr(const ipif_t *); extern boolean_t ipmp_ipif_is_stubaddr(const ipif_t *); -extern void conn_drain_insert(conn_t *connp); -extern int conn_ipsec_length(conn_t *connp); +extern void conn_drain_insert(conn_t *, idl_tx_list_t *); +extern int conn_ipsec_length(conn_t *); extern void ip_wput_ipsec_out(queue_t *, mblk_t *, ipha_t *, ill_t *, ire_t *); extern ipaddr_t ip_get_dst(ipha_t *); @@ -3587,13 +3607,16 @@ typedef enum { * we need to duplicate the definitions here because we cannot * include mac/dls header files here. */ -typedef void *ip_mac_tx_cookie_t; -typedef void (*ip_mac_intr_disable_t)(void *); -typedef void (*ip_mac_intr_enable_t)(void *); -typedef void *(*ip_dld_tx_t)(void *, mblk_t *, uint64_t, uint16_t); -typedef void (*ip_flow_enable_t)(void *, ip_mac_tx_cookie_t); -typedef void *(*ip_dld_callb_t)(void *, ip_flow_enable_t, void *); -typedef int (*ip_capab_func_t)(void *, uint_t, void *, uint_t); +typedef void (*ip_mac_intr_disable_t)(void *); +typedef void (*ip_mac_intr_enable_t)(void *); +typedef ip_mac_tx_cookie_t (*ip_dld_tx_t)(void *, mblk_t *, + uint64_t, uint16_t); +typedef void (*ip_flow_enable_t)(void *, ip_mac_tx_cookie_t); +typedef void *(*ip_dld_callb_t)(void *, + ip_flow_enable_t, void *); +typedef boolean_t (*ip_dld_fctl_t)(void *, ip_mac_tx_cookie_t); +typedef int (*ip_capab_func_t)(void *, uint_t, + void *, uint_t); /* * POLLING README @@ -3640,6 +3663,8 @@ typedef struct ill_dld_direct_s { /* DLD provided driver Tx */ void *idd_tx_dh; /* dld_str_t *dsp */ ip_dld_callb_t idd_tx_cb_df; /* mac_tx_srs_notify */ void *idd_tx_cb_dh; /* mac_client_handle_t *mch */ + ip_dld_fctl_t idd_tx_fctl_df; /* mac_tx_is_flow_blocked */ + void *idd_tx_fctl_dh; /* mac_client_handle */ } ill_dld_direct_t; /* IP - DLD polling capability */ diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c index b040d36c8a..116ae8ccec 100644 --- a/usr/src/uts/common/inet/ip/ip.c +++ b/usr/src/uts/common/inet/ip/ip.c @@ -451,29 +451,115 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * policy change may affect them. * * IP Flow control notes: + * --------------------- + * Non-TCP streams are flow controlled by IP. The way this is accomplished + * differs when ILL_CAPAB_DLD_DIRECT is enabled for that IP instance. When + * ILL_DIRECT_CAPABLE(ill) is TRUE, IP can do direct function calls into + * GLDv3. Otherwise packets are sent down to lower layers using STREAMS + * functions. * - * Non-TCP streams are flow controlled by IP. On the send side, if the packet - * cannot be sent down to the driver by IP, because of a canput failure, IP - * does a putq on the conn_wq. This will cause ip_wsrv to run on the conn_wq. - * ip_wsrv in turn, inserts the conn in a list of conn's that need to be drained - * when the flowcontrol condition subsides. Ultimately STREAMS backenables the - * ip_wsrv on the IP module, which in turn does a qenable of the conn_wq of the - * first conn in the list of conn's to be drained. ip_wsrv on this conn drains - * the queued messages, and removes the conn from the drain list, if all - * messages were drained. It also qenables the next conn in the drain list to - * continue the drain process. + * Per Tx ring udp flow control: + * This is applicable only when ILL_CAPAB_DLD_DIRECT capability is set in + * the ill (i.e. ILL_DIRECT_CAPABLE(ill) is true). + * + * The underlying link can expose multiple Tx rings to the GLDv3 mac layer. + * To achieve best performance, outgoing traffic need to be fanned out among + * these Tx ring. mac_tx() is called (via str_mdata_fastpath_put()) to send + * traffic out of the NIC and it takes a fanout hint. UDP connections pass + * the address of connp as fanout hint to mac_tx(). Under flow controlled + * condition, mac_tx() returns a non-NULL cookie (ip_mac_tx_cookie_t). This + * cookie points to a specific Tx ring that is blocked. The cookie is used to + * hash into an idl_tx_list[] entry in idl_tx_list[] array. Each idl_tx_list_t + * point to drain_lists (idl_t's). These drain list will store the blocked UDP + * connp's. The drain list is not a single list but a configurable number of + * lists. + * + * The diagram below shows idl_tx_list_t's and their drain_lists. ip_stack_t + * has an array of idl_tx_list_t. The size of the array is TX_FANOUT_SIZE + * which is equal to 128. This array in turn contains a pointer to idl_t[], + * the ip drain list. The idl_t[] array size is MIN(max_ncpus, 8). The drain + * list will point to the list of connp's that are flow controlled. + * + * --------------- ------- ------- ------- + * |->|drain_list[0]|-->|connp|-->|connp|-->|connp|--> + * | --------------- ------- ------- ------- + * | --------------- ------- ------- ------- + * |->|drain_list[1]|-->|connp|-->|connp|-->|connp|--> + * ---------------- | --------------- ------- ------- ------- + * |idl_tx_list[0]|->| --------------- ------- ------- ------- + * ---------------- |->|drain_list[2]|-->|connp|-->|connp|-->|connp|--> + * | --------------- ------- ------- ------- + * . . . . . + * | --------------- ------- ------- ------- + * |->|drain_list[n]|-->|connp|-->|connp|-->|connp|--> + * --------------- ------- ------- ------- + * --------------- ------- ------- ------- + * |->|drain_list[0]|-->|connp|-->|connp|-->|connp|--> + * | --------------- ------- ------- ------- + * | --------------- ------- ------- ------- + * ---------------- |->|drain_list[1]|-->|connp|-->|connp|-->|connp|--> + * |idl_tx_list[1]|->| --------------- ------- ------- ------- + * ---------------- | . . . . + * | --------------- ------- ------- ------- + * |->|drain_list[n]|-->|connp|-->|connp|-->|connp|--> + * --------------- ------- ------- ------- + * ..... + * ---------------- + * |idl_tx_list[n]|-> ... + * ---------------- + * + * When mac_tx() returns a cookie, the cookie is used to hash into a + * idl_tx_list in ips_idl_tx_list[] array. Then conn_drain_insert() is + * called passing idl_tx_list. The connp gets inserted in a drain list + * pointed to by idl_tx_list. conn_drain_list() asserts flow control for + * the sockets (non stream based) and sets QFULL condition for conn_wq. + * connp->conn_direct_blocked will be set to indicate the blocked + * condition. + * + * GLDv3 mac layer calls ill_flow_enable() when flow control is relieved. + * A cookie is passed in the call to ill_flow_enable() that identifies the + * blocked Tx ring. This cookie is used to get to the idl_tx_list that + * contains the blocked connp's. conn_walk_drain() uses the idl_tx_list_t + * and goes through each of the drain list (q)enabling the conn_wq of the + * first conn in each of the drain list. This causes ip_wsrv to run for the + * conn. ip_wsrv drains the queued messages, and removes the conn from the + * drain list, if all messages were drained. It also qenables the next conn + * in the drain list to continue the drain process. * * In reality the drain list is not a single list, but a configurable number - * of lists. The ip_wsrv on the IP module, qenables the first conn in each - * list. If the ip_wsrv of the next qenabled conn does not run, because the + * of lists. conn_drain_walk() in the IP module, qenables the first conn in + * each list. If the ip_wsrv of the next qenabled conn does not run, because + * the stream closes, ip_close takes responsibility to qenable the next conn + * in the drain list. conn_drain_insert and conn_drain_tail are the only + * functions that manipulate this drain list. conn_drain_insert is called in + * ip_wput context itself (as opposed to from ip_wsrv context for STREAMS + * case -- see below). The synchronization between drain insertion and flow + * control wakeup is handled by using idl_txl->txl_lock. + * + * Flow control using STREAMS: + * When ILL_DIRECT_CAPABLE() is not TRUE, STREAMS flow control mechanism + * is used. On the send side, if the packet cannot be sent down to the + * driver by IP, because of a canput failure, IP does a putq on the conn_wq. + * This will cause ip_wsrv to run on the conn_wq. ip_wsrv in turn, inserts + * the conn in a list of conn's that need to be drained when the flow + * control condition subsides. The blocked connps are put in first member + * of ips_idl_tx_list[] array. Ultimately STREAMS backenables the ip_wsrv + * on the IP module. It calls conn_walk_drain() passing ips_idl_tx_list[0]. + * ips_idl_tx_list[0] contains the drain lists of blocked conns. The + * conn_wq of the first conn in the drain lists is (q)enabled to run. + * ip_wsrv on this conn drains the queued messages, and removes the conn + * from the drain list, if all messages were drained. It also qenables the + * next conn in the drain list to continue the drain process. + * + * If the ip_wsrv of the next qenabled conn does not run, because the * stream closes, ip_close takes responsibility to qenable the next conn in * the drain list. The directly called ip_wput path always does a putq, if * it cannot putnext. Thus synchronization problems are handled between * ip_wsrv and ip_close. conn_drain_insert and conn_drain_tail are the only * functions that manipulate this drain list. Furthermore conn_drain_insert - * is called only from ip_wsrv, and there can be only 1 instance of ip_wsrv - * running on a queue at any time. conn_drain_tail can be simultaneously called - * from both ip_wsrv and ip_close. + * is called only from ip_wsrv for the STREAMS case, and there can be only 1 + * instance of ip_wsrv running on a queue at any time. conn_drain_tail can + * be simultaneously called from both ip_wsrv and ip_close. * * IPQOS notes: * @@ -732,9 +818,11 @@ static void conn_drain_init(ip_stack_t *); static void conn_drain_fini(ip_stack_t *); static void conn_drain_tail(conn_t *connp, boolean_t closing); -static void conn_walk_drain(ip_stack_t *); +static void conn_walk_drain(ip_stack_t *, idl_tx_list_t *); static void conn_walk_fanout_table(connf_t *, uint_t, pfv_t, void *, zoneid_t); +static void conn_setqfull(conn_t *); +static void conn_clrqfull(conn_t *); static void *ip_stack_init(netstackid_t stackid, netstack_t *ns); static void ip_stack_shutdown(netstackid_t stackid, void *arg); @@ -5372,6 +5460,7 @@ ip_modclose(ill_t *ill) ipif_t *ipif; queue_t *q = ill->ill_rq; ip_stack_t *ipst = ill->ill_ipst; + int i; /* * The punlink prior to this may have initiated a capability @@ -5463,7 +5552,9 @@ ip_modclose(ill_t *ill) * get unblocked. */ ip1dbg(("ip_wsrv: walking\n")); - conn_walk_drain(ipst); + for (i = 0; i < TX_FANOUT_SIZE; i++) { + conn_walk_drain(ipst, &ipst->ips_idl_tx_list[i]); + } mutex_enter(&ipst->ips_ip_mi_lock); mi_close_unlink(&ipst->ips_ip_g_head, (IDP)ill); @@ -13908,8 +13999,7 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp) ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES, ill, IPV4_VERSION, hlen, ipst); } - - ILL_SEND_TX(stq_ill, ire, dst, mp, IP_DROP_ON_NO_DESC); + ILL_SEND_TX(stq_ill, ire, dst, mp, IP_DROP_ON_NO_DESC, NULL); } return (ire); @@ -22341,8 +22431,13 @@ another:; if (!IP_FLOW_CONTROLLED_ULP(PROTO)) { queue_t *dev_q = stq->q_next; - /* flow controlled */ - if (DEV_Q_FLOW_BLOCKED(dev_q)) + /* + * For DIRECT_CAPABLE, we do flow control at + * the time of sending the packet. See + * ILL_SEND_TX(). + */ + if (!ILL_DIRECT_CAPABLE((ill_t *)stq->q_ptr) && + (DEV_Q_FLOW_BLOCKED(dev_q))) goto blocked; if ((PROTO == IPPROTO_UDP) && @@ -22765,7 +22860,8 @@ broadcast: } else { queue_t *dev_q = stq->q_next; - if (DEV_Q_FLOW_BLOCKED(dev_q)) { + if (!ILL_DIRECT_CAPABLE((ill_t *)stq->q_ptr) && + (DEV_Q_FLOW_BLOCKED(dev_q))) { blocked: ipha->ipha_ident = ip_hdr_included; /* @@ -22780,10 +22876,15 @@ blocked: connp != NULL && caller != IRE_SEND) { if (caller == IP_WSRV) { + idl_tx_list_t *idl_txl; + + idl_txl = + &ipst->ips_idl_tx_list[0]; connp->conn_did_putbq = 1; (void) putbq(connp->conn_wq, first_mp); - conn_drain_insert(connp); + conn_drain_insert(connp, + idl_txl); /* * This is the service thread, * and the queue is already @@ -24401,7 +24502,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, ipha_t *, ipha, ip6_t *, NULL, int, 0); - ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0); + ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0, connp); BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutTransmits); UPDATE_MIB(out_ill->ill_ip_mib, @@ -24708,7 +24809,8 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, __dtrace_ipsr_ill_t *, out_ill, ipha_t *, ipha, ip6_t *, NULL, int, 0); - ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0); + ILL_SEND_TX(out_ill, ire, connp, + xmit_mp, 0, connp); BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutTransmits); @@ -27921,7 +28023,8 @@ bad_src_route: static void conn_drain_init(ip_stack_t *ipst) { - int i; + int i, j; + idl_tx_list_t *itl_tx; ipst->ips_conn_drain_list_cnt = conn_drain_nthreads; @@ -27937,12 +28040,19 @@ conn_drain_init(ip_stack_t *ipst) ipst->ips_conn_drain_list_cnt = MIN(max_ncpus, 8); } - ipst->ips_conn_drain_list = kmem_zalloc(ipst->ips_conn_drain_list_cnt * - sizeof (idl_t), KM_SLEEP); - - for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) { - mutex_init(&ipst->ips_conn_drain_list[i].idl_lock, NULL, - MUTEX_DEFAULT, NULL); + ipst->ips_idl_tx_list = + kmem_zalloc(TX_FANOUT_SIZE * sizeof (idl_tx_list_t), KM_SLEEP); + for (i = 0; i < TX_FANOUT_SIZE; i++) { + itl_tx = &ipst->ips_idl_tx_list[i]; + itl_tx->txl_drain_list = + kmem_zalloc(ipst->ips_conn_drain_list_cnt * + sizeof (idl_t), KM_SLEEP); + mutex_init(&itl_tx->txl_lock, NULL, MUTEX_DEFAULT, NULL); + for (j = 0; j < ipst->ips_conn_drain_list_cnt; j++) { + mutex_init(&itl_tx->txl_drain_list[j].idl_lock, NULL, + MUTEX_DEFAULT, NULL); + itl_tx->txl_drain_list[j].idl_itl = itl_tx; + } } } @@ -27950,12 +28060,16 @@ static void conn_drain_fini(ip_stack_t *ipst) { int i; + idl_tx_list_t *itl_tx; - for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) - mutex_destroy(&ipst->ips_conn_drain_list[i].idl_lock); - kmem_free(ipst->ips_conn_drain_list, - ipst->ips_conn_drain_list_cnt * sizeof (idl_t)); - ipst->ips_conn_drain_list = NULL; + for (i = 0; i < TX_FANOUT_SIZE; i++) { + itl_tx = &ipst->ips_idl_tx_list[i]; + kmem_free(itl_tx->txl_drain_list, + ipst->ips_conn_drain_list_cnt * sizeof (idl_t)); + } + kmem_free(ipst->ips_idl_tx_list, + TX_FANOUT_SIZE * sizeof (idl_tx_list_t)); + ipst->ips_idl_tx_list = NULL; } /* @@ -27968,16 +28082,11 @@ conn_drain_fini(ip_stack_t *ipst) * the first conn in each of these drain lists. Each of these qenabled conns * in turn enables the next in the list, after it runs, or when it closes, * thus sustaining the drain process. - * - * The only possible calling sequence is ip_wsrv (on conn) -> ip_wput -> - * conn_drain_insert. Thus there can be only 1 instance of conn_drain_insert - * running at any time, on a given conn, since there can be only 1 service proc - * running on a queue at any time. */ void -conn_drain_insert(conn_t *connp) +conn_drain_insert(conn_t *connp, idl_tx_list_t *tx_list) { - idl_t *idl; + idl_t *idl = tx_list->txl_drain_list; uint_t index; ip_stack_t *ipst = connp->conn_netstack->netstack_ip; @@ -27996,13 +28105,13 @@ conn_drain_insert(conn_t *connp) * Atomicity of load/stores is enough to make sure that * conn_drain_list_index is always within bounds. */ - index = ipst->ips_conn_drain_list_index; + index = tx_list->txl_drain_index; ASSERT(index < ipst->ips_conn_drain_list_cnt); - connp->conn_idl = &ipst->ips_conn_drain_list[index]; + connp->conn_idl = &tx_list->txl_drain_list[index]; index++; if (index == ipst->ips_conn_drain_list_cnt) index = 0; - ipst->ips_conn_drain_list_index = index; + tx_list->txl_drain_index = index; } mutex_exit(&connp->conn_lock); @@ -28044,8 +28153,12 @@ conn_drain_insert(conn_t *connp) * For non streams based sockets assert flow control. */ if (IPCL_IS_NONSTR(connp)) { + DTRACE_PROBE1(su__txq__full, conn_t *, connp); (*connp->conn_upcalls->su_txq_full) (connp->conn_upper_handle, B_TRUE); + } else { + conn_setqfull(connp); + noenable(connp->conn_wq); } mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); } @@ -28167,6 +28280,9 @@ conn_drain_tail(conn_t *connp, boolean_t closing) if (IPCL_IS_NONSTR(connp)) { (*connp->conn_upcalls->su_txq_full) (connp->conn_upper_handle, B_FALSE); + } else { + conn_clrqfull(connp); + enableok(connp->conn_wq); } } @@ -28194,6 +28310,8 @@ ip_wsrv(queue_t *q) if (q->q_next) { ill = (ill_t *)q->q_ptr; if (ill->ill_state_flags == 0) { + ip_stack_t *ipst = ill->ill_ipst; + /* * The device flow control has opened up. * Walk through conn drain lists and qenable the @@ -28202,7 +28320,7 @@ ip_wsrv(queue_t *q) * Hence the if check above. */ ip1dbg(("ip_wsrv: walking\n")); - conn_walk_drain(ill->ill_ipst); + conn_walk_drain(ipst, &ipst->ips_idl_tx_list[0]); } return; } @@ -28229,12 +28347,14 @@ ip_wsrv(queue_t *q) * (causing an infinite loop). */ ASSERT(!connp->conn_did_putbq); + while ((q->q_first != NULL) && !connp->conn_did_putbq) { connp->conn_draining = 1; noenable(q); while ((mp = getq(q)) != NULL) { ASSERT(CONN_Q(q)); + DTRACE_PROBE1(ip__wsrv__ip__output, conn_t *, connp); ip_output(Q_TO_CONN(q), mp, q, IP_WSRV); if (connp->conn_did_putbq) { /* ip_wput did a putbq */ @@ -28253,12 +28373,23 @@ ip_wsrv(queue_t *q) */ connp->conn_draining = 0; enableok(q); - } /* Enable the next conn for draining */ conn_drain_tail(connp, B_FALSE); + /* + * conn_direct_blocked is used to indicate blocked + * condition for direct path (ILL_DIRECT_CAPABLE()). + * This is the only place where it is set without + * checking for ILL_DIRECT_CAPABLE() and setting it + * to 0 is ok even if it is not ILL_DIRECT_CAPABLE(). + */ + if (!connp->conn_did_putbq && connp->conn_direct_blocked) { + DTRACE_PROBE1(ip__wsrv__direct__blocked, conn_t *, connp); + connp->conn_direct_blocked = B_FALSE; + } + connp->conn_did_putbq = 0; } @@ -28274,11 +28405,18 @@ ip_wsrv(queue_t *q) * function and wakes up corresponding mac worker threads, which in turn * calls this callback function, and disables flow control. */ -/* ARGSUSED */ void -ill_flow_enable(void *ill, ip_mac_tx_cookie_t cookie) +ill_flow_enable(void *arg, ip_mac_tx_cookie_t cookie) { - qenable(((ill_t *)ill)->ill_wq); + ill_t *ill = (ill_t *)arg; + ip_stack_t *ipst = ill->ill_ipst; + idl_tx_list_t *idl_txl; + + idl_txl = &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)]; + mutex_enter(&idl_txl->txl_lock); + /* add code to to set a flag to indicate idl_txl is enabled */ + conn_walk_drain(ipst, idl_txl); + mutex_exit(&idl_txl->txl_lock); } /* @@ -28315,7 +28453,7 @@ conn_walk_fanout(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst) * in turn qenable the next conn, when it is done/blocked/closing. */ static void -conn_walk_drain(ip_stack_t *ipst) +conn_walk_drain(ip_stack_t *ipst, idl_tx_list_t *tx_list) { int i; idl_t *idl; @@ -28323,7 +28461,7 @@ conn_walk_drain(ip_stack_t *ipst) IP_STAT(ipst, ip_conn_walk_drain); for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) { - idl = &ipst->ips_conn_drain_list[i]; + idl = &tx_list->txl_drain_list[i]; mutex_enter(&idl->idl_lock); if (idl->idl_conn == NULL) { mutex_exit(&idl->idl_lock); @@ -28521,6 +28659,41 @@ conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags, return (found); } +static void +conn_setqfull(conn_t *connp) +{ + queue_t *q = connp->conn_wq; + + if (!(q->q_flag & QFULL)) { + mutex_enter(QLOCK(q)); + if (!(q->q_flag & QFULL)) { + /* still need to set QFULL */ + q->q_flag |= QFULL; + mutex_exit(QLOCK(q)); + } else { + mutex_exit(QLOCK(q)); + } + } +} + +static void +conn_clrqfull(conn_t *connp) +{ + queue_t *q = connp->conn_wq; + + if (q->q_flag & QFULL) { + mutex_enter(QLOCK(q)); + if (q->q_flag & QFULL) { + q->q_flag &= ~QFULL; + mutex_exit(QLOCK(q)); + if (q->q_flag & QWANTW) + qbackenable(q, 0); + } else { + mutex_exit(QLOCK(q)); + } + } +} + /* * Finish processing of "arp_up" when AR_DLPIOP_DONE is received from arp. */ @@ -29666,7 +29839,7 @@ ip_xmit_v4(mblk_t *mp, ire_t *ire, ipsec_out_t *io, 0); ILL_SEND_TX(out_ill, - ire, connp, first_mp, 0); + ire, connp, first_mp, 0, connp); } else { BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsOutDiscards); diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c index 686e2ad94e..fe10ea8110 100644 --- a/usr/src/uts/common/inet/ip/ip6.c +++ b/usr/src/uts/common/inet/ip/ip6.c @@ -10807,9 +10807,12 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, if (ipst->ips_ip_output_queue && connp != NULL && !mctl_present && caller != IRE_SEND) { if (caller == IP_WSRV) { + idl_tx_list_t *idl_txl; + + idl_txl = &ipst->ips_idl_tx_list[0]; connp->conn_did_putbq = 1; (void) putbq(connp->conn_wq, mp); - conn_drain_insert(connp); + conn_drain_insert(connp, idl_txl); /* * caller == IP_WSRV implies we are * the service thread, and the diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c index 64f9789fe9..3628dd4f56 100644 --- a/usr/src/uts/common/inet/ip/ip_if.c +++ b/usr/src/uts/common/inet/ip/ip_if.c @@ -3083,6 +3083,8 @@ ill_capability_direct_enable(ill_t *ill) idd->idd_tx_dh = direct.di_tx_dh; idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df; idd->idd_tx_cb_dh = direct.di_tx_cb_dh; + idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df; + idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh; /* * One time registration of flow enable callback function */ diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h index 369ba60005..7711a2fedc 100644 --- a/usr/src/uts/common/inet/ip_impl.h +++ b/usr/src/uts/common/inet/ip_impl.h @@ -503,24 +503,72 @@ typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t; #define ILL_DIRECT_CAPABLE(ill) \ (((ill)->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) -#define ILL_SEND_TX(ill, ire, hint, mp, flag) { \ - if (ILL_DIRECT_CAPABLE(ill) && DB_TYPE(mp) == M_DATA) { \ - ill_dld_direct_t *idd; \ - \ - idd = &(ill)->ill_dld_capab->idc_direct; \ - /* \ - * Send the packet directly to DLD, where it \ - * may be queued depending on the availability \ - * of transmit resources at the media layer. \ - * Ignore the returned value for the time being \ - * In future, we may want to take this into \ - * account and flow control the TCP. \ - */ \ - (void) idd->idd_tx_df(idd->idd_tx_dh, mp, \ - (uintptr_t)(hint), flag); \ - } else { \ - putnext((ire)->ire_stq, mp); \ - } \ +#define ILL_SEND_TX(ill, ire, hint, mp, flag, connp) { \ + if (ILL_DIRECT_CAPABLE(ill) && DB_TYPE(mp) == M_DATA) { \ + ill_dld_direct_t *idd; \ + uintptr_t cookie; \ + conn_t *udp_connp = (conn_t *)connp; \ + \ + idd = &(ill)->ill_dld_capab->idc_direct; \ + /* \ + * Send the packet directly to DLD, where it \ + * may be queued depending on the availability \ + * of transmit resources at the media layer. \ + * Ignore the returned value for the time being \ + * In future, we may want to take this into \ + * account and flow control the TCP. \ + */ \ + cookie = idd->idd_tx_df(idd->idd_tx_dh, mp, \ + (uintptr_t)(hint), flag); \ + \ + /* \ + * non-NULL cookie indicates flow control situation \ + * and the cookie itself identifies this specific \ + * Tx ring that is blocked. This cookie is used to \ + * block the UDP conn that is sending packets over \ + * this specific Tx ring. \ + */ \ + if ((cookie != NULL) && (udp_connp != NULL) && \ + (udp_connp->conn_ulp == IPPROTO_UDP)) { \ + idl_tx_list_t *idl_txl; \ + ip_stack_t *ipst; \ + \ + /* \ + * Flow controlled. \ + */ \ + DTRACE_PROBE2(ill__send__tx__cookie, \ + uintptr_t, cookie, conn_t *, udp_connp); \ + ipst = udp_connp->conn_netstack->netstack_ip; \ + idl_txl = \ + &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];\ + mutex_enter(&idl_txl->txl_lock); \ + if (udp_connp->conn_direct_blocked || \ + (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, \ + cookie) == 0)) { \ + DTRACE_PROBE1(ill__tx__not__blocked, \ + boolean, \ + udp_connp->conn_direct_blocked); \ + } else if (idl_txl->txl_cookie != NULL && \ + idl_txl->txl_cookie != cookie) { \ + udp_t *udp = udp_connp->conn_udp; \ + udp_stack_t *us = udp->udp_us; \ + \ + DTRACE_PROBE2(ill__send__tx__collision, \ + uintptr_t, cookie, \ + uintptr_t, idl_txl->txl_cookie); \ + UDP_STAT(us, udp_cookie_coll); \ + } else { \ + udp_connp->conn_direct_blocked = B_TRUE;\ + idl_txl->txl_cookie = cookie; \ + conn_drain_insert(udp_connp, idl_txl); \ + DTRACE_PROBE1(ill__send__tx__insert, \ + conn_t *, udp_connp); \ + } \ + mutex_exit(&idl_txl->txl_lock); \ + } \ + } else { \ + putnext((ire)->ire_stq, mp); \ + } \ } #define MBLK_RX_FANOUT_SLOWPATH(mp, ipha) \ diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h index 750378f587..d6f0b033ff 100644 --- a/usr/src/uts/common/inet/ip_stack.h +++ b/usr/src/uts/common/inet/ip_stack.h @@ -131,6 +131,9 @@ typedef struct ire_stats { uint64_t ire_stats_deleted; /* # of ires deleted from the bucket */ } ire_stats_t; +#define TX_FANOUT_SIZE 128 +#define IDLHASHINDEX(X) \ + ((((uintptr_t)(X) >> 2) + ((uintptr_t)(X) >> 9)) & (TX_FANOUT_SIZE - 1)) /* * IP stack instances @@ -348,9 +351,9 @@ struct ip_stack { kstat_t *ips_loopback_ksp; - struct idl_s *ips_conn_drain_list; /* Array of conn drain lists */ + /* Array of conn drain lists */ + struct idl_tx_list_s *ips_idl_tx_list; uint_t ips_conn_drain_list_cnt; /* Count of conn_drain_list */ - int ips_conn_drain_list_index; /* Next drain_list */ /* * ID used to assign next free one. diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h index 2ecc445a56..716689989f 100644 --- a/usr/src/uts/common/inet/ipclassifier.h +++ b/usr/src/uts/common/inet/ipclassifier.h @@ -245,6 +245,7 @@ struct conn_s { unsigned int conn_lso_ok : 1; /* LSO is usable */ + boolean_t conn_direct_blocked; /* conn is flow-controlled */ squeue_t *conn_initial_sqp; /* Squeue at open time */ squeue_t *conn_final_sqp; /* Squeue after connect */ diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index 95b2551008..1b0df0f335 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -86,6 +86,7 @@ #include <inet/kstatcom.h> #include <inet/tcp.h> #include <inet/tcp_impl.h> +#include <inet/udp_impl.h> #include <net/pfkeyv2.h> #include <inet/ipsec_info.h> #include <inet/ipdrop.h> @@ -19431,7 +19432,7 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst); } - ILL_SEND_TX(ill, ire, connp, mp, 0); + ILL_SEND_TX(ill, ire, connp, mp, 0, NULL); } IRE_REFRELE(ire); @@ -21418,7 +21419,7 @@ tcp_lsosend_data(tcp_t *tcp, mblk_t *mp, ire_t *ire, ill_t *ill, const int mss, ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst); } - ILL_SEND_TX(ill, ire, tcp->tcp_connp, mp, 0); + ILL_SEND_TX(ill, ire, tcp->tcp_connp, mp, 0, NULL); } } diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index 30c876b45f..c4f7be3b93 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -5604,6 +5604,7 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid) udp_stack_t *us = udp->udp_us; ip_stack_t *ipst = connp->conn_netstack->netstack_ip; boolean_t ll_multicast = B_FALSE; + boolean_t direct_send; dev_q = ire->ire_stq->q_next; ASSERT(dev_q != NULL); @@ -5611,16 +5612,24 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid) ill = ire_to_ill(ire); ASSERT(ill != NULL); + /* + * For the direct send case, if resetting of conn_direct_blocked + * was missed, it is still ok because the putq() would enable + * the queue and write service will drain it out. + */ + direct_send = ILL_DIRECT_CAPABLE(ill); + /* is queue flow controlled? */ - if (q->q_first != NULL || connp->conn_draining || - DEV_Q_FLOW_BLOCKED(dev_q)) { + if ((!direct_send) && (q->q_first != NULL || connp->conn_draining || + DEV_Q_FLOW_BLOCKED(dev_q))) { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); - - if (ipst->ips_ip_output_queue) + if (ipst->ips_ip_output_queue) { + DTRACE_PROBE1(udp__xmit__putq, conn_t *, connp); (void) putq(connp->conn_wq, mp); - else + } else { freemsg(mp); + } ire_refrele(ire); return; } @@ -5718,20 +5727,60 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid) ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst); } - if (mp != NULL) { - DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, - void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill, - ipha_t *, ipha, ip6_t *, NULL, int, 0); + if (mp == NULL) + goto bail; - if (ILL_DIRECT_CAPABLE(ill)) { - ill_dld_direct_t *idd = &ill->ill_dld_capab->idc_direct; + DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, + void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill, + ipha_t *, ipha, ip6_t *, NULL, int, 0); - (void) idd->idd_tx_df(idd->idd_tx_dh, mp, - (uintptr_t)connp, 0); - } else { - putnext(ire->ire_stq, mp); + if (direct_send) { + uintptr_t cookie; + ill_dld_direct_t *idd = &ill->ill_dld_capab->idc_direct; + + cookie = idd->idd_tx_df(idd->idd_tx_dh, mp, + (uintptr_t)connp, 0); + if (cookie != NULL) { + idl_tx_list_t *idl_txl; + + /* + * Flow controlled. + */ + DTRACE_PROBE2(non__null__cookie, uintptr_t, + cookie, conn_t *, connp); + idl_txl = &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)]; + mutex_enter(&idl_txl->txl_lock); + /* + * Check again after holding txl_lock to see if Tx + * ring is still blocked and only then insert the + * connp into the drain list. + */ + if (connp->conn_direct_blocked || + (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, + cookie) == 0)) { + mutex_exit(&idl_txl->txl_lock); + goto bail; + } + if (idl_txl->txl_cookie != NULL && + idl_txl->txl_cookie != cookie) { + DTRACE_PROBE2(udp__xmit__collision, + uintptr_t, cookie, + uintptr_t, idl_txl->txl_cookie); + UDP_STAT(us, udp_cookie_coll); + } else { + connp->conn_direct_blocked = B_TRUE; + idl_txl->txl_cookie = cookie; + conn_drain_insert(connp, idl_txl); + DTRACE_PROBE1(udp__xmit__insert, + conn_t *, connp); + } + mutex_exit(&idl_txl->txl_lock); } + } else { + DTRACE_PROBE1(udp__xmit__putnext, mblk_t *, mp); + putnext(ire->ire_stq, mp); } +bail: IRE_REFRELE(ire); } diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h index 34b38e67bd..96f84e43bc 100644 --- a/usr/src/uts/common/inet/udp_impl.h +++ b/usr/src/uts/common/inet/udp_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -192,10 +192,7 @@ typedef struct udp_stat { /* Class "net" kstats */ kstat_named_t udp_in_recvtclass; kstat_named_t udp_in_timestamp; kstat_named_t udp_ip_rcvpktinfo; - kstat_named_t udp_direct_send; - kstat_named_t udp_bwsq_send; - kstat_named_t udp_connected_direct_send; - kstat_named_t udp_connected_bwsq_send; + kstat_named_t udp_cookie_coll; #ifdef DEBUG kstat_named_t udp_data_conn; kstat_named_t udp_data_notconn; diff --git a/usr/src/uts/common/io/aggr/aggr_grp.c b/usr/src/uts/common/io/aggr/aggr_grp.c index 04dc974198..59eb75f9d5 100644 --- a/usr/src/uts/common/io/aggr/aggr_grp.c +++ b/usr/src/uts/common/io/aggr/aggr_grp.c @@ -313,13 +313,13 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) link_state_changed = B_TRUE; } - aggr_grp_multicst_port(port, B_TRUE); - /* * Update port's state. */ port->lp_state = AGGR_PORT_STATE_ATTACHED; + aggr_grp_multicst_port(port, B_TRUE); + /* * Set port's receive callback */ @@ -2028,8 +2028,10 @@ aggr_remmac(void *arg, const uint8_t *mac_addr) /* * Add or remove the multicast addresses that are defined for the group * to or from the specified port. - * This function is called before stopping a port, before a port - * is detached from a group, and when attaching a port to a group. + * + * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port + * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is + * called when the port is either stopped or detached. */ void aggr_grp_multicst_port(aggr_port_t *port, boolean_t add) @@ -2039,7 +2041,7 @@ aggr_grp_multicst_port(aggr_port_t *port, boolean_t add) ASSERT(MAC_PERIM_HELD(port->lp_mh)); ASSERT(MAC_PERIM_HELD(grp->lg_mh)); - if (!port->lp_started) + if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED) return; mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add); @@ -2055,8 +2057,10 @@ aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp) mac_perim_enter_by_mh(grp->lg_mh, &mph); for (port = grp->lg_ports; port != NULL; port = port->lp_next) { - if (port->lp_state != AGGR_PORT_STATE_ATTACHED) + if (port->lp_state != AGGR_PORT_STATE_ATTACHED || + !port->lp_started) { continue; + } cerr = aggr_port_multicst(port, add, addrp); if (cerr != 0 && err == 0) err = cerr; diff --git a/usr/src/uts/common/io/aggr/aggr_port.c b/usr/src/uts/common/io/aggr/aggr_port.c index 0323b622f1..4097ba335e 100644 --- a/usr/src/uts/common/io/aggr/aggr_port.c +++ b/usr/src/uts/common/io/aggr/aggr_port.c @@ -493,9 +493,11 @@ aggr_port_start(aggr_port_t *port) { ASSERT(MAC_PERIM_HELD(port->lp_mh)); - if (!port->lp_started) - port->lp_started = B_TRUE; + if (port->lp_started) + return (0); + port->lp_started = B_TRUE; + aggr_grp_multicst_port(port, B_TRUE); return (0); } @@ -507,8 +509,7 @@ aggr_port_stop(aggr_port_t *port) if (!port->lp_started) return; - if (port->lp_state == AGGR_PORT_STATE_ATTACHED) - aggr_grp_multicst_port(port, B_FALSE); + aggr_grp_multicst_port(port, B_FALSE); /* update the port state */ port->lp_started = B_FALSE; diff --git a/usr/src/uts/common/io/aggr/aggr_send.c b/usr/src/uts/common/io/aggr/aggr_send.c index 9b4ad24621..bc0a19368d 100644 --- a/usr/src/uts/common/io/aggr/aggr_send.c +++ b/usr/src/uts/common/io/aggr/aggr_send.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -35,6 +35,7 @@ #include <sys/vlan.h> #include <sys/strsun.h> #include <sys/strsubr.h> +#include <sys/dlpi.h> #include <inet/common.h> #include <inet/led.h> @@ -42,184 +43,29 @@ #include <inet/ip6.h> #include <inet/tcp.h> #include <netinet/udp.h> -#include <inet/ipsec_impl.h> -#include <inet/sadb.h> -#include <inet/ipsecesp.h> -#include <inet/ipsecah.h> #include <sys/aggr.h> #include <sys/aggr_impl.h> -#define HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3]) -#define HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5]) - -static uint16_t aggr_send_ip6_hdr_len(mblk_t *, ip6_t *); - -static uint64_t -aggr_send_hash(aggr_grp_t *grp, mblk_t *mp) -{ - struct ether_header *ehp; - uint16_t sap; - uint_t skip_len; - uint8_t proto; - uint32_t policy = grp->lg_tx_policy; - uint64_t hash = 0; - - ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t))); - ASSERT(MBLKL(mp) >= sizeof (struct ether_header)); - ASSERT(RW_READ_HELD(&grp->lg_tx_lock)); - - /* compute MAC hash */ - - ehp = (struct ether_header *)mp->b_rptr; - - if (policy & AGGR_POLICY_L2) { - uchar_t *mac_src = ehp->ether_shost.ether_addr_octet; - uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet; - hash = HASH_MAC(mac_src) ^ HASH_MAC(mac_dst); - policy &= ~AGGR_POLICY_L2; - } - - if (policy == 0) - goto done; - - /* skip ethernet header */ - - if (ntohs(ehp->ether_type) == ETHERTYPE_VLAN) { - struct ether_vlan_header *evhp; - mblk_t *newmp = NULL; - - skip_len = sizeof (struct ether_vlan_header); - if (MBLKL(mp) < skip_len) { - /* the vlan tag is the payload, pull up first */ - newmp = msgpullup(mp, -1); - if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) { - goto done; - } - evhp = (struct ether_vlan_header *)newmp->b_rptr; - } else { - evhp = (struct ether_vlan_header *)mp->b_rptr; - } - - sap = ntohs(evhp->ether_type); - freemsg(newmp); - } else { - sap = ntohs(ehp->ether_type); - skip_len = sizeof (struct ether_header); - } - - /* if ethernet header is in its own mblk, skip it */ - if (MBLKL(mp) <= skip_len) { - skip_len -= MBLKL(mp); - mp = mp->b_cont; - } - - sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; - - /* compute IP src/dst addresses hash and skip IPv{4,6} header */ - - switch (sap) { - case ETHERTYPE_IP: { - ipha_t *iphp; - - if (MBLKL(mp) < (skip_len + sizeof (ipha_t))) - goto done; - - iphp = (ipha_t *)(mp->b_rptr + skip_len); - proto = iphp->ipha_protocol; - skip_len += IPH_HDR_LENGTH(iphp); - - if (policy & AGGR_POLICY_L3) { - uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src); - uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst); - - hash ^= (HASH_4BYTES(ip_src) ^ HASH_4BYTES(ip_dst)); - policy &= ~AGGR_POLICY_L3; - } - break; - } - case ETHERTYPE_IPV6: { - ip6_t *ip6hp; - - /* - * if ipv6 packet has options, the proto will not be one of the - * ones handled by the ULP processor below, and will return 0 - * as the index - */ - if (MBLKL(mp) < (skip_len + sizeof (ip6_t))) - goto done; - - ip6hp = (ip6_t *)(mp->b_rptr + skip_len); - proto = ip6hp->ip6_nxt; - skip_len += aggr_send_ip6_hdr_len(mp, ip6hp); - - if (policy & AGGR_POLICY_L3) { - uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]); - uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]); - - hash ^= (HASH_4BYTES(ip_src) ^ HASH_4BYTES(ip_dst)); - policy &= ~AGGR_POLICY_L3; - } - break; - } - default: - goto done; - } - - if (!(policy & AGGR_POLICY_L4)) - goto done; - - /* if ip header is in its own mblk, skip it */ - if (MBLKL(mp) <= skip_len) { - skip_len -= MBLKL(mp); - mp = mp->b_cont; - } - - /* parse ULP header */ -again: - switch (proto) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_ESP: - case IPPROTO_SCTP: - /* - * These Internet Protocols are intentionally designed - * for hashing from the git-go. Port numbers are in the first - * word for transports, SPI is first for ESP. - */ - hash ^= HASH_4BYTES((mp->b_rptr + skip_len)); - break; - - case IPPROTO_AH: { - ah_t *ah = (ah_t *)(mp->b_rptr + skip_len); - - uint_t ah_length = AH_TOTAL_LEN(ah); - proto = ah->ah_nexthdr; - skip_len += ah_length; - - /* if ip header is in its own mblk, skip it */ - if (MBLKL(mp) <= skip_len) { - skip_len -= MBLKL(mp); - mp = mp->b_cont; - } - - goto again; - } - } - -done: - return (hash); -} - /* * Update the TX load balancing policy of the specified group. */ void aggr_send_update_policy(aggr_grp_t *grp, uint32_t policy) { + uint8_t mac_policy = 0; + ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + if ((policy & AGGR_POLICY_L2) != 0) + mac_policy |= MAC_PKT_HASH_L2; + if ((policy & AGGR_POLICY_L3) != 0) + mac_policy |= MAC_PKT_HASH_L3; + if ((policy & AGGR_POLICY_L4) != 0) + mac_policy |= MAC_PKT_HASH_L4; + grp->lg_tx_policy = policy; + grp->lg_mac_tx_policy = mac_policy; } /* @@ -250,7 +96,8 @@ aggr_m_tx(void *arg, mblk_t *mp) nextp = mp->b_next; mp->b_next = NULL; - hash = aggr_send_hash(grp, mp); + hash = mac_pkt_hash(DL_ETHER, mp, grp->lg_mac_tx_policy, + B_TRUE); port = grp->lg_tx_ports[hash % grp->lg_ntx_ports]; /* @@ -266,7 +113,7 @@ aggr_m_tx(void *arg, mblk_t *mp) */ freemsg(mp); } else { - mblk_t *ret_mp; + mblk_t *ret_mp = NULL; /* * It is fine that the port state changes now. @@ -385,51 +232,3 @@ aggr_send_port_disable(aggr_port_t *port) port->lp_tx_enabled = B_FALSE; } - -static uint16_t -aggr_send_ip6_hdr_len(mblk_t *mp, ip6_t *ip6h) -{ - uint16_t length; - uint_t ehdrlen; - uint8_t *nexthdrp; - uint8_t *whereptr; - uint8_t *endptr; - ip6_dest_t *desthdr; - ip6_rthdr_t *rthdr; - ip6_frag_t *fraghdr; - - length = IPV6_HDR_LEN; - whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ - endptr = mp->b_wptr; - - nexthdrp = &ip6h->ip6_nxt; - while (whereptr < endptr) { - switch (*nexthdrp) { - case IPPROTO_HOPOPTS: - case IPPROTO_DSTOPTS: - /* Assumes the headers are identical for hbh and dst */ - desthdr = (ip6_dest_t *)whereptr; - ehdrlen = 8 * (desthdr->ip6d_len + 1); - nexthdrp = &desthdr->ip6d_nxt; - break; - case IPPROTO_ROUTING: - rthdr = (ip6_rthdr_t *)whereptr; - ehdrlen = 8 * (rthdr->ip6r_len + 1); - nexthdrp = &rthdr->ip6r_nxt; - break; - case IPPROTO_FRAGMENT: - fraghdr = (ip6_frag_t *)whereptr; - ehdrlen = sizeof (ip6_frag_t); - nexthdrp = &fraghdr->ip6f_nxt; - break; - case IPPROTO_NONE: - /* No next header means we're finished */ - default: - return (length); - } - length += ehdrlen; - whereptr += ehdrlen; - } - - return (length); -} diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c index 2c3d0f7ecb..5533b582a0 100644 --- a/usr/src/uts/common/io/dld/dld_proto.c +++ b/usr/src/uts/common/io/dld/dld_proto.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1334,25 +1334,14 @@ dld_capab_direct(dld_str_t *dsp, void *data, uint_t flags) case DLD_ENABLE: dls_rx_set(dsp, (dls_rx_t)direct->di_rx_cf, direct->di_rx_ch); - /* - * TODO: XXXGopi - * - * Direct pointer to functions in the MAC layer - * should be passed here: - * - * 1) pass mac_tx() and mac_client_handle instead - * of str_mdata_fastpath_put() and dld_str_t. But - * not done presently because of some VLAN - * processing stuff in str_mdata_fastpath_put(). - * - * 2) pass a MAC layer callback instead of - * dld_flow_ctl_callb(). - */ + direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put; direct->di_tx_dh = dsp; - direct->di_tx_cb_df = (uintptr_t)mac_client_tx_notify; direct->di_tx_cb_dh = dsp->ds_mch; + direct->di_tx_fctl_df = (uintptr_t)mac_tx_is_flow_blocked; + direct->di_tx_fctl_dh = dsp->ds_mch; + dsp->ds_direct = B_TRUE; return (0); diff --git a/usr/src/uts/common/io/dls/dls.c b/usr/src/uts/common/io/dls/dls.c index 064217c8f2..53450a45d1 100644 --- a/usr/src/uts/common/io/dls/dls.c +++ b/usr/src/uts/common/io/dls/dls.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -607,14 +607,6 @@ dls_mac_active_set(dls_link_t *dlp) * Set the function to start receiving packets. */ mac_rx_set(dlp->dl_mch, i_dls_link_rx, dlp); - - /* - * We've got a MAC client for this link now. - * Push down the flows that were defined on this link - * hitherto. The flows are added to the active flow table - * and SRS, softrings etc. are created as needed. - */ - mac_link_init_flows(dlp->dl_mch); } dlp->dl_nactive++; return (0); @@ -625,20 +617,6 @@ dls_mac_active_clear(dls_link_t *dlp) { if (--dlp->dl_nactive == 0) { ASSERT(dlp->dl_mah != NULL); - /* - * We would have initialized subflows etc. only if we - * brought up the primary client and set the unicast - * unicast address etc. Deactivate the flows. The flow - * entry will be removed from the active flow tables, - * and the associated SRS, softrings etc will be - * deleted. But the flow entry itself won't be - * destroyed, instead it will continue to be - * archived off the the global flow hash list, for a - * possible future activation when say - * IP is plumbed again - */ - - mac_link_release_flows(dlp->dl_mch); (void) mac_unicast_remove(dlp->dl_mch, dlp->dl_mah); dlp->dl_mah = NULL; mac_rx_clear(dlp->dl_mch); diff --git a/usr/src/uts/common/io/dls/dls_link.c b/usr/src/uts/common/io/dls/dls_link.c index 852b87d24b..85aee7fe86 100644 --- a/usr/src/uts/common/io/dls/dls_link.c +++ b/usr/src/uts/common/io/dls/dls_link.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -36,7 +36,7 @@ #include <sys/atomic.h> static kmem_cache_t *i_dls_link_cachep; -static mod_hash_t *i_dls_link_hash; +mod_hash_t *i_dls_link_hash; static uint_t i_dls_link_count; #define LINK_HASHSZ 67 /* prime */ diff --git a/usr/src/uts/common/io/dls/dls_mgmt.c b/usr/src/uts/common/io/dls/dls_mgmt.c index bb922423b3..576e13ac2c 100644 --- a/usr/src/uts/common/io/dls/dls_mgmt.c +++ b/usr/src/uts/common/io/dls/dls_mgmt.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -60,10 +60,15 @@ boolean_t devnet_need_rebuild; /* Upcall door handle */ static door_handle_t dls_mgmt_dh = NULL; -#define DD_CONDEMNED 0x1 +#define DD_CONDEMNED 0x1 +#define DD_KSTAT_CHANGING 0x2 /* * This structure is used to keep the <linkid, macname> mapping. + * This structure itself is not protected by the mac perimeter, but is + * protected by the dd_mutex and i_dls_devnet_lock. Thus most of the + * functions manipulating this structure such as dls_devnet_set/unset etc. + * may be called while not holding the mac perimeter. */ typedef struct dls_devnet_s { datalink_id_t dd_linkid; @@ -614,6 +619,11 @@ dls_devnet_rele_link(dls_dl_handle_t dlh, dls_link_t *dlp) /* * Query the "link" kstats. + * + * We may be called from the kstat subsystem in an arbitrary context. + * If the caller is the stack, the context could be an upcall data + * thread. Hence we can't acquire the mac perimeter in this function + * for fear of deadlock. */ static int dls_devnet_stat_update(kstat_t *ksp, int rw) @@ -621,21 +631,34 @@ dls_devnet_stat_update(kstat_t *ksp, int rw) dls_devnet_t *ddp = ksp->ks_private; dls_link_t *dlp; int err; - mac_perim_handle_t mph; - err = mac_perim_enter_by_macname(ddp->dd_mac, &mph); - if (err != 0) - return (err); + /* + * Check the link is being renamed or if the link is going away + * before incrementing dd_tref which in turn prevents the link + * from being renamed or deleted until we finish. + */ + mutex_enter(&ddp->dd_mutex); + if (ddp->dd_flags & (DD_CONDEMNED | DD_KSTAT_CHANGING)) { + mutex_exit(&ddp->dd_mutex); + return (ENOENT); + } + ddp->dd_tref++; + mutex_exit(&ddp->dd_mutex); - err = dls_link_hold(ddp->dd_mac, &dlp); - if (err != 0) { - mac_perim_exit(mph); - return (err); + /* + * If a device detach happens at this time, it will block in + * dls_devnet_unset since the dd_tref has been bumped up above. So the + * access to 'dlp' is safe even though we don't hold the mac perimeter. + */ + if (mod_hash_find(i_dls_link_hash, (mod_hash_key_t)ddp->dd_mac, + (mod_hash_val_t *)&dlp) != 0) { + dls_devnet_rele_tmp(ddp); + return (ENOENT); } err = dls_stat_update(ksp, dlp, rw); - dls_link_rele(dlp); - mac_perim_exit(mph); + + dls_devnet_rele_tmp(ddp); return (err); } @@ -707,6 +730,7 @@ dls_devnet_set(const char *macname, datalink_id_t linkid, dls_devnet_t **ddpp) dls_devnet_t *ddp = NULL; datalink_class_t class; int err; + boolean_t stat_create = B_FALSE; rw_enter(&i_dls_devnet_lock, RW_WRITER); if ((err = mod_hash_find(i_dls_devnet_hash, @@ -748,8 +772,7 @@ newphys: (mod_hash_key_t)(uintptr_t)linkid, (mod_hash_val_t)ddp) == 0); devnet_need_rebuild = B_TRUE; - dls_devnet_stat_create(ddp); - + stat_create = B_TRUE; mutex_enter(&ddp->dd_mutex); if (!ddp->dd_prop_loaded && (ddp->dd_prop_taskid == NULL)) { ddp->dd_prop_taskid = taskq_dispatch(system_taskq, @@ -761,6 +784,20 @@ newphys: err = 0; done: rw_exit(&i_dls_devnet_lock); + /* + * It is safe to drop the i_dls_devnet_lock at this point. In the case + * of physical devices, the softmac framework will fail the device + * detach based on the smac_state or smac_hold_cnt. Other cases like + * vnic and aggr use their own scheme to serialize creates and deletes + * and ensure that *ddp is valid. + * + * The kstat subsystem holds its own locks (rather perimeter) before + * calling the ks_update (dls_devnet_stat_update) entry point which + * in turn grabs the i_dls_devnet_lock. So the lock hierarchy is + * kstat locks -> i_dls_devnet_lock. + */ + if (stat_create) + dls_devnet_stat_create(ddp); if (err == 0 && ddpp != NULL) *ddpp = ddp; return (err); @@ -815,7 +852,6 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait) VERIFY(mod_hash_remove(i_dls_devnet_id_hash, (mod_hash_key_t)(uintptr_t)ddp->dd_linkid, &val) == 0); - dls_devnet_stat_destroy(ddp); devnet_need_rebuild = B_TRUE; } rw_exit(&i_dls_devnet_lock); @@ -830,6 +866,9 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait) ASSERT(ddp->dd_tref == 0 && ddp->dd_prop_taskid == NULL); } + if (ddp->dd_linkid != DATALINK_INVALID_LINKID) + dls_devnet_stat_destroy(ddp); + ddp->dd_prop_loaded = B_FALSE; ddp->dd_linkid = DATALINK_INVALID_LINKID; ddp->dd_zid = GLOBAL_ZONEID; @@ -1112,6 +1151,7 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) mac_perim_handle_t mph = NULL; mac_handle_t mh; mod_hash_val_t val; + boolean_t clear_dd_flag = B_FALSE; /* * In the second case, id2 must be a REMOVED physical link. @@ -1134,8 +1174,10 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) * mac perimeter, hence enter the perimeter first. This also waits * for the property loading to finish. */ - if ((err = mac_perim_enter_by_linkid(id1, &mph)) != 0) - goto done; + if ((err = mac_perim_enter_by_linkid(id1, &mph)) != 0) { + softmac_rele_device(ddh); + return (err); + } rw_enter(&i_dls_devnet_lock, RW_WRITER); if ((err = mod_hash_find(i_dls_devnet_id_hash, @@ -1146,13 +1188,22 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) } /* - * Return EBUSY if any applications have this link open. + * Return EBUSY if any applications have this link open or if any + * thread is currently accessing the link kstats. Then set the + * DD_KSTAT_CHANGING flag to prevent any access to the kstats + * while we delete and recreate kstats below. */ + mutex_enter(&ddp->dd_mutex); if (ddp->dd_ref > 1) { + mutex_exit(&ddp->dd_mutex); err = EBUSY; goto done; } + ddp->dd_flags |= DD_KSTAT_CHANGING; + clear_dd_flag = B_TRUE; + mutex_exit(&ddp->dd_mutex); + if (id2 == DATALINK_INVALID_LINKID) { (void) strlcpy(linkname, link, sizeof (linkname)); @@ -1225,11 +1276,21 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) done: /* * Change the name of the kstat based on the new link name. + * We can't hold the i_dls_devnet_lock across calls to the kstat + * subsystem. Instead the DD_KSTAT_CHANGING flag set above in this + * function prevents any access to the dd_ksp while we delete and + * recreate it below. */ + rw_exit(&i_dls_devnet_lock); if (err == 0) dls_devnet_stat_rename(ddp, linkname); - rw_exit(&i_dls_devnet_lock); + if (clear_dd_flag) { + mutex_enter(&ddp->dd_mutex); + ddp->dd_flags &= ~DD_KSTAT_CHANGING; + mutex_exit(&ddp->dd_mutex); + } + if (mph != NULL) mac_perim_exit(mph); softmac_rele_device(ddh); @@ -1388,6 +1449,11 @@ dls_devnet_create(mac_handle_t mh, datalink_id_t linkid) int err; mac_perim_handle_t mph; + /* + * Holding the mac perimeter ensures that the downcall from the + * dlmgmt daemon which does the property loading does not proceed + * until we relinquish the perimeter. + */ mac_perim_enter_by_mh(mh, &mph); /* @@ -1400,8 +1466,8 @@ dls_devnet_create(mac_handle_t mh, datalink_id_t linkid) return (err); } if ((err = dls_link_hold_create(mac_name(mh), &dlp)) != 0) { - (void) dls_devnet_unset(mac_name(mh), &linkid, B_TRUE); mac_perim_exit(mph); + (void) dls_devnet_unset(mac_name(mh), &linkid, B_TRUE); return (err); } mac_perim_exit(mph); diff --git a/usr/src/uts/common/io/e1000g/e1000g_main.c b/usr/src/uts/common/io/e1000g/e1000g_main.c index 44a73391e1..5272a26fb1 100644 --- a/usr/src/uts/common/io/e1000g/e1000g_main.c +++ b/usr/src/uts/common/io/e1000g/e1000g_main.c @@ -1618,7 +1618,6 @@ static mblk_t *e1000g_poll_ring(void *arg, int bytes_to_pickup) e1000g_rx_ring_t *rx_ring = (e1000g_rx_ring_t *)arg; mblk_t *mp = NULL; mblk_t *tail; - uint_t sz = 0; struct e1000g *adapter; adapter = rx_ring->adapter; @@ -1631,68 +1630,7 @@ static mblk_t *e1000g_poll_ring(void *arg, int bytes_to_pickup) } mutex_enter(&rx_ring->rx_lock); - ASSERT(rx_ring->poll_flag); - - /* - * Get any packets that have arrived. Works only if we - * actually disable the physical adapter/rx_ring interrupt. - * (e1000g_poll_mode == 1). In case e1000g_poll_mode == 0, - * packets will have already been added to the poll list - * by the interrupt (see e1000g_intr_work()). - */ - if (adapter->poll_mode) { - mp = e1000g_receive(rx_ring, &tail, &sz); - if (mp != NULL) { - if (rx_ring->poll_list_head == NULL) - rx_ring->poll_list_head = mp; - else - rx_ring->poll_list_tail->b_next = mp; - rx_ring->poll_list_tail = tail; - rx_ring->poll_list_sz += sz; - } - } - - mp = rx_ring->poll_list_head; - if (mp == NULL) { - mutex_exit(&rx_ring->rx_lock); - rw_exit(&adapter->chip_lock); - return (NULL); - } - - /* Check if we can sendup the entire chain */ - if (bytes_to_pickup >= rx_ring->poll_list_sz) { - mp = rx_ring->poll_list_head; - rx_ring->poll_list_head = NULL; - rx_ring->poll_list_tail = NULL; - rx_ring->poll_list_sz = 0; - mutex_exit(&rx_ring->rx_lock); - rw_exit(&adapter->chip_lock); - return (mp); - } - - /* - * We need to find out how much chain we can send up. We - * are guaranteed that atleast one packet will go up since - * we already checked that. - */ - tail = mp; - sz = 0; - while (mp != NULL) { - sz += MBLKL(mp); - if (sz > bytes_to_pickup) { - sz -= MBLKL(mp); - break; - } - tail = mp; - mp = mp->b_next; - } - - mp = rx_ring->poll_list_head; - rx_ring->poll_list_head = tail->b_next; - if (rx_ring->poll_list_head == NULL) - rx_ring->poll_list_tail = NULL; - rx_ring->poll_list_sz -= sz; - tail->b_next = NULL; + mp = e1000g_receive(rx_ring, &tail, bytes_to_pickup); mutex_exit(&rx_ring->rx_lock); rw_exit(&adapter->chip_lock); return (mp); @@ -2118,79 +2056,26 @@ e1000g_intr_work(struct e1000g *Adapter, uint32_t icr) } if (icr & E1000_ICR_RXT0) { - mblk_t *mp; - uint_t sz = 0; - mblk_t *tmp, *tail = NULL; + mblk_t *mp = NULL; + mblk_t *tail = NULL; e1000g_rx_ring_t *rx_ring; rx_ring = Adapter->rx_ring; mutex_enter(&rx_ring->rx_lock); - /* - * If the real interrupt for the Rx ring was - * not disabled (e1000g_poll_mode == 0), then - * we still pick up the packets and queue them - * on Rx ring if we were in polling mode. this - * enables the polling thread to pick up packets - * really fast in polling mode and helps improve - * latency. + * Sometimes with legacy interrupts, it possible that + * there is a single interrupt for Rx/Tx. In which + * case, if poll flag is set, we shouldn't really + * be doing Rx processing. */ - mp = e1000g_receive(rx_ring, &tail, &sz); + if (!rx_ring->poll_flag) + mp = e1000g_receive(rx_ring, &tail, + E1000G_CHAIN_NO_LIMIT); + mutex_exit(&rx_ring->rx_lock); rw_exit(&Adapter->chip_lock); - - if (mp != NULL) { - ASSERT(tail != NULL); - if (!rx_ring->poll_flag) { - /* - * If not polling, see if something was - * already queued. Take care not to - * reorder packets. - */ - if (rx_ring->poll_list_head == NULL) { - mutex_exit(&rx_ring->rx_lock); - mac_rx_ring(Adapter->mh, rx_ring->mrh, - mp, rx_ring->ring_gen_num); - } else { - tmp = rx_ring->poll_list_head; - rx_ring->poll_list_head = NULL; - rx_ring->poll_list_tail->b_next = mp; - rx_ring->poll_list_tail = NULL; - rx_ring->poll_list_sz = 0; - mutex_exit(&rx_ring->rx_lock); - mac_rx_ring(Adapter->mh, rx_ring->mrh, - tmp, rx_ring->ring_gen_num); - } - } else { - /* - * We are in a polling mode. Put the - * processed packets on the poll list. - */ - if (rx_ring->poll_list_head == NULL) - rx_ring->poll_list_head = mp; - else - rx_ring->poll_list_tail->b_next = mp; - rx_ring->poll_list_tail = tail; - rx_ring->poll_list_sz += sz; - mutex_exit(&rx_ring->rx_lock); - } - } else if (!rx_ring->poll_flag && - rx_ring->poll_list_head != NULL) { - /* - * Nothing new has arrived (then why - * was the interrupt raised??). Check - * if something queued from the last - * time. - */ - tmp = rx_ring->poll_list_head; - rx_ring->poll_list_head = NULL; - rx_ring->poll_list_tail = NULL; - rx_ring->poll_list_sz = 0; - mutex_exit(&rx_ring->rx_lock); + if (mp != NULL) mac_rx_ring(Adapter->mh, rx_ring->mrh, - tmp, rx_ring->ring_gen_num); - } else { - mutex_exit(&rx_ring->rx_lock); - } + mp, rx_ring->ring_gen_num); } else rw_exit(&Adapter->chip_lock); @@ -2698,7 +2583,6 @@ e1000g_rx_ring_intr_enable(mac_intr_handle_t intrh) struct e1000g *adapter = rx_ring->adapter; struct e1000_hw *hw = &adapter->shared; uint32_t intr_mask; - boolean_t poll_mode; rw_enter(&adapter->chip_lock, RW_READER); @@ -2709,20 +2593,17 @@ e1000g_rx_ring_intr_enable(mac_intr_handle_t intrh) mutex_enter(&rx_ring->rx_lock); rx_ring->poll_flag = 0; - poll_mode = adapter->poll_mode; mutex_exit(&rx_ring->rx_lock); - if (poll_mode) { - /* Rx interrupt enabling for MSI and legacy */ - intr_mask = E1000_READ_REG(hw, E1000_IMS); - intr_mask |= E1000_IMS_RXT0; - E1000_WRITE_REG(hw, E1000_IMS, intr_mask); - E1000_WRITE_FLUSH(hw); + /* Rx interrupt enabling for MSI and legacy */ + intr_mask = E1000_READ_REG(hw, E1000_IMS); + intr_mask |= E1000_IMS_RXT0; + E1000_WRITE_REG(hw, E1000_IMS, intr_mask); + E1000_WRITE_FLUSH(hw); - /* Trigger a Rx interrupt to check Rx ring */ - E1000_WRITE_REG(hw, E1000_ICS, E1000_IMS_RXT0); - E1000_WRITE_FLUSH(hw); - } + /* Trigger a Rx interrupt to check Rx ring */ + E1000_WRITE_REG(hw, E1000_ICS, E1000_IMS_RXT0); + E1000_WRITE_FLUSH(hw); rw_exit(&adapter->chip_lock); return (0); @@ -2734,7 +2615,6 @@ e1000g_rx_ring_intr_disable(mac_intr_handle_t intrh) e1000g_rx_ring_t *rx_ring = (e1000g_rx_ring_t *)intrh; struct e1000g *adapter = rx_ring->adapter; struct e1000_hw *hw = &adapter->shared; - boolean_t poll_mode; rw_enter(&adapter->chip_lock, RW_READER); @@ -2742,22 +2622,13 @@ e1000g_rx_ring_intr_disable(mac_intr_handle_t intrh) rw_exit(&adapter->chip_lock); return (0); } - - /* - * Once the adapter can support per Rx ring interrupt, - * we should disable the real interrupt instead of just setting - * the flag. - */ mutex_enter(&rx_ring->rx_lock); rx_ring->poll_flag = 1; - poll_mode = adapter->poll_mode; mutex_exit(&rx_ring->rx_lock); - if (poll_mode) { - /* Rx interrupt disabling for MSI and legacy */ - E1000_WRITE_REG(hw, E1000_IMC, E1000_IMS_RXT0); - E1000_WRITE_FLUSH(hw); - } + /* Rx interrupt disabling for MSI and legacy */ + E1000_WRITE_REG(hw, E1000_IMC, E1000_IMS_RXT0); + E1000_WRITE_FLUSH(hw); rw_exit(&adapter->chip_lock); return (0); diff --git a/usr/src/uts/common/io/e1000g/e1000g_rx.c b/usr/src/uts/common/io/e1000g/e1000g_rx.c index 5876cb51b3..b1ac40145c 100644 --- a/usr/src/uts/common/io/e1000g/e1000g_rx.c +++ b/usr/src/uts/common/io/e1000g/e1000g_rx.c @@ -452,7 +452,7 @@ e1000g_get_buf(e1000g_rx_ring_t *rx_ring) * This routine will process packets received in an interrupt */ mblk_t * -e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t *sz) +e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t sz) { struct e1000_hw *hw; mblk_t *nmp; @@ -471,13 +471,13 @@ e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t *sz) struct e1000g *Adapter; dma_buffer_t *rx_buf; uint16_t cksumflags; + uint_t chain_sz = 0; ret_mp = NULL; ret_nmp = NULL; pkt_count = 0; desc_count = 0; cksumflags = 0; - *sz = 0; Adapter = rx_ring->adapter; hw = &Adapter->shared; @@ -505,7 +505,8 @@ e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t *sz) * descriptor owned by the hardware that begins a packet. */ while ((current_desc->status & E1000_RXD_STAT_DD) && - (pkt_count < Adapter->rx_limit_onintr)) { + (pkt_count < Adapter->rx_limit_onintr) && + ((sz == E1000G_CHAIN_NO_LIMIT) || (chain_sz <= sz))) { desc_count++; /* @@ -832,7 +833,7 @@ rx_end_of_packet: } ret_nmp->b_next = NULL; *tail = ret_nmp; - *sz += length; + chain_sz += length; rx_ring->rx_mblk = NULL; rx_ring->rx_mblk_tail = NULL; diff --git a/usr/src/uts/common/io/e1000g/e1000g_sw.h b/usr/src/uts/common/io/e1000g/e1000g_sw.h index 277ba680a0..d0d465d666 100644 --- a/usr/src/uts/common/io/e1000g/e1000g_sw.h +++ b/usr/src/uts/common/io/e1000g/e1000g_sw.h @@ -198,6 +198,8 @@ extern "C" { #define E1000G_RX_SW_STOP 0x2 #define E1000G_RX_SW_DETACH 0x3 +#define E1000G_CHAIN_NO_LIMIT 0 + /* * definitions for smartspeed workaround */ @@ -786,9 +788,6 @@ typedef struct _e1000g_rx_ring { mac_ring_handle_t mrh; mac_ring_handle_t mrh_init; uint64_t ring_gen_num; - mblk_t *poll_list_head; - mblk_t *poll_list_tail; - uint_t poll_list_sz; boolean_t poll_flag; /* @@ -998,7 +997,7 @@ void e1000g_free_tx_swpkt(p_tx_sw_packet_t packet); void e1000g_tx_freemsg(e1000g_tx_ring_t *tx_ring); uint_t e1000g_tx_softint_worker(caddr_t arg1, caddr_t arg2); mblk_t *e1000g_m_tx(void *arg, mblk_t *mp); -mblk_t *e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t *sz); +mblk_t *e1000g_receive(e1000g_rx_ring_t *rx_ring, mblk_t **tail, uint_t sz); void e1000g_rxfree_func(p_rx_sw_packet_t packet); int e1000g_m_stat(void *arg, uint_t stat, uint64_t *val); diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c index a8b411f994..be8518b523 100644 --- a/usr/src/uts/common/io/mac/mac.c +++ b/usr/src/uts/common/io/mac/mac.c @@ -504,6 +504,7 @@ i_mac_destructor(void *buf, void *arg) ASSERT(mip->mi_kstat_count == 0); ASSERT(mip->mi_nclients == 0); ASSERT(mip->mi_nactiveclients == 0); + ASSERT(mip->mi_single_active_client == NULL); ASSERT(mip->mi_state_flags == 0); ASSERT(mip->mi_factory_addr == NULL); ASSERT(mip->mi_factory_addr_num == 0); @@ -1712,6 +1713,12 @@ mac_tx_client_unblock(mac_client_impl_t *mcip) mac_tx_lock_all(mcip); mcip->mci_tx_flag &= ~MCI_TX_QUIESCE; mac_tx_unlock_all(mcip); + /* + * We may fail to disable flow control for the last MAC_NOTE_TX + * notification because the MAC client is quiesced. Send the + * notification again. + */ + i_mac_notify(mcip->mci_mip, MAC_NOTE_TX); } /* @@ -2350,10 +2357,8 @@ i_mac_tx_srs_notify(mac_impl_t *mip, mac_ring_handle_t ring) cclient = cclient->mci_client_next) { if ((mac_srs = MCIP_TX_SRS(cclient)) != NULL) mac_tx_srs_wakeup(mac_srs, ring); - if (!FLOW_TAB_EMPTY(cclient->mci_subflow_tab)) { - (void) mac_flow_walk_nolock(cclient->mci_subflow_tab, - mac_tx_flow_srs_wakeup, ring); - } + (void) mac_flow_walk(cclient->mci_subflow_tab, + mac_tx_flow_srs_wakeup, ring); } rw_exit(&mip->mi_rw_lock); rw_exit(&i_mac_impl_lock); @@ -4107,8 +4112,13 @@ mac_fini_macaddr(mac_impl_t *mip) { mac_address_t *map = mip->mi_addresses; - /* there should be exactly one entry left on the list */ - ASSERT(map != NULL); + if (map == NULL) + return; + + /* + * If mi_addresses is initialized, there should be exactly one + * entry left on the list with no users. + */ ASSERT(map->ma_nusers == 0); ASSERT(map->ma_next == NULL); diff --git a/usr/src/uts/common/io/mac/mac_bcast.c b/usr/src/uts/common/io/mac/mac_bcast.c index 31a9d0ed7d..eaac168aaf 100644 --- a/usr/src/uts/common/io/mac/mac_bcast.c +++ b/usr/src/uts/common/io/mac/mac_bcast.c @@ -124,14 +124,6 @@ mac_bcast_grp_free(void *bcast_grp) ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); - if (grp->mbg_addrtype == MAC_ADDRTYPE_MULTICAST) { - /* - * The address is a multicast address, have the - * underlying NIC leave the multicast group. - */ - (void) mip->mi_multicst(mip->mi_driver, B_FALSE, grp->mbg_addr); - } - ASSERT(grp->mbg_addr != NULL); kmem_free(grp->mbg_addr, mip->mi_type->mt_addr_length); kmem_free(grp->mbg_clients, @@ -271,15 +263,69 @@ mac_bcast_add(mac_client_impl_t *mcip, const uint8_t *addr, uint16_t vid, size_t addr_len = mip->mi_type->mt_addr_length; int rc = 0; int i, index = -1; - mac_mcast_addrs_t *mci_maddr = NULL; - mac_mcast_addrs_t *mi_maddr = NULL; - mac_mcast_addrs_t **last_maddr; + mac_mcast_addrs_t **prev_mi_addr = NULL; + mac_mcast_addrs_t **prev_mci_addr = NULL; ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); ASSERT(addrtype == MAC_ADDRTYPE_MULTICAST || addrtype == MAC_ADDRTYPE_BROADCAST); + /* + * Add the MAC client to the list of MAC clients associated + * with the group. + */ + if (addrtype == MAC_ADDRTYPE_MULTICAST) { + mac_mcast_addrs_t *maddr; + + /* + * In case of a driver (say aggr), we need this information + * on a per MAC instance basis. + */ + prev_mi_addr = &mip->mi_mcast_addrs; + for (maddr = *prev_mi_addr; maddr != NULL; + prev_mi_addr = &maddr->mma_next, maddr = maddr->mma_next) { + if (bcmp(maddr->mma_addr, addr, addr_len) == 0) + break; + } + if (maddr == NULL) { + /* + * For multicast addresses, have the underlying MAC + * join the corresponding multicast group. + */ + rc = mip->mi_multicst(mip->mi_driver, B_TRUE, addr); + if (rc != 0) + return (rc); + maddr = kmem_zalloc(sizeof (mac_mcast_addrs_t), + KM_SLEEP); + bcopy(addr, maddr->mma_addr, addr_len); + *prev_mi_addr = maddr; + } else { + prev_mi_addr = NULL; + } + maddr->mma_ref++; + + /* + * We maintain a separate list for each MAC client. Get + * the entry or add, if it is not present. + */ + prev_mci_addr = &mcip->mci_mcast_addrs; + for (maddr = *prev_mci_addr; maddr != NULL; + prev_mci_addr = &maddr->mma_next, maddr = maddr->mma_next) { + if (bcmp(maddr->mma_addr, addr, addr_len) == 0) + break; + } + if (maddr == NULL) { + maddr = kmem_zalloc(sizeof (mac_mcast_addrs_t), + KM_SLEEP); + bcopy(addr, maddr->mma_addr, addr_len); + *prev_mci_addr = maddr; + } else { + prev_mci_addr = NULL; + } + maddr->mma_ref++; + } + /* The list is protected by the perimeter */ last_grp = &mip->mi_bcast_grp; for (grp = *last_grp; grp != NULL; @@ -331,7 +377,7 @@ mac_bcast_add(mac_client_impl_t *mcip, const uint8_t *addr, uint16_t vid, if (rc != 0) { kmem_free(grp->mbg_addr, addr_len); kmem_cache_free(mac_bcast_grp_cache, grp); - return (rc); + goto fail; } grp->mbg_flow_ent->fe_mbg = grp; mip->mi_bcast_ngrps++; @@ -366,23 +412,7 @@ mac_bcast_add(mac_client_impl_t *mcip, const uint8_t *addr, uint16_t vid, rc = mac_flow_add(mip->mi_flow_tab, grp->mbg_flow_ent); if (rc != 0) { FLOW_FINAL_REFRELE(grp->mbg_flow_ent); - return (rc); - } - - /* - * For multicast addresses, have the underlying MAC - * join the corresponsing multicast group. - */ - if (addrtype == MAC_ADDRTYPE_MULTICAST) { - rc = mip->mi_multicst(mip->mi_driver, B_TRUE, addr); - if (rc != 0) { - mac_flow_remove(mip->mi_flow_tab, - grp->mbg_flow_ent, B_FALSE); - mac_flow_wait(grp->mbg_flow_ent, - FLOW_DRIVER_UPCALL); - FLOW_FINAL_REFRELE(grp->mbg_flow_ent); - return (rc); - } + goto fail; } *last_grp = grp; @@ -395,45 +425,6 @@ mac_bcast_add(mac_client_impl_t *mcip, const uint8_t *addr, uint16_t vid, * with the group. */ rw_enter(&mip->mi_rw_lock, RW_WRITER); - if (addrtype == MAC_ADDRTYPE_MULTICAST) { - /* - * We maintain a separate list for each MAC client. Get - * the entry or add, if it is not present. - */ - last_maddr = &mcip->mci_mcast_addrs; - for (mci_maddr = *last_maddr; mci_maddr != NULL; - last_maddr = &mci_maddr->mma_next, - mci_maddr = mci_maddr->mma_next) { - if (bcmp(mci_maddr->mma_addr, addr, addr_len) == 0) - break; - } - if (mci_maddr == NULL) { - mci_maddr = kmem_zalloc(sizeof (mac_mcast_addrs_t), - KM_SLEEP); - bcopy(addr, mci_maddr->mma_addr, addr_len); - *last_maddr = mci_maddr; - } - mci_maddr->mma_ref++; - - /* - * In case of a driver (say aggr), we also need this - * information on a per MAC instance basis. - */ - last_maddr = &mip->mi_mcast_addrs; - for (mi_maddr = *last_maddr; mi_maddr != NULL; - last_maddr = &mi_maddr->mma_next, - mi_maddr = mi_maddr->mma_next) { - if (bcmp(mi_maddr->mma_addr, addr, addr_len) == 0) - break; - } - if (mi_maddr == NULL) { - mi_maddr = kmem_zalloc(sizeof (mac_mcast_addrs_t), - KM_SLEEP); - bcopy(addr, mi_maddr->mma_addr, addr_len); - *last_maddr = mi_maddr; - } - mi_maddr->mma_ref++; - } for (i = 0; i < grp->mbg_nclients_alloc; i++) { /* * The MAC client was already added, say when we have @@ -442,7 +433,8 @@ mac_bcast_add(mac_client_impl_t *mcip, const uint8_t *addr, uint16_t vid, */ if (grp->mbg_clients[i].mgb_client == mcip) { grp->mbg_clients[i].mgb_client_ref++; - goto add_done; + rw_exit(&mip->mi_rw_lock); + return (0); } else if (grp->mbg_clients[i].mgb_client == NULL && index == -1) { index = i; @@ -478,10 +470,20 @@ mac_bcast_add(mac_client_impl_t *mcip, const uint8_t *addr, uint16_t vid, * to detect that condition after re-acquiring the lock. */ grp->mbg_clients_gen++; -add_done: rw_exit(&mip->mi_rw_lock); - return (0); + +fail: + if (prev_mi_addr != NULL) { + kmem_free(*prev_mi_addr, sizeof (mac_mcast_addrs_t)); + *prev_mi_addr = NULL; + (void) mip->mi_multicst(mip->mi_driver, B_FALSE, addr); + } + if (prev_mci_addr != NULL) { + kmem_free(*prev_mci_addr, sizeof (mac_mcast_addrs_t)); + *prev_mci_addr = NULL; + } + return (rc); } /* @@ -559,6 +561,8 @@ mac_bcast_delete(mac_client_impl_t *mcip, const uint8_t *addr, uint16_t vid) *prev = grp->mbg_next; } update_maddr: + rw_exit(&mip->mi_rw_lock); + if (grp->mbg_addrtype == MAC_ADDRTYPE_MULTICAST) { mprev = &mcip->mci_mcast_addrs; for (maddr = mcip->mci_mcast_addrs; maddr != NULL; @@ -583,12 +587,12 @@ update_maddr: } ASSERT(maddr != NULL); if (--maddr->mma_ref == 0) { + (void) mip->mi_multicst(mip->mi_driver, B_FALSE, addr); *mprev = maddr->mma_next; maddr->mma_next = NULL; kmem_free(maddr, sizeof (mac_mcast_addrs_t)); } } - rw_exit(&mip->mi_rw_lock); /* * If the group itself is being removed, remove the diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c index 84e302ad9f..cf4a8f4421 100644 --- a/usr/src/uts/common/io/mac/mac_client.c +++ b/usr/src/uts/common/io/mac/mac_client.c @@ -1159,18 +1159,6 @@ mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name, */ mcip = mac_vnic_lower(mip); - /* - * If there are multiple MAC clients of the VNIC, they - * all share the same underlying MAC client handle. - */ - if ((flags & MAC_OPEN_FLAGS_TAG_DISABLE) != 0) - mcip->mci_state_flags |= MCIS_TAG_DISABLE; - - if ((flags & MAC_OPEN_FLAGS_STRIP_DISABLE) != 0) - mcip->mci_state_flags |= MCIS_STRIP_DISABLE; - - if ((flags & MAC_OPEN_FLAGS_DISABLE_TX_VID_CHECK) != 0) - mcip->mci_state_flags |= MCIS_DISABLE_TX_VID_CHECK; /* * Note that multiple mac clients share the same mcip in @@ -1328,13 +1316,6 @@ mac_client_close(mac_client_handle_t mch, uint16_t flags) * when the VNIC is deleted. */ - /* - * Clear the flags set when the upper client initiated - * open. - */ - mcip->mci_state_flags &= ~(MCIS_TAG_DISABLE | - MCIS_STRIP_DISABLE | MCIS_DISABLE_TX_VID_CHECK); - i_mac_perim_exit(mip); return; } @@ -1377,12 +1358,11 @@ mac_rx_bypass_set(mac_client_handle_t mch, mac_direct_rx_t rx_fn, void *arg1) ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); /* - * If the mac_client is a VLAN or native media is non ethernet, we - * should not do DLS bypass and instead let the packets go via the - * default mac_rx_deliver route so vlan header can be stripped etc. + * If the mac_client is a VLAN, we should not do DLS bypass and + * instead let the packets come up via mac_rx_deliver so the vlan + * header can be stripped. */ - if (mcip->mci_nvids > 0 || - mip->mi_info.mi_nativemedia != DL_ETHER) + if (mcip->mci_nvids > 0) return (B_FALSE); /* @@ -1606,6 +1586,37 @@ mac_client_update_mcast(void *arg, boolean_t add, const uint8_t *addrp) } } +static void +mac_update_single_active_client(mac_impl_t *mip) +{ + mac_client_impl_t *client = NULL; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + + rw_enter(&mip->mi_rw_lock, RW_WRITER); + if (mip->mi_nactiveclients == 1) { + /* + * Find the one active MAC client from the list of MAC + * clients. The active MAC client has at least one + * unicast address. + */ + for (client = mip->mi_clients_list; client != NULL; + client = client->mci_client_next) { + if (client->mci_unicast_list != NULL) + break; + } + ASSERT(client != NULL); + } + + /* + * mi_single_active_client is protected by the MAC impl's read/writer + * lock, which allows mac_rx() to check the value of that pointer + * as a reader. + */ + mip->mi_single_active_client = client; + rw_exit(&mip->mi_rw_lock); +} + /* * Add a new unicast address to the MAC client. * @@ -1712,11 +1723,13 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags, mip->mi_state_flags |= MIS_EXCLUSIVE; bzero(&mrp, sizeof (mac_resource_props_t)); - if (is_primary && !(mcip->mci_state_flags & MCIS_IS_VNIC)) { + if (is_primary && !(mcip->mci_state_flags & (MCIS_IS_VNIC | + MCIS_IS_AGGR_PORT))) { /* * Apply the property cached in the mac_impl_t to the primary - * mac client. If the mac client is a VNIC, its property were - * already set in the mcip when the VNIC was created. + * mac client. If the mac client is a VNIC or an aggregation + * port, its property should be set in the mcip when the + * VNIC/aggr was created. */ mac_get_resources((mac_handle_t)mip, &mrp); (void) mac_client_set_resources(mch, &mrp); @@ -1781,8 +1794,13 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags, goto bail; bcast_added = B_TRUE; } - flent = mcip->mci_flent; - ASSERT(flent != NULL); + + /* + * If this is the first unicast address addition for this + * client, reuse the pre-allocated larval flow entry associated with + * the MAC client. + */ + flent = (mcip->mci_nflents == 0) ? mcip->mci_flent : NULL; /* We are configuring the unicast flow now */ if (!MCIP_DATAPATH_SETUP(mcip)) { @@ -1806,6 +1824,7 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags, mip->mi_nactiveclients++; nactiveclients_added = B_TRUE; + /* * This will allocate the RX ring group if possible for the * flow and program the software classifier as needed. @@ -1817,6 +1836,12 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags, * The unicast MAC address must have been added successfully. */ ASSERT(mcip->mci_unicast != NULL); + /* + * Push down the sub-flows that were defined on this link + * hitherto. The flows are added to the active flow table + * and SRS, softrings etc. are created as needed. + */ + mac_link_init_flows(mch); } else { mac_address_t *map = mcip->mci_unicast; @@ -1871,6 +1896,9 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags, mcip->mci_unicast_list = muip; rw_exit(&mcip->mci_rw_lock); + if (nactiveclients_added) + mac_update_single_active_client(mip); + *mah = (mac_unicast_handle_t)muip; /* add it to the flow list of this mcip */ @@ -1906,8 +1934,11 @@ bail: if (mac_started) mac_stop(mip); - if (nactiveclients_added) + if (nactiveclients_added) { mip->mi_nactiveclients--; + mac_update_single_active_client(mip); + } + if (mcip->mci_state_flags & MCIS_EXCLUSIVE) mip->mi_state_flags &= ~MIS_EXCLUSIVE; kmem_free(muip, sizeof (mac_unicast_impl_t)); @@ -1983,9 +2014,9 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah) * Remove the VID from the list of client's VIDs. */ pre = mcip->mci_unicast_list; - if (muip == pre) + if (muip == pre) { mcip->mci_unicast_list = muip->mui_next; - else { + } else { while ((pre->mui_next != NULL) && (pre->mui_next != muip)) pre = pre->mui_next; ASSERT(pre->mui_next == muip); @@ -1997,14 +2028,16 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah) if ((mcip->mci_flags & MAC_CLIENT_FLAGS_PRIMARY) && muip->mui_vid == 0) mcip->mci_flags &= ~MAC_CLIENT_FLAGS_PRIMARY; - /* - * This MAC client is shared, so we will just remove the flent - * corresponding to the address being removed. We don't invoke - * mac_rx_classify_flow_rem() since the additional flow is - * not associated with its own separate set of SRS and rings, - * and these constructs are still needed for the remaining flows. - */ if (!mac_client_single_rcvr(mcip)) { + /* + * This MAC client is shared by more than one unicast + * addresses, so we will just remove the flent + * corresponding to the address being removed. We don't invoke + * mac_rx_classify_flow_rem() since the additional flow is + * not associated with its own separate set of SRS and rings, + * and these constructs are still needed for the remaining + * flows. + */ flent = mac_client_get_flow(mcip, muip); ASSERT(flent != NULL); @@ -2037,7 +2070,20 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah) return (0); } + /* + * We would have initialized subflows etc. only if we brought up + * the primary client and set the unicast unicast address etc. + * Deactivate the flows. The flow entry will be removed from the + * active flow tables, and the associated SRS, softrings etc will + * be deleted. But the flow entry itself won't be destroyed, instead + * it will continue to be archived off the the global flow hash + * list, for a possible future activation when say IP is plumbed + * again. + */ + mac_link_release_flows(mch); + mip->mi_nactiveclients--; + mac_update_single_active_client(mip); /* Tear down the Data path */ mac_datapath_teardown(mcip, mcip->mci_flent, SRST_LINK); @@ -2252,6 +2298,8 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type, mpip->mpi_mcip = mcip; mpip->mpi_no_tx_loop = ((flags & MAC_PROMISC_FLAGS_NO_TX_LOOP) != 0); mpip->mpi_no_phys = ((flags & MAC_PROMISC_FLAGS_NO_PHYS) != 0); + mpip->mpi_strip_vlan_tag = + ((flags & MAC_PROMISC_FLAGS_VLAN_TAG_STRIP) != 0); mcbi = &mip->mi_promisc_cb_info; mutex_enter(mcbi->mcbi_lockp); @@ -2503,44 +2551,65 @@ done: * mac_tx_is_blocked * * Given a cookie, it returns if the ring identified by the cookie is - * flow-controlled or not (this is not implemented yet). If NULL is - * passed in place of a cookie, then it finds out if any of the - * underlying rings belonging to the SRS is flow controlled or not - * and returns that status. + * flow-controlled or not. If NULL is passed in place of a cookie, + * then it finds out if any of the underlying rings belonging to the + * SRS is flow controlled or not and returns that status. */ /* ARGSUSED */ boolean_t mac_tx_is_flow_blocked(mac_client_handle_t mch, mac_tx_cookie_t cookie) { mac_client_impl_t *mcip = (mac_client_impl_t *)mch; - mac_soft_ring_set_t *mac_srs = MCIP_TX_SRS(mcip); + mac_soft_ring_set_t *mac_srs; mac_soft_ring_t *sringp; boolean_t blocked = B_FALSE; + mac_tx_percpu_t *mytx; + int err; int i; /* - * On etherstubs, there won't be a Tx SRS or an Rx - * SRS. Infact there won't even be a flow_entry. + * Bump the reference count so that mac_srs won't be deleted. + * If the client is currently quiesced and we failed to bump + * the reference, return B_TRUE so that flow control stays + * as enabled. + * + * Flow control will then be disabled once the client is no + * longer quiesced. */ - if (mac_srs == NULL) + MAC_TX_TRY_HOLD(mcip, mytx, err); + if (err != 0) + return (B_TRUE); + + if ((mac_srs = MCIP_TX_SRS(mcip)) == NULL) { + MAC_TX_RELE(mcip, mytx); return (B_FALSE); + } mutex_enter(&mac_srs->srs_lock); if (mac_srs->srs_tx.st_mode == SRS_TX_FANOUT) { - for (i = 0; i < mac_srs->srs_oth_ring_count; i++) { - sringp = mac_srs->srs_oth_soft_rings[i]; + if (cookie != NULL) { + sringp = (mac_soft_ring_t *)cookie; mutex_enter(&sringp->s_ring_lock); - if (sringp->s_ring_state & S_RING_TX_HIWAT) { + if (sringp->s_ring_state & S_RING_TX_HIWAT) blocked = B_TRUE; + mutex_exit(&sringp->s_ring_lock); + } else { + for (i = 0; i < mac_srs->srs_oth_ring_count; i++) { + sringp = mac_srs->srs_oth_soft_rings[i]; + mutex_enter(&sringp->s_ring_lock); + if (sringp->s_ring_state & S_RING_TX_HIWAT) { + blocked = B_TRUE; + mutex_exit(&sringp->s_ring_lock); + break; + } mutex_exit(&sringp->s_ring_lock); - break; } - mutex_exit(&sringp->s_ring_lock); } } else { blocked = (mac_srs->srs_state & SRS_TX_HIWAT); } mutex_exit(&mac_srs->srs_lock); + MAC_TX_RELE(mcip, mytx); return (blocked); } @@ -2846,6 +2915,10 @@ mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp, return; mp_copy->b_next = NULL; + if (mpip->mpi_strip_vlan_tag) { + if ((mp_copy = mac_strip_vlan_tag_chain(mp_copy)) == NULL) + return; + } mpip->mpi_fn(mpip->mpi_arg, NULL, mp_copy, loopback); } @@ -3218,7 +3291,7 @@ i_mac_set_resources(mac_handle_t mh, mac_resource_props_t *mrp) */ bcopy(mrp, &tmrp, sizeof (mac_resource_props_t)); mcip = mac_primary_client_handle(mip); - if (mcip != NULL) { + if (mcip != NULL && (mcip->mci_state_flags & MCIS_IS_AGGR_PORT) == 0) { err = mac_client_set_resources((mac_client_handle_t)mcip, &tmrp); } diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c index c93fe0ca8f..9c316911d4 100644 --- a/usr/src/uts/common/io/mac/mac_datapath_setup.c +++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -73,16 +73,24 @@ kmem_cache_t *mac_soft_ring_cache; * The duration in msec we wait before signalling the soft ring * worker thread in case packets get queued. */ -static uint32_t mac_soft_ring_worker_wait = 0; +uint32_t mac_soft_ring_worker_wait = 0; + +/* + * A global tunable for turning polling on/off. By default, dynamic + * polling is always on and is always very beneficial. It should be + * turned off with absolute care and for the rare workload (very + * low latency sensitive traffic). + */ +int mac_poll_enable = B_TRUE; /* * Need to set mac_soft_ring_max_q_cnt based on bandwidth and perhaps latency. * Large values could end up in consuming lot of system memory and cause * system hang. */ -static int mac_soft_ring_max_q_cnt = 1024; -static int mac_soft_ring_min_q_cnt = 256; -static int mac_soft_ring_poll_thres = 16; +int mac_soft_ring_max_q_cnt = 1024; +int mac_soft_ring_min_q_cnt = 256; +int mac_soft_ring_poll_thres = 16; /* * Default value of number of TX rings to be assigned to a MAC client. @@ -91,8 +99,8 @@ static int mac_soft_ring_poll_thres = 16; * If no TX rings are available, then MAC client(s) will be assigned the * default Tx ring. Default Tx ring can be shared among multiple MAC clients. */ -static uint32_t mac_tx_ring_count = 8; -static boolean_t mac_tx_serialize = B_FALSE; +uint32_t mac_tx_ring_count = 8; +boolean_t mac_tx_serialize = B_FALSE; /* * mac_tx_srs_hiwat is the queue depth threshold at which callers of @@ -105,8 +113,8 @@ static boolean_t mac_tx_serialize = B_FALSE; * Note that mac_tx_srs_hiwat is always be lesser than * mac_tx_srs_max_q_cnt. */ -static uint32_t mac_tx_srs_max_q_cnt = 100000; -static uint32_t mac_tx_srs_hiwat = 1000; +uint32_t mac_tx_srs_max_q_cnt = 100000; +uint32_t mac_tx_srs_hiwat = 1000; /* * mac_rx_soft_ring_count, mac_soft_ring_10gig_count: @@ -131,8 +139,8 @@ static uint32_t mac_tx_srs_hiwat = 1000; * rings is based on specified bandwidth, CPU speed and number of CPUs in * the system. */ -static uint_t mac_rx_soft_ring_count = 8; -static uint_t mac_rx_soft_ring_10gig_count = 8; +uint_t mac_rx_soft_ring_count = 8; +uint_t mac_rx_soft_ring_10gig_count = 8; /* * Every Tx and Rx mac_soft_ring_set_t (mac_srs) created gets added @@ -146,18 +154,12 @@ static krwlock_t mac_srs_g_lock; /* * Whether the SRS threads should be bound, or not. */ -static boolean_t mac_srs_thread_bind = B_TRUE; +boolean_t mac_srs_thread_bind = B_TRUE; /* * CPU to fallback to, used by mac_next_bind_cpu(). */ -static processorid_t srs_bind_cpu = 0; - -/* - * Possible setting for soft_ring_process_flag is - * 0 or ST_RING_WORKER_ONLY. - */ -static int soft_ring_process_flag = ST_RING_WORKER_ONLY; +processorid_t srs_bind_cpu = 0; /* * If cpu bindings are specified by user, then Tx SRS and its soft @@ -503,7 +505,7 @@ mac_srs_poll_state_change(mac_soft_ring_set_t *mac_srs, (ring->mr_classify_type == MAC_HW_CLASSIFIER)) { if (turn_off_poll_capab) mac_srs->srs_state &= ~SRS_POLLING_CAPAB; - else + else if (mac_poll_enable) mac_srs->srs_state |= SRS_POLLING_CAPAB; } srs_rx->sr_lower_proc = rx_func; @@ -1498,7 +1500,7 @@ mac_srs_fanout_modify(mac_client_impl_t *mcip, flow_entry_t *flent, mac_soft_ring_set_t *mac_tx_srs) { mac_soft_ring_t *softring; - uint32_t soft_ring_flag = soft_ring_process_flag; + uint32_t soft_ring_flag = 0; processorid_t cpuid = -1; boolean_t user_specified; int i, srings_present, new_fanout_cnt; @@ -1606,7 +1608,7 @@ mac_srs_fanout_init(mac_client_impl_t *mcip, flow_entry_t *flent, { int i; processorid_t cpuid, worker_cpuid, poll_cpuid; - uint32_t soft_ring_flag = soft_ring_process_flag; + uint32_t soft_ring_flag = 0; int soft_ring_cnt; boolean_t user_specified = B_FALSE; mac_cpus_t *srs_cpu = &mac_rx_srs->srs_cpu; @@ -1917,7 +1919,8 @@ mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type, (srs_rx->sr_lowat >> 1) ? mac_soft_ring_poll_thres : (srs_rx->sr_lowat >> 1); if (mac_latency_optimize) - mac_srs->srs_state |= SRS_LATENCY_OPT; + mac_srs->srs_state |= + (SRS_LATENCY_OPT|SRS_SOFTRING_QUEUE); } mac_srs->srs_worker = thread_create(NULL, 0, @@ -1956,12 +1959,21 @@ mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type, ring->mr_classify_type = MAC_HW_CLASSIFIER; ring->mr_flag |= MR_INCIPIENT; - if (FLOW_TAB_EMPTY(mcip->mci_subflow_tab)) + if (FLOW_TAB_EMPTY(mcip->mci_subflow_tab) && mac_poll_enable) mac_srs->srs_state |= SRS_POLLING_CAPAB; mac_srs->srs_poll_thr = thread_create(NULL, 0, mac_rx_srs_poll_ring, mac_srs, 0, &p0, TS_RUN, mac_srs->srs_pri); + /* + * Some drivers require serialization and don't send + * packet chains in interrupt context. For such + * drivers, we should always queue in soft ring + * so that we get a chance to switch into a polling + * mode under backlog. + */ + if (mcip->mci_mip->mi_v12n_level & MAC_VIRT_SERIALIZE) + mac_srs->srs_state |= SRS_SOFTRING_QUEUE; } return (mac_srs); } @@ -2131,10 +2143,6 @@ mac_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, mac_srs = mac_srs_create(mcip, flent, fanout_type | link_type, mac_rx_deliver, mcip, NULL, ring); - if (mip->mi_v12n_level & MAC_VIRT_SERIALIZE) { - mac_srs->srs_rx.sr_enqueue_always = - B_TRUE; - } break; default: cmn_err(CE_PANIC, "srs_setup: mcip = %p " @@ -2706,6 +2714,7 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, mac_srs_group_setup(grp_only_mcip, grp_only_mcip->mci_flent, default_group, SRST_LINK); + mac_rx_group_unmark(default_group, MR_INCIPIENT); } } } @@ -3173,7 +3182,7 @@ mac_tx_srs_setup(mac_client_impl_t *mcip, flow_entry_t *flent, { mac_impl_t *mip = mcip->mci_mip; mac_soft_ring_set_t *tx_srs; - int i, tx_ring_count = 0, tx_rings_reserved; + int i, tx_ring_count = 0, tx_rings_reserved = 0; mac_ring_handle_t *tx_ring = NULL; uint32_t soft_ring_type; mac_group_t *grp = NULL; diff --git a/usr/src/uts/common/io/mac/mac_flow.c b/usr/src/uts/common/io/mac/mac_flow.c index 6dc3a8a7b4..cb6560b1f7 100644 --- a/usr/src/uts/common/io/mac/mac_flow.c +++ b/usr/src/uts/common/io/mac/mac_flow.c @@ -479,8 +479,8 @@ mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp) int i, err; s.fs_flags = flags; - s.fs_mp = mp; retry: + s.fs_mp = mp; /* * Walk the list of predeclared accept functions. @@ -489,6 +489,8 @@ retry: */ for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) { if ((err = (ops->fo_accept[i])(ft, &s)) != 0) { + mblk_t *last; + /* * ENOBUFS indicates that the mp could be too short * and may need a pullup. @@ -497,11 +499,13 @@ retry: return (err); /* - * Don't modify the mblk if there are references to it. - * Also, there is no point pulling up if b_cont is NULL. + * The pullup is done on the last processed mblk, not + * the starting one. pullup is not done if the mblk + * has references or if b_cont is NULL. */ - if (DB_REF(mp) > 1 || mp->b_cont == NULL || - pullupmsg(mp, -1) == 0) + last = s.fs_mp; + if (DB_REF(last) > 1 || last->b_cont == NULL || + pullupmsg(last, -1) == 0) return (EINVAL); retried = B_TRUE; @@ -1209,10 +1213,11 @@ mac_link_flow_add(datalink_id_t linkid, char *flow_name, /* * Add the subflow to the subflow table. Also instantiate the flow - * in the mac if there is an active DLS user. The dl_mah is set when - * dls_active_set() is called, typically during interface plumb. + * in the mac if there is an active user (we check if the MAC client's + * datapath has been setup). */ - err = mac_flow_add_subflow(dlp->dl_mch, flent, dlp->dl_mah != NULL); + err = mac_flow_add_subflow(dlp->dl_mch, flent, + MCIP_DATAPATH_SETUP((mac_client_impl_t *)dlp->dl_mch)); if (err != 0) goto bail; @@ -1514,6 +1519,17 @@ mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo) #define PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end)) +#define CHECK_AND_ADJUST_START_PTR(s, start) { \ + if ((s)->fs_mp->b_wptr == (start)) { \ + mblk_t *next = (s)->fs_mp->b_cont; \ + if (next == NULL) \ + return (EINVAL); \ + \ + (s)->fs_mp = next; \ + (start) = next->b_rptr; \ + } \ +} + /* ARGSUSED */ static boolean_t flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s) @@ -1830,7 +1846,14 @@ flow_ip_accept(flow_tab_t *ft, flow_state_t *s) uint16_t sap = l2info->l2_sap; uchar_t *l3_start; - l3info->l3_start = l3_start = l2info->l2_start + l2info->l2_hdrsize; + l3_start = l2info->l2_start + l2info->l2_hdrsize; + + /* + * Adjust start pointer if we're at the end of an mblk. + */ + CHECK_AND_ADJUST_START_PTR(s, l3_start); + + l3info->l3_start = l3_start; if (!OK_32PTR(l3_start)) return (EINVAL); @@ -2193,7 +2216,14 @@ flow_transport_accept(flow_tab_t *ft, flow_state_t *s) uint8_t proto = l3info->l3_protocol; uchar_t *l4_start; - l4info->l4_start = l4_start = l3info->l3_start + l3info->l3_hdrsize; + l4_start = l3info->l3_start + l3info->l3_hdrsize; + + /* + * Adjust start pointer if we're at the end of an mblk. + */ + CHECK_AND_ADJUST_START_PTR(s, l4_start); + + l4info->l4_start = l4_start; if (!OK_32PTR(l4_start)) return (EINVAL); diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c index 714fb79afb..4d9d590457 100644 --- a/usr/src/uts/common/io/mac/mac_provider.c +++ b/usr/src/uts/common/io/mac/mac_provider.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -668,6 +668,24 @@ mac_rx(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) return; } /* We'll fall through to software classification */ + } else { + flow_entry_t *flent; + int err; + + rw_enter(&mip->mi_rw_lock, RW_READER); + if (mip->mi_single_active_client != NULL) { + flent = mip->mi_single_active_client->mci_flent_list; + FLOW_TRY_REFHOLD(flent, err); + rw_exit(&mip->mi_rw_lock); + if (err == 0) { + (flent->fe_cb_fn)(flent->fe_cb_arg1, + flent->fe_cb_arg2, mp_chain, B_FALSE); + FLOW_REFRELE(flent); + return; + } + } else { + rw_exit(&mip->mi_rw_lock); + } } if (!FLOW_TAB_EMPTY(mip->mi_flow_tab)) { diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c index 290366f5d2..927e3842d3 100644 --- a/usr/src/uts/common/io/mac/mac_sched.c +++ b/usr/src/uts/common/io/mac/mac_sched.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -515,25 +515,27 @@ static void mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) { struct ether_header *ehp; - uint16_t etype; + struct ether_vlan_header *evhp; + uint32_t sap; ipha_t *ipha; - mac_soft_ring_t *softring; - size_t ether_hlen; + uint8_t *dstaddr; + size_t hdrsize; mblk_t *mp; mblk_t *headmp[MAX_SR_TYPES]; mblk_t *tailmp[MAX_SR_TYPES]; int cnt[MAX_SR_TYPES]; size_t sz[MAX_SR_TYPES]; size_t sz1; - boolean_t bw_ctl = B_FALSE; + boolean_t bw_ctl; boolean_t hw_classified; - boolean_t dls_bypass = B_TRUE; - enum pkt_type type; + boolean_t dls_bypass; + boolean_t is_ether; + boolean_t is_unicast; + enum pkt_type type; mac_client_impl_t *mcip = mac_srs->srs_mcip; - struct ether_vlan_header *evhp; - if (mac_srs->srs_type & SRST_BW_CONTROL) - bw_ctl = B_TRUE; + is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); + bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); /* * If we don't have a Rx ring, S/W classification would have done @@ -550,8 +552,7 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) * processing in the Rx path. SRST_DLS_BYPASS will be clear for * such SRSs. */ - if (!(mac_srs->srs_type & SRST_DLS_BYPASS)) - dls_bypass = B_FALSE; + dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0); bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *)); bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *)); @@ -570,68 +571,62 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) mp->b_next = NULL; type = OTH; - sz1 = msgdsize(mp); - - if (!dls_bypass) { - mac_impl_t *mip = mcip->mci_mip; + sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); + if (is_ether) { + /* + * At this point we can be sure the packet at least + * has an ether header. + */ + if (sz1 < sizeof (struct ether_header)) { + mac_rx_drop_pkt(mac_srs, mp); + continue; + } ehp = (struct ether_header *)mp->b_rptr; /* - * For VLAN packets, if the VLAN id doesn't belong - * to this client, we drop the packet. + * Determine if this is a VLAN or non-VLAN packet. */ - if (mip->mi_info.mi_nativemedia == DL_ETHER && - ntohs(ehp->ether_type) == VLAN_TPID) { + if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { + evhp = (struct ether_vlan_header *)mp->b_rptr; + sap = ntohs(evhp->ether_type); + hdrsize = sizeof (struct ether_vlan_header); /* - * LINTED: cast may result in improper - * alignment + * Check if the VID of the packet, if any, + * belongs to this client. */ - evhp = (struct ether_vlan_header *)ehp; if (!mac_client_check_flow_vid(mcip, VLAN_ID(ntohs(evhp->ether_tci)))) { mac_rx_drop_pkt(mac_srs, mp); continue; } + } else { + hdrsize = sizeof (struct ether_header); } - FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], - cnt[type], bw_ctl, sz[type], sz1, mp); - continue; - } - - /* - * At this point we can be sure the packet at least - * has an ether header. - */ - if (sz1 < sizeof (struct ether_header)) { - mac_rx_drop_pkt(mac_srs, mp); - continue; - } - /* LINTED: cast may result in improper alignment */ - ehp = (struct ether_header *)mp->b_rptr; + is_unicast = + ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); + dstaddr = (uint8_t *)&ehp->ether_dhost; + } else { + mac_header_info_t mhi; - /* - * Determine if this is a VLAN or non-VLAN packet. - */ - if ((etype = ntohs(ehp->ether_type)) == VLAN_TPID) { - /* LINTED: cast may result in improper alignment */ - evhp = (struct ether_vlan_header *)mp->b_rptr; - etype = ntohs(evhp->ether_type); - ether_hlen = sizeof (struct ether_vlan_header); - /* - * Check if the VID of the packet, if any, belongs - * to this client. - */ - if (!mac_client_check_flow_vid(mcip, - VLAN_ID(ntohs(evhp->ether_tci)))) { + if (mac_header_info((mac_handle_t)mcip->mci_mip, + mp, &mhi) != 0) { mac_rx_drop_pkt(mac_srs, mp); continue; } - } else { - ether_hlen = sizeof (struct ether_header); + hdrsize = mhi.mhi_hdrsize; + sap = mhi.mhi_bindsap; + is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); + dstaddr = (uint8_t *)mhi.mhi_daddr; } - if (etype == ETHERTYPE_IP) { + if (!dls_bypass) { + FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], + cnt[type], bw_ctl, sz[type], sz1, mp); + continue; + } + + if (sap == ETHERTYPE_IP) { /* * If we are H/W classified, but we have promisc * on, then we need to check for the unicast address. @@ -641,12 +636,11 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) rw_enter(&mcip->mci_rw_lock, RW_READER); map = mcip->mci_unicast; - if (bcmp(&ehp->ether_dhost, map->ma_addr, + if (bcmp(dstaddr, map->ma_addr, map->ma_len) == 0) type = UNDEF; rw_exit(&mcip->mci_rw_lock); - } else if (((((uint8_t *)&ehp->ether_dhost)[0] & - 0x01) == 0)) { + } else if (is_unicast) { type = UNDEF; } } @@ -665,8 +659,7 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) * the 'OTH' type path without DLS bypass. */ - /* LINTED: cast may result in improper alignment */ - ipha = (ipha_t *)(mp->b_rptr + ether_hlen); + ipha = (ipha_t *)(mp->b_rptr + hdrsize); if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) type = OTH; @@ -686,25 +679,25 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) switch (ipha->ipha_protocol) { case IPPROTO_TCP: type = V4_TCP; - mp->b_rptr += ether_hlen; + mp->b_rptr += hdrsize; break; case IPPROTO_UDP: type = V4_UDP; - mp->b_rptr += ether_hlen; + mp->b_rptr += hdrsize; break; default: type = OTH; break; } - ASSERT(type != UNDEF); - FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type], bw_ctl, sz[type], sz1, mp); } for (type = V4_TCP; type < UNDEF; type++) { if (headmp[type] != NULL) { + mac_soft_ring_t *softring; + ASSERT(tailmp[type]->b_next == NULL); switch (type) { case V4_TCP: @@ -716,7 +709,7 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) case OTH: softring = mac_srs->srs_oth_soft_rings[0]; } - mac_rx_soft_ring_process(mac_srs->srs_mcip, softring, + mac_rx_soft_ring_process(mcip, softring, headmp[type], tailmp[type], cnt[type], sz[type]); } } @@ -731,7 +724,7 @@ int fanout_unalligned = 0; */ static int mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, - uint16_t etype, enum pkt_type *type, uint_t *indx) + uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx) { ip6_t *ip6h; uint8_t *whereptr; @@ -740,18 +733,18 @@ mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, uint8_t nexthdr; uint16_t hdr_len; - if (etype == ETHERTYPE_IPV6) { + if (sap == ETHERTYPE_IPV6) { boolean_t modifiable = B_TRUE; - ASSERT(MBLKL(mp) >= sizeof (struct ether_header)); + ASSERT(MBLKL(mp) >= hdrsize); - ip6h = (ip6_t *)(mp->b_rptr + sizeof (struct ether_header)); + ip6h = (ip6_t *)(mp->b_rptr + hdrsize); if ((unsigned char *)ip6h == mp->b_wptr) { /* - * The first mblk_t only includes the ethernet header. + * The first mblk_t only includes the mac header. * Note that it is safe to change the mp pointer here, * as the subsequent operation does not assume mp - * points to the start of the ethernet header. + * points to the start of the mac header. */ mp = mp->b_cont; @@ -900,32 +893,32 @@ static void mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) { struct ether_header *ehp; - uint16_t etype; + struct ether_vlan_header *evhp; + uint32_t sap; ipha_t *ipha; + uint8_t *dstaddr; uint_t indx; - int ports_offset = -1; - int ipha_len; + size_t ports_offset; + size_t ipha_len; + size_t hdrsize; uint_t hash; - mac_soft_ring_t *softring; - size_t ether_hlen; - uint16_t frag_offset_flags; mblk_t *mp; mblk_t *headmp[MAX_SR_TYPES][MAX_SR_FANOUT]; mblk_t *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT]; int cnt[MAX_SR_TYPES][MAX_SR_FANOUT]; size_t sz[MAX_SR_TYPES][MAX_SR_FANOUT]; size_t sz1; - boolean_t bw_ctl = B_FALSE; + boolean_t bw_ctl; boolean_t hw_classified; - boolean_t dls_bypass = B_TRUE; - int i; + boolean_t dls_bypass; + boolean_t is_ether; + boolean_t is_unicast; int fanout_cnt; - enum pkt_type type; + enum pkt_type type; mac_client_impl_t *mcip = mac_srs->srs_mcip; - struct ether_vlan_header *evhp; - if (mac_srs->srs_type & SRST_BW_CONTROL) - bw_ctl = B_TRUE; + is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); + bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); /* * If we don't have a Rx ring, S/W classification would have done @@ -942,8 +935,7 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) * processing in the Rx path. SRST_DLS_BYPASS will be clear for * such SRSs. */ - if (!(mac_srs->srs_type & SRST_DLS_BYPASS)) - dls_bypass = B_FALSE; + dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0); /* * Since the softrings are never destroyed and we always @@ -972,77 +964,66 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) mp->b_next = NULL; type = OTH; - sz1 = msgdsize(mp); + sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); - if (!dls_bypass) { - mac_impl_t *mip = mcip->mci_mip; + if (is_ether) { + /* + * At this point we can be sure the packet at least + * has an ether header. + */ + if (sz1 < sizeof (struct ether_header)) { + mac_rx_drop_pkt(mac_srs, mp); + continue; + } + ehp = (struct ether_header *)mp->b_rptr; - indx = 0; - if (mip->mi_info.mi_nativemedia == DL_ETHER) { - ehp = (struct ether_header *)mp->b_rptr; - etype = ntohs(ehp->ether_type); + /* + * Determine if this is a VLAN or non-VLAN packet. + */ + if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { + evhp = (struct ether_vlan_header *)mp->b_rptr; + sap = ntohs(evhp->ether_type); + hdrsize = sizeof (struct ether_vlan_header); /* - * For VLAN packets, if the VLAN id doesn't - * belong to this client, we drop the packet. + * Check if the VID of the packet, if any, + * belongs to this client. */ - if (etype == VLAN_TPID) { - /* - * LINTED: cast may result in improper - * alignment - */ - evhp = (struct ether_vlan_header *) - mp->b_rptr; - if (!mac_client_check_flow_vid(mcip, - VLAN_ID(ntohs(evhp->ether_tci)))) { - mac_rx_drop_pkt(mac_srs, mp); - continue; - } - } - if (mac_rx_srs_long_fanout(mac_srs, mp, etype, - &type, &indx) == -1) { + if (!mac_client_check_flow_vid(mcip, + VLAN_ID(ntohs(evhp->ether_tci)))) { mac_rx_drop_pkt(mac_srs, mp); continue; } + } else { + hdrsize = sizeof (struct ether_header); } + is_unicast = + ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); + dstaddr = (uint8_t *)&ehp->ether_dhost; + } else { + mac_header_info_t mhi; - FANOUT_ENQUEUE_MP(headmp[type][indx], - tailmp[type][indx], cnt[type][indx], bw_ctl, - sz[type][indx], sz1, mp); - continue; - } - - /* - * At this point we can be sure the packet at least - * has an ether header. On the outbound side, GLD/stack - * ensure this. On the inbound side, the driver needs - * to ensure this. - */ - if (sz1 < sizeof (struct ether_header)) { - mac_rx_drop_pkt(mac_srs, mp); - continue; + if (mac_header_info((mac_handle_t)mcip->mci_mip, + mp, &mhi) != 0) { + mac_rx_drop_pkt(mac_srs, mp); + continue; + } + hdrsize = mhi.mhi_hdrsize; + sap = mhi.mhi_bindsap; + is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); + dstaddr = (uint8_t *)mhi.mhi_daddr; } - /* LINTED: cast may result in improper alignment */ - ehp = (struct ether_header *)mp->b_rptr; - /* - * Determine if this is a VLAN or non-VLAN packet. - */ - if ((etype = ntohs(ehp->ether_type)) == VLAN_TPID) { - /* LINTED: cast may result in improper alignment */ - evhp = (struct ether_vlan_header *)mp->b_rptr; - etype = ntohs(evhp->ether_type); - ether_hlen = sizeof (struct ether_vlan_header); - /* - * Check if the VID of the packet, if any, belongs - * to this client. - */ - if (!mac_client_check_flow_vid(mcip, - VLAN_ID(ntohs(evhp->ether_tci)))) { + if (!dls_bypass) { + if (mac_rx_srs_long_fanout(mac_srs, mp, sap, + hdrsize, &type, &indx) == -1) { mac_rx_drop_pkt(mac_srs, mp); continue; } - } else { - ether_hlen = sizeof (struct ether_header); + + FANOUT_ENQUEUE_MP(headmp[type][indx], + tailmp[type][indx], cnt[type][indx], bw_ctl, + sz[type][indx], sz1, mp); + continue; } @@ -1051,7 +1032,7 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) * classification has not happened, we need to verify if * this unicast packet really belongs to us. */ - if (etype == ETHERTYPE_IP) { + if (sap == ETHERTYPE_IP) { /* * If we are H/W classified, but we have promisc * on, then we need to check for the unicast address. @@ -1061,12 +1042,11 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) rw_enter(&mcip->mci_rw_lock, RW_READER); map = mcip->mci_unicast; - if (bcmp(&ehp->ether_dhost, map->ma_addr, + if (bcmp(dstaddr, map->ma_addr, map->ma_len) == 0) type = UNDEF; rw_exit(&mcip->mci_rw_lock); - } else if (((((uint8_t *)&ehp->ether_dhost)[0] & - 0x01) == 0)) { + } else if (is_unicast) { type = UNDEF; } } @@ -1076,14 +1056,15 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) * the fast path. */ - /* LINTED: cast may result in improper alignment */ - ipha = (ipha_t *)(mp->b_rptr + ether_hlen); + ipha = (ipha_t *)(mp->b_rptr + hdrsize); if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) { type = OTH; fanout_oth1++; } if (type != OTH) { + uint16_t frag_offset_flags; + switch (ipha->ipha_protocol) { case IPPROTO_TCP: case IPPROTO_UDP: @@ -1103,7 +1084,7 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) fanout_oth3++; break; } - ports_offset = ether_hlen + ipha_len; + ports_offset = hdrsize + ipha_len; break; default: type = OTH; @@ -1113,8 +1094,8 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) } if (type == OTH) { - if (mac_rx_srs_long_fanout(mac_srs, mp, etype, - &type, &indx) == -1) { + if (mac_rx_srs_long_fanout(mac_srs, mp, sap, + hdrsize, &type, &indx) == -1) { mac_rx_drop_pkt(mac_srs, mp); continue; } @@ -1146,7 +1127,7 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) *(uint32_t *)(mp->b_rptr + ports_offset)); indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count); type = V4_TCP; - mp->b_rptr += ether_hlen; + mp->b_rptr += hdrsize; break; case IPPROTO_UDP: case IPPROTO_SCTP: @@ -1162,19 +1143,24 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) mac_srs->srs_ind++; } type = V4_UDP; - mp->b_rptr += ether_hlen; + mp->b_rptr += hdrsize; break; + default: + indx = 0; + type = OTH; } - ASSERT(type != UNDEF); - FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx], cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp); } for (type = V4_TCP; type < UNDEF; type++) { + int i; + for (i = 0; i < fanout_cnt; i++) { if (headmp[type][i] != NULL) { + mac_soft_ring_t *softring; + ASSERT(tailmp[type][i]->b_next == NULL); switch (type) { case V4_TCP: @@ -1190,7 +1176,7 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) mac_srs->srs_oth_soft_rings[i]; break; } - mac_rx_soft_ring_process(mac_srs->srs_mcip, + mac_rx_soft_ring_process(mcip, softring, headmp[type][i], tailmp[type][i], cnt[type][i], sz[type][i]); } @@ -1373,46 +1359,39 @@ check_again: (mac_srs->srs_first != NULL)) { /* * We have packets to process and worker thread - * is not running. Check to see if poll thread is - * allowed to process. Let it do processing only if it - * picked up some packets from the NIC otherwise - * wakeup the worker thread. + * is not running. Check to see if poll thread is + * allowed to process. */ - if ((mac_srs->srs_state & SRS_LATENCY_OPT) && - (head != NULL)) { + if (mac_srs->srs_state & SRS_LATENCY_OPT) { mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC); if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) { srs_rx->sr_poll_again++; goto check_again; - } else { - /* - * We are already above low water mark - * so stay in the polling mode but no - * need to poll. Once we dip below - * the polling threshold, the processing - * thread (soft ring) will signal us - * to poll again (MAC_UPDATE_SRS_COUNT) - */ - srs_rx->sr_poll_drain_no_poll++; - mac_srs->srs_state &= - ~(SRS_PROC|SRS_GET_PKTS); - /* - * In B/W control case, its possible - * that the backlog built up due to - * B/W limit being reached and packets - * are queued only in SRS. In this case, - * we should schedule worker thread - * since no one else will wake us up. - */ - if ((mac_srs->srs_type & - SRST_BW_CONTROL) && - (mac_srs->srs_tid == NULL)) { - mac_srs->srs_tid = - timeout(mac_srs_fire, - mac_srs, 1); - srs_rx->sr_poll_worker_wakeup++; - } + } + /* + * We are already above low water mark + * so stay in the polling mode but no + * need to poll. Once we dip below + * the polling threshold, the processing + * thread (soft ring) will signal us + * to poll again (MAC_UPDATE_SRS_COUNT) + */ + srs_rx->sr_poll_drain_no_poll++; + mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); + /* + * In B/W control case, its possible + * that the backlog built up due to + * B/W limit being reached and packets + * are queued only in SRS. In this case, + * we should schedule worker thread + * since no one else will wake us up. + */ + if ((mac_srs->srs_type & SRST_BW_CONTROL) && + (mac_srs->srs_tid == NULL)) { + mac_srs->srs_tid = + timeout(mac_srs_fire, mac_srs, 1); + srs_rx->sr_poll_worker_wakeup++; } } else { /* @@ -1598,7 +1577,7 @@ mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL)); -again: + /* If we are blanked i.e. can't do upcalls, then we are done */ if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || @@ -1609,6 +1588,26 @@ again: if (mac_srs->srs_first == NULL) goto out; + if (!(mac_srs->srs_state & SRS_LATENCY_OPT) && + (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) { + /* + * In the normal case, the SRS worker thread does no + * work and we wait for a backlog to build up before + * we switch into polling mode. In case we are + * optimizing for throughput, we use the worker thread + * as well. The goal is to let worker thread process + * the queue and poll thread to feed packets into + * the queue. As such, we should signal the poll + * thread to try and get more packets. + * + * We could have pulled this check in the POLL_RING + * macro itself but keeping it explicit here makes + * the architecture more human understandable. + */ + MAC_SRS_POLL_RING(mac_srs); + } + +again: head = mac_srs->srs_first; mac_srs->srs_first = NULL; tail = mac_srs->srs_last; @@ -1624,10 +1623,7 @@ again: mac_srs->srs_state |= (SRS_PROC|proc_type); - /* Switch to polling mode */ - MAC_SRS_WORKER_POLLING_ON(mac_srs); - if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) - MAC_SRS_POLL_RING(mac_srs); + /* * mcip is NULL for broadcast and multicast flows. The promisc * callbacks for broadcast and multicast packets are delivered from @@ -1696,37 +1692,27 @@ again: mutex_enter(&mac_srs->srs_lock); } - /* - * Send the poll thread to pick up any packets arrived - * so far. This also serves as the last check in case - * nothing else is queued in the SRS. The poll thread - * is signalled only in the case the drain was done - * by the worker thread and SRS_WORKER is set. The - * worker thread can run in parallel as long as the - * SRS_WORKER flag is set. We we have nothing else to - * process, we can exit while leaving SRS_PROC set - * which gives the poll thread control to process and - * cleanup once it returns from the NIC. - * - * If we have nothing else to process, we need to - * ensure that we keep holding the srs_lock till - * all the checks below are done and control is - * handed to the poll thread if it was running. - */ - if (mac_srs->srs_first != NULL) { - if (proc_type == SRS_WORKER) { - if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) - MAC_SRS_POLL_RING(mac_srs); + if (!(mac_srs->srs_state & (SRS_LATENCY_OPT|SRS_BLANK|SRS_PAUSE))) { + /* + * In case we are optimizing for throughput, we + * should try and keep the worker thread running + * as much as possible. Send the poll thread down + * to check one more time if something else + * arrived. In the meanwhile, if poll thread had + * collected something due to earlier signal, + * process it now. + */ + if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) { + srs_rx->sr_drain_poll_sig++; + MAC_SRS_POLL_RING(mac_srs); + } + if (mac_srs->srs_first != NULL) { srs_rx->sr_drain_again++; goto again; - } else { - srs_rx->sr_drain_worker_sig++; - cv_signal(&mac_srs->srs_async); } } out: - if (mac_srs->srs_state & SRS_GET_PKTS) { /* * Poll thread is already running. Leave the @@ -1885,12 +1871,6 @@ again: mutex_exit(&mac_srs->srs_bw->mac_bw_lock); } - /* - * We can continue processing the queue. - * We need to figure out if there is a fanout needed or - * we can just process this here. - */ - if ((tid = mac_srs->srs_tid) != 0) mac_srs->srs_tid = 0; @@ -2405,8 +2385,7 @@ mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain, * optimizing for latency, we should signal the * worker thread. */ - if (loopback || ((count > 1) && - !(mac_srs->srs_state & SRS_LATENCY_OPT))) { + if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT)) { /* * For loopback, We need to let the worker take * over as we don't want to continue in the same @@ -2502,6 +2481,12 @@ mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, mblk_t *tail; boolean_t wakeup_worker = B_TRUE; + /* + * Ignore fanout hint if we don't have multiple tx rings. + */ + if (!TX_MULTI_RING_MODE(mac_srs)) + fanout_hint = 0; + if (mac_srs->srs_first != NULL) wakeup_worker = B_FALSE; MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); @@ -2753,18 +2738,89 @@ mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, * the soft ring associated with that Tx ring. The srs itself will not * queue any packets. */ + +#define MAC_TX_SOFT_RING_PROCESS(chain) { \ + index = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count), \ + softring = mac_srs->srs_oth_soft_rings[index]; \ + cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \ + DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index); \ +} + static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) { mac_soft_ring_t *softring; - uint_t indx, hash; + uint64_t hash; + uint_t index; + mac_tx_cookie_t cookie = NULL; ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT); - hash = HASH_HINT(fanout_hint); - indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); - softring = mac_srs->srs_oth_soft_rings[indx]; - return (mac_tx_soft_ring_process(softring, mp_chain, flag, ret_mp)); + if (fanout_hint != 0) { + /* + * The hint is specified by the caller, simply pass the + * whole chain to the soft ring. + */ + hash = HASH_HINT(fanout_hint); + MAC_TX_SOFT_RING_PROCESS(mp_chain); + } else { + mblk_t *last_mp, *cur_mp, *sub_chain; + uint64_t last_hash = 0; + uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media; + + /* + * Compute the hash from the contents (headers) of the + * packets of the mblk chain. Split the chains into + * subchains of the same conversation. + * + * Since there may be more than one ring used for + * sub-chains of the same call, and since the caller + * does not maintain per conversation state since it + * passed a zero hint, unsent subchains will be + * dropped. + */ + + flag |= MAC_DROP_ON_NO_DESC; + ret_mp = NULL; + + ASSERT(ret_mp == NULL); + + sub_chain = NULL; + last_mp = NULL; + + for (cur_mp = mp_chain; cur_mp != NULL; + cur_mp = cur_mp->b_next) { + hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4, + B_TRUE); + if (last_hash != 0 && hash != last_hash) { + /* + * Starting a different subchain, send current + * chain out. + */ + ASSERT(last_mp != NULL); + last_mp->b_next = NULL; + MAC_TX_SOFT_RING_PROCESS(sub_chain); + sub_chain = NULL; + } + + /* add packet to subchain */ + if (sub_chain == NULL) + sub_chain = cur_mp; + last_mp = cur_mp; + last_hash = hash; + } + + if (sub_chain != NULL) { + /* send last subchain */ + ASSERT(last_mp != NULL); + last_mp->b_next = NULL; + MAC_TX_SOFT_RING_PROCESS(sub_chain); + } + + cookie = NULL; + } + + return (cookie); } /* @@ -2788,8 +2844,17 @@ mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); mutex_enter(&mac_srs->srs_lock); if (mac_srs->srs_bw->mac_bw_limit == 0) { - /* zero bandwidth: drop all */ - MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); + /* + * zero bandwidth, no traffic is sent: drop the packets, + * or return the whole chain if the caller requests all + * unsent packets back. + */ + if (flag & MAC_TX_NO_ENQUEUE) { + cookie = (mac_tx_cookie_t)mac_srs; + *ret_mp = mp_chain; + } else { + MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); + } mutex_exit(&mac_srs->srs_lock); return (cookie); } else if ((mac_srs->srs_first != NULL) || @@ -3223,9 +3288,6 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, DTRACE_PROBE3(slowpath, mac_client_impl_t *, src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain); - if (mip->mi_promisc_list != NULL) - mac_promisc_dispatch(mip, mp_chain, src_mcip); - mp = mp_chain; while (mp != NULL) { flow_entry_t *dst_flow_ent; @@ -3241,6 +3303,12 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, CHECK_VID_AND_ADD_TAG(mp); /* + * Check if there are promiscuous mode callbacks defined. + */ + if (mip->mi_promisc_list != NULL) + mac_promisc_dispatch(mip, mp, src_mcip); + + /* * Find the destination. */ dst_flow_ent = mac_tx_classify(mip, mp); @@ -3516,9 +3584,8 @@ mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp, mutex_enter(&ringp->s_ring_lock); ringp->s_ring_total_inpkt += cnt; - if ((ringp->s_ring_type & ST_RING_ANY) || - ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) && - !mac_srs->srs_rx.sr_enqueue_always)) { + if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) && + !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) { /* If on processor or blanking on, then enqueue and return */ if (ringp->s_ring_state & S_RING_BLANK || ringp->s_ring_state & S_RING_PROC) { @@ -3526,7 +3593,6 @@ mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp, mutex_exit(&ringp->s_ring_lock); return; } - proc = ringp->s_ring_rx_func; arg1 = ringp->s_ring_rx_arg1; arg2 = ringp->s_ring_rx_arg2; diff --git a/usr/src/uts/common/io/mac/mac_soft_ring.c b/usr/src/uts/common/io/mac/mac_soft_ring.c index b216e23ff9..a9816e045e 100644 --- a/usr/src/uts/common/io/mac/mac_soft_ring.c +++ b/usr/src/uts/common/io/mac/mac_soft_ring.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -207,6 +207,8 @@ mac_soft_ring_create(int id, clock_t wait, void *flent, uint16_t type, ringp->s_ring_rx_func = rx_func; ringp->s_ring_rx_arg1 = x_arg1; ringp->s_ring_rx_arg2 = x_arg2; + if (mac_srs->srs_state & SRS_SOFTRING_QUEUE) + ringp->s_ring_type |= ST_RING_WORKER_ONLY; } if (cpuid != -1) (void) mac_soft_ring_bind(ringp, cpuid); diff --git a/usr/src/uts/common/io/mac/mac_util.c b/usr/src/uts/common/io/mac/mac_util.c index 1615060736..8b87c25b19 100644 --- a/usr/src/uts/common/io/mac/mac_util.c +++ b/usr/src/uts/common/io/mac/mac_util.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -44,6 +44,10 @@ #include <sys/vtrace.h> #include <sys/dlpi.h> #include <sys/sunndi.h> +#include <inet/ipsec_impl.h> +#include <inet/sadb.h> +#include <inet/ipsecesp.h> +#include <inet/ipsecah.h> /* * Copy an mblk, preserving its hardware checksum flags. @@ -821,3 +825,192 @@ mac_get_devinfo(mac_handle_t mh) return ((void *)mip->mi_dip); } + +#define PKT_HASH_4BYTES(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3]) +#define PKT_HASH_MAC(x) ((x)[0] ^ (x)[1] ^ (x)[2] ^ (x)[3] ^ (x)[4] ^ (x)[5]) + +uint64_t +mac_pkt_hash(uint_t media, mblk_t *mp, uint8_t policy, boolean_t is_outbound) +{ + struct ether_header *ehp; + uint64_t hash = 0; + uint16_t sap; + uint_t skip_len; + uint8_t proto; + + /* + * We may want to have one of these per MAC type plugin in the + * future. For now supports only ethernet. + */ + if (media != DL_ETHER) + return (0L); + + /* for now we support only outbound packets */ + ASSERT(is_outbound); + ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t))); + ASSERT(MBLKL(mp) >= sizeof (struct ether_header)); + + /* compute L2 hash */ + + ehp = (struct ether_header *)mp->b_rptr; + + if ((policy & MAC_PKT_HASH_L2) != 0) { + uchar_t *mac_src = ehp->ether_shost.ether_addr_octet; + uchar_t *mac_dst = ehp->ether_dhost.ether_addr_octet; + hash = PKT_HASH_MAC(mac_src) ^ PKT_HASH_MAC(mac_dst); + policy &= ~MAC_PKT_HASH_L2; + } + + if (policy == 0) + goto done; + + /* skip ethernet header */ + + sap = ntohs(ehp->ether_type); + if (sap == ETHERTYPE_VLAN) { + struct ether_vlan_header *evhp; + mblk_t *newmp = NULL; + + skip_len = sizeof (struct ether_vlan_header); + if (MBLKL(mp) < skip_len) { + /* the vlan tag is the payload, pull up first */ + newmp = msgpullup(mp, -1); + if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) { + goto done; + } + evhp = (struct ether_vlan_header *)newmp->b_rptr; + } else { + evhp = (struct ether_vlan_header *)mp->b_rptr; + } + + sap = ntohs(evhp->ether_type); + freemsg(newmp); + } else { + skip_len = sizeof (struct ether_header); + } + + /* if ethernet header is in its own mblk, skip it */ + if (MBLKL(mp) <= skip_len) { + skip_len -= MBLKL(mp); + mp = mp->b_cont; + if (mp == NULL) + goto done; + } + + sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; + + /* compute IP src/dst addresses hash and skip IPv{4,6} header */ + + switch (sap) { + case ETHERTYPE_IP: { + ipha_t *iphp; + + /* + * If the header is not aligned or the header doesn't fit + * in the mblk, bail now. Note that this may cause packets + * reordering. + */ + iphp = (ipha_t *)(mp->b_rptr + skip_len); + if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) || + !OK_32PTR((char *)iphp)) + goto done; + + proto = iphp->ipha_protocol; + skip_len += IPH_HDR_LENGTH(iphp); + + if ((policy & MAC_PKT_HASH_L3) != 0) { + uint8_t *ip_src = (uint8_t *)&(iphp->ipha_src); + uint8_t *ip_dst = (uint8_t *)&(iphp->ipha_dst); + + hash ^= (PKT_HASH_4BYTES(ip_src) ^ + PKT_HASH_4BYTES(ip_dst)); + policy &= ~MAC_PKT_HASH_L3; + } + break; + } + case ETHERTYPE_IPV6: { + ip6_t *ip6hp; + uint16_t hdr_length; + + /* + * If the header is not aligned or the header doesn't fit + * in the mblk, bail now. Note that this may cause packets + * reordering. + */ + + ip6hp = (ip6_t *)(mp->b_rptr + skip_len); + if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) || + !OK_32PTR((char *)ip6hp)) + goto done; + + if (!mac_ip_hdr_length_v6(mp, ip6hp, &hdr_length, &proto)) + goto done; + skip_len += hdr_length; + + if ((policy & MAC_PKT_HASH_L3) != 0) { + uint8_t *ip_src = &(ip6hp->ip6_src.s6_addr8[12]); + uint8_t *ip_dst = &(ip6hp->ip6_dst.s6_addr8[12]); + + hash ^= (PKT_HASH_4BYTES(ip_src) ^ + PKT_HASH_4BYTES(ip_dst)); + policy &= ~MAC_PKT_HASH_L3; + } + break; + } + default: + goto done; + } + + if (policy == 0) + goto done; + + /* if ip header is in its own mblk, skip it */ + if (MBLKL(mp) <= skip_len) { + skip_len -= MBLKL(mp); + mp = mp->b_cont; + if (mp == NULL) + goto done; + } + + /* parse ULP header */ +again: + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_ESP: + case IPPROTO_SCTP: + /* + * These Internet Protocols are intentionally designed + * for hashing from the git-go. Port numbers are in the first + * word for transports, SPI is first for ESP. + */ + if (mp->b_rptr + skip_len + 4 > mp->b_wptr) + goto done; + hash ^= PKT_HASH_4BYTES((mp->b_rptr + skip_len)); + break; + + case IPPROTO_AH: { + ah_t *ah = (ah_t *)(mp->b_rptr + skip_len); + uint_t ah_length = AH_TOTAL_LEN(ah); + + if ((unsigned char *)ah + sizeof (ah_t) > mp->b_wptr) + goto done; + + proto = ah->ah_nexthdr; + skip_len += ah_length; + + /* if AH header is in its own mblk, skip it */ + if (MBLKL(mp) <= skip_len) { + skip_len -= MBLKL(mp); + mp = mp->b_cont; + if (mp == NULL) + goto done; + } + + goto again; + } + } + +done: + return (hash); +} diff --git a/usr/src/uts/common/io/nxge/nxge_send.c b/usr/src/uts/common/io/nxge/nxge_send.c index 0bb35ef423..97603172be 100644 --- a/usr/src/uts/common/io/nxge/nxge_send.c +++ b/usr/src/uts/common/io/nxge/nxge_send.c @@ -19,10 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +#include <sys/mac_provider.h> #include <sys/nxge/nxge_impl.h> #include <sys/nxge/nxge_hio.h> #include <npi_tx_wr64.h> @@ -32,6 +33,9 @@ #include <inet/ip_impl.h> #include <inet/tcp.h> +extern uint64_t mac_pkt_hash(uint_t, mblk_t *mp, uint8_t policy, + boolean_t is_outbound); + static mblk_t *nxge_lso_eliminate(mblk_t *); static mblk_t *nxge_do_softlso(mblk_t *mp, uint32_t mss); static void nxge_lso_info_get(mblk_t *, uint32_t *, uint32_t *); @@ -121,8 +125,17 @@ nxge_tx_ring_send(void *arg, mblk_t *mp) #if defined(sun4v) /* + * Hashing policy for load balancing over the set of TX rings + * available to the driver. + */ +static uint8_t nxge_tx_hash_policy = MAC_PKT_HASH_L4; + +/* * nxge_m_tx() is needed for Hybrid I/O operation of the vnet in * the guest domain. See CR 6778758 for long term solution. + * + * The guest domain driver will for now hash the packet + * to pick a DMA channel from the only group it has group 0. */ mblk_t * @@ -130,15 +143,23 @@ nxge_m_tx(void *arg, mblk_t *mp) { p_nxge_t nxgep = (p_nxge_t)arg; mblk_t *next; + uint64_t rindex; p_tx_ring_t tx_ring_p; int status; NXGE_DEBUG_MSG((nxgep, TX_CTL, "==> nxge_m_tx")); /* - * Get the default ring handle. + * Hash to pick a ring from Group 0, the only TX group + * for a guest domain driver. + */ + rindex = mac_pkt_hash(DL_ETHER, mp, nxge_tx_hash_policy, B_TRUE); + rindex = rindex % nxgep->pt_config.tdc_grps[0].max_tdcs; + + /* + * Get the ring handle. */ - tx_ring_p = nxgep->tx_rings->rings[0]; + tx_ring_p = nxgep->tx_rings->rings[rindex]; while (mp != NULL) { next = mp->b_next; diff --git a/usr/src/uts/common/io/softmac/softmac_main.c b/usr/src/uts/common/io/softmac/softmac_main.c index c940794d72..a44856c849 100644 --- a/usr/src/uts/common/io/softmac/softmac_main.c +++ b/usr/src/uts/common/io/softmac/softmac_main.c @@ -1042,17 +1042,26 @@ softmac_mac_recreate(mod_hash_key_t key, mod_hash_val_t *val, void *arg) return (MH_WALK_CONTINUE); } + /* + * Bumping up the smac_hold_cnt allows us to drop the lock. It also + * makes softmac_destroy() return failure on an attempted device detach. + * We don't want to hold the lock across calls to other subsystems + * like kstats, which will happen in the call to dls_devnet_recreate + */ + softmac->smac_hold_cnt++; + mutex_exit(&softmac->smac_mutex); + if (dls_mgmt_create(softmac->smac_devname, makedevice(softmac->smac_umajor, softmac->smac_uppa + 1), DATALINK_CLASS_PHYS, softmac->smac_media, B_TRUE, &linkid) != 0) { - mutex_exit(&softmac->smac_mutex); + softmac_rele_device((dls_dev_handle_t)softmac); return (MH_WALK_CONTINUE); } if ((err = softmac_update_info(softmac, &linkid)) != 0) { cmn_err(CE_WARN, "softmac: softmac_update_info() for %s " "failed (%d)", softmac->smac_devname, err); - mutex_exit(&softmac->smac_mutex); + softmac_rele_device((dls_dev_handle_t)softmac); return (MH_WALK_CONTINUE); } @@ -1069,7 +1078,10 @@ softmac_mac_recreate(mod_hash_key_t key, mod_hash_val_t *val, void *arg) } } + mutex_enter(&softmac->smac_mutex); softmac->smac_flags &= ~SOFTMAC_NEED_RECREATE; + ASSERT(softmac->smac_hold_cnt != 0); + softmac->smac_hold_cnt--; mutex_exit(&softmac->smac_mutex); return (MH_WALK_CONTINUE); diff --git a/usr/src/uts/common/sys/aggr_impl.h b/usr/src/uts/common/sys/aggr_impl.h index a1f7e82849..59babc2876 100644 --- a/usr/src/uts/common/sys/aggr_impl.h +++ b/usr/src/uts/common/sys/aggr_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -157,6 +157,7 @@ typedef struct aggr_grp_s { aggr_port_t **lg_tx_ports; /* array of tx ports */ uint_t lg_tx_ports_size; /* size of lg_tx_ports */ uint32_t lg_tx_policy; /* outbound policy */ + uint8_t lg_mac_tx_policy; uint64_t lg_ifspeed; link_state_t lg_link_state; link_duplex_t lg_link_duplex; diff --git a/usr/src/uts/common/sys/dld.h b/usr/src/uts/common/sys/dld.h index 03efb11d58..3094fa1a09 100644 --- a/usr/src/uts/common/sys/dld.h +++ b/usr/src/uts/common/sys/dld.h @@ -372,6 +372,10 @@ typedef struct dld_capab_direct_s { /* flow control notification callback */ uintptr_t di_tx_cb_df; /* callback registration/de-registration */ void *di_tx_cb_dh; + + /* flow control "can I put on a ring" callback */ + uintptr_t di_tx_fctl_df; /* canput-like callback */ + void *di_tx_fctl_dh; } dld_capab_direct_t; /* diff --git a/usr/src/uts/common/sys/dld_impl.h b/usr/src/uts/common/sys/dld_impl.h index 906fd6fe15..6aa661b04f 100644 --- a/usr/src/uts/common/sys/dld_impl.h +++ b/usr/src/uts/common/sys/dld_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -323,7 +323,7 @@ typedef struct dld_ap { mutex_exit(&(dsp)->ds_lock); \ } -#define DLD_TX(dsp, mp, f_hint, flag) \ +#define DLD_TX(dsp, mp, f_hint, flag) \ mac_tx(dsp->ds_mch, mp, f_hint, flag, NULL) #ifdef DEBUG diff --git a/usr/src/uts/common/sys/dls_impl.h b/usr/src/uts/common/sys/dls_impl.h index 71f79a611a..98bec27a8e 100644 --- a/usr/src/uts/common/sys/dls_impl.h +++ b/usr/src/uts/common/sys/dls_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -70,6 +70,8 @@ typedef struct dls_head_s { uint_t dh_removing; /* dh_lock */ } dls_head_t; +extern mod_hash_t *i_dls_link_hash; + extern void dls_link_init(void); extern int dls_link_fini(void); extern int dls_link_hold(const char *, dls_link_t **); diff --git a/usr/src/uts/common/sys/mac.h b/usr/src/uts/common/sys/mac.h index d4608f3729..d179e5a2f8 100644 --- a/usr/src/uts/common/sys/mac.h +++ b/usr/src/uts/common/sys/mac.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -565,21 +565,28 @@ extern void mac_margin_get(mac_handle_t, uint32_t *); extern int mac_margin_remove(mac_handle_t, uint32_t); extern int mac_margin_add(mac_handle_t, uint32_t *, boolean_t); -extern void mac_init_ops(struct dev_ops *, const char *); -extern void mac_fini_ops(struct dev_ops *); -extern uint32_t mac_no_notification(mac_handle_t); extern mactype_register_t *mactype_alloc(uint_t); extern void mactype_free(mactype_register_t *); extern int mactype_register(mactype_register_t *); extern int mactype_unregister(const char *); -extern void mac_set_ring(void *, void *); extern void mac_start_logusage(mac_logtype_t, uint_t); extern void mac_stop_logusage(mac_logtype_t); extern mac_handle_t mac_get_lower_mac_handle(mac_handle_t); +/* + * Packet hashing for distribution to multiple ports and rings. + */ + +#define MAC_PKT_HASH_L2 0x01 +#define MAC_PKT_HASH_L3 0x02 +#define MAC_PKT_HASH_L4 0x04 + +extern uint64_t mac_pkt_hash(uint_t, mblk_t *, uint8_t, + boolean_t); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/mac_client.h b/usr/src/uts/common/sys/mac_client.h index f1743577ef..04f1f83006 100644 --- a/usr/src/uts/common/sys/mac_client.h +++ b/usr/src/uts/common/sys/mac_client.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -98,8 +98,9 @@ typedef enum { #define MAC_CLOSE_FLAGS_IS_AGGR_PORT 0x0004 /* flags passed to mac_promisc_add() */ -#define MAC_PROMISC_FLAGS_NO_TX_LOOP 0x0001 -#define MAC_PROMISC_FLAGS_NO_PHYS 0x0002 +#define MAC_PROMISC_FLAGS_NO_TX_LOOP 0x0001 +#define MAC_PROMISC_FLAGS_NO_PHYS 0x0002 +#define MAC_PROMISC_FLAGS_VLAN_TAG_STRIP 0x0004 /* flags passed to mac_tx() */ #define MAC_DROP_ON_NO_DESC 0x01 /* freemsg() if no tx descs */ @@ -175,6 +176,12 @@ extern uint_t mac_hwgrp_num(mac_handle_t); extern void mac_get_hwgrp_info(mac_handle_t, int, uint_t *, uint_t *, uint_t *, uint_t *, char *); +extern uint32_t mac_no_notification(mac_handle_t); +extern int mac_set_prop(mac_handle_t, mac_prop_t *, void *, uint_t); +extern int mac_get_prop(mac_handle_t, mac_prop_t *, void *, uint_t, uint_t *); + +extern boolean_t mac_is_vnic(mac_handle_t); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/mac_client_impl.h b/usr/src/uts/common/sys/mac_client_impl.h index 93ee6e760d..d4dd8853a6 100644 --- a/usr/src/uts/common/sys/mac_client_impl.h +++ b/usr/src/uts/common/sys/mac_client_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -74,6 +74,7 @@ typedef struct mac_promisc_impl_s { /* Protected by */ struct mac_client_impl_s *mpi_mcip; /* WO */ boolean_t mpi_no_tx_loop; /* WO */ boolean_t mpi_no_phys; /* WO */ + boolean_t mpi_strip_vlan_tag; /* WO */ } mac_promisc_impl_t; typedef union mac_tx_percpu_s { diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h index b5ddf9c2cc..5c0d74cfea 100644 --- a/usr/src/uts/common/sys/mac_impl.h +++ b/usr/src/uts/common/sys/mac_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -424,6 +424,7 @@ struct mac_impl_s { /* list of MAC clients which opened this MAC */ struct mac_client_impl_s *mi_clients_list; /* mi_rw_lock */ uint_t mi_nclients; /* mi_rw_lock */ + struct mac_client_impl_s *mi_single_active_client; /* mi_rw_lock */ uint32_t mi_margin; /* mi_rw_lock */ uint_t mi_sdu_min; /* mi_rw_lock */ diff --git a/usr/src/uts/common/sys/mac_provider.h b/usr/src/uts/common/sys/mac_provider.h index 1f57148de8..5522a6c884 100644 --- a/usr/src/uts/common/sys/mac_provider.h +++ b/usr/src/uts/common/sys/mac_provider.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -414,10 +414,6 @@ typedef struct mac_register_s { */ extern void mac_sdu_get(mac_handle_t, uint_t *, uint_t *); extern int mac_maxsdu_update(mac_handle_t, uint_t); -extern int mac_set_prop(mac_handle_t, mac_prop_t *, - void *, uint_t); -extern int mac_get_prop(mac_handle_t, mac_prop_t *, - void *, uint_t, uint_t *); extern mac_register_t *mac_alloc(uint_t); extern void mac_free(mac_register_t *); @@ -452,19 +448,15 @@ extern int mac_margin_add(mac_handle_t, uint32_t *, boolean_t); extern void mac_init_ops(struct dev_ops *, const char *); extern void mac_fini_ops(struct dev_ops *); -extern uint32_t mac_no_notification(mac_handle_t); extern mactype_register_t *mactype_alloc(uint_t); extern void mactype_free(mactype_register_t *); extern int mactype_register(mactype_register_t *); extern int mactype_unregister(const char *); -extern void mac_set_ring(void *, void *); extern boolean_t mac_unicst_verify(mac_handle_t, const uint8_t *, uint_t); -extern boolean_t mac_is_vnic(mac_handle_t); - extern int mac_group_add_ring(mac_group_handle_t, int); extern void mac_group_rem_ring(mac_group_handle_t, mac_ring_handle_t); diff --git a/usr/src/uts/common/sys/mac_soft_ring.h b/usr/src/uts/common/sys/mac_soft_ring.h index 45fcdf65bf..69f9174d17 100644 --- a/usr/src/uts/common/sys/mac_soft_ring.h +++ b/usr/src/uts/common/sys/mac_soft_ring.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -181,7 +181,6 @@ typedef struct mac_srs_rx_s { void *sr_arg1; /* srs_lock */ mac_resource_handle_t sr_arg2; /* srs_lock */ mac_rx_func_t sr_lower_proc; /* Atomically changed */ - boolean_t sr_enqueue_always; /* enqueue at soft ring */ uint32_t sr_poll_pkt_cnt; uint32_t sr_poll_thres; @@ -233,7 +232,7 @@ typedef struct mac_srs_rx_s { /* Worker thread goes back to draining the queue */ uint32_t sr_drain_again; /* More Packets in queue so signal the worker thread to drain */ - uint32_t sr_drain_worker_sig; + uint32_t sr_drain_poll_sig; /* Poll thread is already running so worker has nothing to do */ uint32_t sr_drain_poll_running; /* We have packets already queued so keep polling */ @@ -485,6 +484,7 @@ struct mac_soft_ring_set_s { #define SRS_QUIESCE_PERM 0x10000000 #define SRS_LATENCY_OPT 0x20000000 +#define SRS_SOFTRING_QUEUE 0x40000000 #define SRS_QUIESCED(srs) (srs->srs_state & SRS_QUIESCE_DONE) diff --git a/usr/src/uts/common/xen/io/xnbo.c b/usr/src/uts/common/xen/io/xnbo.c index 672d88e6e0..ee87c067d2 100644 --- a/usr/src/uts/common/xen/io/xnbo.c +++ b/usr/src/uts/common/xen/io/xnbo.c @@ -312,7 +312,8 @@ xnbo_open_mac(xnb_t *xnbp, char *mac) mac_rx_set(xnbop->o_mch, rx_fn, xnbp); } else { err = mac_promisc_add(xnbop->o_mch, MAC_CLIENT_PROMISC_ALL, - rx_fn, xnbp, &xnbop->o_mphp, MAC_PROMISC_FLAGS_NO_TX_LOOP); + rx_fn, xnbp, &xnbop->o_mphp, MAC_PROMISC_FLAGS_NO_TX_LOOP | + MAC_PROMISC_FLAGS_VLAN_TAG_STRIP); if (err != 0) { cmn_err(CE_WARN, "xnbo_open_mac: " "cannot enable promiscuous mode of %s: %d", |
