diff options
Diffstat (limited to 'usr/src/uts/common/io')
144 files changed, 39398 insertions, 3188 deletions
diff --git a/usr/src/uts/common/io/aggr/aggr_grp.c b/usr/src/uts/common/io/aggr/aggr_grp.c index 3554e111c1..47840951d3 100644 --- a/usr/src/uts/common/io/aggr/aggr_grp.c +++ b/usr/src/uts/common/io/aggr/aggr_grp.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* @@ -32,39 +32,69 @@ * module. The hash key is the linkid associated with the link * aggregation group. * - * A set of MAC ports are associated with each association group. + * Each aggregation contains a set of ports. The port is represented + * by the aggr_port_t structure. A port consists of a single MAC + * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying + * MAC. This client is used by the aggr to send and receive LACP + * traffic. Each port client takes on the same MAC unicast address -- + * the address of the aggregation itself (taken from the first port by + * default). * - * Aggr pseudo TX rings - * -------------------- - * The underlying ports (NICs) in an aggregation can have TX rings. To - * enhance aggr's performance, these TX rings are made available to the - * aggr layer as pseudo TX rings. The concept of pseudo rings are not new. - * They are already present and implemented on the RX side. It is called - * as pseudo RX rings. The same concept is extended to the TX side where - * each TX ring of an underlying port is reflected in aggr as a pseudo - * TX ring. Thus each pseudo TX ring will map to a specific hardware TX - * ring. Even in the case of a NIC that does not have a TX ring, a pseudo - * TX ring is given to the aggregation layer. + * The MAC client that hangs off each aggr port is not your typical + * MAC client. Not only does it have exclusive control of the MAC, but + * it also has no Tx or Rx SRSes. An SRS is designed to queue and + * fanout traffic among L4 protocols; but the aggr is an intermediary, + * not a consumer. Instead of using SRSes, the aggr puts the + * underlying hardware rings into passthru mode and ships packets up + * via a direct call to aggr_recv_cb(). This allows aggr to enforce + * LACP while passing all other traffic up to clients of the aggr. + * + * Pseudo Rx Groups and Rings + * -------------------------- + * + * It is imperative for client performance that the aggr provide as + * many MAC groups as possible. In order to use the underlying HW + * resources, aggr creates pseudo groups to aggregate the underlying + * HW groups. Every HW group gets mapped to a pseudo group; and every + * HW ring in that group gets mapped to a pseudo ring. The pseudo + * group at index 0 combines all the HW groups at index 0 from each + * port, etc. The aggr's MAC then creates normal MAC groups and rings + * out of these pseudo groups and rings to present to the aggr's + * clients. To the clients, the aggr's groups and rings are absolutely + * no different than a NIC's groups or rings. + * + * Pseudo Tx Rings + * --------------- + * + * The underlying ports (NICs) in an aggregation can have Tx rings. To + * enhance aggr's performance, these Tx rings are made available to + * the aggr layer as pseudo Tx rings. The concept of pseudo rings are + * not new. They are already present and implemented on the Rx side. + * The same concept is extended to the Tx side where each Tx ring of + * an underlying port is reflected in aggr as a pseudo Tx ring. Thus + * each pseudo Tx ring will map to a specific hardware Tx ring. Even + * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring + * is given to the aggregation layer. * * With this change, the outgoing stack depth looks much better: * * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() -> * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx() * - * Two new modes are introduced to mac_tx() to handle aggr pseudo TX rings: + * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings: * SRS_TX_AGGR and SRS_TX_BW_AGGR. * * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine - * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) TX + * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx * ring belonging to a port on which the packet has to be sent. * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4 - * policy and then uses the fanout_hint passed to it to pick a TX ring from + * policy and then uses the fanout_hint passed to it to pick a Tx ring from * the selected port. * * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where * bandwidth limit is applied first on the outgoing packet and the packets * allowed to go out would call mac_tx_aggr_mode() to send the packet on a - * particular TX ring. + * particular Tx ring. */ #include <sys/types.h> @@ -121,10 +151,12 @@ static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); static int aggr_pseudo_disable_intr(mac_intr_handle_t); static int aggr_pseudo_enable_intr(mac_intr_handle_t); -static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t); -static void aggr_pseudo_stop_ring(mac_ring_driver_t); +static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t); +static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t); static int aggr_addmac(void *, const uint8_t *); static int aggr_remmac(void *, const uint8_t *); +static int aggr_addvlan(mac_group_driver_t, uint16_t); +static int aggr_remvlan(mac_group_driver_t, uint16_t); static mblk_t *aggr_rx_poll(void *, int); static void aggr_fill_ring(void *, mac_ring_type_t, const int, const int, mac_ring_info_t *, mac_ring_handle_t); @@ -325,6 +357,7 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) return (B_FALSE); } + mutex_enter(&grp->lg_stat_lock); if (grp->lg_ifspeed == 0) { /* * The group inherits the speed of the first link being @@ -338,8 +371,10 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) * the group link speed, as per 802.3ad. Since it is * not, the attach is cancelled. */ + mutex_exit(&grp->lg_stat_lock); return (B_FALSE); } + mutex_exit(&grp->lg_stat_lock); grp->lg_nattached_ports++; @@ -348,7 +383,9 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) */ if (grp->lg_link_state != LINK_STATE_UP) { grp->lg_link_state = LINK_STATE_UP; + mutex_enter(&grp->lg_stat_lock); grp->lg_link_duplex = LINK_DUPLEX_FULL; + mutex_exit(&grp->lg_stat_lock); link_state_changed = B_TRUE; } @@ -360,9 +397,13 @@ aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) aggr_grp_multicst_port(port, B_TRUE); /* - * Set port's receive callback + * The port client doesn't have an Rx SRS; instead of calling + * mac_rx_set() we set the client's flow callback directly. + * This datapath is used only when the port's driver doesn't + * support MAC_CAPAB_RINGS. Drivers with ring support will + * deliver traffic to the aggr via ring passthru. */ - mac_rx_set(port->lp_mch, aggr_recv_cb, port); + mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port); /* * If LACP is OFF, the port can be used to send data as soon @@ -392,7 +433,7 @@ aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port) if (port->lp_state != AGGR_PORT_STATE_ATTACHED) return (B_FALSE); - mac_rx_clear(port->lp_mch); + mac_client_clear_flow_cb(port->lp_mch); aggr_grp_multicst_port(port, B_FALSE); @@ -406,9 +447,11 @@ aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port) grp->lg_nattached_ports--; if (grp->lg_nattached_ports == 0) { /* the last attached MAC port of the group is being detached */ - grp->lg_ifspeed = 0; grp->lg_link_state = LINK_STATE_DOWN; + mutex_enter(&grp->lg_stat_lock); + grp->lg_ifspeed = 0; grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; + mutex_exit(&grp->lg_stat_lock); link_state_changed = B_TRUE; } @@ -529,26 +572,27 @@ aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force, zoneid_t port_zoneid = ALL_ZONES; int err; - /* The port must be int the same zone as the aggregation. */ + /* The port must be in the same zone as the aggregation. */ if (zone_check_datalink(&port_zoneid, port_linkid) != 0) port_zoneid = GLOBAL_ZONEID; if (grp->lg_zoneid != port_zoneid) return (EBUSY); /* - * lg_mh could be NULL when the function is called during the creation - * of the aggregation. + * If we are creating the aggr, then there is no MAC handle + * and thus no perimeter to hold. If we are adding a port to + * an existing aggr, then the perimiter of the aggr's MAC must + * be held. */ ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh)); - /* create new port */ err = aggr_port_create(grp, port_linkid, force, &port); if (err != 0) return (err); mac_perim_enter_by_mh(port->lp_mh, &mph); - /* add port to list of group constituent ports */ + /* Add the new port to the end of the list. */ cport = &grp->lg_ports; while (*cport != NULL) cport = &((*cport)->lp_next); @@ -630,6 +674,7 @@ aggr_add_pseudo_rx_ring(aggr_port_t *port, ring->arr_flags |= MAC_PSEUDO_RING_INUSE; ring->arr_hw_rh = hw_rh; ring->arr_port = port; + ring->arr_grp = rx_grp; rx_grp->arg_ring_cnt++; /* @@ -640,10 +685,15 @@ aggr_add_pseudo_rx_ring(aggr_port_t *port, ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; ring->arr_hw_rh = NULL; ring->arr_port = NULL; + ring->arr_grp = NULL; rx_grp->arg_ring_cnt--; } else { - mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring, - mac_find_ring(rx_grp->arg_gh, j)); + /* + * This must run after the MAC is registered. + */ + ASSERT3P(ring->arr_rh, !=, NULL); + mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb, + (void *)port, (mac_resource_handle_t)ring); } return (err); } @@ -654,11 +704,9 @@ aggr_add_pseudo_rx_ring(aggr_port_t *port, static void aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) { - aggr_pseudo_rx_ring_t *ring; - int j; + for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) { + aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j; - for (j = 0; j < MAX_RINGS_PER_GROUP; j++) { - ring = rx_grp->arg_rings + j; if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) || ring->arr_hw_rh != hw_rh) { continue; @@ -669,134 +717,140 @@ aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; ring->arr_hw_rh = NULL; ring->arr_port = NULL; + ring->arr_grp = NULL; rx_grp->arg_ring_cnt--; - mac_hwring_teardown(hw_rh); + mac_hwring_clear_passthru(hw_rh); break; } } /* - * This function is called to create pseudo rings over the hardware rings of - * the underlying device. Note that there is a 1:1 mapping between the pseudo - * RX rings of the aggr and the hardware rings of the underlying port. + * Create pseudo rings over the HW rings of the port. + * + * o Create a pseudo ring in rx_grp per HW ring in the port's HW group. + * + * o Program existing unicast filters on the pseudo group into the HW group. + * + * o Program existing VLAN filters on the pseudo group into the HW group. */ static int aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) { - aggr_grp_t *grp = port->lp_grp; mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; aggr_unicst_addr_t *addr, *a; mac_perim_handle_t pmph; - int hw_rh_cnt, i = 0, j; + aggr_vlan_t *avp; + uint_t hw_rh_cnt, i; int err = 0; + uint_t g_idx = rx_grp->arg_index; - ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); + ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT); mac_perim_enter_by_mh(port->lp_mh, &pmph); /* - * This function must be called after the aggr registers its mac - * and its RX group has been initialized. + * This function must be called after the aggr registers its + * MAC and its Rx groups have been initialized. */ ASSERT(rx_grp->arg_gh != NULL); /* - * Get the list the the underlying HW rings. + * Get the list of the underlying HW rings. */ - hw_rh_cnt = mac_hwrings_get(port->lp_mch, - &port->lp_hwgh, hw_rh, MAC_RING_TYPE_RX); - - if (port->lp_hwgh != NULL) { - /* - * Quiesce the HW ring and the mac srs on the ring. Note - * that the HW ring will be restarted when the pseudo ring - * is started. At that time all the packets will be - * directly passed up to the pseudo RX ring and handled - * by mac srs created over the pseudo RX ring. - */ - mac_rx_client_quiesce(port->lp_mch); - mac_srs_perm_quiesce(port->lp_mch, B_TRUE); - } + hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, + &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX); /* - * Add all the unicast addresses to the newly added port. + * Add existing VLAN and unicast address filters to the port. */ + for (avp = list_head(&rx_grp->arg_vlans); avp != NULL; + avp = list_next(&rx_grp->arg_vlans, avp)) { + if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0) + goto err; + } + for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) { - if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0) - break; + if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0) + goto err; } - for (i = 0; err == 0 && i < hw_rh_cnt; i++) + for (i = 0; i < hw_rh_cnt; i++) { err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]); + if (err != 0) + goto err; + } - if (err != 0) { - for (j = 0; j < i; j++) - aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]); + mac_perim_exit(pmph); + return (0); + +err: + ASSERT(err != 0); + + for (uint_t j = 0; j < i; j++) + aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]); + + for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next) + aggr_port_remmac(port, g_idx, a->aua_addr); - for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next) - aggr_port_remmac(port, a->aua_addr); + if (avp != NULL) + avp = list_prev(&rx_grp->arg_vlans, avp); - if (port->lp_hwgh != NULL) { - mac_srs_perm_quiesce(port->lp_mch, B_FALSE); - mac_rx_client_restart(port->lp_mch); - port->lp_hwgh = NULL; + for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) { + int err2; + + if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) { + cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s" + ": errno %d.", avp->av_vid, + mac_client_name(port->lp_mch), err2); } - } else { - port->lp_rx_grp_added = B_TRUE; } -done: + + port->lp_hwghs[g_idx] = NULL; mac_perim_exit(pmph); return (err); } /* - * This function is called by aggr to remove pseudo RX rings over the - * HW rings of the underlying port. + * Destroy the pseudo rings mapping to this port and remove all VLAN + * and unicast filters from this port. Even if there are no underlying + * HW rings we must still remove the unicast filters to take the port + * out of promisc mode. */ static void aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) { - aggr_grp_t *grp = port->lp_grp; mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; aggr_unicst_addr_t *addr; - mac_group_handle_t hwgh; mac_perim_handle_t pmph; - int hw_rh_cnt, i; + uint_t hw_rh_cnt; + uint_t g_idx = rx_grp->arg_index; - ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); + ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT); + ASSERT3P(rx_grp->arg_gh, !=, NULL); mac_perim_enter_by_mh(port->lp_mh, &pmph); - if (!port->lp_rx_grp_added) - goto done; - - ASSERT(rx_grp->arg_gh != NULL); - hw_rh_cnt = mac_hwrings_get(port->lp_mch, - &hwgh, hw_rh, MAC_RING_TYPE_RX); + hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh, + MAC_RING_TYPE_RX); - /* - * If hw_rh_cnt is 0, it means that the underlying port does not - * support RX rings. Directly return in this case. - */ - for (i = 0; i < hw_rh_cnt; i++) + for (uint_t i = 0; i < hw_rh_cnt; i++) aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]); for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) - aggr_port_remmac(port, addr->aua_addr); + aggr_port_remmac(port, g_idx, addr->aua_addr); - if (port->lp_hwgh != NULL) { - port->lp_hwgh = NULL; + for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL; + avp = list_next(&rx_grp->arg_vlans, avp)) { + int err; - /* - * First clear the permanent-quiesced flag of the RX srs then - * restart the HW ring and the mac srs on the ring. Note that - * the HW ring and associated SRS will soon been removed when - * the port is removed from the aggr. - */ - mac_srs_perm_quiesce(port->lp_mch, B_FALSE); - mac_rx_client_restart(port->lp_mch); + if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) { + cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s" + ": errno %d.", avp->av_vid, + mac_client_name(port->lp_mch), err); + } } - port->lp_rx_grp_added = B_FALSE; -done: + port->lp_hwghs[g_idx] = NULL; mac_perim_exit(pmph); } @@ -900,8 +954,8 @@ aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) /* * Get the list the the underlying HW rings. */ - hw_rh_cnt = mac_hwrings_get(port->lp_mch, - NULL, hw_rh, MAC_RING_TYPE_TX); + hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh, + MAC_RING_TYPE_TX); /* * Even if the underlying NIC does not have TX rings, we @@ -1006,23 +1060,46 @@ aggr_pseudo_enable_intr(mac_intr_handle_t ih) return (mac_hwring_enable_intr(rr_ring->arr_hw_rh)); } +/* + * Start the pseudo ring. Since the pseudo ring is just an abstraction + * over an actual HW ring, the real task is to start the underlying HW + * ring. + */ static int -aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen) +aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen) { - aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; int err; + aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; err = mac_hwring_start(rr_ring->arr_hw_rh); - if (err == 0) - rr_ring->arr_gen = mr_gen; + + if (err != 0) + return (err); + + rr_ring->arr_gen = mr_gen; return (err); } +/* + * Stop the pseudo ring. Since the pseudo ring is just an abstraction + * over an actual HW ring, the real task is to stop the underlying HW + * ring. + */ static void -aggr_pseudo_stop_ring(mac_ring_driver_t arg) +aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg) { aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; - mac_hwring_stop(rr_ring->arr_hw_rh); + + /* + * The rings underlying the default group must stay up to + * continue receiving LACP traffic. We would normally never + * stop the default Rx rings because of the primary MAC + * client; but aggr's primary MAC client doesn't call + * mac_unicast_add() and thus mi_active is 0 when the last + * non-primary client is deleted. + */ + if (rr_ring->arr_grp->arg_index != 0) + mac_hwring_stop(rr_ring->arr_hw_rh); } /* @@ -1032,13 +1109,15 @@ int aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, laioc_port_t *ports) { - int rc, i, nadded = 0; + int rc; + uint_t port_added = 0; + uint_t grp_added; aggr_grp_t *grp = NULL; aggr_port_t *port; boolean_t link_state_changed = B_FALSE; mac_perim_handle_t mph, pmph; - /* get group corresponding to linkid */ + /* Get the aggr corresponding to linkid. */ rw_enter(&aggr_grp_lock, RW_READER); if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), (mod_hash_val_t *)&grp) != 0) { @@ -1048,20 +1127,22 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, AGGR_GRP_REFHOLD(grp); /* - * Hold the perimeter so that the aggregation won't be destroyed. + * Hold the perimeter so that the aggregation can't be destroyed. */ mac_perim_enter_by_mh(grp->lg_mh, &mph); rw_exit(&aggr_grp_lock); - /* add the specified ports to group */ - for (i = 0; i < nports; i++) { - /* add port to group */ + /* Add the specified ports to the aggr. */ + for (uint_t i = 0; i < nports; i++) { + grp_added = 0; + if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port)) != 0) { goto bail; } + ASSERT(port != NULL); - nadded++; + port_added++; /* check capabilities */ if (!aggr_grp_capab_check(grp, port) || @@ -1078,9 +1159,16 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group); if (rc != 0) goto bail; - rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group); - if (rc != 0) - goto bail; + + for (uint_t j = 0; j < grp->lg_rx_group_count; j++) { + rc = aggr_add_pseudo_rx_group(port, + &grp->lg_rx_groups[j]); + + if (rc != 0) + goto bail; + + grp_added++; + } mac_perim_enter_by_mh(port->lp_mh, &pmph); @@ -1098,7 +1186,7 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, /* * Turn on the promiscuous mode over the port when it * is requested to be turned on to receive the - * non-primary address over a port, or the promiscous + * non-primary address over a port, or the promiscuous * mode is enabled over the aggr. */ if (grp->lg_promisc || port->lp_prom_addr != NULL) { @@ -1133,17 +1221,33 @@ aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, bail: if (rc != 0) { /* stop and remove ports that have been added */ - for (i = 0; i < nadded; i++) { + for (uint_t i = 0; i < port_added; i++) { + uint_t grp_remove; + port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); ASSERT(port != NULL); + if (grp->lg_started) { mac_perim_enter_by_mh(port->lp_mh, &pmph); (void) aggr_port_promisc(port, B_FALSE); aggr_port_stop(port); mac_perim_exit(pmph); } + aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); - aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); + + /* + * Only the last port could have a partial set + * of groups added. + */ + grp_remove = (i + 1 == port_added) ? grp_added : + grp->lg_rx_group_count; + + for (uint_t j = 0; j < grp_remove; j++) { + aggr_rem_pseudo_rx_group(port, + &grp->lg_rx_groups[j]); + } + (void) aggr_grp_rem_port(grp, port, NULL, NULL); } } @@ -1305,7 +1409,8 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP), KM_SLEEP); grp->lg_tx_blocked_cnt = 0; - bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t)); + bzero(&grp->lg_rx_groups, + sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT); bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t)); aggr_lacp_init_grp(grp); @@ -1325,11 +1430,48 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, grp->lg_key = key; for (i = 0; i < nports; i++) { - err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, NULL); + err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port); if (err != 0) goto bail; } + grp->lg_rx_group_count = 1; + + for (i = 0, port = grp->lg_ports; port != NULL; + i++, port = port->lp_next) { + uint_t num_rgroups; + + mac_perim_enter_by_mh(port->lp_mh, &mph); + num_rgroups = mac_get_num_rx_groups(port->lp_mh); + mac_perim_exit(mph); + + /* + * Utilize all the groups in a port. If some ports + * have less groups than others, then traffic destined + * for the same unicast address may be HW classified + * on some ports but SW classified by aggr when + * arriving on other ports. + */ + grp->lg_rx_group_count = MAX(grp->lg_rx_group_count, + num_rgroups); + } + + /* + * There could be cases where the hardware provides more + * groups than aggr can support. Make sure we never go above + * the max aggr can support. + */ + grp->lg_rx_group_count = MIN(grp->lg_rx_group_count, + MAX_GROUPS_PER_PORT); + + ASSERT3U(grp->lg_rx_group_count, >, 0); + for (i = 0; i < MAX_GROUPS_PER_PORT; i++) { + grp->lg_rx_groups[i].arg_index = i; + grp->lg_rx_groups[i].arg_untagged = 0; + list_create(&(grp->lg_rx_groups[i].arg_vlans), + sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link)); + } + /* * If no explicit MAC address was specified by the administrator, * set it to the MAC address of the first port. @@ -1347,7 +1489,7 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, grp->lg_mac_addr_port = grp->lg_ports; } - /* set the initial group capabilities */ + /* Set the initial group capabilities. */ aggr_grp_capab_set(grp); if ((mac = mac_alloc(MAC_VERSION)) == NULL) { @@ -1382,14 +1524,18 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, * Update the MAC address of the constituent ports. * None of the port is attached at this time, the link state of the * aggregation will not change. + * + * All ports take on the primary MAC address of the aggr + * (lg_aggr). At this point, none of the ports are attached; + * thus the link state of the aggregation will not change. */ link_state_changed = aggr_grp_update_ports_mac(grp); ASSERT(!link_state_changed); - /* update outbound load balancing policy */ + /* Update outbound load balancing policy. */ aggr_send_update_policy(grp, policy); - /* set LACP mode */ + /* Set LACP mode. */ aggr_lacp_set_mode(grp, lacp_mode, lacp_timer); /* @@ -1397,12 +1543,18 @@ aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, */ for (port = grp->lg_ports; port != NULL; port = port->lp_next) { /* - * Create the pseudo ring for each HW ring of the underlying - * port. Note that this is done after the aggr registers the - * mac. + * Create the pseudo ring for each HW ring of the + * underlying port. Note that this is done after the + * aggr registers its MAC. */ - VERIFY(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group) == 0); - VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0); + VERIFY3S(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group), + ==, 0); + + for (i = 0; i < grp->lg_rx_group_count; i++) { + VERIFY3S(aggr_add_pseudo_rx_group(port, + &grp->lg_rx_groups[i]), ==, 0); + } + if (aggr_port_notify_link(grp, port)) link_state_changed = B_TRUE; @@ -1547,7 +1699,9 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, continue; val = aggr_port_stat(port, stat); val -= port->lp_stat[i]; + mutex_enter(&grp->lg_stat_lock); grp->lg_stat[i] += val; + mutex_exit(&grp->lg_stat_lock); } for (i = 0; i < ETHER_NSTAT; i++) { stat = i + MACTYPE_STAT_MIN; @@ -1555,7 +1709,9 @@ aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, continue; val = aggr_port_stat(port, stat); val -= port->lp_ether_stat[i]; + mutex_enter(&grp->lg_stat_lock); grp->lg_ether_stat[i] += val; + mutex_exit(&grp->lg_stat_lock); } grp->lg_nports--; @@ -1680,7 +1836,8 @@ aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports) * aggr_find_tx_ring() will not return any rings * belonging to it. */ - aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); + for (i = 0; i < grp->lg_rx_group_count; i++) + aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]); /* remove port from group */ rc = aggr_grp_rem_port(grp, port, &mac_addr_changed, @@ -1785,7 +1942,8 @@ aggr_grp_delete(datalink_id_t linkid, cred_t *cred) (void) aggr_grp_detach_port(grp, port); mac_perim_exit(pmph); aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); - aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); + for (uint_t i = 0; i < grp->lg_rx_group_count; i++) + aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]); aggr_port_delete(port); port = cport; } @@ -1804,6 +1962,10 @@ aggr_grp_delete(datalink_id_t linkid, cred_t *cred) VERIFY(mac_unregister(grp->lg_mh) == 0); grp->lg_mh = NULL; + for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) { + list_destroy(&(grp->lg_rx_groups[i].arg_vlans)); + } + AGGR_GRP_REFRELE(grp); return (0); } @@ -1886,6 +2048,8 @@ aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val) aggr_port_t *port; uint_t stat_index; + ASSERT(MUTEX_HELD(&grp->lg_stat_lock)); + /* We only aggregate counter statistics. */ if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) || IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) { @@ -1954,10 +2118,9 @@ static int aggr_m_stat(void *arg, uint_t stat, uint64_t *val) { aggr_grp_t *grp = arg; - mac_perim_handle_t mph; int rval = 0; - mac_perim_enter_by_mh(grp->lg_mh, &mph); + mutex_enter(&grp->lg_stat_lock); switch (stat) { case MAC_STAT_IFSPEED: @@ -1977,7 +2140,7 @@ aggr_m_stat(void *arg, uint_t stat, uint64_t *val) rval = aggr_grp_stat(grp, stat, val); } - mac_perim_exit(mph); + mutex_exit(&grp->lg_stat_lock); return (rval); } @@ -2167,17 +2330,15 @@ aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) return (!grp->lg_zcopy); case MAC_CAPAB_RINGS: { mac_capab_rings_t *cap_rings = cap_data; + uint_t ring_cnt = 0; + + for (uint_t i = 0; i < grp->lg_rx_group_count; i++) + ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt; if (cap_rings->mr_type == MAC_RING_TYPE_RX) { cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; - cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt; - - /* - * An aggregation advertises only one (pseudo) RX - * group, which virtualizes the main/primary group of - * the underlying devices. - */ - cap_rings->mr_gnum = 1; + cap_rings->mr_rnum = ring_cnt; + cap_rings->mr_gnum = grp->lg_rx_group_count; cap_rings->mr_gaddring = NULL; cap_rings->mr_gremring = NULL; } else { @@ -2209,19 +2370,17 @@ aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) } /* - * Callback funtion for MAC layer to register groups. + * Callback function for MAC layer to register groups. */ static void aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index, mac_group_info_t *infop, mac_group_handle_t gh) { aggr_grp_t *grp = arg; - aggr_pseudo_rx_group_t *rx_group; - aggr_pseudo_tx_group_t *tx_group; - ASSERT(index == 0); if (rtype == MAC_RING_TYPE_RX) { - rx_group = &grp->lg_rx_group; + aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index]; + rx_group->arg_gh = gh; rx_group->arg_grp = grp; @@ -2231,8 +2390,18 @@ aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index, infop->mgi_addmac = aggr_addmac; infop->mgi_remmac = aggr_remmac; infop->mgi_count = rx_group->arg_ring_cnt; + + /* + * Always set the HW VLAN callbacks. They are smart + * enough to know when a port has HW VLAN filters to + * program and when it doesn't. + */ + infop->mgi_addvlan = aggr_addvlan; + infop->mgi_remvlan = aggr_remvlan; } else { - tx_group = &grp->lg_tx_group; + aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group; + + ASSERT3S(index, ==, 0); tx_group->atg_gh = gh; } } @@ -2248,13 +2417,13 @@ aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, switch (rtype) { case MAC_RING_TYPE_RX: { - aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_group; + aggr_pseudo_rx_group_t *rx_group; aggr_pseudo_rx_ring_t *rx_ring; mac_intr_t aggr_mac_intr; - ASSERT(rg_index == 0); - - ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt)); + rx_group = &grp->lg_rx_groups[rg_index]; + ASSERT3S(index, >=, 0); + ASSERT3S(index, <, rx_group->arg_ring_cnt); rx_ring = rx_group->arg_rings + index; rx_ring->arr_rh = rh; @@ -2268,8 +2437,8 @@ aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, aggr_mac_intr.mi_ddi_handle = NULL; infop->mri_driver = (mac_ring_driver_t)rx_ring; - infop->mri_start = aggr_pseudo_start_ring; - infop->mri_stop = aggr_pseudo_stop_ring; + infop->mri_start = aggr_pseudo_start_rx_ring; + infop->mri_stop = aggr_pseudo_stop_rx_ring; infop->mri_intr = aggr_mac_intr; infop->mri_poll = aggr_rx_poll; @@ -2356,6 +2525,7 @@ aggr_addmac(void *arg, const uint8_t *mac_addr) aggr_port_t *port, *p; mac_perim_handle_t mph; int err = 0; + uint_t idx = rx_group->arg_index; mac_perim_enter_by_mh(grp->lg_mh, &mph); @@ -2382,12 +2552,12 @@ aggr_addmac(void *arg, const uint8_t *mac_addr) *pprev = addr; for (port = grp->lg_ports; port != NULL; port = port->lp_next) - if ((err = aggr_port_addmac(port, mac_addr)) != 0) + if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0) break; if (err != 0) { for (p = grp->lg_ports; p != port; p = p->lp_next) - aggr_port_remmac(p, mac_addr); + aggr_port_remmac(p, idx, mac_addr); *pprev = NULL; kmem_free(addr, sizeof (aggr_unicst_addr_t)); @@ -2432,7 +2602,7 @@ aggr_remmac(void *arg, const uint8_t *mac_addr) } for (port = grp->lg_ports; port != NULL; port = port->lp_next) - aggr_port_remmac(port, mac_addr); + aggr_port_remmac(port, rx_group->arg_index, mac_addr); *pprev = addr->aua_next; kmem_free(addr, sizeof (aggr_unicst_addr_t)); @@ -2442,6 +2612,188 @@ aggr_remmac(void *arg, const uint8_t *mac_addr) } /* + * Search for VID in the Rx group's list and return a pointer if + * found. Otherwise return NULL. + */ +static aggr_vlan_t * +aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid) +{ + ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh)); + for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL; + avp = list_next(&rx_group->arg_vlans, avp)) { + if (avp->av_vid == vid) + return (avp); + } + + return (NULL); +} + +/* + * Accept traffic on the specified VID. + * + * Persist VLAN state in the aggr so that ports added later will + * receive the correct filters. In the future it would be nice to + * allow aggr to iterate its clients instead of duplicating state. + */ +static int +aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid) +{ + aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; + aggr_grp_t *aggr = rx_group->arg_grp; + aggr_port_t *port, *p; + mac_perim_handle_t mph; + int err = 0; + aggr_vlan_t *avp = NULL; + uint_t idx = rx_group->arg_index; + + mac_perim_enter_by_mh(aggr->lg_mh, &mph); + + if (vid == MAC_VLAN_UNTAGGED) { + /* + * Aggr is both a MAC provider and MAC client. As a + * MAC provider it is passed MAC_VLAN_UNTAGGED by its + * client. As a client itself, it should pass + * VLAN_ID_NONE to its ports. + */ + vid = VLAN_ID_NONE; + rx_group->arg_untagged++; + goto update_ports; + } + + avp = aggr_find_vlan(rx_group, vid); + + if (avp != NULL) { + avp->av_refs++; + mac_perim_exit(mph); + return (0); + } + + avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP); + avp->av_vid = vid; + avp->av_refs = 1; + +update_ports: + for (port = aggr->lg_ports; port != NULL; port = port->lp_next) + if ((err = aggr_port_addvlan(port, idx, vid)) != 0) + break; + + if (err != 0) { + /* + * If any of these calls fail then we are in a + * situation where the ports have different HW state. + * There's no reasonable action the MAC client can + * take in this scenario to rectify the situation. + */ + for (p = aggr->lg_ports; p != port; p = p->lp_next) { + int err2; + + if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) { + cmn_err(CE_WARN, "Failed to remove VLAN %u" + " from port %s: errno %d.", vid, + mac_client_name(p->lp_mch), err2); + } + + } + + if (vid == VLAN_ID_NONE) + rx_group->arg_untagged--; + + if (avp != NULL) { + kmem_free(avp, sizeof (aggr_vlan_t)); + avp = NULL; + } + } + + if (avp != NULL) + list_insert_tail(&rx_group->arg_vlans, avp); + +done: + mac_perim_exit(mph); + return (err); +} + +/* + * Stop accepting traffic on this VLAN if it's the last use of this VLAN. + */ +static int +aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid) +{ + aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; + aggr_grp_t *aggr = rx_group->arg_grp; + aggr_port_t *port, *p; + mac_perim_handle_t mph; + int err = 0; + aggr_vlan_t *avp = NULL; + uint_t idx = rx_group->arg_index; + + mac_perim_enter_by_mh(aggr->lg_mh, &mph); + + /* + * See the comment in aggr_addvlan(). + */ + if (vid == MAC_VLAN_UNTAGGED) { + vid = VLAN_ID_NONE; + rx_group->arg_untagged--; + + if (rx_group->arg_untagged > 0) + goto done; + + goto update_ports; + } + + avp = aggr_find_vlan(rx_group, vid); + + if (avp == NULL) { + err = ENOENT; + goto done; + } + + avp->av_refs--; + + if (avp->av_refs > 0) + goto done; + +update_ports: + for (port = aggr->lg_ports; port != NULL; port = port->lp_next) + if ((err = aggr_port_remvlan(port, idx, vid)) != 0) + break; + + /* + * See the comment in aggr_addvlan() for justification of the + * use of VERIFY here. + */ + if (err != 0) { + for (p = aggr->lg_ports; p != port; p = p->lp_next) { + int err2; + + if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) { + cmn_err(CE_WARN, "Failed to add VLAN %u" + " to port %s: errno %d.", vid, + mac_client_name(p->lp_mch), err2); + } + } + + if (avp != NULL) + avp->av_refs++; + + if (vid == VLAN_ID_NONE) + rx_group->arg_untagged++; + + goto done; + } + + if (err == 0 && avp != NULL) { + VERIFY3U(avp->av_refs, ==, 0); + list_remove(&rx_group->arg_vlans, avp); + kmem_free(avp, sizeof (aggr_vlan_t)); + } + +done: + mac_perim_exit(mph); + return (err); +} + +/* * Add or remove the multicast addresses that are defined for the group * to or from the specified port. * diff --git a/usr/src/uts/common/io/aggr/aggr_lacp.c b/usr/src/uts/common/io/aggr/aggr_lacp.c index a0d566d12b..7dcd3bfb95 100644 --- a/usr/src/uts/common/io/aggr/aggr_lacp.c +++ b/usr/src/uts/common/io/aggr/aggr_lacp.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017, Joyent, Inc. */ /* @@ -606,7 +607,7 @@ lacp_xmit_sm(aggr_port_t *portp) ASSERT(MAC_PERIM_HELD(portp->lp_grp->lg_mh)); /* LACP_OFF state not in specification so check here. */ - if (!pl->sm.lacp_on || !pl->NTT || !portp->lp_started) + if (!pl->sm.lacp_on || !pl->NTT) return; /* diff --git a/usr/src/uts/common/io/aggr/aggr_port.c b/usr/src/uts/common/io/aggr/aggr_port.c index 00545d2c03..c8dbe00336 100644 --- a/usr/src/uts/common/io/aggr/aggr_port.c +++ b/usr/src/uts/common/io/aggr/aggr_port.c @@ -21,6 +21,8 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* @@ -69,10 +71,10 @@ aggr_port_destructor(void *buf, void *arg) { aggr_port_t *port = buf; - ASSERT(port->lp_mnh == NULL); - ASSERT(port->lp_mphp == NULL); - ASSERT(!port->lp_rx_grp_added && !port->lp_tx_grp_added); - ASSERT(port->lp_hwgh == NULL); + ASSERT3P(port->lp_mnh, ==, NULL); + ASSERT(!port->lp_tx_grp_added); + for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) + ASSERT3P(port->lp_hwghs[i], ==, NULL); } void @@ -126,7 +128,6 @@ aggr_port_init_callbacks(aggr_port_t *port) aggr_grp_port_hold(port); } -/* ARGSUSED */ int aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force, aggr_port_t **pp) @@ -195,9 +196,9 @@ aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force, } /* - * As the underlying mac's current margin size is used to determine + * As the underlying MAC's current margin size is used to determine * the margin size of the aggregation itself, request the underlying - * mac not to change to a smaller size. + * MAC not to change to a smaller size. */ if ((err = mac_margin_add(mh, &margin, B_TRUE)) != 0) { id_free(aggr_portids, portid); @@ -206,7 +207,7 @@ aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force, if ((err = mac_unicast_add(mch, NULL, MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK, &mah, 0, &diag)) != 0) { - VERIFY(mac_margin_remove(mh, margin) == 0); + VERIFY3S(mac_margin_remove(mh, margin), ==, 0); id_free(aggr_portids, portid); goto fail; } @@ -261,6 +262,7 @@ aggr_port_create(aggr_grp_t *grp, const datalink_id_t linkid, boolean_t force, fail: if (mch != NULL) mac_client_close(mch, MAC_CLOSE_FLAGS_EXCLUSIVE); + mac_close(mh); return (err); } @@ -270,13 +272,11 @@ aggr_port_delete(aggr_port_t *port) { aggr_lacp_port_t *pl = &port->lp_lacp; - ASSERT(port->lp_mphp == NULL); ASSERT(!port->lp_promisc_on); - port->lp_closing = B_TRUE; + VERIFY0(mac_margin_remove(port->lp_mh, port->lp_margin)); + mac_client_clear_flow_cb(port->lp_mch); - VERIFY(mac_margin_remove(port->lp_mh, port->lp_margin) == 0); - mac_rx_clear(port->lp_mch); /* * If the notification callback is already in process and waiting for * the aggr grp's mac perimeter, don't wait (otherwise there would be @@ -307,8 +307,10 @@ aggr_port_delete(aggr_port_t *port) * port's MAC_NOTE_UNICST notify callback function being called. */ (void) mac_unicast_primary_set(port->lp_mh, port->lp_addr); + if (port->lp_mah != NULL) (void) mac_unicast_remove(port->lp_mch, port->lp_mah); + mac_client_close(port->lp_mch, MAC_CLOSE_FLAGS_EXCLUSIVE); mac_close(port->lp_mh); AGGR_PORT_REFRELE(port); @@ -373,10 +375,14 @@ aggr_port_notify_link(aggr_grp_t *grp, aggr_port_t *port) /* link speed changes? */ ifspeed = aggr_port_stat(port, MAC_STAT_IFSPEED); if (port->lp_ifspeed != ifspeed) { + mutex_enter(&grp->lg_stat_lock); + if (port->lp_state == AGGR_PORT_STATE_ATTACHED) do_detach |= (ifspeed != grp->lg_ifspeed); else do_attach |= (ifspeed == grp->lg_ifspeed); + + mutex_exit(&grp->lg_stat_lock); } port->lp_ifspeed = ifspeed; @@ -515,6 +521,10 @@ aggr_port_stop(aggr_port_t *port) port->lp_started = B_FALSE; } +/* + * Set the promisc mode of the port. If the port is already in the + * requested mode then do nothing. + */ int aggr_port_promisc(aggr_port_t *port, boolean_t on) { @@ -523,27 +533,14 @@ aggr_port_promisc(aggr_port_t *port, boolean_t on) ASSERT(MAC_PERIM_HELD(port->lp_mh)); if (on == port->lp_promisc_on) - /* already in desired promiscous mode */ return (0); - if (on) { - mac_rx_clear(port->lp_mch); - rc = mac_promisc_add(port->lp_mch, MAC_CLIENT_PROMISC_ALL, - aggr_recv_cb, port, &port->lp_mphp, - MAC_PROMISC_FLAGS_NO_TX_LOOP); - if (rc != 0) { - mac_rx_set(port->lp_mch, aggr_recv_cb, port); - return (rc); - } - } else { - mac_promisc_remove(port->lp_mphp); - port->lp_mphp = NULL; - mac_rx_set(port->lp_mch, aggr_recv_cb, port); - } + rc = mac_set_promisc(port->lp_mh, on); - port->lp_promisc_on = on; + if (rc == 0) + port->lp_promisc_on = on; - return (0); + return (rc); } /* @@ -583,35 +580,45 @@ aggr_port_stat(aggr_port_t *port, uint_t stat) } /* - * Add a non-primary unicast address to the underlying port. If the port - * supports HW Rx group, try to add the address into the HW Rx group of - * the port first. If that fails, or if the port does not support HW Rx - * group, enable the port's promiscous mode. + * Add a non-primary unicast address to the underlying port. If the + * port supports HW Rx groups, then try to add the address filter to + * the HW group first. If that fails, or if the port does not support + * RINGS capab, then enable the port's promiscous mode. */ int -aggr_port_addmac(aggr_port_t *port, const uint8_t *mac_addr) +aggr_port_addmac(aggr_port_t *port, uint_t idx, const uint8_t *mac_addr) { aggr_unicst_addr_t *addr, **pprev; mac_perim_handle_t pmph; int err; ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); + ASSERT3U(idx, <, MAX_GROUPS_PER_PORT); mac_perim_enter_by_mh(port->lp_mh, &pmph); /* - * If the underlying port support HW Rx group, add the mac to its - * RX group directly. + * If the port doesn't have a HW group to back the aggr's + * pseudo group, then try using the port's default group and + * let the aggr SW classify its traffic. This scenario happens + * when mixing ports with a different number of HW groups. */ - if ((port->lp_hwgh != NULL) && - ((mac_hwgroup_addmac(port->lp_hwgh, mac_addr)) == 0)) { + if (port->lp_hwghs[idx] == NULL) + idx = 0; + + /* + * If there is an underlying HW Rx group, then try adding this + * unicast address to it. + */ + if ((port->lp_hwghs[idx] != NULL) && + ((mac_hwgroup_addmac(port->lp_hwghs[idx], mac_addr)) == 0)) { mac_perim_exit(pmph); return (0); } /* - * If that fails, or if the port does not support HW Rx group, enable - * the port's promiscous mode. (Note that we turn on the promiscous - * mode only if the port is already started. + * If the port doesn't have HW groups, or we failed to add the + * HW filter, then enable the port's promiscuous mode. We + * enable promiscuous mode only if the port is already started. */ if (port->lp_started && ((err = aggr_port_promisc(port, B_TRUE)) != 0)) { @@ -643,13 +650,14 @@ aggr_port_addmac(aggr_port_t *port, const uint8_t *mac_addr) * promiscous mode. */ void -aggr_port_remmac(aggr_port_t *port, const uint8_t *mac_addr) +aggr_port_remmac(aggr_port_t *port, uint_t idx, const uint8_t *mac_addr) { aggr_grp_t *grp = port->lp_grp; aggr_unicst_addr_t *addr, **pprev; mac_perim_handle_t pmph; ASSERT(MAC_PERIM_HELD(grp->lg_mh)); + ASSERT3U(idx, <, MAX_GROUPS_PER_PORT); mac_perim_enter_by_mh(port->lp_mh, &pmph); /* @@ -662,6 +670,7 @@ aggr_port_remmac(aggr_port_t *port, const uint8_t *mac_addr) break; pprev = &addr->aua_next; } + if (addr != NULL) { /* * This unicast address put the port into the promiscous mode, @@ -674,8 +683,65 @@ aggr_port_remmac(aggr_port_t *port, const uint8_t *mac_addr) if (port->lp_prom_addr == NULL && !grp->lg_promisc) (void) aggr_port_promisc(port, B_FALSE); } else { - ASSERT(port->lp_hwgh != NULL); - (void) mac_hwgroup_remmac(port->lp_hwgh, mac_addr); + /* See comment in aggr_port_addmac(). */ + if (port->lp_hwghs[idx] == NULL) + idx = 0; + + ASSERT3P(port->lp_hwghs[idx], !=, NULL); + (void) mac_hwgroup_remmac(port->lp_hwghs[idx], mac_addr); } + mac_perim_exit(pmph); } + +int +aggr_port_addvlan(aggr_port_t *port, uint_t idx, uint16_t vid) +{ + mac_perim_handle_t pmph; + int err; + + ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); + ASSERT3U(idx, <, MAX_GROUPS_PER_PORT); + mac_perim_enter_by_mh(port->lp_mh, &pmph); + + /* See comment in aggr_port_addmac(). */ + if (port->lp_hwghs[idx] == NULL) + idx = 0; + + /* + * Add the VLAN filter to the HW group if the port has a HW + * group. If the port doesn't have a HW group, then it will + * implicitly allow tagged traffic to pass and there is + * nothing to do. + */ + if (port->lp_hwghs[idx] == NULL) + err = 0; + else + err = mac_hwgroup_addvlan(port->lp_hwghs[idx], vid); + + mac_perim_exit(pmph); + return (err); +} + +int +aggr_port_remvlan(aggr_port_t *port, uint_t idx, uint16_t vid) +{ + mac_perim_handle_t pmph; + int err; + + ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); + ASSERT3U(idx, <, MAX_GROUPS_PER_PORT); + mac_perim_enter_by_mh(port->lp_mh, &pmph); + + /* See comment in aggr_port_addmac(). */ + if (port->lp_hwghs[idx] == NULL) + idx = 0; + + if (port->lp_hwghs[idx] == NULL) + err = 0; + else + err = mac_hwgroup_remvlan(port->lp_hwghs[idx], vid); + + mac_perim_exit(pmph); + return (err); +} diff --git a/usr/src/uts/common/io/aggr/aggr_recv.c b/usr/src/uts/common/io/aggr/aggr_recv.c index 2bdb7872e3..b6b3e6de1f 100644 --- a/usr/src/uts/common/io/aggr/aggr_recv.c +++ b/usr/src/uts/common/io/aggr/aggr_recv.c @@ -21,6 +21,8 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* @@ -55,7 +57,7 @@ aggr_recv_lacp(aggr_port_t *port, mac_resource_handle_t mrh, mblk_t *mp) { aggr_grp_t *grp = port->lp_grp; - /* in promiscuous mode, send copy of packet up */ + /* In promiscuous mode, pass copy of packet up. */ if (grp->lg_promisc) { mblk_t *nmp = copymsg(mp); @@ -68,11 +70,11 @@ aggr_recv_lacp(aggr_port_t *port, mac_resource_handle_t mrh, mblk_t *mp) /* * Callback function invoked by MAC service module when packets are - * made available by a MAC port. + * made available by a MAC port, both in promisc_on mode and not. */ /* ARGSUSED */ -void -aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, +static void +aggr_recv_path_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, boolean_t loopback) { aggr_port_t *port = (aggr_port_t *)arg; @@ -161,3 +163,10 @@ aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, } } } + +void +aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback) +{ + aggr_recv_path_cb(arg, mrh, mp, loopback); +} diff --git a/usr/src/uts/common/io/bpf/bpf_filter.c b/usr/src/uts/common/io/bpf/bpf_filter.c deleted file mode 100644 index db5b224a5e..0000000000 --- a/usr/src/uts/common/io/bpf/bpf_filter.c +++ /dev/null @@ -1,576 +0,0 @@ -/* $NetBSD: bpf_filter.c,v 1.35 2008/08/20 13:01:54 joerg Exp $ */ - -/* - * Copyright (c) 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from the Stanford/CMU enet packet filter, - * (net/enet.c) distributed as part of 4.3BSD, and code contributed - * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence - * Berkeley Laboratory. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)bpf_filter.c 8.1 (Berkeley) 6/10/93 - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <sys/param.h> -#include <sys/time.h> -#include <sys/stream.h> -#include <sys/byteorder.h> -#include <sys/sdt.h> - -#define EXTRACT_SHORT(p) BE_IN16(p) -#define EXTRACT_LONG(p) BE_IN32(p) - -#ifdef _KERNEL -#define M_LEN(_m) ((_m)->b_wptr - (_m)->b_rptr) -#define mtod(_a, _t) ((_t)((_a)->b_rptr)) -#define MINDEX(len, m, k) \ -{ \ - len = M_LEN(m); \ - while (k >= len) { \ - k -= len; \ - m = m->b_cont; \ - if (m == 0) \ - return (0); \ - len = M_LEN(m); \ - } \ -} - -static int m_xword(mblk_t *, uint32_t, int *); -static int m_xhalf(mblk_t *, uint32_t, int *); - -static int -m_xword(mblk_t *m, uint32_t k, int *err) -{ - int len; - uchar_t *cp, *np; - mblk_t *m0; - - *err = 1; - MINDEX(len, m, k); - cp = mtod(m, uchar_t *) + k; - if (len >= k + 4) { - *err = 0; - return (EXTRACT_LONG(cp)); - } - m0 = m->b_cont; - if (m0 == 0 || M_LEN(m0) + len - k < 4) { - DTRACE_PROBE3(mblk_xword_fail, mblk_t *, m0, int, len, int, k); - return (0); - } - *err = 0; - np = mtod(m0, uchar_t *); - switch (len - k) { - - case 1: - return ((cp[0] << 24) | (np[0] << 16) | (np[1] << 8) | np[2]); - - case 2: - return ((cp[0] << 24) | (cp[1] << 16) | (np[0] << 8) | np[1]); - - default: - return ((cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | np[0]); - } -} - -static int -m_xhalf(mblk_t *m, uint32_t k, int *err) -{ - int len; - uchar_t *cp; - mblk_t *m0; - - *err = 1; - MINDEX(len, m, k); - cp = mtod(m, uchar_t *) + k; - if (len >= k + 2) { - *err = 0; - return (EXTRACT_SHORT(cp)); - } - m0 = m->b_cont; - if (m0 == 0) { - DTRACE_PROBE3(mblk_xhalf_fail, mblk_t *, m0, int, len, int, k); - return (0); - } - *err = 0; - return ((cp[0] << 8) | mtod(m0, uchar_t *)[0]); -} -#else /* _KERNEL */ -#include <stdlib.h> -#endif /* !_KERNEL */ - -#include <net/bpf.h> - -/* - * Execute the filter program starting at pc on the packet p - * wirelen is the length of the original packet - * buflen is the amount of data present - * When buflen is non-0, p is a pointer to a the start of the packet and the - * packet is only in one mblk_t. - * When buflen is 0, p is an mblk_t pointer. - */ -uint_t -bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen) -{ - uint32_t A, X, k; - uint32_t mem[BPF_MEMWORDS]; - - if (pc == 0) - /* - * No filter means accept all. - */ - return ((uint_t)-1); - A = 0; - X = 0; - --pc; - /* CONSTCOND */ - while (1) { - ++pc; - switch (pc->code) { - - default: -#ifdef _KERNEL - DTRACE_PROBE1(bpf_insn_unknown, - struct bpf_insn *, pc); - return (0); -#else - abort(); -#endif - case BPF_RET|BPF_K: - return ((uint_t)pc->k); - - case BPF_RET|BPF_A: - return ((uint_t)A); - - case BPF_LD|BPF_W|BPF_ABS: - k = pc->k; - if (k + sizeof (int32_t) > buflen) { -#ifdef _KERNEL - int merr = 0; - - if (buflen != 0) - return (0); - A = m_xword((mblk_t *)p, k, &merr); - if (merr != 0) - return (0); - continue; -#else - return (0); -#endif - } - A = EXTRACT_LONG(&p[k]); - continue; - - case BPF_LD|BPF_H|BPF_ABS: - k = pc->k; - if (k + sizeof (int16_t) > buflen) { -#ifdef _KERNEL - int merr; - - if (buflen != 0) - return (0); - A = m_xhalf((mblk_t *)p, k, &merr); - if (merr != 0) - return (0); - continue; -#else - return (0); -#endif - } - A = EXTRACT_SHORT(&p[k]); - continue; - - case BPF_LD|BPF_B|BPF_ABS: - k = pc->k; - if (k >= buflen) { -#ifdef _KERNEL - mblk_t *m; - int len; - - if (buflen != 0) - return (0); - m = (mblk_t *)p; - MINDEX(len, m, k); - A = mtod(m, uchar_t *)[k]; - continue; -#else - return (0); -#endif - } - A = p[k]; - continue; - - case BPF_LD|BPF_W|BPF_LEN: - A = wirelen; - continue; - - case BPF_LDX|BPF_W|BPF_LEN: - X = wirelen; - continue; - - case BPF_LD|BPF_W|BPF_IND: - k = X + pc->k; - if (k + sizeof (int32_t) > buflen) { -#ifdef _KERNEL - int merr = 0; - - if (buflen != 0) - return (0); - A = m_xword((mblk_t *)p, k, &merr); - if (merr != 0) - return (0); - continue; -#else - return (0); -#endif - } - A = EXTRACT_LONG(&p[k]); - continue; - - case BPF_LD|BPF_H|BPF_IND: - k = X + pc->k; - if (k + sizeof (int16_t) > buflen) { -#ifdef _KERNEL - int merr = 0; - - if (buflen != 0) - return (0); - A = m_xhalf((mblk_t *)p, k, &merr); - if (merr != 0) - return (0); - continue; -#else - return (0); -#endif - } - A = EXTRACT_SHORT(&p[k]); - continue; - - case BPF_LD|BPF_B|BPF_IND: - k = X + pc->k; - if (k >= buflen) { -#ifdef _KERNEL - mblk_t *m; - int len; - - if (buflen != 0) - return (0); - m = (mblk_t *)p; - MINDEX(len, m, k); - A = mtod(m, uchar_t *)[k]; - continue; -#else - return (0); -#endif - } - A = p[k]; - continue; - - case BPF_LDX|BPF_MSH|BPF_B: - k = pc->k; - if (k >= buflen) { -#ifdef _KERNEL - mblk_t *m; - int len; - - if (buflen != 0) - return (0); - m = (mblk_t *)p; - MINDEX(len, m, k); - X = (mtod(m, char *)[k] & 0xf) << 2; - continue; -#else - return (0); -#endif - } - X = (p[pc->k] & 0xf) << 2; - continue; - - case BPF_LD|BPF_IMM: - A = pc->k; - continue; - - case BPF_LDX|BPF_IMM: - X = pc->k; - continue; - - case BPF_LD|BPF_MEM: - A = mem[pc->k]; - continue; - - case BPF_LDX|BPF_MEM: - X = mem[pc->k]; - continue; - - case BPF_ST: - mem[pc->k] = A; - continue; - - case BPF_STX: - mem[pc->k] = X; - continue; - - case BPF_JMP|BPF_JA: - pc += pc->k; - continue; - - case BPF_JMP|BPF_JGT|BPF_K: - pc += (A > pc->k) ? pc->jt : pc->jf; - continue; - - case BPF_JMP|BPF_JGE|BPF_K: - pc += (A >= pc->k) ? pc->jt : pc->jf; - continue; - - case BPF_JMP|BPF_JEQ|BPF_K: - pc += (A == pc->k) ? pc->jt : pc->jf; - continue; - - case BPF_JMP|BPF_JSET|BPF_K: - pc += (A & pc->k) ? pc->jt : pc->jf; - continue; - - case BPF_JMP|BPF_JGT|BPF_X: - pc += (A > X) ? pc->jt : pc->jf; - continue; - - case BPF_JMP|BPF_JGE|BPF_X: - pc += (A >= X) ? pc->jt : pc->jf; - continue; - - case BPF_JMP|BPF_JEQ|BPF_X: - pc += (A == X) ? pc->jt : pc->jf; - continue; - - case BPF_JMP|BPF_JSET|BPF_X: - pc += (A & X) ? pc->jt : pc->jf; - continue; - - case BPF_ALU|BPF_ADD|BPF_X: - A += X; - continue; - - case BPF_ALU|BPF_SUB|BPF_X: - A -= X; - continue; - - case BPF_ALU|BPF_MUL|BPF_X: - A *= X; - continue; - - case BPF_ALU|BPF_DIV|BPF_X: - if (X == 0) - return (0); - A /= X; - continue; - - case BPF_ALU|BPF_AND|BPF_X: - A &= X; - continue; - - case BPF_ALU|BPF_OR|BPF_X: - A |= X; - continue; - - case BPF_ALU|BPF_LSH|BPF_X: - A <<= X; - continue; - - case BPF_ALU|BPF_RSH|BPF_X: - A >>= X; - continue; - - case BPF_ALU|BPF_ADD|BPF_K: - A += pc->k; - continue; - - case BPF_ALU|BPF_SUB|BPF_K: - A -= pc->k; - continue; - - case BPF_ALU|BPF_MUL|BPF_K: - A *= pc->k; - continue; - - case BPF_ALU|BPF_DIV|BPF_K: - A /= pc->k; - continue; - - case BPF_ALU|BPF_AND|BPF_K: - A &= pc->k; - continue; - - case BPF_ALU|BPF_OR|BPF_K: - A |= pc->k; - continue; - - case BPF_ALU|BPF_LSH|BPF_K: - A <<= pc->k; - continue; - - case BPF_ALU|BPF_RSH|BPF_K: - A >>= pc->k; - continue; - - case BPF_ALU|BPF_NEG: - A = -A; - continue; - - case BPF_MISC|BPF_TAX: - X = A; - continue; - - case BPF_MISC|BPF_TXA: - A = X; - continue; - } - } - /* NOTREACHED */ -} - -#ifdef _KERNEL -/* - * Return true if the 'fcode' is a valid filter program. - * The constraints are that each jump be forward and to a valid - * code, that memory accesses are within valid ranges (to the - * extent that this can be checked statically; loads of packet - * data have to be, and are, also checked at run time), and that - * the code terminates with either an accept or reject. - * - * The kernel needs to be able to verify an application's filter code. - * Otherwise, a bogus program could easily crash the system. - */ -int -bpf_validate(struct bpf_insn *f, int len) -{ - uint_t i, from; - struct bpf_insn *p; - - if (len < 1 || len > BPF_MAXINSNS) - return (0); - - for (i = 0; i < len; ++i) { - p = &f[i]; - DTRACE_PROBE1(bpf_valid_insn, struct bpf_insn *, p); - switch (BPF_CLASS(p->code)) { - /* - * Check that memory operations use valid addresses. - */ - case BPF_LD: - case BPF_LDX: - switch (BPF_MODE(p->code)) { - case BPF_MEM: - if (p->k >= BPF_MEMWORDS) - return (0); - break; - case BPF_ABS: - case BPF_IND: - case BPF_MSH: - case BPF_IMM: - case BPF_LEN: - break; - default: - return (0); - } - break; - case BPF_ST: - case BPF_STX: - if (p->k >= BPF_MEMWORDS) - return (0); - break; - case BPF_ALU: - switch (BPF_OP(p->code)) { - case BPF_ADD: - case BPF_SUB: - case BPF_MUL: - case BPF_OR: - case BPF_AND: - case BPF_LSH: - case BPF_RSH: - case BPF_NEG: - break; - case BPF_DIV: - /* - * Check for constant division by 0. - */ - if (BPF_RVAL(p->code) == BPF_K && p->k == 0) - return (0); - break; - default: - return (0); - } - break; - case BPF_JMP: - /* - * Check that jumps are within the code block, - * and that unconditional branches don't go - * backwards as a result of an overflow. - * Unconditional branches have a 32-bit offset, - * so they could overflow; we check to make - * sure they don't. Conditional branches have - * an 8-bit offset, and the from address is <= - * BPF_MAXINSNS, and we assume that BPF_MAXINSNS - * is sufficiently small that adding 255 to it - * won't overflow. - * - * We know that len is <= BPF_MAXINSNS, and we - * assume that BPF_MAXINSNS is < the maximum size - * of a uint_t, so that i + 1 doesn't overflow. - */ - from = i + 1; - switch (BPF_OP(p->code)) { - case BPF_JA: - if (from + p->k < from || from + p->k >= len) - return (0); - break; - case BPF_JEQ: - case BPF_JGT: - case BPF_JGE: - case BPF_JSET: - if (from + p->jt >= len || from + p->jf >= len) - return (0); - break; - default: - return (0); - } - break; - case BPF_RET: - break; - case BPF_MISC: - break; - default: - return (0); - } - } - - return (BPF_CLASS(f[len - 1].code) == BPF_RET); -} -#endif diff --git a/usr/src/uts/common/io/bpf/bpf_wrap.c b/usr/src/uts/common/io/bpf/bpf_wrap.c new file mode 100644 index 0000000000..6cbde58a20 --- /dev/null +++ b/usr/src/uts/common/io/bpf/bpf_wrap.c @@ -0,0 +1,35 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <net/bpf.h> +#include <inet/bpf.h> + +/* + * With BPF filter validation and evaluation moved into the 'ip' module, these + * wrapper functions are provided to expose the original interface. + */ + +uint_t +bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen) +{ + return ((uint_t)ip_bpf_filter((ip_bpf_insn_t *)pc, p, wirelen, buflen)); +} + +int +bpf_validate(struct bpf_insn *f, int len) +{ + return ((int)ip_bpf_validate((ip_bpf_insn_t *)f, (uint_t)len)); +} diff --git a/usr/src/uts/common/io/bridge.c b/usr/src/uts/common/io/bridge.c index 28f16175ee..dc79139ce6 100644 --- a/usr/src/uts/common/io/bridge.c +++ b/usr/src/uts/common/io/bridge.c @@ -23,6 +23,7 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* @@ -41,6 +42,7 @@ #include <sys/modctl.h> #include <sys/note.h> #include <sys/param.h> +#include <sys/pattr.h> #include <sys/policy.h> #include <sys/sdt.h> #include <sys/stat.h> @@ -1693,7 +1695,8 @@ bridge_learn(bridge_link_t *blp, const uint8_t *saddr, uint16_t ingress_nick, * The passed-in tci is the "impossible" value 0xFFFF when no tag is present. */ static mblk_t * -reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid) +reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid, + boolean_t keep_flags) { boolean_t source_has_tag = (tci != 0xFFFF); mblk_t *mpcopy; @@ -1705,8 +1708,13 @@ reform_vlan_header(mblk_t *mp, uint16_t vlanid, uint16_t tci, uint16_t pvid) if (mp == NULL) return (mp); - /* No forwarded packet can have hardware checksum enabled */ - DB_CKSUMFLAGS(mp) = 0; + /* + * A forwarded packet cannot have HW offloads enabled unless + * the destination is known to be local to the host and HW + * offloads haven't been emulated. + */ + if (!keep_flags) + DB_CKSUMFLAGS(mp) = 0; /* Get the no-modification cases out of the way first */ if (!source_has_tag && vlanid == pvid) /* 1a */ @@ -1907,17 +1915,46 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp, blp->bl_trillthreads++; mutex_exit(&blp->bl_trilllock); update_header(mp, hdr_info, B_FALSE); - if (is_xmit) - mp = mac_fix_cksum(mp); - /* all trill data frames have Inner.VLAN */ - mp = reform_vlan_header(mp, vlanid, tci, 0); - if (mp == NULL) { - KIINCR(bki_drops); - fwd_unref(bfp); - return (NULL); + + if (is_xmit) { + mac_hw_emul(&mp, NULL, NULL, + MAC_HWCKSUM_EMUL | MAC_LSO_EMUL); + + if (mp == NULL) { + KIINCR(bki_drops); + goto done; + } } - trill_encap_fn(tdp, blp, hdr_info, mp, - bfp->bf_trill_nick); + + while (mp != NULL) { + mblk_t *next = mp->b_next; + + mp->b_next = NULL; + + /* + * All trill data frames have + * Inner.VLAN. + */ + mp = reform_vlan_header(mp, vlanid, tci, + 0, B_FALSE); + + if (mp == NULL) { + /* + * Make sure to free + * any remaining + * segments. + */ + freemsgchain(next); + KIINCR(bki_drops); + goto done; + } + + trill_encap_fn(tdp, blp, hdr_info, mp, + bfp->bf_trill_nick); + mp = next; + } + +done: mutex_enter(&blp->bl_trilllock); if (--blp->bl_trillthreads == 0 && blp->bl_trilldata == NULL) @@ -1959,31 +1996,68 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp, mpsend = copymsg(mp); } - if (!from_trill && is_xmit) - mpsend = mac_fix_cksum(mpsend); + /* + * If the destination is not local to the host + * then we need to emulate HW offloads because + * we can't guarantee the forwarding + * destination provides them. + */ + if (!from_trill && is_xmit && + !(bfp->bf_flags & BFF_LOCALADDR)) { + mac_hw_emul(&mpsend, NULL, NULL, + MAC_HWCKSUM_EMUL | MAC_LSO_EMUL); - mpsend = reform_vlan_header(mpsend, vlanid, tci, - blpsend->bl_pvid); - if (mpsend == NULL) { - KIINCR(bki_drops); - continue; + if (mpsend == NULL) { + KIINCR(bki_drops); + continue; + } + } + + /* + * The HW emulation above may have segmented + * an LSO mblk. + */ + while ((mpsend != NULL) && + !(bfp->bf_flags & BFF_LOCALADDR)) { + mblk_t *next = mpsend->b_next; + + mpsend->b_next = NULL; + mpsend = reform_vlan_header(mpsend, vlanid, tci, + blpsend->bl_pvid, B_FALSE); + + if (mpsend == NULL) { + KIINCR(bki_drops); + mpsend = next; + continue; + } + + KIINCR(bki_forwards); + KLPINCR(blpsend, bkl_xmit); + MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, + mpsend); + freemsg(mpsend); + mpsend = next; } - KIINCR(bki_forwards); /* * No need to bump up the link reference count, as * the forwarding entry itself holds a reference to * the link. */ if (bfp->bf_flags & BFF_LOCALADDR) { + mpsend = reform_vlan_header(mpsend, vlanid, tci, + blpsend->bl_pvid, B_TRUE); + + if (mpsend == NULL) { + KIINCR(bki_drops); + continue; + } + + KIINCR(bki_forwards); mac_rx_common(blpsend->bl_mh, NULL, mpsend); - } else { - KLPINCR(blpsend, bkl_xmit); - MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, - mpsend); - freemsg(mpsend); } } + /* * Handle a special case: if we're transmitting to the original * link, then check whether the localaddr flag is set. If it @@ -2019,7 +2093,7 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp, * Inner.VLAN */ mpsend = reform_vlan_header(mpsend, - vlanid, tci, 0); + vlanid, tci, 0, B_FALSE); if (mpsend == NULL) { KIINCR(bki_drops); } else { @@ -2070,25 +2144,57 @@ bridge_forward(bridge_link_t *blp, mac_header_info_t *hdr_info, mblk_t *mp, mpsend = copymsg(mp); } - if (!from_trill && is_xmit) - mpsend = mac_fix_cksum(mpsend); + /* + * In this case, send to all links connected + * to the bridge. Some of these destinations + * may not provide HW offload -- so just + * emulate it here. + */ + if (!from_trill && is_xmit) { + mac_hw_emul(&mpsend, NULL, NULL, + MAC_HWCKSUM_EMUL | MAC_LSO_EMUL); - mpsend = reform_vlan_header(mpsend, vlanid, tci, - blpsend->bl_pvid); - if (mpsend == NULL) { - KIINCR(bki_drops); - continue; + if (mpsend == NULL) { + KIINCR(bki_drops); + continue; + } + } + + /* + * The HW emulation above may have segmented + * an LSO mblk. + */ + while (mpsend != NULL) { + mblk_t *next = mpsend->b_next; + + mpsend->b_next = NULL; + mpsend = reform_vlan_header(mpsend, vlanid, tci, + blpsend->bl_pvid, B_FALSE); + + if (mpsend == NULL) { + KIINCR(bki_drops); + mpsend = next; + continue; + } + + if (hdr_info->mhi_dsttype == + MAC_ADDRTYPE_UNICAST) + KIINCR(bki_unknown); + else + KIINCR(bki_mbcast); + + KLPINCR(blpsend, bkl_xmit); + if ((mpcopy = copymsg(mpsend)) != NULL) { + mac_rx_common(blpsend->bl_mh, NULL, + mpcopy); + } + + MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, + mpsend); + freemsg(mpsend); + mpsend = next; } - if (hdr_info->mhi_dsttype == MAC_ADDRTYPE_UNICAST) - KIINCR(bki_unknown); - else - KIINCR(bki_mbcast); - KLPINCR(blpsend, bkl_xmit); - if ((mpcopy = copymsg(mpsend)) != NULL) - mac_rx_common(blpsend->bl_mh, NULL, mpcopy); - MAC_RING_TX(blpsend->bl_mh, NULL, mpsend, mpsend); - freemsg(mpsend); link_unref(blpsend); } } diff --git a/usr/src/uts/common/io/bufmod.c b/usr/src/uts/common/io/bufmod.c index f553c58aff..c1dd697a78 100644 --- a/usr/src/uts/common/io/bufmod.c +++ b/usr/src/uts/common/io/bufmod.c @@ -22,10 +22,9 @@ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * STREAMS Buffering module * @@ -1016,15 +1015,15 @@ sbaddmsg(queue_t *rq, mblk_t *mp) pad = Align(hp.sbh_totlen); hp.sbh_totlen += sizeof (hp); - hp.sbh_totlen += pad; + + /* We can't fit this message on the current chunk. */ + if ((sbp->sb_mlen + hp.sbh_totlen) > sbp->sb_chunk) + sbclosechunk(sbp); /* - * Would the inclusion of this message overflow the current - * chunk? If so close the chunk off and start a new one. + * If we closed it (just now or during a previous + * call) then allocate the head of a new chunk. */ - if ((hp.sbh_totlen + sbp->sb_mlen) > sbp->sb_chunk) - sbclosechunk(sbp); - if (sbp->sb_head == NULL) { /* Allocate leading header of new chunk */ sbp->sb_head = allocb(sizeof (hp), BPRI_MED); @@ -1044,36 +1043,41 @@ sbaddmsg(queue_t *rq, mblk_t *mp) } /* - * Copy header into message + * Set the header values and join the message to the + * chunk. The header values are copied into the chunk + * after we adjust for padding below. */ hp.sbh_drops = sbp->sb_drops; hp.sbh_origlen = origlen; - (void) memcpy(sbp->sb_head->b_wptr, (char *)&hp, sizeof (hp)); - sbp->sb_head->b_wptr += sizeof (hp); - - ASSERT(sbp->sb_head->b_wptr <= sbp->sb_head->b_datap->db_lim); - - /* - * Join message to the chunk - */ linkb(sbp->sb_head, mp); - sbp->sb_mcount++; sbp->sb_mlen += hp.sbh_totlen; /* - * If the first message alone is too big for the chunk close - * the chunk now. - * If the next message would immediately cause the chunk to - * overflow we may as well close the chunk now. The next - * message is certain to be at least SMALLEST_MESSAGE size. + * There's no chance to fit another message on the + * chunk -- forgo the padding and close the chunk. */ - if (hp.sbh_totlen + SMALLEST_MESSAGE > sbp->sb_chunk) { + if ((sbp->sb_mlen + pad + SMALLEST_MESSAGE) > sbp->sb_chunk) { + (void) memcpy(sbp->sb_head->b_wptr, (char *)&hp, + sizeof (hp)); + sbp->sb_head->b_wptr += sizeof (hp); + ASSERT(sbp->sb_head->b_wptr <= + sbp->sb_head->b_datap->db_lim); sbclosechunk(sbp); return; } /* + * We may add another message to this chunk -- adjust + * the headers for padding to be added below. + */ + hp.sbh_totlen += pad; + (void) memcpy(sbp->sb_head->b_wptr, (char *)&hp, sizeof (hp)); + sbp->sb_head->b_wptr += sizeof (hp); + ASSERT(sbp->sb_head->b_wptr <= sbp->sb_head->b_datap->db_lim); + sbp->sb_mlen += pad; + + /* * Find space for the wrapper. The wrapper consists of: * * 1) Padding for this message (this is to ensure each header @@ -1086,7 +1090,6 @@ sbaddmsg(queue_t *rq, mblk_t *mp) * of the message, but only if we 'own' the data. If the dblk * has been shared through dupmsg() we mustn't alter it. */ - wrapperlen = (sizeof (hp) + pad); /* Is there space for the wrapper beyond the message's data ? */ diff --git a/usr/src/uts/common/io/chxge/ch.c b/usr/src/uts/common/io/chxge/ch.c index 9f1f7f87de..80400061af 100644 --- a/usr/src/uts/common/io/chxge/ch.c +++ b/usr/src/uts/common/io/chxge/ch.c @@ -22,6 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ /* @@ -59,6 +60,7 @@ #include <sys/sunddi.h> #include <sys/dlpi.h> #include <sys/ethernet.h> +#include <sys/mac_provider.h> #include <sys/strsun.h> #include <sys/strsubr.h> #include <inet/common.h> @@ -1377,8 +1379,7 @@ ch_send_up(ch_t *chp, mblk_t *mp, uint32_t cksum, int flg) * set in /etc/system (see sge.c). */ if (flg) - (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, cksum, - HCK_FULLCKSUM, 0); + mac_hcksum_set(mp, 0, 0, 0, cksum, HCK_FULLCKSUM); gld_recv(chp->ch_macp, mp); } else { freemsg(mp); @@ -1693,8 +1694,7 @@ ch_send(gld_mac_info_t *macinfo, mblk_t *mp) msg_flg = 0; if (chp->ch_config.cksum_enabled) { if (is_T2(chp)) { - hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, - NULL, &msg_flg); + mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &msg_flg); flg = (msg_flg & HCK_FULLCKSUM)? CH_NO_CPL: CH_NO_HWCKSUM|CH_NO_CPL; } else diff --git a/usr/src/uts/common/io/cons.c b/usr/src/uts/common/io/cons.c index 6ef1b0b9f7..495ae93cf9 100644 --- a/usr/src/uts/common/io/cons.c +++ b/usr/src/uts/common/io/cons.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1982, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ /* @@ -53,6 +54,7 @@ #include <sys/vnode.h> #include <sys/uio.h> #include <sys/stat.h> +#include <sys/limits.h> #include <sys/console.h> #include <sys/consdev.h> @@ -414,14 +416,24 @@ cnwrite(dev_t dev, struct uio *uio, struct cred *cred) */ if (vsconsvp != NULL && vsconsvp->v_stream != NULL) { struiod_t uiod; + struct iovec buf[IOV_MAX_STACK]; + int iovlen = 0; + + if (uio->uio_iovcnt > IOV_MAX_STACK) { + iovlen = uio->uio_iovcnt * sizeof (iovec_t); + uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP); + } else { + uiod.d_iov = buf; + } /* * strwrite modifies uio so need to make copy. */ - (void) uiodup(uio, &uiod.d_uio, uiod.d_iov, - sizeof (uiod.d_iov) / sizeof (*uiod.d_iov)); + (void) uiodup(uio, &uiod.d_uio, uiod.d_iov, uio->uio_iovcnt); (void) strwrite(vsconsvp, &uiod.d_uio, cred); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); } if (rconsvp->v_stream != NULL) diff --git a/usr/src/uts/common/io/cpqary3/cpqary3.c b/usr/src/uts/common/io/cpqary3/cpqary3.c index 622f0dcf68..f67d77b3d2 100644 --- a/usr/src/uts/common/io/cpqary3/cpqary3.c +++ b/usr/src/uts/common/io/cpqary3/cpqary3.c @@ -41,7 +41,7 @@ extern cpqary3_driver_info_t gdriver_info; * Global Variables Definitions */ -static char cpqary3_brief[] = "HP Smart Array Driver"; +static char cpqary3_brief[] = "HP Smart Array (Legacy)"; void *cpqary3_state; /* HPQaculi Changes */ diff --git a/usr/src/uts/common/io/devpoll.c b/usr/src/uts/common/io/devpoll.c index 4ef14a99de..e11ab34142 100644 --- a/usr/src/uts/common/io/devpoll.c +++ b/usr/src/uts/common/io/devpoll.c @@ -25,7 +25,7 @@ /* * Copyright (c) 2012 by Delphix. All rights reserved. - * Copyright 2017 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #include <sys/types.h> @@ -245,30 +245,20 @@ dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) * stale entries! */ static int -dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, - pollcache_t *pcp, nfds_t nfds, int *fdcntp) +dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, pollcache_t *pcp, nfds_t nfds, + int *fdcntp) { - int start, ostart, end; - int fdcnt, fd; - boolean_t done; - file_t *fp; - short revent; - boolean_t no_wrap; - pollhead_t *php; - polldat_t *pdp; + int start, ostart, end, fdcnt, error = 0; + boolean_t done, no_wrap; pollfd_t *pfdp; epoll_event_t *epoll; - int error = 0; - short mask = POLLRDHUP | POLLWRBAND; - boolean_t is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0; + const short mask = POLLRDHUP | POLLWRBAND; + const boolean_t is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0; ASSERT(MUTEX_HELD(&pcp->pc_lock)); if (pcp->pc_bitmap == NULL) { - /* - * No Need to search because no poll fd - * has been cached. - */ - return (error); + /* No Need to search because no poll fd has been cached. */ + return (0); } if (is_epoll) { @@ -281,7 +271,6 @@ dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, retry: start = ostart = pcp->pc_mapstart; end = pcp->pc_mapend; - php = NULL; if (start == 0) { /* @@ -294,8 +283,11 @@ retry: done = B_FALSE; fdcnt = 0; while ((fdcnt < nfds) && !done) { - php = NULL; - revent = 0; + pollhead_t *php = NULL; + short revent = 0; + uf_entry_gen_t gen; + int fd; + /* * Examine the bit map in a circular fashion * to avoid starvation. Always resume from @@ -305,6 +297,9 @@ retry: fd = bt_getlowbit(pcp->pc_bitmap, start, end); ASSERT(fd <= end); if (fd >= 0) { + file_t *fp; + polldat_t *pdp; + if (fd == end) { if (no_wrap) { done = B_TRUE; @@ -328,28 +323,14 @@ repoll: */ continue; } - if ((fp = getf(fd)) == NULL) { - /* - * The fd has been closed, but user has not - * done a POLLREMOVE on this fd yet. Instead - * of cleaning it here implicitly, we return - * POLLNVAL. This is consistent with poll(2) - * polling a closed fd. Hope this will remind - * user to do a POLLREMOVE. - */ - if (!is_epoll && pfdp != NULL) { - pfdp[fdcnt].fd = fd; - pfdp[fdcnt].revents = POLLNVAL; - fdcnt++; - continue; - } - - /* - * In the epoll compatibility case, we actually - * perform the implicit removal to remain - * closer to the epoll semantics. - */ + if ((fp = getf_gen(fd, &gen)) == NULL) { if (is_epoll) { + /* + * In the epoll compatibility case, we + * actually perform the implicit + * removal to remain closer to the + * epoll semantics. + */ pdp->pd_fp = NULL; pdp->pd_events = 0; @@ -360,30 +341,36 @@ repoll: } BT_CLEAR(pcp->pc_bitmap, fd); - continue; + } else if (pfdp != NULL) { + /* + * The fd has been closed, but user has + * not done a POLLREMOVE on this fd + * yet. Instead of cleaning it here + * implicitly, we return POLLNVAL. This + * is consistent with poll(2) polling a + * closed fd. Hope this will remind + * user to do a POLLREMOVE. + */ + pfdp[fdcnt].fd = fd; + pfdp[fdcnt].revents = POLLNVAL; + fdcnt++; } + continue; } - if (fp != pdp->pd_fp) { + /* + * Detect a change to the resource underlying a cached + * file descriptor. While the fd generation comparison + * will catch nearly all cases, the file_t comparison + * is maintained as a failsafe as well. + */ + if (gen != pdp->pd_gen || fp != pdp->pd_fp) { /* * The user is polling on a cached fd which was * closed and then reused. Unfortunately there * is no good way to communicate this fact to * the consumer. * - * If the file struct is also reused, we may - * not be able to detect the fd reuse at all. - * As long as this does not cause system - * failure and/or memory leaks, we will play - * along. The man page states that if the user - * does not clean up closed fds, polling - * results will be indeterministic. - * - * XXX: perhaps log the detection of fd reuse? - */ - pdp->pd_fp = fp; - - /* * When this situation has been detected, it's * likely that any existing pollhead is * ill-suited to perform proper wake-ups. @@ -396,7 +383,42 @@ repoll: pollhead_delete(pdp->pd_php, pdp); pdp->pd_php = NULL; } + + /* + * Since epoll is expected to act on the + * underlying 'struct file' (in Linux terms, + * our vnode_t would be a closer analog) rather + * than the fd itself, an implicit remove + * is necessary under these circumstances to + * suppress any results (or errors) from the + * new resource occupying the fd. + */ + if (is_epoll) { + pdp->pd_fp = NULL; + pdp->pd_events = 0; + BT_CLEAR(pcp->pc_bitmap, fd); + releasef(fd); + continue; + } else { + /* + * Regular /dev/poll is unbothered + * about the fd reassignment. + */ + pdp->pd_fp = fp; + pdp->pd_gen = gen; + } } + + /* + * Skip entries marked with the sentinal value for + * having already fired under oneshot conditions. + */ + if (pdp->pd_events == POLLONESHOT) { + releasef(fd); + BT_CLEAR(pcp->pc_bitmap, fd); + continue; + } + /* * XXX - pollrelock() logic needs to know which * which pollcache lock to grab. It'd be a @@ -537,18 +559,19 @@ repoll: /* Handle special polling modes. */ if (pdp->pd_events & POLLONESHOT) { /* - * If POLLONESHOT is set, perform the - * implicit POLLREMOVE. + * Entries operating under POLLONESHOT + * will be marked with a sentinel value + * to indicate that they have "fired" + * when emitting an event. This will + * disable them from polling until a + * later add/modify event rearms them. */ - pdp->pd_fp = NULL; - pdp->pd_events = 0; - + pdp->pd_events = POLLONESHOT; if (pdp->pd_php != NULL) { pollhead_delete(pdp->pd_php, pdp); pdp->pd_php = NULL; } - BT_CLEAR(pcp->pc_bitmap, fd); } else if (pdp->pd_events & POLLET) { /* @@ -700,14 +723,10 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) pollfd_t *pollfdp, *pfdp; dvpoll_epollfd_t *epfdp; uintptr_t limit; - int error, size; - ssize_t uiosize; - size_t copysize; + int error; + uint_t size; + size_t copysize, uiosize; nfds_t pollfdnum; - struct pollhead *php = NULL; - polldat_t *pdp; - int fd; - file_t *fp; boolean_t is_epoll, fds_added = B_FALSE; minor = getminor(dev); @@ -732,10 +751,27 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) pcp->pc_pid = curproc->p_pid; } - uiosize = uiop->uio_resid; + if (uiop->uio_resid < 0) { + /* No one else is this careful, but maybe they should be. */ + return (EINVAL); + } + + uiosize = (size_t)uiop->uio_resid; pollfdnum = uiosize / size; /* + * For epoll-enabled handles, restrict the allowed write size to 2. + * This corresponds to an epoll_ctl(3C) performing an EPOLL_CTL_MOD + * operation which is expanded into two operations (DEL and ADD). + * + * All other operations performed through epoll_ctl(3C) will consist of + * a single entry. + */ + if (is_epoll && pollfdnum > 2) { + return (EINVAL); + } + + /* * We want to make sure that pollfdnum isn't large enough to DoS us, * but we also don't want to grab p_lock unnecessarily -- so we * perform the full check against our resource limits if and only if @@ -794,6 +830,21 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0) { ASSERT(dpep->dpe_refcnt != 0); + /* + * The epoll API does not allow EINTR as a result when making + * modifications to the set of polled fds. Given that write + * activity is relatively quick and the size of accepted writes + * is limited above to two entries, a signal-ignorant wait is + * used here to avoid the EINTR. + */ + if (is_epoll) { + cv_wait(&dpep->dpe_cv, &dpep->dpe_lock); + continue; + } + + /* + * Non-epoll writers to /dev/poll handles can tolerate EINTR. + */ if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) { dpep->dpe_writerwait--; mutex_exit(&dpep->dpe_lock); @@ -828,7 +879,9 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) } for (pfdp = pollfdp; (uintptr_t)pfdp < limit; pfdp = (pollfd_t *)((uintptr_t)pfdp + size)) { - fd = pfdp->fd; + int fd = pfdp->fd; + polldat_t *pdp; + if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles) { /* * epoll semantics demand that we return EBADF if our @@ -844,76 +897,60 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) pdp = pcache_lookup_fd(pcp, fd); if (pfdp->events != POLLREMOVE) { + uf_entry_gen_t gen; + file_t *fp = NULL; + struct pollhead *php = NULL; - fp = NULL; - - if (pdp == NULL) { - /* - * If we're in epoll compatibility mode, check - * that the fd is valid before allocating - * anything for it; epoll semantics demand that - * we return EBADF if our specified fd is - * invalid. - */ - if (is_epoll) { - if ((fp = getf(fd)) == NULL) { - error = EBADF; - break; - } + /* + * If we're in epoll compatibility mode, check that the + * fd is valid before allocating anything for it; epoll + * semantics demand that we return EBADF if our + * specified fd is invalid. + */ + if (is_epoll) { + if ((fp = getf_gen(fd, &gen)) == NULL) { + error = EBADF; + break; } - + } + if (pdp == NULL) { pdp = pcache_alloc_fd(0); pdp->pd_fd = fd; pdp->pd_pcache = pcp; pcache_insert_fd(pcp, pdp, pollfdnum); - } else { + } + + if (is_epoll) { /* - * epoll semantics demand that we error out if - * a file descriptor is added twice, which we - * check (imperfectly) by checking if we both - * have the file descriptor cached and the - * file pointer that correponds to the file - * descriptor matches our cached value. If - * there is a pointer mismatch, the file - * descriptor was closed without being removed. - * The converse is clearly not true, however, - * so to narrow the window by which a spurious - * EEXIST may be returned, we also check if - * this fp has been added to an epoll control - * descriptor in the past; if it hasn't, we - * know that this is due to fp reuse -- it's - * not a true EEXIST case. (By performing this - * additional check, we limit the window of - * spurious EEXIST to situations where a single - * file descriptor is being used across two or - * more epoll control descriptors -- and even - * then, the file descriptor must be closed and - * reused in a relatively tight time span.) + * If the fd is already a member of the epoll + * set, error emission is needed only when the + * fd assignment generation matches the one + * recorded in the polldat_t. Absence of such + * a generation match indicates that a new + * resource has been assigned at that fd. + * + * Caveat: It is possible to force a generation + * update while keeping the same backing + * resource. This is possible via dup2, but + * does not represent real-world use cases, + * making the lack of error acceptable. */ - if (is_epoll) { - if (pdp->pd_fp != NULL && - (fp = getf(fd)) != NULL && - fp == pdp->pd_fp && - (fp->f_flag2 & FEPOLLED)) { - error = EEXIST; - releasef(fd); - break; - } - - /* - * We have decided that the cached - * information was stale: it either - * didn't match, or the fp had never - * actually been epoll()'d on before. - * We need to now clear our pd_events - * to assure that we don't mistakenly - * operate on cached event disposition. - */ - pdp->pd_events = 0; + if (pdp->pd_fp != NULL && pdp->pd_gen == gen) { + error = EEXIST; + releasef(fd); + break; } - } - if (is_epoll) { + /* + * We have decided that the cached information + * was stale. Reset pd_events to assure that + * we don't mistakenly operate on cached event + * disposition. This configures the implicit + * subscription to HUP and ERR events which + * epoll features. + */ + pdp->pd_events = POLLERR|POLLHUP; + epfdp = (dvpoll_epollfd_t *)pfdp; pdp->pd_epolldata = epfdp->dpep_data; } @@ -928,39 +965,36 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) if (fd > pcp->pc_mapend) { pcp->pc_mapend = fd; } - if (fp == NULL && (fp = getf(fd)) == NULL) { - /* - * The fd is not valid. Since we can't pass - * this error back in the write() call, set - * the bit in bitmap to force DP_POLL ioctl - * to examine it. - */ - BT_SET(pcp->pc_bitmap, fd); - pdp->pd_events |= pfdp->events; - continue; - } - /* - * To (greatly) reduce EEXIST false positives, we - * denote that this fp has been epoll()'d. We do this - * regardless of epoll compatibility mode, as the flag - * is harmless if not in epoll compatibility mode. - */ - fp->f_flag2 |= FEPOLLED; + if (!is_epoll) { + ASSERT(fp == NULL); - /* - * Don't do VOP_POLL for an already cached fd with - * same poll events. - */ - if ((pdp->pd_events == pfdp->events) && - (pdp->pd_fp == fp)) { + if ((fp = getf_gen(fd, &gen)) == NULL) { + /* + * The fd is not valid. Since we can't + * pass this error back in the write() + * call, set the bit in bitmap to force + * DP_POLL ioctl to examine it. + */ + BT_SET(pcp->pc_bitmap, fd); + pdp->pd_events |= pfdp->events; + continue; + } /* - * the events are already cached + * Don't do VOP_POLL for an already cached fd + * with same poll events. */ - releasef(fd); - continue; + if ((pdp->pd_events == pfdp->events) && + (pdp->pd_fp == fp)) { + /* + * the events are already cached + */ + releasef(fd); + continue; + } } + /* * do VOP_POLL and cache this poll fd. */ @@ -992,11 +1026,11 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) * wake-ups. * * Drivers which never emit a pollhead will simply - * disobey the exectation of edge-triggered behavior. + * disobey the expectation of edge-triggered behavior. * This includes recursive epoll which, even on Linux, * yields its events in a level-triggered fashion only. */ - if ((pdp->pd_events & POLLET) && error == 0 && + if ((pfdp->events & POLLET) != 0 && error == 0 && php == NULL) { short levent = 0; @@ -1018,6 +1052,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) break; } pdp->pd_fp = fp; + pdp->pd_gen = gen; pdp->pd_events |= pfdp->events; if (php != NULL) { if (pdp->pd_php == NULL) { @@ -1143,8 +1178,13 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) * to turn it off for a particular open. */ dpep->dpe_flag |= DP_ISEPOLLCOMPAT; - mutex_exit(&dpep->dpe_lock); + /* Record the epoll-enabled nature in the pollcache too */ + mutex_enter(&pcp->pc_lock); + pcp->pc_flag |= PC_EPOLL; + mutex_exit(&pcp->pc_lock); + + mutex_exit(&dpep->dpe_lock); return (0); } diff --git a/usr/src/uts/common/io/dld/dld_drv.c b/usr/src/uts/common/io/dld/dld_drv.c index cfe0f78415..00b5f0e3de 100644 --- a/usr/src/uts/common/io/dld/dld_drv.c +++ b/usr/src/uts/common/io/dld/dld_drv.c @@ -347,8 +347,8 @@ drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) if ((err = dls_devnet_hold_tmp(diap->dia_linkid, &dlh)) != 0) return (err); - if ((err = mac_perim_enter_by_macname( - dls_devnet_mac(dlh), &mph)) != 0) { + if ((err = mac_perim_enter_by_macname(dls_devnet_mac(dlh), + &mph)) != 0) { dls_devnet_rele_tmp(dlh); return (err); } @@ -360,7 +360,6 @@ drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) } mac_sdu_get(dlp->dl_mh, NULL, &diap->dia_max_sdu); - dls_link_rele(dlp); mac_perim_exit(mph); dls_devnet_rele_tmp(dlh); @@ -702,7 +701,8 @@ drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set, err = EACCES; goto done; } - err = dls_devnet_setzid(dlh, dzp->diz_zid); + err = dls_devnet_setzid(dlh, dzp->diz_zid, + dzp->diz_transient); } else { kprop->pr_perm_flags = MAC_PROP_PERM_RW; (*(zoneid_t *)kprop->pr_val) = dls_devnet_getzid(dlh); @@ -717,8 +717,18 @@ drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set, else err = drv_ioc_clrap(linkid); } else { - if (kprop->pr_valsize == 0) - return (ENOBUFS); + /* + * You might think that the earlier call to + * mac_prop_check_size() should catch this but + * it can't. The autopush prop uses 0 as a + * sentinel value to clear the prop. This + * check ensures we don't allow a get with a + * valsize of 0. + */ + if (kprop->pr_valsize == 0) { + err = ENOBUFS; + goto done; + } kprop->pr_perm_flags = MAC_PROP_PERM_RW; err = drv_ioc_getap(linkid, dlap); @@ -866,7 +876,7 @@ drv_ioc_rename(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) return (err); if ((err = dls_devnet_rename(dir->dir_linkid1, dir->dir_linkid2, - dir->dir_link)) != 0) + dir->dir_link, dir->dir_zoneinit)) != 0) return (err); if (dir->dir_linkid2 == DATALINK_INVALID_LINKID) @@ -1321,10 +1331,13 @@ drv_ioc_gettran(void *karg, intptr_t arg, int mode, cred_t *cred, dls_link_t *dlp = NULL; dld_ioc_gettran_t *dgt = karg; - if ((ret = mac_perim_enter_by_linkid(dgt->dgt_linkid, &mph)) != 0) + if ((ret = dls_devnet_hold_tmp(dgt->dgt_linkid, &dlh)) != 0) + goto done; + + if ((ret = mac_perim_enter_by_macname(dls_devnet_mac(dlh), &mph)) != 0) goto done; - if ((ret = dls_devnet_hold_link(dgt->dgt_linkid, &dlh, &dlp)) != 0) + if ((ret = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0) goto done; /* @@ -1343,13 +1356,14 @@ drv_ioc_gettran(void *karg, intptr_t arg, int mode, cred_t *cred, } done: - if (dlh != NULL && dlp != NULL) { - dls_devnet_rele_link(dlh, dlp); - } + if (dlp != NULL) + dls_link_rele(dlp); - if (mph != NULL) { + if (mph != NULL) mac_perim_exit(mph); - } + + if (dlh != NULL) + dls_devnet_rele_tmp(dlh); return (ret); } @@ -1373,10 +1387,13 @@ drv_ioc_readtran(void *karg, intptr_t arg, int mode, cred_t *cred, if (dti->dti_nbytes != 256 || dti->dti_off != 0) return (EINVAL); - if ((ret = mac_perim_enter_by_linkid(dti->dti_linkid, &mph)) != 0) + if ((ret = dls_devnet_hold_tmp(dti->dti_linkid, &dlh)) != 0) + goto done; + + if ((ret = mac_perim_enter_by_macname(dls_devnet_mac(dlh), &mph)) != 0) goto done; - if ((ret = dls_devnet_hold_link(dti->dti_linkid, &dlh, &dlp)) != 0) + if ((ret = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0) goto done; /* @@ -1396,13 +1413,14 @@ drv_ioc_readtran(void *karg, intptr_t arg, int mode, cred_t *cred, } done: - if (dlh != NULL && dlp != NULL) { - dls_devnet_rele_link(dlh, dlp); - } + if (dlp != NULL) + dls_link_rele(dlp); - if (mph != NULL) { + if (mph != NULL) mac_perim_exit(mph); - } + + if (dlh != NULL) + dls_devnet_rele_tmp(dlh); return (ret); } @@ -1499,7 +1517,6 @@ done: return (ret); } - /* * Note that ioctls that modify links have a NULL di_priv_func(), as * privileges can only be checked after we know the class of the link being @@ -1575,7 +1592,8 @@ static dld_ioc_modentry_t dld_ioc_modtable[] = { {SIMNET_IOC, "simnet", 0, NULL, 0}, {BRIDGE_IOC, "bridge", 0, NULL, 0}, {IPTUN_IOC, "iptun", 0, NULL, 0}, - {IBPART_IOC, "ibp", -1, NULL, 0} + {IBPART_IOC, "ibp", -1, NULL, 0}, + {OVERLAY_IOC, "overlay", 0, NULL, 0} }; #define DLDIOC_CNT \ (sizeof (dld_ioc_modtable) / sizeof (dld_ioc_modentry_t)) diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c index 05989dccb2..238dd9bb22 100644 --- a/usr/src/uts/common/io/dld/dld_proto.c +++ b/usr/src/uts/common/io/dld/dld_proto.c @@ -42,7 +42,7 @@ static proto_reqfunc_t proto_info_req, proto_attach_req, proto_detach_req, proto_bind_req, proto_unbind_req, proto_promiscon_req, proto_promiscoff_req, proto_enabmulti_req, proto_disabmulti_req, proto_physaddr_req, proto_setphysaddr_req, proto_udqos_req, proto_req, proto_capability_req, - proto_notify_req, proto_passive_req; + proto_notify_req, proto_passive_req, proto_exclusive_req; static void proto_capability_advertise(dld_str_t *, mblk_t *); static int dld_capab_poll_disable(dld_str_t *, dld_capab_poll_t *); @@ -122,6 +122,9 @@ dld_proto(dld_str_t *dsp, mblk_t *mp) case DL_PASSIVE_REQ: proto_passive_req(dsp, mp); break; + case DL_EXCLUSIVE_REQ: + proto_exclusive_req(dsp, mp); + break; default: proto_req(dsp, mp); break; @@ -606,6 +609,14 @@ proto_promiscon_req(dld_str_t *dsp, mblk_t *mp) new_flags |= DLS_PROMISC_PHYS; break; + case DL_PROMISC_RX_ONLY: + new_flags |= DLS_PROMISC_RX_ONLY; + break; + + case DL_PROMISC_FIXUPS: + new_flags |= DLS_PROMISC_FIXUPS; + break; + default: dl_err = DL_NOTSUPPORTED; goto failed2; @@ -693,6 +704,22 @@ proto_promiscoff_req(dld_str_t *dsp, mblk_t *mp) new_flags &= ~DLS_PROMISC_PHYS; break; + case DL_PROMISC_RX_ONLY: + if (!(dsp->ds_promisc & DLS_PROMISC_RX_ONLY)) { + dl_err = DL_NOTENAB; + goto failed2; + } + new_flags &= ~DLS_PROMISC_RX_ONLY; + break; + + case DL_PROMISC_FIXUPS: + if (!(dsp->ds_promisc & DLS_PROMISC_FIXUPS)) { + dl_err = DL_NOTENAB; + goto failed2; + } + new_flags &= ~DLS_PROMISC_FIXUPS; + break; + default: dl_err = DL_NOTSUPPORTED; goto failed2; @@ -1184,7 +1211,6 @@ proto_unitdata_req(dld_str_t *dsp, mblk_t *mp) uint16_t sap; uint_t addr_length; mblk_t *bp, *payload; - uint32_t start, stuff, end, value, flags; t_uscalar_t dl_err; uint_t max_sdu; @@ -1253,9 +1279,7 @@ proto_unitdata_req(dld_str_t *dsp, mblk_t *mp) /* * Transfer the checksum offload information if it is present. */ - hcksum_retrieve(payload, NULL, NULL, &start, &stuff, &end, &value, - &flags); - (void) hcksum_assoc(bp, NULL, NULL, start, stuff, end, value, flags, 0); + mac_hcksum_clone(payload, bp); /* * Link the payload onto the new header. @@ -1296,7 +1320,8 @@ proto_passive_req(dld_str_t *dsp, mblk_t *mp) * If we've already become active by issuing an active primitive, * then it's too late to try to become passive. */ - if (dsp->ds_passivestate == DLD_ACTIVE) { + if (dsp->ds_passivestate == DLD_ACTIVE || + dsp->ds_passivestate == DLD_EXCLUSIVE) { dl_err = DL_OUTSTATE; goto failed; } @@ -1350,12 +1375,20 @@ dld_capab_direct(dld_str_t *dsp, void *data, uint_t flags) ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + if (dsp->ds_sap == ETHERTYPE_IPV6) + return (ENOTSUP); + switch (flags) { case DLD_ENABLE: dls_rx_set(dsp, (dls_rx_t)direct->di_rx_cf, direct->di_rx_ch); - direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put; + if (direct->di_flags & DI_DIRECT_RAW) { + direct->di_tx_df = + (uintptr_t)str_mdata_raw_fastpath_put; + } else { + direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put; + } direct->di_tx_dh = dsp; direct->di_tx_cb_df = (uintptr_t)mac_client_tx_notify; direct->di_tx_cb_dh = dsp->ds_mch; @@ -1377,24 +1410,22 @@ dld_capab_direct(dld_str_t *dsp, void *data, uint_t flags) } /* - * dld_capab_poll_enable() - * - * This function is misnamed. All polling and fanouts are run out of the - * lower mac (in case of VNIC and the only mac in case of NICs). The - * availability of Rx ring and promiscous mode is all taken care between - * the soft ring set (mac_srs), the Rx ring, and S/W classifier. Any - * fanout necessary is done by the soft rings that are part of the - * mac_srs (by default mac_srs sends the packets up via a TCP and - * non TCP soft ring). + * This function is misnamed. All polling and fanouts are run out of + * the lower MAC for VNICs and out of the MAC for NICs. The + * availability of Rx rings and promiscous mode is taken care of + * between the soft ring set (mac_srs), the Rx ring, and the SW + * classifier. Fanout, if necessary, is done by the soft rings that + * are part of the SRS. By default the SRS divvies up the packets + * based on protocol: TCP, UDP, or Other (OTH). * - * The mac_srs (or its associated soft rings) always store the ill_rx_ring + * The SRS (or its associated soft rings) always store the ill_rx_ring * (the cookie returned when they registered with IP during plumb) as their * 2nd argument which is passed up as mac_resource_handle_t. The upcall * function and 1st argument is what the caller registered when they * called mac_rx_classify_flow_add() to register the flow. For VNIC, * the function is vnic_rx and argument is vnic_t. For regular NIC * case, it mac_rx_default and mac_handle_t. As explained above, the - * mac_srs (or its soft ring) will add the ill_rx_ring (mac_resource_handle_t) + * SRS (or its soft ring) will add the ill_rx_ring (mac_resource_handle_t) * from its stored 2nd argument. */ static int @@ -1407,11 +1438,11 @@ dld_capab_poll_enable(dld_str_t *dsp, dld_capab_poll_t *poll) return (ENOTSUP); /* - * Enable client polling if and only if DLS bypass is possible. - * Special cases like VLANs need DLS processing in the Rx data path. - * In such a case we can neither allow the client (IP) to directly - * poll the softring (since DLS processing hasn't been done) nor can - * we allow DLS bypass. + * Enable client polling if and only if DLS bypass is + * possible. Some traffic requires DLS processing in the Rx + * data path. In such a case we can neither allow the client + * (IP) to directly poll the soft ring (since DLS processing + * hasn't been done) nor can we allow DLS bypass. */ if (!mac_rx_bypass_set(dsp->ds_mch, dsp->ds_rx, dsp->ds_rx_arg)) return (ENOTSUP); @@ -1456,6 +1487,9 @@ dld_capab_poll(dld_str_t *dsp, void *data, uint_t flags) ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + if (dsp->ds_sap == ETHERTYPE_IPV6) + return (ENOTSUP); + switch (flags) { case DLD_ENABLE: return (dld_capab_poll_enable(dsp, poll)); @@ -1466,12 +1500,34 @@ dld_capab_poll(dld_str_t *dsp, void *data, uint_t flags) } static int +dld_capab_ipcheck(dld_str_t *dsp, void *data, uint_t flags) +{ + dld_capab_ipcheck_t *ipc = data; + + ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + + switch (flags) { + case DLD_ENABLE: + ipc->ipc_allowed_df = (uintptr_t)mac_protect_check_addr; + ipc->ipc_allowed_dh = dsp->ds_mch; + return (0); + case DLD_DISABLE: + return (0); + } + + return (ENOTSUP); +} + +static int dld_capab_lso(dld_str_t *dsp, void *data, uint_t flags) { dld_capab_lso_t *lso = data; ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + if (dsp->ds_sap == ETHERTYPE_IPV6) + return (ENOTSUP); + switch (flags) { case DLD_ENABLE: { mac_capab_lso_t mac_lso; @@ -1517,8 +1573,9 @@ dld_capab(dld_str_t *dsp, uint_t type, void *data, uint_t flags) * completes. So we limit the check to DLD_ENABLE case. */ if ((flags == DLD_ENABLE && type != DLD_CAPAB_PERIM) && - (dsp->ds_sap != ETHERTYPE_IP || - !check_mod_above(dsp->ds_rq, "ip"))) { + (((dsp->ds_sap != ETHERTYPE_IP && dsp->ds_sap != ETHERTYPE_IPV6) || + !check_mod_above(dsp->ds_rq, "ip")) && + !check_mod_above(dsp->ds_rq, "vnd"))) { return (ENOTSUP); } @@ -1539,6 +1596,10 @@ dld_capab(dld_str_t *dsp, uint_t type, void *data, uint_t flags) err = dld_capab_lso(dsp, data, flags); break; + case DLD_CAPAB_IPCHECK: + err = dld_capab_ipcheck(dsp, data, flags); + break; + default: err = ENOTSUP; break; @@ -1600,9 +1661,15 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp) } /* - * Direct capability negotiation interface between IP and DLD + * Direct capability negotiation interface between IP/VND and DLD. Note + * that for vnd we only allow the case where the media type is the + * native media type so we know that there are no transformations that + * would have to happen to the mac header that it receives. */ - if (dsp->ds_sap == ETHERTYPE_IP && check_mod_above(dsp->ds_rq, "ip")) { + if (((dsp->ds_sap == ETHERTYPE_IP || dsp->ds_sap == ETHERTYPE_IPV6) && + check_mod_above(dsp->ds_rq, "ip")) || + (check_mod_above(dsp->ds_rq, "vnd") && + dsp->ds_mip->mi_media == dsp->ds_mip->mi_nativemedia)) { dld_capable = B_TRUE; subsize += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); @@ -1721,3 +1788,36 @@ dld_capabilities_disable(dld_str_t *dsp) if (dsp->ds_polling) (void) dld_capab_poll_disable(dsp, NULL); } + +static void +proto_exclusive_req(dld_str_t *dsp, mblk_t *mp) +{ + int ret = 0; + t_uscalar_t dl_err; + mac_perim_handle_t mph; + + if (dsp->ds_passivestate != DLD_UNINITIALIZED) { + dl_err = DL_OUTSTATE; + goto failed; + } + + if (MBLKL(mp) < DL_EXCLUSIVE_REQ_SIZE) { + dl_err = DL_BADPRIM; + goto failed; + } + + mac_perim_enter_by_mh(dsp->ds_mh, &mph); + ret = dls_exclusive_set(dsp, B_TRUE); + mac_perim_exit(mph); + + if (ret != 0) { + dl_err = DL_SYSERR; + goto failed; + } + + dsp->ds_passivestate = DLD_EXCLUSIVE; + dlokack(dsp->ds_wq, mp, DL_EXCLUSIVE_REQ); + return; +failed: + dlerrorack(dsp->ds_wq, mp, DL_EXCLUSIVE_REQ, dl_err, (t_uscalar_t)ret); +} diff --git a/usr/src/uts/common/io/dld/dld_str.c b/usr/src/uts/common/io/dld/dld_str.c index 24b5ab2e72..4cb0f95cf5 100644 --- a/usr/src/uts/common/io/dld/dld_str.c +++ b/usr/src/uts/common/io/dld/dld_str.c @@ -855,6 +855,77 @@ i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid, return (mp); } +static boolean_t +i_dld_raw_ether_check(dld_str_t *dsp, mac_header_info_t *mhip, mblk_t **mpp) +{ + mblk_t *mp = *mpp; + mblk_t *newmp; + uint_t pri, vid, dvid; + + dvid = mac_client_vid(dsp->ds_mch); + + /* + * Discard the packet if this is a VLAN stream but the VID in + * the packet is not correct. + */ + vid = VLAN_ID(mhip->mhi_tci); + if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE)) + return (B_FALSE); + + /* + * Discard the packet if this packet is a tagged packet + * but both pri and VID are 0. + */ + pri = VLAN_PRI(mhip->mhi_tci); + if (mhip->mhi_istagged && !mhip->mhi_ispvid && pri == 0 && + vid == VLAN_ID_NONE) + return (B_FALSE); + + /* + * Update the priority bits to the per-stream priority if + * priority is not set in the packet. Update the VID for + * packets on a VLAN stream. + */ + pri = (pri == 0) ? dsp->ds_pri : 0; + if ((pri != 0) || (dvid != VLAN_ID_NONE)) { + if ((newmp = i_dld_ether_header_update_tag(mp, pri, + dvid, dsp->ds_dlp->dl_tagmode)) == NULL) { + return (B_FALSE); + } + *mpp = newmp; + } + + return (B_TRUE); +} + +mac_tx_cookie_t +str_mdata_raw_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint, + uint16_t flag) +{ + boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER); + mac_header_info_t mhi; + mac_tx_cookie_t cookie; + + if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0) + goto discard; + + if (is_ethernet) { + if (i_dld_raw_ether_check(dsp, &mhi, &mp) == B_FALSE) + goto discard; + } + + if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) { + DLD_SETQFULL(dsp); + } + return (cookie); +discard: + /* TODO: bump kstat? */ + freemsg(mp); + return (NULL); +} + + + /* * M_DATA put (IP fast-path mode) */ @@ -903,7 +974,6 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp) mblk_t *bp, *newmp; size_t size; mac_header_info_t mhi; - uint_t pri, vid, dvid; uint_t max_sdu; /* @@ -949,38 +1019,8 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp) goto discard; if (is_ethernet) { - dvid = mac_client_vid(dsp->ds_mch); - - /* - * Discard the packet if this is a VLAN stream but the VID in - * the packet is not correct. - */ - vid = VLAN_ID(mhi.mhi_tci); - if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE)) - goto discard; - - /* - * Discard the packet if this packet is a tagged packet - * but both pri and VID are 0. - */ - pri = VLAN_PRI(mhi.mhi_tci); - if (mhi.mhi_istagged && !mhi.mhi_ispvid && pri == 0 && - vid == VLAN_ID_NONE) + if (i_dld_raw_ether_check(dsp, &mhi, &mp) == B_FALSE) goto discard; - - /* - * Update the priority bits to the per-stream priority if - * priority is not set in the packet. Update the VID for - * packets on a VLAN stream. - */ - pri = (pri == 0) ? dsp->ds_pri : 0; - if ((pri != 0) || (dvid != VLAN_ID_NONE)) { - if ((newmp = i_dld_ether_header_update_tag(mp, pri, - dvid, dsp->ds_dlp->dl_tagmode)) == NULL) { - goto discard; - } - mp = newmp; - } } if (DLD_TX(dsp, mp, 0, 0) != NULL) { diff --git a/usr/src/uts/common/io/dls/dls.c b/usr/src/uts/common/io/dls/dls.c index 92993ada58..303ea8eb24 100644 --- a/usr/src/uts/common/io/dls/dls.c +++ b/usr/src/uts/common/io/dls/dls.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013 Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* @@ -171,16 +171,16 @@ dls_bind(dld_str_t *dsp, uint32_t sap) /* * The MAC layer does the VLAN demultiplexing and will only pass up * untagged packets to non-promiscuous primary MAC clients. In order to - * support the binding to the VLAN SAP which is required by DLPI, dls + * support binding to the VLAN SAP, which is required by DLPI, DLS * needs to get a copy of all tagged packets when the client binds to * the VLAN SAP. We do this by registering a separate promiscuous - * callback for each dls client binding to that SAP. + * callback for each DLS client binding to that SAP. * * Note: even though there are two promiscuous handles in dld_str_t, * ds_mph is for the regular promiscuous mode, ds_vlan_mph is the handle - * to receive VLAN pkt when promiscuous mode is not on. Only one of - * them can be non-NULL at the same time, to avoid receiving dup copies - * of pkts. + * to receive VLAN traffic when promiscuous mode is not on. Only one of + * them can be non-NULL at the same time, to avoid receiving duplicate + * copies of packets. */ if (sap == ETHERTYPE_VLAN && dsp->ds_promisc == 0) { int err; @@ -248,19 +248,69 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags) { int err = 0; uint32_t old_flags = dsp->ds_promisc; + uint32_t new_type = new_flags & + ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS); mac_client_promisc_type_t mptype = MAC_CLIENT_PROMISC_ALL; + uint16_t mac_flags = 0; + boolean_t doremove = B_FALSE; ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); ASSERT(!(new_flags & ~(DLS_PROMISC_SAP | DLS_PROMISC_MULTI | - DLS_PROMISC_PHYS))); + DLS_PROMISC_PHYS | DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS))); + + /* + * If we only have the non-data receive flags set or are only changing + * them, then there's nothing to do other than update the flags here. + * Basically when we only have something in the set of + * DLS_PROMISC_RX_ONLY and DLS_PROMISC_FIXUPS around, then there's + * nothing else for us to do other than toggle it, as there's no need to + * talk to MAC and we don't have to do anything else. + */ + if ((old_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0 && + (new_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0) { + dsp->ds_promisc = new_flags; + return (0); + } /* * If the user has only requested DLS_PROMISC_MULTI then we need to make * sure that they don't see all packets. */ - if (new_flags == DLS_PROMISC_MULTI) + if (new_type == DLS_PROMISC_MULTI) mptype = MAC_CLIENT_PROMISC_MULTI; + /* + * Look at new flags and figure out the correct mac promisc flags. + * If we've only requested DLS_PROMISC_SAP and not _MULTI or _PHYS, + * don't turn on physical promisc mode. + */ + if (new_flags & DLS_PROMISC_RX_ONLY) + mac_flags |= MAC_PROMISC_FLAGS_NO_TX_LOOP; + if (new_flags & DLS_PROMISC_FIXUPS) + mac_flags |= MAC_PROMISC_FLAGS_DO_FIXUPS; + if (new_type == DLS_PROMISC_SAP) + mac_flags |= MAC_PROMISC_FLAGS_NO_PHYS; + + /* + * If we're coming in and we're being asked to transition to a state + * where the only DLS flags would be enabled are flags that change what + * we do with promiscuous packets (DLS_PROMISC_RX_ONLY and + * DLS_PROMISC_FIXUPS) and not which packets we should receive, then we + * need to remove the MAC layer promiscuous handler. + */ + if ((new_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) == 0 && + (old_flags & ~(DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS)) != 0 && + new_flags != 0) { + doremove = B_TRUE; + } + + /* + * There are three cases we care about here with respect to MAC. Going + * from nothing to something, something to nothing, something to + * something where we need to change how we're getting stuff from mac. + * In the last case, as long as they're not equal, we need to assume + * something has changed and do something about it. + */ if (dsp->ds_promisc == 0 && new_flags != 0) { /* * If only DLS_PROMISC_SAP, we don't turn on the @@ -268,9 +318,7 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags) */ dsp->ds_promisc = new_flags; err = mac_promisc_add(dsp->ds_mch, mptype, - dls_rx_promisc, dsp, &dsp->ds_mph, - (new_flags != DLS_PROMISC_SAP) ? 0 : - MAC_PROMISC_FLAGS_NO_PHYS); + dls_rx_promisc, dsp, &dsp->ds_mph, mac_flags); if (err != 0) { dsp->ds_promisc = old_flags; return (err); @@ -281,7 +329,8 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags) mac_promisc_remove(dsp->ds_vlan_mph); dsp->ds_vlan_mph = NULL; } - } else if (dsp->ds_promisc != 0 && new_flags == 0) { + } else if (dsp->ds_promisc != 0 && + (new_flags == 0 || doremove == B_TRUE)) { ASSERT(dsp->ds_mph != NULL); mac_promisc_remove(dsp->ds_mph); @@ -296,19 +345,13 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags) MAC_CLIENT_PROMISC_ALL, dls_rx_vlan_promisc, dsp, &dsp->ds_vlan_mph, MAC_PROMISC_FLAGS_NO_PHYS); } - } else if (dsp->ds_promisc == DLS_PROMISC_SAP && new_flags != 0 && - new_flags != dsp->ds_promisc) { - /* - * If the old flag is PROMISC_SAP, but the current flag has - * changed to some new non-zero value, we need to turn the - * physical promiscuous mode. - */ + } else if (new_flags != 0 && new_flags != old_flags) { ASSERT(dsp->ds_mph != NULL); mac_promisc_remove(dsp->ds_mph); /* Honors both after-remove and before-add semantics! */ dsp->ds_promisc = new_flags; err = mac_promisc_add(dsp->ds_mch, mptype, - dls_rx_promisc, dsp, &dsp->ds_mph, 0); + dls_rx_promisc, dsp, &dsp->ds_mph, mac_flags); if (err != 0) dsp->ds_promisc = old_flags; } else { @@ -629,6 +672,22 @@ boolean_t dls_accept_promisc(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx, void **ds_rx_arg, boolean_t loopback) { + if (dsp->ds_promisc == 0) { + /* + * If there are active walkers of the mi_promisc_list when + * promiscuousness is disabled, ds_promisc will be cleared, + * but the DLS will remain on the mi_promisc_list until the + * walk is completed. If we do not recognize this case here, + * we won't properly execute the ds_promisc case in the common + * accept routine -- and we will potentially accept a packet + * that has originated with this DLS (which in turn can + * induce recursion and death by stack overflow). If + * ds_promisc is zero, we know that we are in this window -- + * and we refuse to accept the packet. + */ + return (B_FALSE); + } + return (dls_accept_common(dsp, mhip, ds_rx, ds_rx_arg, B_TRUE, loopback)); } @@ -650,8 +709,8 @@ dls_mac_active_set(dls_link_t *dlp) /* request the primary MAC address */ if ((err = mac_unicast_add(dlp->dl_mch, NULL, MAC_UNICAST_PRIMARY | MAC_UNICAST_TAG_DISABLE | - MAC_UNICAST_DISABLE_TX_VID_CHECK, &dlp->dl_mah, 0, - &diag)) != 0) { + MAC_UNICAST_DISABLE_TX_VID_CHECK, &dlp->dl_mah, + VLAN_ID_NONE, &diag)) != 0) { return (err); } @@ -659,7 +718,10 @@ dls_mac_active_set(dls_link_t *dlp) * Set the function to start receiving packets. */ mac_rx_set(dlp->dl_mch, i_dls_link_rx, dlp); + } else if (dlp->dl_exclusive == B_TRUE) { + return (EBUSY); } + dlp->dl_nactive++; return (0); } @@ -685,7 +747,11 @@ dls_active_set(dld_str_t *dsp) if (dsp->ds_passivestate == DLD_PASSIVE) return (0); - /* If we're already active, then there's nothing more to do. */ + if (dsp->ds_dlp->dl_exclusive == B_TRUE && + dsp->ds_passivestate != DLD_EXCLUSIVE) + return (EBUSY); + + /* If we're already active, we need to check the link's exclusivity */ if ((dsp->ds_nactive == 0) && ((err = dls_mac_active_set(dsp->ds_dlp)) != 0)) { /* except for ENXIO all other errors are mapped to EBUSY */ @@ -694,7 +760,8 @@ dls_active_set(dld_str_t *dsp) return (err); } - dsp->ds_passivestate = DLD_ACTIVE; + dsp->ds_passivestate = dsp->ds_dlp->dl_exclusive == B_TRUE ? + DLD_EXCLUSIVE : DLD_ACTIVE; dsp->ds_nactive++; return (0); } @@ -725,7 +792,32 @@ dls_active_clear(dld_str_t *dsp, boolean_t all) if (dsp->ds_nactive != 0) return; - ASSERT(dsp->ds_passivestate == DLD_ACTIVE); + ASSERT(dsp->ds_passivestate == DLD_ACTIVE || + dsp->ds_passivestate == DLD_EXCLUSIVE); dls_mac_active_clear(dsp->ds_dlp); + /* + * We verify below to ensure that no other part of DLS has mucked with + * our exclusive state. + */ + if (dsp->ds_passivestate == DLD_EXCLUSIVE) + VERIFY(dls_exclusive_set(dsp, B_FALSE) == 0); dsp->ds_passivestate = DLD_UNINITIALIZED; } + +int +dls_exclusive_set(dld_str_t *dsp, boolean_t enable) +{ + ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + + if (enable == B_FALSE) { + dsp->ds_dlp->dl_exclusive = B_FALSE; + return (0); + } + + if (dsp->ds_dlp->dl_nactive != 0) + return (EBUSY); + + dsp->ds_dlp->dl_exclusive = B_TRUE; + + return (0); +} diff --git a/usr/src/uts/common/io/dls/dls_link.c b/usr/src/uts/common/io/dls/dls_link.c index 6c8ffcb0a9..c792251052 100644 --- a/usr/src/uts/common/io/dls/dls_link.c +++ b/usr/src/uts/common/io/dls/dls_link.c @@ -21,7 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* @@ -30,11 +30,15 @@ #include <sys/sysmacros.h> #include <sys/strsubr.h> +#include <sys/pattr.h> #include <sys/strsun.h> #include <sys/vlan.h> #include <sys/dld_impl.h> #include <sys/sdt.h> #include <sys/atomic.h> +#include <sys/sysevent.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/datalink.h> static kmem_cache_t *i_dls_link_cachep; mod_hash_t *i_dls_link_hash; @@ -159,6 +163,18 @@ i_dls_link_subchain(dls_link_t *dlp, mblk_t *mp, const mac_header_info_t *mhip, uint16_t cvid, cpri; int err; + /* + * If this message is from a same-machine sender, then + * there may be HW checksum offloads to emulate. + */ + if (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) { + mblk_t *tmpnext = mp->b_next; + + mp->b_next = NULL; + mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL); + mp->b_next = tmpnext; + } + DLS_PREPARE_PKT(dlp->dl_mh, mp, &cmhi, err); if (err != 0) break; @@ -353,6 +369,22 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp, int err, rval; /* + * The mac_hw_emul() function, by design, doesn't predicate on + * HW_LOCAL_MAC. But since we are in Rx context we know that + * any LSO packet must also be from a same-machine sender. We + * take advantage of that and forgoe writing a manual loop to + * predicate on HW_LOCAL_MAC. + * + * But for checksum emulation we need to predicate on + * HW_LOCAL_MAC to avoid calling mac_hw_emul() on packets that + * don't need it (thanks to the fact that HCK_IPV4_HDRCKSUM + * and HCK_IPV4_HDRCKSUM_OK use the same value). Therefore we + * do the checksum emulation in the second loop and in + * subchain matching. + */ + mac_hw_emul(&mp, NULL, NULL, MAC_LSO_EMUL); + + /* * Walk the packet chain. */ for (; mp != NULL; mp = nextp) { @@ -361,6 +393,18 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp, */ accepted = B_FALSE; + /* + * If this message is from a same-machine sender, then + * there may be HW checksum offloads to emulate. + */ + if (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) { + mblk_t *tmpnext = mp->b_next; + + mp->b_next = NULL; + mac_hw_emul(&mp, NULL, NULL, MAC_HWCKSUM_EMUL); + mp->b_next = tmpnext; + } + DLS_PREPARE_PKT(dlp->dl_mh, mp, &mhi, err); if (err != 0) { atomic_inc_32(&(dlp->dl_unknowns)); @@ -379,7 +423,16 @@ i_dls_link_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp, vid = VLAN_ID(mhi.mhi_tci); + /* + * This condition is true only when a sun4v vsw client + * is on the scene; as it is the only type of client + * that multiplexes VLANs on a single client instance. + * All other types of clients have one VLAN per client + * instance. In that case, MAC strips the VLAN tag + * before delivering it to DLS (see mac_rx_deliver()). + */ if (mhi.mhi_istagged) { + /* * If it is tagged traffic, send it upstream to * all dld_str_t which are attached to the physical @@ -554,7 +607,13 @@ dls_rx_promisc(void *arg, mac_resource_handle_t mrh, mblk_t *mp, dls_head_t *dhp; mod_hash_key_t key; + /* + * We expect to deal with only a single packet. + */ + ASSERT3P(mp->b_next, ==, NULL); + DLS_PREPARE_PKT(dlp->dl_mh, mp, &mhi, err); + if (err != 0) goto drop; @@ -580,6 +639,67 @@ drop: freemsg(mp); } +/* + * We'd like to notify via sysevents that a link state change has occurred. + * There are a couple of challenges associated with this. The first is that if + * the link is flapping a lot, we may not see an accurate state when we launch + * the notification, we're told it changed, not what it changed to. + * + * The next problem is that all of the information that a user has associated + * with this device is the exact opposite of what we have on the dls_link_t. We + * have the name of the mac device, which has no bearing on what users see. + * Likewise, we don't have the datalink id either. So we're going to have to get + * this from dls. + * + * This is all further complicated by the fact that this could be going on in + * another thread at the same time as someone is tearing down the dls_link_t + * that we're associated with. We need to be careful not to grab the mac + * perimeter, otherwise we stand a good chance of deadlock. + */ +static void +dls_link_notify(void *arg, mac_notify_type_t type) +{ + dls_link_t *dlp = arg; + dls_dl_handle_t dhp; + nvlist_t *nvp; + sysevent_t *event; + sysevent_id_t eid; + + if (type != MAC_NOTE_LINK && type != MAC_NOTE_LOWLINK) + return; + + /* + * If we can't find a devnet handle for this link, then there is no user + * knowable device for this at the moment and there's nothing we can + * really share with them that will make sense. + */ + if (dls_devnet_hold_tmp_by_link(dlp, &dhp) != 0) + return; + + /* + * Because we're attaching this nvlist_t to the sysevent, it'll get + * cleaned up when we call sysevent_free. + */ + VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_int32(nvp, DATALINK_EV_LINK_ID, + dls_devnet_linkid(dhp)) == 0); + VERIFY(nvlist_add_string(nvp, DATALINK_EV_LINK_NAME, + dls_devnet_link(dhp)) == 0); + VERIFY(nvlist_add_int32(nvp, DATALINK_EV_ZONE_ID, + dls_devnet_getzid(dhp)) == 0); + + dls_devnet_rele_tmp(dhp); + + event = sysevent_alloc(EC_DATALINK, ESC_DATALINK_LINK_STATE, + ILLUMOS_KERN_PUB"dls", SE_SLEEP); + VERIFY(event != NULL); + (void) sysevent_attach_attributes(event, (sysevent_attr_list_t *)nvp); + + (void) log_sysevent(event, SE_SLEEP, &eid); + sysevent_free(event); + +} + static void i_dls_link_destroy(dls_link_t *dlp) { @@ -590,6 +710,9 @@ i_dls_link_destroy(dls_link_t *dlp) /* * Free the structure back to the cache. */ + if (dlp->dl_mnh != NULL) + mac_notify_remove(dlp->dl_mnh, B_TRUE); + if (dlp->dl_mch != NULL) mac_client_close(dlp->dl_mch, 0); @@ -601,8 +724,10 @@ i_dls_link_destroy(dls_link_t *dlp) dlp->dl_mh = NULL; dlp->dl_mch = NULL; dlp->dl_mip = NULL; + dlp->dl_mnh = NULL; dlp->dl_unknowns = 0; dlp->dl_nonip_cnt = 0; + dlp->dl_exclusive = B_FALSE; kmem_cache_free(i_dls_link_cachep, dlp); } @@ -641,6 +766,8 @@ i_dls_link_create(const char *name, dls_link_t **dlpp) if (err != 0) goto bail; + dlp->dl_mnh = mac_notify_add(dlp->dl_mh, dls_link_notify, dlp); + DTRACE_PROBE2(dls__primary__client, char *, dlp->dl_name, void *, dlp->dl_mch); diff --git a/usr/src/uts/common/io/dls/dls_mgmt.c b/usr/src/uts/common/io/dls/dls_mgmt.c index 9755aed43f..a910cbe6b1 100644 --- a/usr/src/uts/common/io/dls/dls_mgmt.c +++ b/usr/src/uts/common/io/dls/dls_mgmt.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright (c) 2017 Joyent, Inc. */ /* * Copyright (c) 2016 by Delphix. All rights reserved. @@ -85,6 +86,14 @@ static door_handle_t dls_mgmt_dh = NULL; /* dls_devnet_t dd_flags */ #define DD_CONDEMNED 0x1 #define DD_IMPLICIT_IPTUN 0x2 /* Implicitly-created ip*.*tun* tunnel */ +#define DD_INITIALIZING 0x4 + +/* + * If the link is marked as initializing or condemned then it should + * not be visible outside of the DLS framework. + */ +#define DD_NOT_VISIBLE(flags) ( \ + (flags & (DD_CONDEMNED | DD_INITIALIZING)) != 0) /* * This structure is used to keep the <linkid, macname> mapping. @@ -108,13 +117,14 @@ typedef struct dls_devnet_s { zoneid_t dd_zid; /* current zone */ boolean_t dd_prop_loaded; taskqid_t dd_prop_taskid; + boolean_t dd_transient; /* link goes away when zone does */ } dls_devnet_t; static int i_dls_devnet_create_iptun(const char *, const char *, datalink_id_t *); static int i_dls_devnet_destroy_iptun(datalink_id_t); -static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t); -static int dls_devnet_unset(const char *, datalink_id_t *, boolean_t); +static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t, boolean_t); +static int dls_devnet_unset(mac_handle_t, datalink_id_t *, boolean_t); /*ARGSUSED*/ static int @@ -134,9 +144,9 @@ i_dls_devnet_destructor(void *buf, void *arg) { dls_devnet_t *ddp = buf; - ASSERT(ddp->dd_ksp == NULL); - ASSERT(ddp->dd_ref == 0); - ASSERT(ddp->dd_tref == 0); + VERIFY(ddp->dd_ksp == NULL); + VERIFY(ddp->dd_ref == 0); + VERIFY(ddp->dd_tref == 0); mutex_destroy(&ddp->dd_mutex); cv_destroy(&ddp->dd_cv); } @@ -148,7 +158,12 @@ dls_zone_remove(datalink_id_t linkid, void *arg) dls_devnet_t *ddp; if (dls_devnet_hold_tmp(linkid, &ddp) == 0) { - (void) dls_devnet_setzid(ddp, GLOBAL_ZONEID); + /* + * Don't bother moving transient links back to the global zone + * since we will simply delete them in dls_devnet_unset. + */ + if (!ddp->dd_transient) + (void) dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE); dls_devnet_rele_tmp(ddp); } return (0); @@ -529,6 +544,27 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid) getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID; (void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN); + getlinkid.ld_zoneid = getzoneid(); + + if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval, + sizeof (retval))) == 0) { + *linkid = retval.lr_linkid; + } + return (err); +} + +int +dls_mgmt_get_linkid_in_zone(const char *link, datalink_id_t *linkid, + zoneid_t zid) +{ + dlmgmt_door_getlinkid_t getlinkid; + dlmgmt_getlinkid_retval_t retval; + int err; + + ASSERT(getzoneid() == GLOBAL_ZONEID || zid == getzoneid()); + getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID; + (void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN); + getlinkid.ld_zoneid = zid; if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval, sizeof (retval))) == 0) { @@ -537,6 +573,7 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid) return (err); } + datalink_id_t dls_mgmt_get_next(datalink_id_t linkid, datalink_class_t class, datalink_media_t dmedia, uint32_t flags) @@ -736,13 +773,24 @@ dls_devnet_stat_update(kstat_t *ksp, int rw) * Create the "link" kstats. */ static void -dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid) +dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid, zoneid_t newzoneid) { kstat_t *ksp; + char *nm; + char kname[MAXLINKNAMELEN]; + + if (zoneid != newzoneid) { + ASSERT(zoneid == GLOBAL_ZONEID); + (void) snprintf(kname, sizeof (kname), "z%d_%s", newzoneid, + ddp->dd_linkname); + nm = kname; + } else { + nm = ddp->dd_linkname; + } - if (dls_stat_create("link", 0, ddp->dd_linkname, zoneid, + if (dls_stat_create("link", 0, nm, zoneid, dls_devnet_stat_update, (void *)(uintptr_t)ddp->dd_linkid, - &ksp) == 0) { + &ksp, newzoneid) == 0) { ASSERT(ksp != NULL); if (zoneid == ddp->dd_owner_zid) { ASSERT(ddp->dd_ksp == NULL); @@ -762,12 +810,12 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid) { if (zoneid == ddp->dd_owner_zid) { if (ddp->dd_ksp != NULL) { - kstat_delete(ddp->dd_ksp); + dls_stat_delete(ddp->dd_ksp); ddp->dd_ksp = NULL; } } else { if (ddp->dd_zone_ksp != NULL) { - kstat_delete(ddp->dd_zone_ksp); + dls_stat_delete(ddp->dd_zone_ksp); ddp->dd_zone_ksp = NULL; } } @@ -778,24 +826,38 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid) * and create the new set using the new name. */ static void -dls_devnet_stat_rename(dls_devnet_t *ddp) +dls_devnet_stat_rename(dls_devnet_t *ddp, boolean_t zoneinit) { if (ddp->dd_ksp != NULL) { - kstat_delete(ddp->dd_ksp); + dls_stat_delete(ddp->dd_ksp); ddp->dd_ksp = NULL; } - /* We can't rename a link while it's assigned to a non-global zone. */ + if (zoneinit && ddp->dd_zone_ksp != NULL) { + dls_stat_delete(ddp->dd_zone_ksp); + ddp->dd_zone_ksp = NULL; + } + /* + * We can't rename a link while it's assigned to a non-global zone + * unless we're first initializing the zone while readying it. + */ ASSERT(ddp->dd_zone_ksp == NULL); - dls_devnet_stat_create(ddp, ddp->dd_owner_zid); + dls_devnet_stat_create(ddp, ddp->dd_owner_zid, + (zoneinit ? ddp->dd_zid : ddp->dd_owner_zid)); + if (zoneinit) + dls_devnet_stat_create(ddp, ddp->dd_zid, ddp->dd_zid); } /* - * Associate a linkid with a given link (identified by macname) + * Associate the linkid with the link identified by macname. If this + * is called on behalf of a physical link then linkid may be + * DATALINK_INVALID_LINKID. Otherwise, if called on behalf of a + * virtual link, linkid must have a value. */ static int -dls_devnet_set(const char *macname, datalink_id_t linkid, zoneid_t zoneid, +dls_devnet_set(mac_handle_t mh, datalink_id_t linkid, zoneid_t zoneid, dls_devnet_t **ddpp) { + const char *macname = mac_name(mh); dls_devnet_t *ddp = NULL; datalink_class_t class; int err; @@ -828,17 +890,41 @@ dls_devnet_set(const char *macname, datalink_id_t linkid, zoneid_t zoneid, } /* - * This might be a physical link that has already - * been created, but which does not have a linkid - * because dlmgmtd was not running when it was created. + * If we arrive here we know we are attempting to set + * the linkid on a physical link. A virtual link + * should never arrive here because it should never + * call this function without a linkid. Virtual links + * are created through dlgmtmd and thus we know + * dlmgmtd is alive to assign it a linkid (search for + * uses of dladm_create_datalink_id() to prove this to + * yourself); we don't have the same guarantee for a + * physical link which may perform an upcall for a + * linkid while dlmgmtd is down but will continue + * creating a devnet without the linkid (see + * softmac_create_datalink() to see how physical link + * creation works). That is why there is no entry in + * the id hash but there is one in the macname hash -- + * softmac couldn't acquire a linkid the first time it + * called this function. + * + * Because of the check above, we also know that + * ddp->dd_linkid is not set. Following this, the link + * must still be in the DD_INITIALIZING state because + * that flag is removed IFF dd_linkid is set. This is + * why we can ASSERT the DD_INITIALIZING flag below if + * the call to i_dls_devnet_setzid() fails. */ if (linkid == DATALINK_INVALID_LINKID || class != DATALINK_CLASS_PHYS) { err = EINVAL; goto done; } + + ASSERT(ddp->dd_flags & DD_INITIALIZING); + } else { ddp = kmem_cache_alloc(i_dls_devnet_cachep, KM_SLEEP); + ddp->dd_flags = DD_INITIALIZING; ddp->dd_tref = 0; ddp->dd_ref++; ddp->dd_owner_zid = zoneid; @@ -856,12 +942,6 @@ dls_devnet_set(const char *macname, datalink_id_t linkid, zoneid_t zoneid, (mod_hash_val_t)ddp) == 0); devnet_need_rebuild = B_TRUE; stat_create = B_TRUE; - mutex_enter(&ddp->dd_mutex); - if (!ddp->dd_prop_loaded && (ddp->dd_prop_taskid == NULL)) { - ddp->dd_prop_taskid = taskq_dispatch(system_taskq, - dls_devnet_prop_task, ddp, TQ_SLEEP); - } - mutex_exit(&ddp->dd_mutex); } err = 0; done: @@ -875,8 +955,19 @@ done: rw_exit(&i_dls_devnet_lock); if (err == 0) { if (zoneid != GLOBAL_ZONEID && - (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE)) != 0) - (void) dls_devnet_unset(macname, &linkid, B_TRUE); + (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE, + B_FALSE)) != 0) { + /* + * At this point the link is marked as + * DD_INITIALIZING -- there can be no + * outstanding temp refs and therefore no need + * to wait for them. + */ + ASSERT(ddp->dd_flags & DD_INITIALIZING); + (void) dls_devnet_unset(mh, &linkid, B_FALSE); + return (err); + } + /* * The kstat subsystem holds its own locks (rather perimeter) * before calling the ks_update (dls_devnet_stat_update) entry @@ -884,20 +975,35 @@ done: * lock hierarchy is kstat locks -> i_dls_devnet_lock. */ if (stat_create) - dls_devnet_stat_create(ddp, zoneid); + dls_devnet_stat_create(ddp, zoneid, zoneid); if (ddpp != NULL) *ddpp = ddp; + + mutex_enter(&ddp->dd_mutex); + if (linkid != DATALINK_INVALID_LINKID && + !ddp->dd_prop_loaded && ddp->dd_prop_taskid == NULL) { + ddp->dd_prop_taskid = taskq_dispatch(system_taskq, + dls_devnet_prop_task, ddp, TQ_SLEEP); + } + mutex_exit(&ddp->dd_mutex); + } return (err); } /* - * Disassociate a linkid with a given link (identified by macname) - * This waits until temporary references to the dls_devnet_t are gone. + * Disassociate the linkid from the link identified by macname. If + * wait is B_TRUE, wait until all temporary refs are released and the + * prop task is finished. + * + * If waiting then you SHOULD NOT call this from inside the MAC perim + * as deadlock will ensue. Otherwise, this function is safe to call + * from inside or outside the MAC perim. */ static int -dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait) +dls_devnet_unset(mac_handle_t mh, datalink_id_t *id, boolean_t wait) { + const char *macname = mac_name(mh); dls_devnet_t *ddp; int err; mod_hash_val_t val; @@ -918,21 +1024,62 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait) * deadlock. Return EBUSY if the asynchronous thread started for * property loading as part of the post attach hasn't yet completed. */ - ASSERT(ddp->dd_ref != 0); + VERIFY(ddp->dd_ref != 0); if ((ddp->dd_ref != 1) || (!wait && (ddp->dd_tref != 0 || ddp->dd_prop_taskid != NULL))) { - mutex_exit(&ddp->dd_mutex); - rw_exit(&i_dls_devnet_lock); - return (EBUSY); + int zstatus = 0; + + /* + * There are a couple of alternatives that might be going on + * here; a) the zone is shutting down and it has a transient + * link assigned, in which case we want to clean it up instead + * of moving it back to the global zone, or b) its possible + * that we're trying to clean up an orphaned vnic that was + * delegated to a zone and which wasn't cleaned up properly + * when the zone went away. Check for either of these cases + * before we simply return EBUSY. + * + * zstatus indicates which situation we are dealing with: + * 0 - means return EBUSY + * 1 - means case (a), cleanup transient link + * -1 - means case (b), orphained VNIC + */ + if (ddp->dd_ref > 1 && ddp->dd_zid != GLOBAL_ZONEID) { + zone_t *zp; + + if ((zp = zone_find_by_id(ddp->dd_zid)) == NULL) { + zstatus = -1; + } else { + if (ddp->dd_transient) { + zone_status_t s = zone_status_get(zp); + + if (s >= ZONE_IS_SHUTTING_DOWN) + zstatus = 1; + } + zone_rele(zp); + } + } + + if (zstatus == 0) { + mutex_exit(&ddp->dd_mutex); + rw_exit(&i_dls_devnet_lock); + return (EBUSY); + } + + /* + * We want to delete the link, reset ref to 1; + */ + if (zstatus == -1) + /* Log a warning, but continue in this case */ + cmn_err(CE_WARN, "clear orphaned datalink: %s\n", + ddp->dd_linkname); + ddp->dd_ref = 1; } ddp->dd_flags |= DD_CONDEMNED; ddp->dd_ref--; *id = ddp->dd_linkid; - if (ddp->dd_zid != GLOBAL_ZONEID) - (void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE); - /* * Remove this dls_devnet_t from the hash table. */ @@ -947,18 +1094,40 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait) } rw_exit(&i_dls_devnet_lock); + /* + * It is important to call i_dls_devnet_setzid() WITHOUT the + * i_dls_devnet_lock held. The setzid call grabs the MAC + * perim; thus causing DLS -> MAC lock ordering if performed + * with the i_dls_devnet_lock held. This forces consumers to + * grab the MAC perim before calling dls_devnet_unset() (the + * locking rules state MAC -> DLS order). By performing the + * setzid outside of the i_dls_devnet_lock consumers can + * safely call dls_devnet_unset() outside the MAC perim. + */ + if (ddp->dd_zid != GLOBAL_ZONEID) { + dls_devnet_stat_destroy(ddp, ddp->dd_zid); + (void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE, + B_FALSE); + } + if (wait) { /* * Wait until all temporary references are released. + * The holders of the tref need the MAC perim to + * perform their work and release the tref. To avoid + * deadlock, assert that the perim is never held here. */ + ASSERT0(MAC_PERIM_HELD(mh)); while ((ddp->dd_tref != 0) || (ddp->dd_prop_taskid != NULL)) cv_wait(&ddp->dd_cv, &ddp->dd_mutex); } else { - ASSERT(ddp->dd_tref == 0 && ddp->dd_prop_taskid == NULL); + VERIFY(ddp->dd_tref == 0); + VERIFY(ddp->dd_prop_taskid == NULL); } - if (ddp->dd_linkid != DATALINK_INVALID_LINKID) + if (ddp->dd_linkid != DATALINK_INVALID_LINKID) { dls_devnet_stat_destroy(ddp, ddp->dd_owner_zid); + } ddp->dd_prop_loaded = B_FALSE; ddp->dd_linkid = DATALINK_INVALID_LINKID; @@ -969,6 +1138,39 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait) return (0); } +/* + * This is a private hold routine used when we already have the dls_link_t, thus + * we know that it cannot go away. + */ +int +dls_devnet_hold_tmp_by_link(dls_link_t *dlp, dls_dl_handle_t *ddhp) +{ + int err; + dls_devnet_t *ddp = NULL; + + rw_enter(&i_dls_devnet_lock, RW_WRITER); + if ((err = mod_hash_find(i_dls_devnet_hash, + (mod_hash_key_t)dlp->dl_name, (mod_hash_val_t *)&ddp)) != 0) { + ASSERT(err == MH_ERR_NOTFOUND); + rw_exit(&i_dls_devnet_lock); + return (ENOENT); + } + + mutex_enter(&ddp->dd_mutex); + VERIFY(ddp->dd_ref > 0); + if (DD_NOT_VISIBLE(ddp->dd_flags)) { + mutex_exit(&ddp->dd_mutex); + rw_exit(&i_dls_devnet_lock); + return (ENOENT); + } + ddp->dd_tref++; + mutex_exit(&ddp->dd_mutex); + rw_exit(&i_dls_devnet_lock); + + *ddhp = ddp; + return (0); +} + static int dls_devnet_hold_common(datalink_id_t linkid, dls_devnet_t **ddpp, boolean_t tmp_hold) @@ -985,8 +1187,8 @@ dls_devnet_hold_common(datalink_id_t linkid, dls_devnet_t **ddpp, } mutex_enter(&ddp->dd_mutex); - ASSERT(ddp->dd_ref > 0); - if (ddp->dd_flags & DD_CONDEMNED) { + VERIFY(ddp->dd_ref > 0); + if (DD_NOT_VISIBLE(ddp->dd_flags)) { mutex_exit(&ddp->dd_mutex); rw_exit(&i_dls_devnet_lock); return (ENOENT); @@ -1053,8 +1255,8 @@ dls_devnet_hold_by_dev(dev_t dev, dls_dl_handle_t *ddhp) return (ENOENT); } mutex_enter(&ddp->dd_mutex); - ASSERT(ddp->dd_ref > 0); - if (ddp->dd_flags & DD_CONDEMNED) { + VERIFY(ddp->dd_ref > 0); + if (DD_NOT_VISIBLE(ddp->dd_flags)) { mutex_exit(&ddp->dd_mutex); rw_exit(&i_dls_devnet_lock); return (ENOENT); @@ -1071,7 +1273,7 @@ void dls_devnet_rele(dls_devnet_t *ddp) { mutex_enter(&ddp->dd_mutex); - ASSERT(ddp->dd_ref > 1); + VERIFY(ddp->dd_ref > 1); ddp->dd_ref--; if ((ddp->dd_flags & DD_IMPLICIT_IPTUN) && ddp->dd_ref == 1) { mutex_exit(&ddp->dd_mutex); @@ -1083,7 +1285,7 @@ dls_devnet_rele(dls_devnet_t *ddp) } static int -dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp) +dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid) { char drv[MAXLINKNAMELEN]; uint_t ppa; @@ -1093,7 +1295,7 @@ dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp) dls_dev_handle_t ddh; int err; - if ((err = dls_mgmt_get_linkid(link, &linkid)) == 0) + if ((err = dls_mgmt_get_linkid_in_zone(link, &linkid, zid)) == 0) return (dls_devnet_hold(linkid, ddpp)); /* @@ -1236,9 +1438,15 @@ dls_devnet_phydev(datalink_id_t vlanid, dev_t *devp) * * This case does not change the <link name, linkid> mapping, so the link's * kstats need to be updated with using name associated the given id2. + * + * The zoneinit parameter is used to allow us to create a VNIC in the global + * zone which is assigned to a non-global zone. Since there is a race condition + * in the create process if two VNICs have the same name, we need to rename it + * after it has been assigned to the zone. */ int -dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) +dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link, + boolean_t zoneinit) { dls_dev_handle_t ddh = NULL; int err = 0; @@ -1283,10 +1491,12 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) } mutex_enter(&ddp->dd_mutex); - if (ddp->dd_ref > 1) { - mutex_exit(&ddp->dd_mutex); - err = EBUSY; - goto done; + if (!zoneinit) { + if (ddp->dd_ref > 1) { + mutex_exit(&ddp->dd_mutex); + err = EBUSY; + goto done; + } } mutex_exit(&ddp->dd_mutex); @@ -1297,7 +1507,15 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) /* rename mac client name and its flow if exists */ if ((err = mac_open(ddp->dd_mac, &mh)) != 0) goto done; - (void) mac_rename_primary(mh, link); + if (zoneinit) { + char tname[MAXLINKNAMELEN]; + + (void) snprintf(tname, sizeof (tname), "z%d_%s", + ddp->dd_zid, link); + (void) mac_rename_primary(mh, tname); + } else { + (void) mac_rename_primary(mh, link); + } mac_close(mh); goto done; } @@ -1364,7 +1582,7 @@ done: rw_exit(&i_dls_devnet_lock); if (err == 0) - dls_devnet_stat_rename(ddp); + dls_devnet_stat_rename(ddp, zoneinit); if (mph != NULL) mac_perim_exit(mph); @@ -1373,7 +1591,8 @@ done: } static int -i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop) +i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop, + boolean_t transient) { int err; mac_perim_handle_t mph; @@ -1402,10 +1621,18 @@ i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop) sizeof (retval)); if (err != 0) goto done; + + /* + * We set upcall_done only if the upcall is + * successful. This way, if dls_link_setzid() fails, + * we know another upcall must be done to reset the + * dlmgmtd state. + */ upcall_done = B_TRUE; } if ((err = dls_link_setzid(ddp->dd_mac, new_zoneid)) == 0) { ddp->dd_zid = new_zoneid; + ddp->dd_transient = transient; devnet_need_rebuild = B_TRUE; } @@ -1420,7 +1647,7 @@ done: } int -dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid) +dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid, boolean_t transient) { dls_devnet_t *ddp; int err; @@ -1442,7 +1669,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid) refheld = B_TRUE; } - if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE)) != 0) { + if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE, transient)) != 0) { if (refheld) dls_devnet_rele(ddp); return (err); @@ -1459,7 +1686,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid) if (old_zid != GLOBAL_ZONEID) dls_devnet_stat_destroy(ddh, old_zid); if (new_zid != GLOBAL_ZONEID) - dls_devnet_stat_create(ddh, new_zid); + dls_devnet_stat_create(ddh, new_zid, new_zid); return (0); } @@ -1497,15 +1724,19 @@ dls_devnet_islinkvisible(datalink_id_t linkid, zoneid_t zoneid) * Access a vanity naming node. */ int -dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp) +dls_devnet_open_in_zone(const char *link, dls_dl_handle_t *dhp, dev_t *devp, + zoneid_t zid) { dls_devnet_t *ddp; dls_link_t *dlp; - zoneid_t zid = getzoneid(); + zoneid_t czid = getzoneid(); int err; mac_perim_handle_t mph; - if ((err = dls_devnet_hold_by_name(link, &ddp)) != 0) + if (czid != GLOBAL_ZONEID && czid != zid) + return (ENOENT); + + if ((err = dls_devnet_hold_by_name(link, &ddp, zid)) != 0) return (err); dls_devnet_prop_task_wait(ddp); @@ -1538,6 +1769,12 @@ dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp) return (0); } +int +dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp) +{ + return (dls_devnet_open_in_zone(link, dhp, devp, getzoneid())); +} + /* * Close access to a vanity naming node. */ @@ -1594,13 +1831,32 @@ dls_devnet_create(mac_handle_t mh, datalink_id_t linkid, zoneid_t zoneid) * we need to use the linkid to get the user name for the link * when we create the MAC client. */ - if ((err = dls_devnet_set(mac_name(mh), linkid, zoneid, &ddp)) == 0) { + if ((err = dls_devnet_set(mh, linkid, zoneid, &ddp)) == 0) { if ((err = dls_link_hold_create(mac_name(mh), &dlp)) != 0) { mac_perim_exit(mph); - (void) dls_devnet_unset(mac_name(mh), &linkid, B_TRUE); + (void) dls_devnet_unset(mh, &linkid, B_FALSE); return (err); } + + /* + * If dd_linkid is set then the link was successfully + * initialized. In this case we can remove the + * initializing flag and make the link visible to the + * rest of the system. + * + * If not set then we were called by softmac and it + * was unable to obtain a linkid for the physical link + * because dlmgmtd is down. In that case softmac will + * eventually obtain a linkid and call + * dls_devnet_recreate() to complete initialization. + */ + mutex_enter(&ddp->dd_mutex); + if (ddp->dd_linkid != DATALINK_INVALID_LINKID) + ddp->dd_flags &= ~DD_INITIALIZING; + mutex_exit(&ddp->dd_mutex); + } + mac_perim_exit(mph); return (err); } @@ -1614,8 +1870,19 @@ dls_devnet_create(mac_handle_t mh, datalink_id_t linkid, zoneid_t zoneid) int dls_devnet_recreate(mac_handle_t mh, datalink_id_t linkid) { - ASSERT(linkid != DATALINK_INVALID_LINKID); - return (dls_devnet_set(mac_name(mh), linkid, GLOBAL_ZONEID, NULL)); + dls_devnet_t *ddp; + int err; + + VERIFY(linkid != DATALINK_INVALID_LINKID); + if ((err = dls_devnet_set(mh, linkid, GLOBAL_ZONEID, &ddp)) == 0) { + mutex_enter(&ddp->dd_mutex); + if (ddp->dd_linkid != DATALINK_INVALID_LINKID) + ddp->dd_flags &= ~DD_INITIALIZING; + mutex_exit(&ddp->dd_mutex); + } + + return (err); + } int @@ -1625,15 +1892,52 @@ dls_devnet_destroy(mac_handle_t mh, datalink_id_t *idp, boolean_t wait) mac_perim_handle_t mph; *idp = DATALINK_INVALID_LINKID; - err = dls_devnet_unset(mac_name(mh), idp, wait); - if (err != 0 && err != ENOENT) + err = dls_devnet_unset(mh, idp, wait); + + /* + * We continue on in the face of ENOENT because the devnet + * unset and DLS link release are not atomic and we may have a + * scenario where there is no entry in i_dls_devnet_hash for + * the MAC name but there is an entry in i_dls_link_hash. For + * example, if the following occurred: + * + * 1. dls_devnet_unset() returns success, and + * + * 2. dls_link_rele_by_name() fails with ENOTEMPTY because + * flows still exist, and + * + * 3. dls_devnet_set() fails to set the zone id and calls + * dls_devnet_unset() -- leaving an entry in + * i_dls_link_hash but no corresponding entry in + * i_dls_devnet_hash. + * + * Even if #3 wasn't true the dls_devnet_set() may fail for + * different reasons in the future; the point is that it _can_ + * fail as part of its contract. We can't rely on it working + * so we must assume that these two pieces of state (devnet + * and link hashes), which should always be in sync, can get + * out of sync and thus even if we get ENOENT from the devnet + * hash we should still try to delete from the link hash just + * in case. + * + * We could prevent the ENOTEMPTY from dls_link_rele_by_name() + * by calling mac_disable() before calling + * dls_devnet_destroy() but that's not currently possible due + * to a long-standing bug. OpenSolaris 6791335: The semantics + * of mac_disable() were modified by Crossbow such that + * dls_devnet_destroy() needs to be called before + * mac_disable() can succeed. This is because of the implicit + * reference that dls has on the mac_impl_t. + */ + if (err != 0 && err != ENOENT) { return (err); + } mac_perim_enter_by_mh(mh, &mph); err = dls_link_rele_by_name(mac_name(mh)); - mac_perim_exit(mph); - if (err != 0) { + dls_devnet_t *ddp; + /* * XXX It is a general GLDv3 bug that dls_devnet_set() has to * be called to re-set the link when destroy fails. The @@ -1641,9 +1945,22 @@ dls_devnet_destroy(mac_handle_t mh, datalink_id_t *idp, boolean_t wait) * called from kernel context or from a zone other than that * which initially created the link. */ - (void) dls_devnet_set(mac_name(mh), *idp, crgetzoneid(CRED()), - NULL); + (void) dls_devnet_set(mh, *idp, crgetzoneid(CRED()), &ddp); + + /* + * You might think dd_linkid should always be set + * here, but in the case where dls_devnet_unset() + * returns ENOENT it will be DATALINK_INVALID_LINKID. + * Stay consistent with the rest of DLS and only + * remove the initializing flag if linkid is set. + */ + mutex_enter(&ddp->dd_mutex); + if (ddp->dd_linkid != DATALINK_INVALID_LINKID) + ddp->dd_flags &= ~DD_INITIALIZING; + mutex_exit(&ddp->dd_mutex); } + + mac_perim_exit(mph); return (err); } @@ -1717,6 +2034,12 @@ i_dls_devnet_destroy_iptun(datalink_id_t linkid) } const char * +dls_devnet_link(dls_dl_handle_t ddh) +{ + return (ddh->dd_linkname); +} + +const char * dls_devnet_mac(dls_dl_handle_t ddh) { return (ddh->dd_mac); diff --git a/usr/src/uts/common/io/dls/dls_stat.c b/usr/src/uts/common/io/dls/dls_stat.c index 51e4be7260..82dceff278 100644 --- a/usr/src/uts/common/io/dls/dls_stat.c +++ b/usr/src/uts/common/io/dls/dls_stat.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ /* @@ -30,30 +31,33 @@ #include <sys/dld_impl.h> #include <sys/mac_ether.h> -static mac_stat_info_t i_dls_si[] = { - { MAC_STAT_IFSPEED, "ifspeed", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_MULTIRCV, "multircv", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_BRDCSTRCV, "brdcstrcv", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_MULTIXMT, "multixmt", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_BRDCSTXMT, "brdcstxmt", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_NORCVBUF, "norcvbuf", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_IERRORS, "ierrors", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_NOXMTBUF, "noxmtbuf", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_OERRORS, "oerrors", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_COLLISIONS, "collisions", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_RBYTES, "rbytes", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_IPACKETS, "ipackets", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_OBYTES, "obytes", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_OPACKETS, "opackets", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_RBYTES, "rbytes64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_IPACKETS, "ipackets64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_OBYTES, "obytes64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_OPACKETS, "opackets64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_LINK_STATE, "link_state", KSTAT_DATA_UINT32, - (uint64_t)LINK_STATE_UNKNOWN} -}; - -#define STAT_INFO_COUNT (sizeof (i_dls_si) / sizeof (i_dls_si[0])) +/* + * structure for link kstats + */ +typedef struct { + kstat_named_t dk_ifspeed; + kstat_named_t dk_multircv; + kstat_named_t dk_brdcstrcv; + kstat_named_t dk_multixmt; + kstat_named_t dk_brdcstxmt; + kstat_named_t dk_norcvbuf; + kstat_named_t dk_ierrors; + kstat_named_t dk_noxmtbuf; + kstat_named_t dk_oerrors; + kstat_named_t dk_collisions; + kstat_named_t dk_rbytes; + kstat_named_t dk_ipackets; + kstat_named_t dk_obytes; + kstat_named_t dk_opackets; + kstat_named_t dk_rbytes64; + kstat_named_t dk_ipackets64; + kstat_named_t dk_obytes64; + kstat_named_t dk_opackets64; + kstat_named_t dk_link_state; + kstat_named_t dk_link_duplex; + kstat_named_t dk_unknowns; + kstat_named_t dk_zonename; +} dls_kstat_t; /* * Exported functions. @@ -61,42 +65,54 @@ static mac_stat_info_t i_dls_si[] = { int dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw) { - kstat_named_t *knp; - uint_t i; - uint64_t val; + dls_kstat_t *dkp = ksp->ks_data; if (rw != KSTAT_READ) return (EACCES); - knp = (kstat_named_t *)ksp->ks_data; - for (i = 0; i < STAT_INFO_COUNT; i++) { - val = mac_stat_get(dlp->dl_mh, i_dls_si[i].msi_stat); - - switch (i_dls_si[i].msi_type) { - case KSTAT_DATA_UINT64: - knp->value.ui64 = val; - break; - case KSTAT_DATA_UINT32: - knp->value.ui32 = (uint32_t)val; - break; - default: - ASSERT(B_FALSE); - } - - knp++; - } + dkp->dk_ifspeed.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_IFSPEED); + dkp->dk_multircv.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_MULTIRCV); + dkp->dk_brdcstrcv.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_BRDCSTRCV); + dkp->dk_multixmt.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_MULTIXMT); + dkp->dk_brdcstxmt.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_BRDCSTXMT); + dkp->dk_norcvbuf.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_NORCVBUF); + dkp->dk_ierrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_IERRORS); + dkp->dk_noxmtbuf.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_NOXMTBUF); + dkp->dk_oerrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OERRORS); + dkp->dk_collisions.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_COLLISIONS); + dkp->dk_rbytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES); + dkp->dk_ipackets.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_IPACKETS); + dkp->dk_obytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES); + dkp->dk_opackets.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_OPACKETS); + dkp->dk_rbytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES); + dkp->dk_ipackets64.value.ui64 = mac_stat_get(dlp->dl_mh, + MAC_STAT_IPACKETS); + dkp->dk_obytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES); + dkp->dk_opackets64.value.ui64 = mac_stat_get(dlp->dl_mh, + MAC_STAT_OPACKETS); + dkp->dk_link_state.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_LINK_STATE); /* * Ethernet specific kstat "link_duplex" */ if (dlp->dl_mip->mi_nativemedia != DL_ETHER) { - knp->value.ui32 = LINK_DUPLEX_UNKNOWN; + dkp->dk_link_duplex.value.ui32 = LINK_DUPLEX_UNKNOWN; } else { - val = mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX); - knp->value.ui32 = (uint32_t)val; + dkp->dk_link_duplex.value.ui32 = + (uint32_t)mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX); } - knp++; - knp->value.ui32 = dlp->dl_unknowns; + + dkp->dk_unknowns.value.ui32 = dlp->dl_unknowns; return (0); } @@ -104,30 +120,66 @@ dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw) int dls_stat_create(const char *module, int instance, const char *name, zoneid_t zoneid, int (*update)(struct kstat *, int), void *private, - kstat_t **kspp) + kstat_t **kspp, zoneid_t newzoneid) { kstat_t *ksp; - kstat_named_t *knp; - uint_t i; + zone_t *zone; + dls_kstat_t *dkp; if ((ksp = kstat_create_zone(module, instance, name, "net", - KSTAT_TYPE_NAMED, STAT_INFO_COUNT + 2, 0, zoneid)) == NULL) { + KSTAT_TYPE_NAMED, sizeof (dls_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zoneid)) == NULL) { return (EINVAL); } ksp->ks_update = update; ksp->ks_private = private; + dkp = ksp->ks_data = kmem_zalloc(sizeof (dls_kstat_t), KM_SLEEP); + if ((zone = zone_find_by_id(newzoneid)) != NULL) { + ksp->ks_data_size += strlen(zone->zone_name) + 1; + } - knp = (kstat_named_t *)ksp->ks_data; - for (i = 0; i < STAT_INFO_COUNT; i++) { - kstat_named_init(knp, i_dls_si[i].msi_name, - i_dls_si[i].msi_type); - knp++; + kstat_named_init(&dkp->dk_ifspeed, "ifspeed", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_multircv, "multircv", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_brdcstrcv, "brdcstrcv", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_multixmt, "multixmt", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_brdcstxmt, "brdcstxmt", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_norcvbuf, "norcvbuf", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_ierrors, "ierrors", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_noxmtbuf, "noxmtbuf", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_oerrors, "oerrors", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_collisions, "collisions", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_rbytes, "rbytes", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_ipackets, "ipackets", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_obytes, "obytes", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_opackets, "opackets", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_rbytes64, "rbytes64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_ipackets64, "ipackets64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_obytes64, "obytes64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_opackets64, "opackets64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_link_state, "link_state", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_link_duplex, "link_duplex", + KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_unknowns, "unknowns", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_zonename, "zonename", KSTAT_DATA_STRING); + + if (zone != NULL) { + kstat_named_setstr(&dkp->dk_zonename, zone->zone_name); + zone_rele(zone); } - kstat_named_init(knp++, "link_duplex", KSTAT_DATA_UINT32); - kstat_named_init(knp, "unknowns", KSTAT_DATA_UINT32); kstat_install(ksp); *kspp = ksp; return (0); } + +void +dls_stat_delete(kstat_t *ksp) +{ + void *data; + if (ksp != NULL) { + data = ksp->ks_data; + kstat_delete(ksp); + kmem_free(data, sizeof (dls_kstat_t)); + } +} diff --git a/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE new file mode 100644 index 0000000000..00aefb6f51 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE @@ -0,0 +1,32 @@ +/* + * MegaRAID device driver for SAS2.0 controllers + * Copyright (c) 2009, LSI Logic Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ diff --git a/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip new file mode 100644 index 0000000000..ac6d2d1b15 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip @@ -0,0 +1 @@ +DR_SAS DRIVER diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.c b/usr/src/uts/common/io/dr_sas/dr_sas.c new file mode 100644 index 0000000000..02354c9b16 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/dr_sas.c @@ -0,0 +1,5510 @@ +/* + * dr_sas.c: source for dr_sas driver + * + * MegaRAID device driver for SAS2.0 controllers + * Copyright (c) 2008-2009, LSI Logic Corporation. + * All rights reserved. + * + * Version: + * Author: + * Arun Chandrashekhar + * Manju R + * Rajesh Prabhakaran + * Seokmann Ju + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/file.h> +#include <sys/errno.h> +#include <sys/open.h> +#include <sys/cred.h> +#include <sys/modctl.h> +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/cmn_err.h> +#include <sys/kmem.h> +#include <sys/stat.h> +#include <sys/mkdev.h> +#include <sys/pci.h> +#include <sys/scsi/scsi.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/atomic.h> +#include <sys/signal.h> +#include <sys/fs/dv_node.h> /* devfs_clean */ + +#include "dr_sas.h" + +/* + * FMA header files + */ +#include <sys/ddifm.h> +#include <sys/fm/protocol.h> +#include <sys/fm/util.h> +#include <sys/fm/io/ddi.h> + +/* + * Local static data + */ +static void *drsas_state = NULL; +static int debug_level_g = CL_NONE; + +#pragma weak scsi_hba_open +#pragma weak scsi_hba_close +#pragma weak scsi_hba_ioctl + +static ddi_dma_attr_t drsas_generic_dma_attr = { + DMA_ATTR_V0, /* dma_attr_version */ + 0, /* low DMA address range */ + 0xFFFFFFFFU, /* high DMA address range */ + 0xFFFFFFFFU, /* DMA counter register */ + 8, /* DMA address alignment */ + 0x07, /* DMA burstsizes */ + 1, /* min DMA size */ + 0xFFFFFFFFU, /* max DMA size */ + 0xFFFFFFFFU, /* segment boundary */ + DRSAS_MAX_SGE_CNT, /* dma_attr_sglen */ + 512, /* granularity of device */ + 0 /* bus specific DMA flags */ +}; + +int32_t drsas_max_cap_maxxfer = 0x1000000; + +/* + * cb_ops contains base level routines + */ +static struct cb_ops drsas_cb_ops = { + drsas_open, /* open */ + drsas_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + drsas_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + nodev, /* cb_prop_op */ + 0, /* streamtab */ + D_NEW | D_HOTPLUG, /* cb_flag */ + CB_REV, /* cb_rev */ + nodev, /* cb_aread */ + nodev /* cb_awrite */ +}; + +/* + * dev_ops contains configuration routines + */ +static struct dev_ops drsas_ops = { + DEVO_REV, /* rev, */ + 0, /* refcnt */ + drsas_getinfo, /* getinfo */ + nulldev, /* identify */ + nulldev, /* probe */ + drsas_attach, /* attach */ + drsas_detach, /* detach */ + drsas_reset, /* reset */ + &drsas_cb_ops, /* char/block ops */ + NULL, /* bus ops */ + NULL, /* power */ + ddi_quiesce_not_supported, /* quiesce */ +}; + +char _depends_on[] = "misc/scsi"; + +static struct modldrv modldrv = { + &mod_driverops, /* module type - driver */ + DRSAS_VERSION, + &drsas_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, /* ml_rev - must be MODREV_1 */ + &modldrv, /* ml_linkage */ + NULL /* end of driver linkage */ +}; + +static struct ddi_device_acc_attr endian_attr = { + DDI_DEVICE_ATTR_V0, + DDI_STRUCTURE_LE_ACC, + DDI_STRICTORDER_ACC +}; + + +/* + * ************************************************************************** * + * * + * common entry points - for loadable kernel modules * + * * + * ************************************************************************** * + */ + +int +_init(void) +{ + int ret; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + ret = ddi_soft_state_init(&drsas_state, + sizeof (struct drsas_instance), 0); + + if (ret != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "dr_sas: could not init state")); + return (ret); + } + + if ((ret = scsi_hba_init(&modlinkage)) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "dr_sas: could not init scsi hba")); + ddi_soft_state_fini(&drsas_state); + return (ret); + } + + ret = mod_install(&modlinkage); + + if (ret != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "dr_sas: mod_install failed")); + scsi_hba_fini(&modlinkage); + ddi_soft_state_fini(&drsas_state); + } + + return (ret); +} + +int +_info(struct modinfo *modinfop) +{ + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int ret; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if ((ret = mod_remove(&modlinkage)) != DDI_SUCCESS) + return (ret); + + scsi_hba_fini(&modlinkage); + + ddi_soft_state_fini(&drsas_state); + + return (ret); +} + + +/* + * ************************************************************************** * + * * + * common entry points - for autoconfiguration * + * * + * ************************************************************************** * + */ + +static int +drsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int instance_no; + int nregs; + uint8_t added_isr_f = 0; + uint8_t added_soft_isr_f = 0; + uint8_t create_devctl_node_f = 0; + uint8_t create_scsi_node_f = 0; + uint8_t create_ioc_node_f = 0; + uint8_t tran_alloc_f = 0; + uint8_t irq; + uint16_t vendor_id; + uint16_t device_id; + uint16_t subsysvid; + uint16_t subsysid; + uint16_t command; + off_t reglength = 0; + int intr_types = 0; + char *data; + int msi_enable = 0; + + scsi_hba_tran_t *tran; + ddi_dma_attr_t tran_dma_attr; + struct drsas_instance *instance; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* CONSTCOND */ + ASSERT(NO_COMPETING_THREADS); + + instance_no = ddi_get_instance(dip); + + /* + * check to see whether this device is in a DMA-capable slot. + */ + if (ddi_slaveonly(dip) == DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: Device in slave-only slot, unused", + instance_no)); + return (DDI_FAILURE); + } + + switch (cmd) { + case DDI_ATTACH: + con_log(CL_DLEVEL1, (CE_NOTE, "dr_sas: DDI_ATTACH")); + /* allocate the soft state for the instance */ + if (ddi_soft_state_zalloc(drsas_state, instance_no) + != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: Failed to allocate soft state", + instance_no)); + + return (DDI_FAILURE); + } + + instance = (struct drsas_instance *)ddi_get_soft_state + (drsas_state, instance_no); + + if (instance == NULL) { + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: Bad soft state", instance_no)); + + ddi_soft_state_free(drsas_state, instance_no); + + return (DDI_FAILURE); + } + + bzero((caddr_t)instance, + sizeof (struct drsas_instance)); + + instance->func_ptr = kmem_zalloc( + sizeof (struct drsas_func_ptr), KM_SLEEP); + ASSERT(instance->func_ptr); + + /* Setup the PCI configuration space handles */ + if (pci_config_setup(dip, &instance->pci_handle) != + DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: pci config setup failed ", + instance_no)); + + kmem_free(instance->func_ptr, + sizeof (struct drsas_func_ptr)); + ddi_soft_state_free(drsas_state, instance_no); + + return (DDI_FAILURE); + } + + if (ddi_dev_nregs(dip, &nregs) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to get registers.")); + + pci_config_teardown(&instance->pci_handle); + kmem_free(instance->func_ptr, + sizeof (struct drsas_func_ptr)); + ddi_soft_state_free(drsas_state, instance_no); + + return (DDI_FAILURE); + } + + vendor_id = pci_config_get16(instance->pci_handle, + PCI_CONF_VENID); + device_id = pci_config_get16(instance->pci_handle, + PCI_CONF_DEVID); + + subsysvid = pci_config_get16(instance->pci_handle, + PCI_CONF_SUBVENID); + subsysid = pci_config_get16(instance->pci_handle, + PCI_CONF_SUBSYSID); + + pci_config_put16(instance->pci_handle, PCI_CONF_COMM, + (pci_config_get16(instance->pci_handle, + PCI_CONF_COMM) | PCI_COMM_ME)); + irq = pci_config_get8(instance->pci_handle, + PCI_CONF_ILINE); + + con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: " + "0x%x:0x%x 0x%x:0x%x, irq:%d drv-ver:%s", + instance_no, vendor_id, device_id, subsysvid, + subsysid, irq, DRSAS_VERSION)); + + /* enable bus-mastering */ + command = pci_config_get16(instance->pci_handle, + PCI_CONF_COMM); + + if (!(command & PCI_COMM_ME)) { + command |= PCI_COMM_ME; + + pci_config_put16(instance->pci_handle, + PCI_CONF_COMM, command); + + con_log(CL_ANN, (CE_CONT, "dr_sas%d: " + "enable bus-mastering", instance_no)); + } else { + con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: " + "bus-mastering already set", instance_no)); + } + + /* initialize function pointers */ + if ((device_id == PCI_DEVICE_ID_LSI_2108VDE) || + (device_id == PCI_DEVICE_ID_LSI_2108V)) { + con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: " + "2108V/DE detected", instance_no)); + instance->func_ptr->read_fw_status_reg = + read_fw_status_reg_ppc; + instance->func_ptr->issue_cmd = issue_cmd_ppc; + instance->func_ptr->issue_cmd_in_sync_mode = + issue_cmd_in_sync_mode_ppc; + instance->func_ptr->issue_cmd_in_poll_mode = + issue_cmd_in_poll_mode_ppc; + instance->func_ptr->enable_intr = + enable_intr_ppc; + instance->func_ptr->disable_intr = + disable_intr_ppc; + instance->func_ptr->intr_ack = intr_ack_ppc; + } else { + con_log(CL_ANN, (CE_WARN, + "dr_sas: Invalid device detected")); + + pci_config_teardown(&instance->pci_handle); + kmem_free(instance->func_ptr, + sizeof (struct drsas_func_ptr)); + ddi_soft_state_free(drsas_state, instance_no); + + return (DDI_FAILURE); + } + + instance->baseaddress = pci_config_get32( + instance->pci_handle, PCI_CONF_BASE0); + instance->baseaddress &= 0x0fffc; + + instance->dip = dip; + instance->vendor_id = vendor_id; + instance->device_id = device_id; + instance->subsysvid = subsysvid; + instance->subsysid = subsysid; + instance->instance = instance_no; + + /* Initialize FMA */ + instance->fm_capabilities = ddi_prop_get_int( + DDI_DEV_T_ANY, instance->dip, DDI_PROP_DONTPASS, + "fm-capable", DDI_FM_EREPORT_CAPABLE | + DDI_FM_ACCCHK_CAPABLE | DDI_FM_DMACHK_CAPABLE + | DDI_FM_ERRCB_CAPABLE); + + drsas_fm_init(instance); + + /* Initialize Interrupts */ + if ((ddi_dev_regsize(instance->dip, + REGISTER_SET_IO_2108, ®length) != DDI_SUCCESS) || + reglength < MINIMUM_MFI_MEM_SZ) { + return (DDI_FAILURE); + } + if (reglength > DEFAULT_MFI_MEM_SZ) { + reglength = DEFAULT_MFI_MEM_SZ; + con_log(CL_DLEVEL1, (CE_NOTE, + "dr_sas: register length to map is " + "0x%lx bytes", reglength)); + } + if (ddi_regs_map_setup(instance->dip, + REGISTER_SET_IO_2108, &instance->regmap, 0, + reglength, &endian_attr, &instance->regmap_handle) + != DDI_SUCCESS) { + con_log(CL_ANN, (CE_NOTE, + "dr_sas: couldn't map control registers")); + goto fail_attach; + } + + /* + * Disable Interrupt Now. + * Setup Software interrupt + */ + instance->func_ptr->disable_intr(instance); + + msi_enable = 0; + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0, + "drsas-enable-msi", &data) == DDI_SUCCESS) { + if (strncmp(data, "yes", 3) == 0) { + msi_enable = 1; + con_log(CL_ANN, (CE_WARN, + "msi_enable = %d ENABLED", + msi_enable)); + } + ddi_prop_free(data); + } + + con_log(CL_DLEVEL1, (CE_WARN, "msi_enable = %d", + msi_enable)); + + /* Check for all supported interrupt types */ + if (ddi_intr_get_supported_types( + dip, &intr_types) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "ddi_intr_get_supported_types() failed")); + goto fail_attach; + } + + con_log(CL_DLEVEL1, (CE_NOTE, + "ddi_intr_get_supported_types() ret: 0x%x", + intr_types)); + + /* Initialize and Setup Interrupt handler */ + if (msi_enable && (intr_types & DDI_INTR_TYPE_MSIX)) { + if (drsas_add_intrs(instance, + DDI_INTR_TYPE_MSIX) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "MSIX interrupt query failed")); + goto fail_attach; + } + instance->intr_type = DDI_INTR_TYPE_MSIX; + } else if (msi_enable && (intr_types & + DDI_INTR_TYPE_MSI)) { + if (drsas_add_intrs(instance, + DDI_INTR_TYPE_MSI) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "MSI interrupt query failed")); + goto fail_attach; + } + instance->intr_type = DDI_INTR_TYPE_MSI; + } else if (intr_types & DDI_INTR_TYPE_FIXED) { + msi_enable = 0; + if (drsas_add_intrs(instance, + DDI_INTR_TYPE_FIXED) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "FIXED interrupt query failed")); + goto fail_attach; + } + instance->intr_type = DDI_INTR_TYPE_FIXED; + } else { + con_log(CL_ANN, (CE_WARN, "Device cannot " + "suppport either FIXED or MSI/X " + "interrupts")); + goto fail_attach; + } + + added_isr_f = 1; + + /* setup the mfi based low level driver */ + if (init_mfi(instance) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "dr_sas: " + "could not initialize the low level driver")); + + goto fail_attach; + } + + /* Initialize all Mutex */ + INIT_LIST_HEAD(&instance->completed_pool_list); + mutex_init(&instance->completed_pool_mtx, + "completed_pool_mtx", MUTEX_DRIVER, + DDI_INTR_PRI(instance->intr_pri)); + + mutex_init(&instance->int_cmd_mtx, "int_cmd_mtx", + MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri)); + cv_init(&instance->int_cmd_cv, NULL, CV_DRIVER, NULL); + + mutex_init(&instance->cmd_pool_mtx, "cmd_pool_mtx", + MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri)); + + /* Register our soft-isr for highlevel interrupts. */ + instance->isr_level = instance->intr_pri; + if (instance->isr_level == HIGH_LEVEL_INTR) { + if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH, + &instance->soft_intr_id, NULL, NULL, + drsas_softintr, (caddr_t)instance) != + DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + " Software ISR did not register")); + + goto fail_attach; + } + + added_soft_isr_f = 1; + } + + /* Allocate a transport structure */ + tran = scsi_hba_tran_alloc(dip, SCSI_HBA_CANSLEEP); + + if (tran == NULL) { + con_log(CL_ANN, (CE_WARN, + "scsi_hba_tran_alloc failed")); + goto fail_attach; + } + + tran_alloc_f = 1; + + instance->tran = tran; + + tran->tran_hba_private = instance; + tran->tran_tgt_init = drsas_tran_tgt_init; + tran->tran_tgt_probe = scsi_hba_probe; + tran->tran_tgt_free = drsas_tran_tgt_free; + tran->tran_init_pkt = drsas_tran_init_pkt; + tran->tran_start = drsas_tran_start; + tran->tran_abort = drsas_tran_abort; + tran->tran_reset = drsas_tran_reset; + tran->tran_getcap = drsas_tran_getcap; + tran->tran_setcap = drsas_tran_setcap; + tran->tran_destroy_pkt = drsas_tran_destroy_pkt; + tran->tran_dmafree = drsas_tran_dmafree; + tran->tran_sync_pkt = drsas_tran_sync_pkt; + tran->tran_bus_config = drsas_tran_bus_config; + + tran_dma_attr = drsas_generic_dma_attr; + tran_dma_attr.dma_attr_sgllen = instance->max_num_sge; + + /* Attach this instance of the hba */ + if (scsi_hba_attach_setup(dip, &tran_dma_attr, tran, 0) + != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "scsi_hba_attach failed")); + + goto fail_attach; + } + + /* create devctl node for cfgadm command */ + if (ddi_create_minor_node(dip, "devctl", + S_IFCHR, INST2DEVCTL(instance_no), + DDI_NT_SCSI_NEXUS, 0) == DDI_FAILURE) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to create devctl node.")); + + goto fail_attach; + } + + create_devctl_node_f = 1; + + /* create scsi node for cfgadm command */ + if (ddi_create_minor_node(dip, "scsi", S_IFCHR, + INST2SCSI(instance_no), + DDI_NT_SCSI_ATTACHMENT_POINT, 0) == + DDI_FAILURE) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to create scsi node.")); + + goto fail_attach; + } + + create_scsi_node_f = 1; + + (void) sprintf(instance->iocnode, "%d:lsirdctl", + instance_no); + + /* + * Create a node for applications + * for issuing ioctl to the driver. + */ + if (ddi_create_minor_node(dip, instance->iocnode, + S_IFCHR, INST2LSIRDCTL(instance_no), + DDI_PSEUDO, 0) == DDI_FAILURE) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to create ioctl node.")); + + goto fail_attach; + } + + create_ioc_node_f = 1; + + /* Create a taskq to handle dr events */ + if ((instance->taskq = ddi_taskq_create(dip, + "drsas_dr_taskq", 1, + TASKQ_DEFAULTPRI, 0)) == NULL) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to create taskq ")); + instance->taskq = NULL; + goto fail_attach; + } + + /* enable interrupt */ + instance->func_ptr->enable_intr(instance); + + /* initiate AEN */ + if (start_mfi_aen(instance)) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to initiate AEN.")); + goto fail_initiate_aen; + } + + con_log(CL_DLEVEL1, (CE_NOTE, + "AEN started for instance %d.", instance_no)); + + /* Finally! We are on the air. */ + ddi_report_dev(dip); + + if (drsas_check_acc_handle(instance->regmap_handle) != + DDI_SUCCESS) { + goto fail_attach; + } + if (drsas_check_acc_handle(instance->pci_handle) != + DDI_SUCCESS) { + goto fail_attach; + } + instance->dr_ld_list = + kmem_zalloc(MRDRV_MAX_LD * sizeof (struct drsas_ld), + KM_SLEEP); + break; + case DDI_PM_RESUME: + con_log(CL_ANN, (CE_NOTE, + "dr_sas: DDI_PM_RESUME")); + break; + case DDI_RESUME: + con_log(CL_ANN, (CE_NOTE, + "dr_sas: DDI_RESUME")); + break; + default: + con_log(CL_ANN, (CE_WARN, + "dr_sas: invalid attach cmd=%x", cmd)); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); + +fail_initiate_aen: +fail_attach: + if (create_devctl_node_f) { + ddi_remove_minor_node(dip, "devctl"); + } + + if (create_scsi_node_f) { + ddi_remove_minor_node(dip, "scsi"); + } + + if (create_ioc_node_f) { + ddi_remove_minor_node(dip, instance->iocnode); + } + + if (tran_alloc_f) { + scsi_hba_tran_free(tran); + } + + + if (added_soft_isr_f) { + ddi_remove_softintr(instance->soft_intr_id); + } + + if (added_isr_f) { + drsas_rem_intrs(instance); + } + + if (instance && instance->taskq) { + ddi_taskq_destroy(instance->taskq); + } + + drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + + drsas_fm_fini(instance); + + pci_config_teardown(&instance->pci_handle); + + ddi_soft_state_free(drsas_state, instance_no); + + con_log(CL_ANN, (CE_NOTE, + "dr_sas: return failure from drsas_attach")); + + return (DDI_FAILURE); +} + +/*ARGSUSED*/ +static int +drsas_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) +{ + int rval; + int drsas_minor = getminor((dev_t)arg); + + struct drsas_instance *instance; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + instance = (struct drsas_instance *) + ddi_get_soft_state(drsas_state, + MINOR2INST(drsas_minor)); + + if (instance == NULL) { + *resultp = NULL; + rval = DDI_FAILURE; + } else { + *resultp = instance->dip; + rval = DDI_SUCCESS; + } + break; + case DDI_INFO_DEVT2INSTANCE: + *resultp = (void *)instance; + rval = DDI_SUCCESS; + break; + default: + *resultp = NULL; + rval = DDI_FAILURE; + } + + return (rval); +} + +static int +drsas_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + int instance_no; + + struct drsas_instance *instance; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* CONSTCOND */ + ASSERT(NO_COMPETING_THREADS); + + instance_no = ddi_get_instance(dip); + + instance = (struct drsas_instance *)ddi_get_soft_state(drsas_state, + instance_no); + + if (!instance) { + con_log(CL_ANN, (CE_WARN, + "dr_sas:%d could not get instance in detach", + instance_no)); + + return (DDI_FAILURE); + } + + con_log(CL_ANN, (CE_NOTE, + "dr_sas%d: detaching device 0x%4x:0x%4x:0x%4x:0x%4x", + instance_no, instance->vendor_id, instance->device_id, + instance->subsysvid, instance->subsysid)); + + switch (cmd) { + case DDI_DETACH: + con_log(CL_ANN, (CE_NOTE, + "drsas_detach: DDI_DETACH")); + + if (scsi_hba_detach(dip) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas:%d failed to detach", + instance_no)); + + return (DDI_FAILURE); + } + + scsi_hba_tran_free(instance->tran); + + flush_cache(instance); + + if (abort_aen_cmd(instance, instance->aen_cmd)) { + con_log(CL_ANN, (CE_WARN, "drsas_detach: " + "failed to abort prevous AEN command")); + + return (DDI_FAILURE); + } + + instance->func_ptr->disable_intr(instance); + + if (instance->isr_level == HIGH_LEVEL_INTR) { + ddi_remove_softintr(instance->soft_intr_id); + } + + drsas_rem_intrs(instance); + + if (instance->taskq) { + ddi_taskq_destroy(instance->taskq); + } + kmem_free(instance->dr_ld_list, MRDRV_MAX_LD + * sizeof (struct drsas_ld)); + free_space_for_mfi(instance); + + drsas_fm_fini(instance); + + pci_config_teardown(&instance->pci_handle); + + kmem_free(instance->func_ptr, + sizeof (struct drsas_func_ptr)); + + ddi_soft_state_free(drsas_state, instance_no); + break; + case DDI_PM_SUSPEND: + con_log(CL_ANN, (CE_NOTE, + "drsas_detach: DDI_PM_SUSPEND")); + + break; + case DDI_SUSPEND: + con_log(CL_ANN, (CE_NOTE, + "drsas_detach: DDI_SUSPEND")); + + break; + default: + con_log(CL_ANN, (CE_WARN, + "invalid detach command:0x%x", cmd)); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * ************************************************************************** * + * * + * common entry points - for character driver types * + * * + * ************************************************************************** * + */ +static int +drsas_open(dev_t *dev, int openflags, int otyp, cred_t *credp) +{ + int rval = 0; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* Check root permissions */ + if (drv_priv(credp) != 0) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: Non-root ioctl access denied!")); + return (EPERM); + } + + /* Verify we are being opened as a character device */ + if (otyp != OTYP_CHR) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: ioctl node must be a char node")); + return (EINVAL); + } + + if (ddi_get_soft_state(drsas_state, MINOR2INST(getminor(*dev))) + == NULL) { + return (ENXIO); + } + + if (scsi_hba_open) { + rval = scsi_hba_open(dev, openflags, otyp, credp); + } + + return (rval); +} + +static int +drsas_close(dev_t dev, int openflags, int otyp, cred_t *credp) +{ + int rval = 0; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* no need for locks! */ + + if (scsi_hba_close) { + rval = scsi_hba_close(dev, openflags, otyp, credp); + } + + return (rval); +} + +static int +drsas_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + int rval = 0; + + struct drsas_instance *instance; + struct drsas_ioctl *ioctl; + struct drsas_aen aen; + int i; + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + instance = ddi_get_soft_state(drsas_state, MINOR2INST(getminor(dev))); + + if (instance == NULL) { + /* invalid minor number */ + con_log(CL_ANN, (CE_WARN, "dr_sas: adapter not found.")); + return (ENXIO); + } + + ioctl = (struct drsas_ioctl *)kmem_zalloc(sizeof (struct drsas_ioctl), + KM_SLEEP); + ASSERT(ioctl); + + switch ((uint_t)cmd) { + case DRSAS_IOCTL_FIRMWARE: + for (i = 0; i < sizeof (struct drsas_ioctl); i++) { + if (ddi_copyin((uint8_t *)arg+i, + (uint8_t *)ioctl+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, "drsas_ioctl " + "ERROR IOCTL copyin")); + kmem_free(ioctl, + sizeof (struct drsas_ioctl)); + return (EFAULT); + } + } + if (ioctl->control_code == DRSAS_DRIVER_IOCTL_COMMON) { + rval = handle_drv_ioctl(instance, ioctl, mode); + } else { + rval = handle_mfi_ioctl(instance, ioctl, mode); + } + for (i = 0; i < sizeof (struct drsas_ioctl) - 1; i++) { + if (ddi_copyout((uint8_t *)ioctl+i, + (uint8_t *)arg+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "drsas_ioctl: ddi_copyout " + "failed")); + rval = 1; + break; + } + } + + break; + case DRSAS_IOCTL_AEN: + for (i = 0; i < sizeof (struct drsas_aen); i++) { + if (ddi_copyin((uint8_t *)arg+i, + (uint8_t *)&aen+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "drsas_ioctl: " + "ERROR AEN copyin")); + kmem_free(ioctl, + sizeof (struct drsas_ioctl)); + return (EFAULT); + } + } + + rval = handle_mfi_aen(instance, &aen); + for (i = 0; i < sizeof (struct drsas_aen); i++) { + if (ddi_copyout((uint8_t *)&aen + i, + (uint8_t *)arg + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "drsas_ioctl: " + "ddi_copyout failed")); + rval = 1; + break; + } + } + + break; + default: + rval = scsi_hba_ioctl(dev, cmd, arg, + mode, credp, rvalp); + + con_log(CL_DLEVEL1, (CE_NOTE, "drsas_ioctl: " + "scsi_hba_ioctl called, ret = %x.", rval)); + } + + kmem_free(ioctl, sizeof (struct drsas_ioctl)); + return (rval); +} + +/* + * ************************************************************************** * + * * + * common entry points - for block driver types * + * * + * ************************************************************************** * + */ +/*ARGSUSED*/ +static int +drsas_reset(dev_info_t *dip, ddi_reset_cmd_t cmd) +{ + int instance_no; + + struct drsas_instance *instance; + + instance_no = ddi_get_instance(dip); + instance = (struct drsas_instance *)ddi_get_soft_state + (drsas_state, instance_no); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if (!instance) { + con_log(CL_ANN, (CE_WARN, "dr_sas:%d could not get adapter " + "in reset", instance_no)); + return (DDI_FAILURE); + } + + instance->func_ptr->disable_intr(instance); + + con_log(CL_ANN1, (CE_NOTE, "flushing cache for instance %d", + instance_no)); + + flush_cache(instance); + + return (DDI_SUCCESS); +} + + +/* + * ************************************************************************** * + * * + * entry points (SCSI HBA) * + * * + * ************************************************************************** * + */ +/*ARGSUSED*/ +static int +drsas_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip, + scsi_hba_tran_t *tran, struct scsi_device *sd) +{ + struct drsas_instance *instance; + uint16_t tgt = sd->sd_address.a_target; + uint8_t lun = sd->sd_address.a_lun; + + con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init target %d lun %d", + tgt, lun)); + + instance = ADDR2MR(&sd->sd_address); + + if (ndi_dev_is_persistent_node(tgt_dip) == 0) { + (void) ndi_merge_node(tgt_dip, drsas_name_node); + ddi_set_name_addr(tgt_dip, NULL); + + con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init in " + "ndi_dev_is_persistent_node DDI_FAILURE t = %d l = %d", + tgt, lun)); + return (DDI_FAILURE); + } + + con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init dev_dip %p tgt_dip %p", + (void *)instance->dr_ld_list[tgt].dip, (void *)tgt_dip)); + + if (tgt < MRDRV_MAX_LD && lun == 0) { + if (instance->dr_ld_list[tgt].dip == NULL && + strcmp(ddi_driver_name(sd->sd_dev), "sd") == 0) { + instance->dr_ld_list[tgt].dip = tgt_dip; + instance->dr_ld_list[tgt].lun_type = DRSAS_LD_LUN; + } + } + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static void +drsas_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip, + scsi_hba_tran_t *hba_tran, struct scsi_device *sd) +{ + struct drsas_instance *instance; + int tgt = sd->sd_address.a_target; + int lun = sd->sd_address.a_lun; + + instance = ADDR2MR(&sd->sd_address); + + con_log(CL_ANN1, (CE_NOTE, "tgt_free t = %d l = %d", tgt, lun)); + + if (tgt < MRDRV_MAX_LD && lun == 0) { + if (instance->dr_ld_list[tgt].dip == tgt_dip) { + instance->dr_ld_list[tgt].dip = NULL; + } + } +} + +static dev_info_t * +drsas_find_child(struct drsas_instance *instance, uint16_t tgt, uint8_t lun) +{ + dev_info_t *child = NULL; + char addr[SCSI_MAXNAMELEN]; + char tmp[MAXNAMELEN]; + + (void) sprintf(addr, "%x,%x", tgt, lun); + for (child = ddi_get_child(instance->dip); child; + child = ddi_get_next_sibling(child)) { + + if (drsas_name_node(child, tmp, MAXNAMELEN) != + DDI_SUCCESS) { + continue; + } + + if (strcmp(addr, tmp) == 0) { + break; + } + } + con_log(CL_ANN1, (CE_NOTE, "drsas_find_child: return child = %p", + (void *)child)); + return (child); +} + +static int +drsas_name_node(dev_info_t *dip, char *name, int len) +{ + int tgt, lun; + + tgt = ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "target", -1); + con_log(CL_ANN1, (CE_NOTE, + "drsas_name_node: dip %p tgt %d", (void *)dip, tgt)); + if (tgt == -1) { + return (DDI_FAILURE); + } + lun = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "lun", -1); + con_log(CL_ANN1, + (CE_NOTE, "drsas_name_node: tgt %d lun %d", tgt, lun)); + if (lun == -1) { + return (DDI_FAILURE); + } + (void) snprintf(name, len, "%x,%x", tgt, lun); + return (DDI_SUCCESS); +} + +static struct scsi_pkt * +drsas_tran_init_pkt(struct scsi_address *ap, register struct scsi_pkt *pkt, + struct buf *bp, int cmdlen, int statuslen, int tgtlen, + int flags, int (*callback)(), caddr_t arg) +{ + struct scsa_cmd *acmd; + struct drsas_instance *instance; + struct scsi_pkt *new_pkt; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + instance = ADDR2MR(ap); + + /* step #1 : pkt allocation */ + if (pkt == NULL) { + pkt = scsi_hba_pkt_alloc(instance->dip, ap, cmdlen, statuslen, + tgtlen, sizeof (struct scsa_cmd), callback, arg); + if (pkt == NULL) { + return (NULL); + } + + acmd = PKT2CMD(pkt); + + /* + * Initialize the new pkt - we redundantly initialize + * all the fields for illustrative purposes. + */ + acmd->cmd_pkt = pkt; + acmd->cmd_flags = 0; + acmd->cmd_scblen = statuslen; + acmd->cmd_cdblen = cmdlen; + acmd->cmd_dmahandle = NULL; + acmd->cmd_ncookies = 0; + acmd->cmd_cookie = 0; + acmd->cmd_cookiecnt = 0; + acmd->cmd_nwin = 0; + + pkt->pkt_address = *ap; + pkt->pkt_comp = (void (*)())NULL; + pkt->pkt_flags = 0; + pkt->pkt_time = 0; + pkt->pkt_resid = 0; + pkt->pkt_state = 0; + pkt->pkt_statistics = 0; + pkt->pkt_reason = 0; + new_pkt = pkt; + } else { + acmd = PKT2CMD(pkt); + new_pkt = NULL; + } + + /* step #2 : dma allocation/move */ + if (bp && bp->b_bcount != 0) { + if (acmd->cmd_dmahandle == NULL) { + if (drsas_dma_alloc(instance, pkt, bp, flags, + callback) == DDI_FAILURE) { + if (new_pkt) { + scsi_hba_pkt_free(ap, new_pkt); + } + return ((struct scsi_pkt *)NULL); + } + } else { + if (drsas_dma_move(instance, pkt, bp) == DDI_FAILURE) { + return ((struct scsi_pkt *)NULL); + } + } + } + + return (pkt); +} + +static int +drsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt) +{ + uchar_t cmd_done = 0; + + struct drsas_instance *instance = ADDR2MR(ap); + struct drsas_cmd *cmd; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d:SCSI CDB[0]=0x%x", + __func__, __LINE__, pkt->pkt_cdbp[0])); + + pkt->pkt_reason = CMD_CMPLT; + *pkt->pkt_scbp = STATUS_GOOD; /* clear arq scsi_status */ + + cmd = build_cmd(instance, ap, pkt, &cmd_done); + + /* + * Check if the command is already completed by the drsas_build_cmd() + * routine. In which case the busy_flag would be clear and scb will be + * NULL and appropriate reason provided in pkt_reason field + */ + if (cmd_done) { + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_scbp[0] = STATUS_GOOD; + pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET + | STATE_SENT_CMD; + if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + + return (TRAN_ACCEPT); + } + + if (cmd == NULL) { + return (TRAN_BUSY); + } + + if ((pkt->pkt_flags & FLAG_NOINTR) == 0) { + if (instance->fw_outstanding > instance->max_fw_cmds) { + con_log(CL_ANN, (CE_CONT, "dr_sas:Firmware busy")); + return_mfi_pkt(instance, cmd); + return (TRAN_BUSY); + } + + /* Synchronize the Cmd frame for the controller */ + (void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, 0, 0, + DDI_DMA_SYNC_FORDEV); + + instance->func_ptr->issue_cmd(cmd, instance); + + } else { + struct drsas_header *hdr = &cmd->frame->hdr; + + cmd->sync_cmd = DRSAS_TRUE; + + instance->func_ptr-> issue_cmd_in_poll_mode(instance, cmd); + + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_statistics = 0; + pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS; + + switch (ddi_get8(cmd->frame_dma_obj.acc_handle, + &hdr->cmd_status)) { + case MFI_STAT_OK: + pkt->pkt_scbp[0] = STATUS_GOOD; + break; + + case MFI_STAT_SCSI_DONE_WITH_ERROR: + + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_statistics = 0; + + ((struct scsi_status *)pkt->pkt_scbp)->sts_chk = 1; + break; + + case MFI_STAT_DEVICE_NOT_FOUND: + pkt->pkt_reason = CMD_DEV_GONE; + pkt->pkt_statistics = STAT_DISCON; + break; + + default: + ((struct scsi_status *)pkt->pkt_scbp)->sts_busy = 1; + } + + return_mfi_pkt(instance, cmd); + (void) drsas_common_check(instance, cmd); + + if (pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + + } + + return (TRAN_ACCEPT); +} + +/*ARGSUSED*/ +static int +drsas_tran_abort(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* abort command not supported by H/W */ + + return (DDI_FAILURE); +} + +/*ARGSUSED*/ +static int +drsas_tran_reset(struct scsi_address *ap, int level) +{ + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* reset command not supported by H/W */ + + return (DDI_FAILURE); + +} + +/*ARGSUSED*/ +static int +drsas_tran_getcap(struct scsi_address *ap, char *cap, int whom) +{ + int rval = 0; + + struct drsas_instance *instance = ADDR2MR(ap); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* we do allow inquiring about capabilities for other targets */ + if (cap == NULL) { + return (-1); + } + + switch (scsi_hba_lookup_capstr(cap)) { + case SCSI_CAP_DMA_MAX: + /* Limit to 16MB max transfer */ + rval = drsas_max_cap_maxxfer; + break; + case SCSI_CAP_MSG_OUT: + rval = 1; + break; + case SCSI_CAP_DISCONNECT: + rval = 0; + break; + case SCSI_CAP_SYNCHRONOUS: + rval = 0; + break; + case SCSI_CAP_WIDE_XFER: + rval = 1; + break; + case SCSI_CAP_TAGGED_QING: + rval = 1; + break; + case SCSI_CAP_UNTAGGED_QING: + rval = 1; + break; + case SCSI_CAP_PARITY: + rval = 1; + break; + case SCSI_CAP_INITIATOR_ID: + rval = instance->init_id; + break; + case SCSI_CAP_ARQ: + rval = 1; + break; + case SCSI_CAP_LINKED_CMDS: + rval = 0; + break; + case SCSI_CAP_RESET_NOTIFICATION: + rval = 1; + break; + case SCSI_CAP_GEOMETRY: + rval = -1; + + break; + default: + con_log(CL_DLEVEL2, (CE_NOTE, "Default cap coming 0x%x", + scsi_hba_lookup_capstr(cap))); + rval = -1; + break; + } + + return (rval); +} + +/*ARGSUSED*/ +static int +drsas_tran_setcap(struct scsi_address *ap, char *cap, int value, int whom) +{ + int rval = 1; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* We don't allow setting capabilities for other targets */ + if (cap == NULL || whom == 0) { + return (-1); + } + + switch (scsi_hba_lookup_capstr(cap)) { + case SCSI_CAP_DMA_MAX: + case SCSI_CAP_MSG_OUT: + case SCSI_CAP_PARITY: + case SCSI_CAP_LINKED_CMDS: + case SCSI_CAP_RESET_NOTIFICATION: + case SCSI_CAP_DISCONNECT: + case SCSI_CAP_SYNCHRONOUS: + case SCSI_CAP_UNTAGGED_QING: + case SCSI_CAP_WIDE_XFER: + case SCSI_CAP_INITIATOR_ID: + case SCSI_CAP_ARQ: + /* + * None of these are settable via + * the capability interface. + */ + break; + case SCSI_CAP_TAGGED_QING: + rval = 1; + break; + case SCSI_CAP_SECTOR_SIZE: + rval = 1; + break; + + case SCSI_CAP_TOTAL_SECTORS: + rval = 1; + break; + default: + rval = -1; + break; + } + + return (rval); +} + +static void +drsas_tran_destroy_pkt(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + struct scsa_cmd *acmd = PKT2CMD(pkt); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + acmd->cmd_flags &= ~CFLAG_DMAVALID; + + (void) ddi_dma_unbind_handle(acmd->cmd_dmahandle); + + ddi_dma_free_handle(&acmd->cmd_dmahandle); + + acmd->cmd_dmahandle = NULL; + } + + /* free the pkt */ + scsi_hba_pkt_free(ap, pkt); +} + +/*ARGSUSED*/ +static void +drsas_tran_dmafree(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + register struct scsa_cmd *acmd = PKT2CMD(pkt); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + acmd->cmd_flags &= ~CFLAG_DMAVALID; + + (void) ddi_dma_unbind_handle(acmd->cmd_dmahandle); + + ddi_dma_free_handle(&acmd->cmd_dmahandle); + + acmd->cmd_dmahandle = NULL; + } +} + +/*ARGSUSED*/ +static void +drsas_tran_sync_pkt(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + register struct scsa_cmd *acmd = PKT2CMD(pkt); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, acmd->cmd_dma_offset, + acmd->cmd_dma_len, (acmd->cmd_flags & CFLAG_DMASEND) ? + DDI_DMA_SYNC_FORDEV : DDI_DMA_SYNC_FORCPU); + } +} + +/* + * drsas_isr(caddr_t) + * + * The Interrupt Service Routine + * + * Collect status for all completed commands and do callback + * + */ +static uint_t +drsas_isr(struct drsas_instance *instance) +{ + int need_softintr; + uint32_t producer; + uint32_t consumer; + uint32_t context; + + struct drsas_cmd *cmd; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + ASSERT(instance); + if ((instance->intr_type == DDI_INTR_TYPE_FIXED) && + !instance->func_ptr->intr_ack(instance)) { + return (DDI_INTR_UNCLAIMED); + } + + (void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle, + 0, 0, DDI_DMA_SYNC_FORCPU); + + if (drsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle) + != DDI_SUCCESS) { + drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (DDI_INTR_UNCLAIMED); + } + + producer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle, + instance->producer); + consumer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle, + instance->consumer); + + con_log(CL_ANN1, (CE_CONT, " producer %x consumer %x ", + producer, consumer)); + if (producer == consumer) { + con_log(CL_ANN1, (CE_WARN, "producer = consumer case")); + return (DDI_INTR_UNCLAIMED); + } + mutex_enter(&instance->completed_pool_mtx); + + while (consumer != producer) { + context = ddi_get32(instance->mfi_internal_dma_obj.acc_handle, + &instance->reply_queue[consumer]); + cmd = instance->cmd_list[context]; + mlist_add_tail(&cmd->list, &instance->completed_pool_list); + + consumer++; + if (consumer == (instance->max_fw_cmds + 1)) { + consumer = 0; + } + } + + mutex_exit(&instance->completed_pool_mtx); + + ddi_put32(instance->mfi_internal_dma_obj.acc_handle, + instance->consumer, consumer); + (void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle, + 0, 0, DDI_DMA_SYNC_FORDEV); + + if (instance->softint_running) { + need_softintr = 0; + } else { + need_softintr = 1; + } + + if (instance->isr_level == HIGH_LEVEL_INTR) { + if (need_softintr) { + ddi_trigger_softintr(instance->soft_intr_id); + } + } else { + /* + * Not a high-level interrupt, therefore call the soft level + * interrupt explicitly + */ + (void) drsas_softintr(instance); + } + + return (DDI_INTR_CLAIMED); +} + + +/* + * ************************************************************************** * + * * + * libraries * + * * + * ************************************************************************** * + */ +/* + * get_mfi_pkt : Get a command from the free pool + * After successful allocation, the caller of this routine + * must clear the frame buffer (memset to zero) before + * using the packet further. + * + * ***** Note ***** + * After clearing the frame buffer the context id of the + * frame buffer SHOULD be restored back. + */ +static struct drsas_cmd * +get_mfi_pkt(struct drsas_instance *instance) +{ + mlist_t *head = &instance->cmd_pool_list; + struct drsas_cmd *cmd = NULL; + + mutex_enter(&instance->cmd_pool_mtx); + ASSERT(mutex_owned(&instance->cmd_pool_mtx)); + + if (!mlist_empty(head)) { + cmd = mlist_entry(head->next, struct drsas_cmd, list); + mlist_del_init(head->next); + } + if (cmd != NULL) + cmd->pkt = NULL; + mutex_exit(&instance->cmd_pool_mtx); + + return (cmd); +} + +/* + * return_mfi_pkt : Return a cmd to free command pool + */ +static void +return_mfi_pkt(struct drsas_instance *instance, struct drsas_cmd *cmd) +{ + mutex_enter(&instance->cmd_pool_mtx); + ASSERT(mutex_owned(&instance->cmd_pool_mtx)); + + mlist_add(&cmd->list, &instance->cmd_pool_list); + + mutex_exit(&instance->cmd_pool_mtx); +} + +/* + * destroy_mfi_frame_pool + */ +static void +destroy_mfi_frame_pool(struct drsas_instance *instance) +{ + int i; + uint32_t max_cmd = instance->max_fw_cmds; + + struct drsas_cmd *cmd; + + /* return all frames to pool */ + for (i = 0; i < max_cmd+1; i++) { + + cmd = instance->cmd_list[i]; + + if (cmd->frame_dma_obj_status == DMA_OBJ_ALLOCATED) + (void) drsas_free_dma_obj(instance, cmd->frame_dma_obj); + + cmd->frame_dma_obj_status = DMA_OBJ_FREED; + } + +} + +/* + * create_mfi_frame_pool + */ +static int +create_mfi_frame_pool(struct drsas_instance *instance) +{ + int i = 0; + int cookie_cnt; + uint16_t max_cmd; + uint16_t sge_sz; + uint32_t sgl_sz; + uint32_t tot_frame_size; + + struct drsas_cmd *cmd; + + max_cmd = instance->max_fw_cmds; + + sge_sz = sizeof (struct drsas_sge64); + + /* calculated the number of 64byte frames required for SGL */ + sgl_sz = sge_sz * instance->max_num_sge; + tot_frame_size = sgl_sz + MRMFI_FRAME_SIZE + SENSE_LENGTH; + + con_log(CL_DLEVEL3, (CE_NOTE, "create_mfi_frame_pool: " + "sgl_sz %x tot_frame_size %x", sgl_sz, tot_frame_size)); + + while (i < max_cmd+1) { + cmd = instance->cmd_list[i]; + + cmd->frame_dma_obj.size = tot_frame_size; + cmd->frame_dma_obj.dma_attr = drsas_generic_dma_attr; + cmd->frame_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + cmd->frame_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + cmd->frame_dma_obj.dma_attr.dma_attr_sgllen = 1; + cmd->frame_dma_obj.dma_attr.dma_attr_align = 64; + + + cookie_cnt = drsas_alloc_dma_obj(instance, &cmd->frame_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC); + + if (cookie_cnt == -1 || cookie_cnt > 1) { + con_log(CL_ANN, (CE_WARN, + "create_mfi_frame_pool: could not alloc.")); + return (DDI_FAILURE); + } + + bzero(cmd->frame_dma_obj.buffer, tot_frame_size); + + cmd->frame_dma_obj_status = DMA_OBJ_ALLOCATED; + cmd->frame = (union drsas_frame *)cmd->frame_dma_obj.buffer; + cmd->frame_phys_addr = + cmd->frame_dma_obj.dma_cookie[0].dmac_address; + + cmd->sense = (uint8_t *)(((unsigned long) + cmd->frame_dma_obj.buffer) + + tot_frame_size - SENSE_LENGTH); + cmd->sense_phys_addr = + cmd->frame_dma_obj.dma_cookie[0].dmac_address + + tot_frame_size - SENSE_LENGTH; + + if (!cmd->frame || !cmd->sense) { + con_log(CL_ANN, (CE_NOTE, + "dr_sas: pci_pool_alloc failed")); + + return (ENOMEM); + } + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &cmd->frame->io.context, cmd->index); + i++; + + con_log(CL_DLEVEL3, (CE_NOTE, "[%x]-%x", + cmd->index, cmd->frame_phys_addr)); + } + + return (DDI_SUCCESS); +} + +/* + * free_additional_dma_buffer + */ +static void +free_additional_dma_buffer(struct drsas_instance *instance) +{ + if (instance->mfi_internal_dma_obj.status == DMA_OBJ_ALLOCATED) { + (void) drsas_free_dma_obj(instance, + instance->mfi_internal_dma_obj); + instance->mfi_internal_dma_obj.status = DMA_OBJ_FREED; + } + + if (instance->mfi_evt_detail_obj.status == DMA_OBJ_ALLOCATED) { + (void) drsas_free_dma_obj(instance, + instance->mfi_evt_detail_obj); + instance->mfi_evt_detail_obj.status = DMA_OBJ_FREED; + } +} + +/* + * alloc_additional_dma_buffer + */ +static int +alloc_additional_dma_buffer(struct drsas_instance *instance) +{ + uint32_t reply_q_sz; + uint32_t internal_buf_size = PAGESIZE*2; + + /* max cmds plus 1 + producer & consumer */ + reply_q_sz = sizeof (uint32_t) * (instance->max_fw_cmds + 1 + 2); + + instance->mfi_internal_dma_obj.size = internal_buf_size; + instance->mfi_internal_dma_obj.dma_attr = drsas_generic_dma_attr; + instance->mfi_internal_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + instance->mfi_internal_dma_obj.dma_attr.dma_attr_count_max = + 0xFFFFFFFFU; + instance->mfi_internal_dma_obj.dma_attr.dma_attr_sgllen = 1; + + if (drsas_alloc_dma_obj(instance, &instance->mfi_internal_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: could not alloc reply queue")); + return (DDI_FAILURE); + } + + bzero(instance->mfi_internal_dma_obj.buffer, internal_buf_size); + + instance->mfi_internal_dma_obj.status |= DMA_OBJ_ALLOCATED; + + instance->producer = (uint32_t *)((unsigned long) + instance->mfi_internal_dma_obj.buffer); + instance->consumer = (uint32_t *)((unsigned long) + instance->mfi_internal_dma_obj.buffer + 4); + instance->reply_queue = (uint32_t *)((unsigned long) + instance->mfi_internal_dma_obj.buffer + 8); + instance->internal_buf = (caddr_t)(((unsigned long) + instance->mfi_internal_dma_obj.buffer) + reply_q_sz + 8); + instance->internal_buf_dmac_add = + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + + (reply_q_sz + 8); + instance->internal_buf_size = internal_buf_size - + (reply_q_sz + 8); + + /* allocate evt_detail */ + instance->mfi_evt_detail_obj.size = sizeof (struct drsas_evt_detail); + instance->mfi_evt_detail_obj.dma_attr = drsas_generic_dma_attr; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_sgllen = 1; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_align = 1; + + if (drsas_alloc_dma_obj(instance, &instance->mfi_evt_detail_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "alloc_additional_dma_buffer: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + bzero(instance->mfi_evt_detail_obj.buffer, + sizeof (struct drsas_evt_detail)); + + instance->mfi_evt_detail_obj.status |= DMA_OBJ_ALLOCATED; + + return (DDI_SUCCESS); +} + +/* + * free_space_for_mfi + */ +static void +free_space_for_mfi(struct drsas_instance *instance) +{ + int i; + uint32_t max_cmd = instance->max_fw_cmds; + + /* already freed */ + if (instance->cmd_list == NULL) { + return; + } + + free_additional_dma_buffer(instance); + + /* first free the MFI frame pool */ + destroy_mfi_frame_pool(instance); + + /* free all the commands in the cmd_list */ + for (i = 0; i < instance->max_fw_cmds+1; i++) { + kmem_free(instance->cmd_list[i], + sizeof (struct drsas_cmd)); + + instance->cmd_list[i] = NULL; + } + + /* free the cmd_list buffer itself */ + kmem_free(instance->cmd_list, + sizeof (struct drsas_cmd *) * (max_cmd+1)); + + instance->cmd_list = NULL; + + INIT_LIST_HEAD(&instance->cmd_pool_list); +} + +/* + * alloc_space_for_mfi + */ +static int +alloc_space_for_mfi(struct drsas_instance *instance) +{ + int i; + uint32_t max_cmd; + size_t sz; + + struct drsas_cmd *cmd; + + max_cmd = instance->max_fw_cmds; + + /* reserve 1 more slot for flush_cache */ + sz = sizeof (struct drsas_cmd *) * (max_cmd+1); + + /* + * instance->cmd_list is an array of struct drsas_cmd pointers. + * Allocate the dynamic array first and then allocate individual + * commands. + */ + instance->cmd_list = kmem_zalloc(sz, KM_SLEEP); + ASSERT(instance->cmd_list); + + for (i = 0; i < max_cmd+1; i++) { + instance->cmd_list[i] = kmem_zalloc(sizeof (struct drsas_cmd), + KM_SLEEP); + ASSERT(instance->cmd_list[i]); + } + + INIT_LIST_HEAD(&instance->cmd_pool_list); + + /* add all the commands to command pool (instance->cmd_pool) */ + for (i = 0; i < max_cmd; i++) { + cmd = instance->cmd_list[i]; + cmd->index = i; + + mlist_add_tail(&cmd->list, &instance->cmd_pool_list); + } + + /* single slot for flush_cache won't be added in command pool */ + cmd = instance->cmd_list[max_cmd]; + cmd->index = i; + + /* create a frame pool and assign one frame to each cmd */ + if (create_mfi_frame_pool(instance)) { + con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool")); + return (DDI_FAILURE); + } + + /* create a frame pool and assign one frame to each cmd */ + if (alloc_additional_dma_buffer(instance)) { + con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool")); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * get_ctrl_info + */ +static int +get_ctrl_info(struct drsas_instance *instance, + struct drsas_ctrl_info *ctrl_info) +{ + int ret = 0; + + struct drsas_cmd *cmd; + struct drsas_dcmd_frame *dcmd; + struct drsas_ctrl_info *ci; + + cmd = get_mfi_pkt(instance); + + if (!cmd) { + con_log(CL_ANN, (CE_WARN, + "Failed to get a cmd for ctrl info")); + return (DDI_FAILURE); + } + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + dcmd = &cmd->frame->dcmd; + + ci = (struct drsas_ctrl_info *)instance->internal_buf; + + if (!ci) { + con_log(CL_ANN, (CE_WARN, + "Failed to alloc mem for ctrl info")); + return_mfi_pkt(instance, cmd); + return (DDI_FAILURE); + } + + (void) memset(ci, 0, sizeof (struct drsas_ctrl_info)); + + /* for( i = 0; i < DCMD_MBOX_SZ; i++ ) dcmd->mbox.b[i] = 0; */ + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, + MFI_CMD_STATUS_POLL_MODE); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_READ); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, + sizeof (struct drsas_ctrl_info)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + DR_DCMD_CTRL_GET_INFO); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr, + instance->internal_buf_dmac_add); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length, + sizeof (struct drsas_ctrl_info)); + + cmd->frame_count = 1; + + if (!instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { + ret = 0; + ddi_rep_get8(cmd->frame_dma_obj.acc_handle, + (uint8_t *)ctrl_info, (uint8_t *)ci, + sizeof (struct drsas_ctrl_info), DDI_DEV_AUTOINCR); + } else { + con_log(CL_ANN, (CE_WARN, "get_ctrl_info: Ctrl info failed")); + ret = -1; + } + + return_mfi_pkt(instance, cmd); + if (drsas_common_check(instance, cmd) != DDI_SUCCESS) { + ret = -1; + } + + return (ret); +} + +/* + * abort_aen_cmd + */ +static int +abort_aen_cmd(struct drsas_instance *instance, + struct drsas_cmd *cmd_to_abort) +{ + int ret = 0; + + struct drsas_cmd *cmd; + struct drsas_abort_frame *abort_fr; + + cmd = get_mfi_pkt(instance); + + if (!cmd) { + con_log(CL_ANN, (CE_WARN, + "Failed to get a cmd for ctrl info")); + return (DDI_FAILURE); + } + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + abort_fr = &cmd->frame->abort; + + /* prepare and issue the abort frame */ + ddi_put8(cmd->frame_dma_obj.acc_handle, + &abort_fr->cmd, MFI_CMD_OP_ABORT); + ddi_put8(cmd->frame_dma_obj.acc_handle, &abort_fr->cmd_status, + MFI_CMD_STATUS_SYNC_MODE); + ddi_put16(cmd->frame_dma_obj.acc_handle, &abort_fr->flags, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &abort_fr->abort_context, + cmd_to_abort->index); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &abort_fr->abort_mfi_phys_addr_lo, cmd_to_abort->frame_phys_addr); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &abort_fr->abort_mfi_phys_addr_hi, 0); + + instance->aen_cmd->abort_aen = 1; + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, + "abort_aen_cmd: issue_cmd_in_sync_mode failed")); + ret = -1; + } else { + ret = 0; + } + + instance->aen_cmd->abort_aen = 1; + instance->aen_cmd = 0; + + return_mfi_pkt(instance, cmd); + (void) drsas_common_check(instance, cmd); + + return (ret); +} + +/* + * init_mfi + */ +static int +init_mfi(struct drsas_instance *instance) +{ + struct drsas_cmd *cmd; + struct drsas_ctrl_info ctrl_info; + struct drsas_init_frame *init_frame; + struct drsas_init_queue_info *initq_info; + + /* we expect the FW state to be READY */ + if (mfi_state_transition_to_ready(instance)) { + con_log(CL_ANN, (CE_WARN, "dr_sas: F/W is not ready")); + goto fail_ready_state; + } + + /* get various operational parameters from status register */ + instance->max_num_sge = + (instance->func_ptr->read_fw_status_reg(instance) & + 0xFF0000) >> 0x10; + /* + * Reduce the max supported cmds by 1. This is to ensure that the + * reply_q_sz (1 more than the max cmd that driver may send) + * does not exceed max cmds that the FW can support + */ + instance->max_fw_cmds = + instance->func_ptr->read_fw_status_reg(instance) & 0xFFFF; + instance->max_fw_cmds = instance->max_fw_cmds - 1; + + instance->max_num_sge = + (instance->max_num_sge > DRSAS_MAX_SGE_CNT) ? + DRSAS_MAX_SGE_CNT : instance->max_num_sge; + + /* create a pool of commands */ + if (alloc_space_for_mfi(instance) != DDI_SUCCESS) + goto fail_alloc_fw_space; + + /* + * Prepare a init frame. Note the init frame points to queue info + * structure. Each frame has SGL allocated after first 64 bytes. For + * this frame - since we don't need any SGL - we use SGL's space as + * queue info structure + */ + cmd = get_mfi_pkt(instance); + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + init_frame = (struct drsas_init_frame *)cmd->frame; + initq_info = (struct drsas_init_queue_info *) + ((unsigned long)init_frame + 64); + + (void) memset(init_frame, 0, MRMFI_FRAME_SIZE); + (void) memset(initq_info, 0, sizeof (struct drsas_init_queue_info)); + + ddi_put32(cmd->frame_dma_obj.acc_handle, &initq_info->init_flags, 0); + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->reply_queue_entries, instance->max_fw_cmds + 1); + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->producer_index_phys_addr_hi, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->producer_index_phys_addr_lo, + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address); + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->consumer_index_phys_addr_hi, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->consumer_index_phys_addr_lo, + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + 4); + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->reply_queue_start_phys_addr_hi, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->reply_queue_start_phys_addr_lo, + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + 8); + + ddi_put8(cmd->frame_dma_obj.acc_handle, + &init_frame->cmd, MFI_CMD_OP_INIT); + ddi_put8(cmd->frame_dma_obj.acc_handle, &init_frame->cmd_status, + MFI_CMD_STATUS_POLL_MODE); + ddi_put16(cmd->frame_dma_obj.acc_handle, &init_frame->flags, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &init_frame->queue_info_new_phys_addr_lo, + cmd->frame_phys_addr + 64); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &init_frame->queue_info_new_phys_addr_hi, 0); + + ddi_put32(cmd->frame_dma_obj.acc_handle, &init_frame->data_xfer_len, + sizeof (struct drsas_init_queue_info)); + + cmd->frame_count = 1; + + /* issue the init frame in polled mode */ + if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, "failed to init firmware")); + goto fail_fw_init; + } + + return_mfi_pkt(instance, cmd); + if (drsas_common_check(instance, cmd) != DDI_SUCCESS) { + goto fail_fw_init; + } + + /* gather misc FW related information */ + if (!get_ctrl_info(instance, &ctrl_info)) { + instance->max_sectors_per_req = ctrl_info.max_request_size; + con_log(CL_ANN1, (CE_NOTE, "product name %s ld present %d", + ctrl_info.product_name, ctrl_info.ld_present_count)); + } else { + instance->max_sectors_per_req = instance->max_num_sge * + PAGESIZE / 512; + } + + if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) { + goto fail_fw_init; + } + + return (DDI_SUCCESS); + +fail_fw_init: +fail_alloc_fw_space: + + free_space_for_mfi(instance); + +fail_ready_state: + ddi_regs_map_free(&instance->regmap_handle); + +fail_mfi_reg_setup: + return (DDI_FAILURE); +} + +/* + * mfi_state_transition_to_ready : Move the FW to READY state + * + * @reg_set : MFI register set + */ +static int +mfi_state_transition_to_ready(struct drsas_instance *instance) +{ + int i; + uint8_t max_wait; + uint32_t fw_ctrl; + uint32_t fw_state; + uint32_t cur_state; + + fw_state = + instance->func_ptr->read_fw_status_reg(instance) & MFI_STATE_MASK; + con_log(CL_ANN1, (CE_NOTE, + "mfi_state_transition_to_ready:FW state = 0x%x", fw_state)); + + while (fw_state != MFI_STATE_READY) { + con_log(CL_ANN, (CE_NOTE, + "mfi_state_transition_to_ready:FW state%x", fw_state)); + + switch (fw_state) { + case MFI_STATE_FAULT: + con_log(CL_ANN, (CE_NOTE, + "dr_sas: FW in FAULT state!!")); + + return (ENODEV); + case MFI_STATE_WAIT_HANDSHAKE: + /* set the CLR bit in IMR0 */ + con_log(CL_ANN, (CE_NOTE, + "dr_sas: FW waiting for HANDSHAKE")); + /* + * PCI_Hot Plug: MFI F/W requires + * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG) + * to be set + */ + /* WR_IB_MSG_0(MFI_INIT_CLEAR_HANDSHAKE, instance); */ + WR_IB_DOORBELL(MFI_INIT_CLEAR_HANDSHAKE | + MFI_INIT_HOTPLUG, instance); + + max_wait = 2; + cur_state = MFI_STATE_WAIT_HANDSHAKE; + break; + case MFI_STATE_BOOT_MESSAGE_PENDING: + /* set the CLR bit in IMR0 */ + con_log(CL_ANN, (CE_NOTE, + "dr_sas: FW state boot message pending")); + /* + * PCI_Hot Plug: MFI F/W requires + * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG) + * to be set + */ + WR_IB_DOORBELL(MFI_INIT_HOTPLUG, instance); + + max_wait = 10; + cur_state = MFI_STATE_BOOT_MESSAGE_PENDING; + break; + case MFI_STATE_OPERATIONAL: + /* bring it to READY state; assuming max wait 2 secs */ + instance->func_ptr->disable_intr(instance); + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: FW in OPERATIONAL state")); + /* + * PCI_Hot Plug: MFI F/W requires + * (MFI_INIT_READY | MFI_INIT_MFIMODE | MFI_INIT_ABORT) + * to be set + */ + /* WR_IB_DOORBELL(MFI_INIT_READY, instance); */ + WR_IB_DOORBELL(MFI_RESET_FLAGS, instance); + + max_wait = 10; + cur_state = MFI_STATE_OPERATIONAL; + break; + case MFI_STATE_UNDEFINED: + /* this state should not last for more than 2 seconds */ + con_log(CL_ANN, (CE_NOTE, "FW state undefined")); + + max_wait = 2; + cur_state = MFI_STATE_UNDEFINED; + break; + case MFI_STATE_BB_INIT: + max_wait = 2; + cur_state = MFI_STATE_BB_INIT; + break; + case MFI_STATE_FW_INIT: + max_wait = 2; + cur_state = MFI_STATE_FW_INIT; + break; + case MFI_STATE_DEVICE_SCAN: + max_wait = 10; + cur_state = MFI_STATE_DEVICE_SCAN; + break; + default: + con_log(CL_ANN, (CE_NOTE, + "dr_sas: Unknown state 0x%x", fw_state)); + return (ENODEV); + } + + /* the cur_state should not last for more than max_wait secs */ + for (i = 0; i < (max_wait * MILLISEC); i++) { + /* fw_state = RD_OB_MSG_0(instance) & MFI_STATE_MASK; */ + fw_state = + instance->func_ptr->read_fw_status_reg(instance) & + MFI_STATE_MASK; + + if (fw_state == cur_state) { + delay(1 * drv_usectohz(MILLISEC)); + } else { + break; + } + } + + /* return error if fw_state hasn't changed after max_wait */ + if (fw_state == cur_state) { + con_log(CL_ANN, (CE_NOTE, + "FW state hasn't changed in %d secs", max_wait)); + return (ENODEV); + } + }; + + fw_ctrl = RD_IB_DOORBELL(instance); + + con_log(CL_ANN1, (CE_NOTE, + "mfi_state_transition_to_ready:FW ctrl = 0x%x", fw_ctrl)); + + /* + * Write 0xF to the doorbell register to do the following. + * - Abort all outstanding commands (bit 0). + * - Transition from OPERATIONAL to READY state (bit 1). + * - Discard (possible) low MFA posted in 64-bit mode (bit-2). + * - Set to release FW to continue running (i.e. BIOS handshake + * (bit 3). + */ + WR_IB_DOORBELL(0xF, instance); + + if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) { + return (ENODEV); + } + return (DDI_SUCCESS); +} + +/* + * get_seq_num + */ +static int +get_seq_num(struct drsas_instance *instance, + struct drsas_evt_log_info *eli) +{ + int ret = DDI_SUCCESS; + + dma_obj_t dcmd_dma_obj; + struct drsas_cmd *cmd; + struct drsas_dcmd_frame *dcmd; + + cmd = get_mfi_pkt(instance); + + if (!cmd) { + cmn_err(CE_WARN, "dr_sas: failed to get a cmd"); + return (ENOMEM); + } + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + dcmd = &cmd->frame->dcmd; + + /* allocate the data transfer buffer */ + dcmd_dma_obj.size = sizeof (struct drsas_evt_log_info); + dcmd_dma_obj.dma_attr = drsas_generic_dma_attr; + dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1; + dcmd_dma_obj.dma_attr.dma_attr_align = 1; + + if (drsas_alloc_dma_obj(instance, &dcmd_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, + "get_seq_num: could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + (void) memset(dcmd_dma_obj.buffer, 0, + sizeof (struct drsas_evt_log_info)); + + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_READ); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, + sizeof (struct drsas_evt_log_info)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + DR_DCMD_CTRL_EVENT_GET_INFO); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length, + sizeof (struct drsas_evt_log_info)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr, + dcmd_dma_obj.dma_cookie[0].dmac_address); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + cmn_err(CE_WARN, "get_seq_num: " + "failed to issue DRSAS_DCMD_CTRL_EVENT_GET_INFO"); + ret = DDI_FAILURE; + } else { + /* copy the data back into callers buffer */ + ddi_rep_get8(cmd->frame_dma_obj.acc_handle, (uint8_t *)eli, + (uint8_t *)dcmd_dma_obj.buffer, + sizeof (struct drsas_evt_log_info), DDI_DEV_AUTOINCR); + ret = DDI_SUCCESS; + } + + if (drsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS) + ret = DDI_FAILURE; + + return_mfi_pkt(instance, cmd); + if (drsas_common_check(instance, cmd) != DDI_SUCCESS) { + ret = DDI_FAILURE; + } + return (ret); +} + +/* + * start_mfi_aen + */ +static int +start_mfi_aen(struct drsas_instance *instance) +{ + int ret = 0; + + struct drsas_evt_log_info eli; + union drsas_evt_class_locale class_locale; + + /* get the latest sequence number from FW */ + (void) memset(&eli, 0, sizeof (struct drsas_evt_log_info)); + + if (get_seq_num(instance, &eli)) { + cmn_err(CE_WARN, "start_mfi_aen: failed to get seq num"); + return (-1); + } + + /* register AEN with FW for latest sequence number plus 1 */ + class_locale.members.reserved = 0; + class_locale.members.locale = DR_EVT_LOCALE_ALL; + class_locale.members.class = DR_EVT_CLASS_INFO; + ret = register_mfi_aen(instance, eli.newest_seq_num + 1, + class_locale.word); + + if (ret) { + cmn_err(CE_WARN, "start_mfi_aen: aen registration failed"); + return (-1); + } + + return (ret); +} + +/* + * flush_cache + */ +static void +flush_cache(struct drsas_instance *instance) +{ + struct drsas_cmd *cmd = NULL; + struct drsas_dcmd_frame *dcmd; + uint32_t max_cmd = instance->max_fw_cmds; + + cmd = instance->cmd_list[max_cmd]; + + if (cmd == NULL) + return; + + dcmd = &cmd->frame->dcmd; + + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0x0); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 0); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_NONE); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + DR_DCMD_CTRL_CACHE_FLUSH); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.b[0], + DR_FLUSH_CTRL_CACHE | DR_FLUSH_DISK_CACHE); + + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { + con_log(CL_ANN1, (CE_WARN, + "flush_cache: failed to issue MFI_DCMD_CTRL_CACHE_FLUSH")); + } + con_log(CL_DLEVEL1, (CE_NOTE, "done")); +} + +/* + * service_mfi_aen- Completes an AEN command + * @instance: Adapter soft state + * @cmd: Command to be completed + * + */ +static void +service_mfi_aen(struct drsas_instance *instance, struct drsas_cmd *cmd) +{ + uint32_t seq_num; + struct drsas_evt_detail *evt_detail = + (struct drsas_evt_detail *)instance->mfi_evt_detail_obj.buffer; + int rval = 0; + int tgt = 0; + ddi_acc_handle_t acc_handle; + + acc_handle = cmd->frame_dma_obj.acc_handle; + + cmd->cmd_status = ddi_get8(acc_handle, &cmd->frame->io.cmd_status); + + if (cmd->cmd_status == ENODATA) { + cmd->cmd_status = 0; + } + + /* + * log the MFI AEN event to the sysevent queue so that + * application will get noticed + */ + if (ddi_log_sysevent(instance->dip, DDI_VENDOR_LSI, "LSIMEGA", "SAS", + NULL, NULL, DDI_NOSLEEP) != DDI_SUCCESS) { + int instance_no = ddi_get_instance(instance->dip); + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: Failed to log AEN event", instance_no)); + } + /* + * Check for any ld devices that has changed state. i.e. online + * or offline. + */ + con_log(CL_ANN1, (CE_NOTE, + "AEN: code = %x class = %x locale = %x args = %x", + ddi_get32(acc_handle, &evt_detail->code), + evt_detail->cl.members.class, + ddi_get16(acc_handle, &evt_detail->cl.members.locale), + ddi_get8(acc_handle, &evt_detail->arg_type))); + + switch (ddi_get32(acc_handle, &evt_detail->code)) { + case DR_EVT_CFG_CLEARED: { + for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) { + if (instance->dr_ld_list[tgt].dip != NULL) { + rval = drsas_service_evt(instance, tgt, 0, + DRSAS_EVT_UNCONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, + "dr_sas: CFG CLEARED AEN rval = %d " + "tgt id = %d", rval, tgt)); + } + } + break; + } + + case DR_EVT_LD_DELETED: { + rval = drsas_service_evt(instance, + ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0, + DRSAS_EVT_UNCONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, "dr_sas: LD DELETED AEN rval = %d " + "tgt id = %d index = %d", rval, + ddi_get16(acc_handle, &evt_detail->args.ld.target_id), + ddi_get8(acc_handle, &evt_detail->args.ld.ld_index))); + break; + } /* End of DR_EVT_LD_DELETED */ + + case DR_EVT_LD_CREATED: { + rval = drsas_service_evt(instance, + ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0, + DRSAS_EVT_CONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, "dr_sas: LD CREATED AEN rval = %d " + "tgt id = %d index = %d", rval, + ddi_get16(acc_handle, &evt_detail->args.ld.target_id), + ddi_get8(acc_handle, &evt_detail->args.ld.ld_index))); + break; + } /* End of DR_EVT_LD_CREATED */ + } /* End of Main Switch */ + + /* get copy of seq_num and class/locale for re-registration */ + seq_num = ddi_get32(acc_handle, &evt_detail->seq_num); + seq_num++; + (void) memset(instance->mfi_evt_detail_obj.buffer, 0, + sizeof (struct drsas_evt_detail)); + + ddi_put8(acc_handle, &cmd->frame->dcmd.cmd_status, 0x0); + ddi_put32(acc_handle, &cmd->frame->dcmd.mbox.w[0], seq_num); + + instance->aen_seq_num = seq_num; + + cmd->frame_count = 1; + + /* Issue the aen registration frame */ + instance->func_ptr->issue_cmd(cmd, instance); +} + +/* + * complete_cmd_in_sync_mode - Completes an internal command + * @instance: Adapter soft state + * @cmd: Command to be completed + * + * The issue_cmd_in_sync_mode() function waits for a command to complete + * after it issues a command. This function wakes up that waiting routine by + * calling wake_up() on the wait queue. + */ +static void +complete_cmd_in_sync_mode(struct drsas_instance *instance, + struct drsas_cmd *cmd) +{ + cmd->cmd_status = ddi_get8(cmd->frame_dma_obj.acc_handle, + &cmd->frame->io.cmd_status); + + cmd->sync_cmd = DRSAS_FALSE; + + if (cmd->cmd_status == ENODATA) { + cmd->cmd_status = 0; + } + + cv_broadcast(&instance->int_cmd_cv); +} + +/* + * drsas_softintr - The Software ISR + * @param arg : HBA soft state + * + * called from high-level interrupt if hi-level interrupt are not there, + * otherwise triggered as a soft interrupt + */ +static uint_t +drsas_softintr(struct drsas_instance *instance) +{ + struct scsi_pkt *pkt; + struct scsa_cmd *acmd; + struct drsas_cmd *cmd; + struct mlist_head *pos, *next; + mlist_t process_list; + struct drsas_header *hdr; + struct scsi_arq_status *arqstat; + + con_log(CL_ANN1, (CE_CONT, "drsas_softintr called")); + + ASSERT(instance); + mutex_enter(&instance->completed_pool_mtx); + + if (mlist_empty(&instance->completed_pool_list)) { + mutex_exit(&instance->completed_pool_mtx); + return (DDI_INTR_UNCLAIMED); + } + + instance->softint_running = 1; + + INIT_LIST_HEAD(&process_list); + mlist_splice(&instance->completed_pool_list, &process_list); + INIT_LIST_HEAD(&instance->completed_pool_list); + + mutex_exit(&instance->completed_pool_mtx); + + /* perform all callbacks first, before releasing the SCBs */ + mlist_for_each_safe(pos, next, &process_list) { + cmd = mlist_entry(pos, struct drsas_cmd, list); + + /* syncronize the Cmd frame for the controller */ + (void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, + 0, 0, DDI_DMA_SYNC_FORCPU); + + if (drsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) != + DDI_SUCCESS) { + drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (DDI_INTR_UNCLAIMED); + } + + hdr = &cmd->frame->hdr; + + /* remove the internal command from the process list */ + mlist_del_init(&cmd->list); + + switch (ddi_get8(cmd->frame_dma_obj.acc_handle, &hdr->cmd)) { + case MFI_CMD_OP_PD_SCSI: + case MFI_CMD_OP_LD_SCSI: + case MFI_CMD_OP_LD_READ: + case MFI_CMD_OP_LD_WRITE: + /* + * MFI_CMD_OP_PD_SCSI and MFI_CMD_OP_LD_SCSI + * could have been issued either through an + * IO path or an IOCTL path. If it was via IOCTL, + * we will send it to internal completion. + */ + if (cmd->sync_cmd == DRSAS_TRUE) { + complete_cmd_in_sync_mode(instance, cmd); + break; + } + + /* regular commands */ + acmd = cmd->cmd; + pkt = CMD2PKT(acmd); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + if (acmd->cmd_flags & CFLAG_CONSISTENT) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, + acmd->cmd_dma_offset, + acmd->cmd_dma_len, + DDI_DMA_SYNC_FORCPU); + } + } + + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_statistics = 0; + pkt->pkt_state = STATE_GOT_BUS + | STATE_GOT_TARGET | STATE_SENT_CMD + | STATE_XFERRED_DATA | STATE_GOT_STATUS; + + con_log(CL_ANN1, (CE_CONT, + "CDB[0] = %x completed for %s: size %lx context %x", + pkt->pkt_cdbp[0], ((acmd->islogical) ? "LD" : "PD"), + acmd->cmd_dmacount, hdr->context)); + + if (pkt->pkt_cdbp[0] == SCMD_INQUIRY) { + struct scsi_inquiry *inq; + + if (acmd->cmd_dmacount != 0) { + bp_mapin(acmd->cmd_buf); + inq = (struct scsi_inquiry *) + acmd->cmd_buf->b_un.b_addr; + + /* don't expose physical drives to OS */ + if (acmd->islogical && + (hdr->cmd_status == MFI_STAT_OK)) { + display_scsi_inquiry( + (caddr_t)inq); + } else if ((hdr->cmd_status == + MFI_STAT_OK) && inq->inq_dtype == + DTYPE_DIRECT) { + + display_scsi_inquiry( + (caddr_t)inq); + + /* for physical disk */ + hdr->cmd_status = + MFI_STAT_DEVICE_NOT_FOUND; + } + } + } + + switch (hdr->cmd_status) { + case MFI_STAT_OK: + pkt->pkt_scbp[0] = STATUS_GOOD; + break; + case MFI_STAT_LD_CC_IN_PROGRESS: + case MFI_STAT_LD_RECON_IN_PROGRESS: + pkt->pkt_scbp[0] = STATUS_GOOD; + break; + case MFI_STAT_LD_INIT_IN_PROGRESS: + con_log(CL_ANN, + (CE_WARN, "Initialization in Progress")); + pkt->pkt_reason = CMD_TRAN_ERR; + + break; + case MFI_STAT_SCSI_DONE_WITH_ERROR: + con_log(CL_ANN1, (CE_CONT, "scsi_done error")); + + pkt->pkt_reason = CMD_CMPLT; + ((struct scsi_status *) + pkt->pkt_scbp)->sts_chk = 1; + + if (pkt->pkt_cdbp[0] == SCMD_TEST_UNIT_READY) { + + con_log(CL_ANN, + (CE_WARN, "TEST_UNIT_READY fail")); + + } else { + pkt->pkt_state |= STATE_ARQ_DONE; + arqstat = (void *)(pkt->pkt_scbp); + arqstat->sts_rqpkt_reason = CMD_CMPLT; + arqstat->sts_rqpkt_resid = 0; + arqstat->sts_rqpkt_state |= + STATE_GOT_BUS | STATE_GOT_TARGET + | STATE_SENT_CMD + | STATE_XFERRED_DATA; + *(uint8_t *)&arqstat->sts_rqpkt_status = + STATUS_GOOD; + ddi_rep_get8( + cmd->frame_dma_obj.acc_handle, + (uint8_t *) + &(arqstat->sts_sensedata), + cmd->sense, + acmd->cmd_scblen - + offsetof(struct scsi_arq_status, + sts_sensedata), DDI_DEV_AUTOINCR); + } + break; + case MFI_STAT_LD_OFFLINE: + case MFI_STAT_DEVICE_NOT_FOUND: + con_log(CL_ANN1, (CE_CONT, + "device not found error")); + pkt->pkt_reason = CMD_DEV_GONE; + pkt->pkt_statistics = STAT_DISCON; + break; + case MFI_STAT_LD_LBA_OUT_OF_RANGE: + pkt->pkt_state |= STATE_ARQ_DONE; + pkt->pkt_reason = CMD_CMPLT; + ((struct scsi_status *) + pkt->pkt_scbp)->sts_chk = 1; + + arqstat = (void *)(pkt->pkt_scbp); + arqstat->sts_rqpkt_reason = CMD_CMPLT; + arqstat->sts_rqpkt_resid = 0; + arqstat->sts_rqpkt_state |= STATE_GOT_BUS + | STATE_GOT_TARGET | STATE_SENT_CMD + | STATE_XFERRED_DATA; + *(uint8_t *)&arqstat->sts_rqpkt_status = + STATUS_GOOD; + + arqstat->sts_sensedata.es_valid = 1; + arqstat->sts_sensedata.es_key = + KEY_ILLEGAL_REQUEST; + arqstat->sts_sensedata.es_class = + CLASS_EXTENDED_SENSE; + + /* + * LOGICAL BLOCK ADDRESS OUT OF RANGE: + * ASC: 0x21h; ASCQ: 0x00h; + */ + arqstat->sts_sensedata.es_add_code = 0x21; + arqstat->sts_sensedata.es_qual_code = 0x00; + + break; + + default: + con_log(CL_ANN, (CE_CONT, "Unknown status!")); + pkt->pkt_reason = CMD_TRAN_ERR; + + break; + } + + atomic_add_16(&instance->fw_outstanding, (-1)); + + return_mfi_pkt(instance, cmd); + + (void) drsas_common_check(instance, cmd); + + if (acmd->cmd_dmahandle) { + if (drsas_check_dma_handle( + acmd->cmd_dmahandle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, + DDI_SERVICE_UNAFFECTED); + pkt->pkt_reason = CMD_TRAN_ERR; + pkt->pkt_statistics = 0; + } + } + + /* Call the callback routine */ + if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && + pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + + break; + case MFI_CMD_OP_SMP: + case MFI_CMD_OP_STP: + complete_cmd_in_sync_mode(instance, cmd); + break; + case MFI_CMD_OP_DCMD: + /* see if got an event notification */ + if (ddi_get32(cmd->frame_dma_obj.acc_handle, + &cmd->frame->dcmd.opcode) == + DR_DCMD_CTRL_EVENT_WAIT) { + if ((instance->aen_cmd == cmd) && + (instance->aen_cmd->abort_aen)) { + con_log(CL_ANN, (CE_WARN, + "drsas_softintr: " + "aborted_aen returned")); + } else { + atomic_add_16(&instance->fw_outstanding, + (-1)); + service_mfi_aen(instance, cmd); + } + } else { + complete_cmd_in_sync_mode(instance, cmd); + } + + break; + case MFI_CMD_OP_ABORT: + con_log(CL_ANN, (CE_WARN, "MFI_CMD_OP_ABORT complete")); + /* + * MFI_CMD_OP_ABORT successfully completed + * in the synchronous mode + */ + complete_cmd_in_sync_mode(instance, cmd); + break; + default: + drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + + if (cmd->pkt != NULL) { + pkt = cmd->pkt; + if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && + pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + } + con_log(CL_ANN, (CE_WARN, "Cmd type unknown !")); + break; + } + } + + instance->softint_running = 0; + + return (DDI_INTR_CLAIMED); +} + +/* + * drsas_alloc_dma_obj + * + * Allocate the memory and other resources for an dma object. + */ +static int +drsas_alloc_dma_obj(struct drsas_instance *instance, dma_obj_t *obj, + uchar_t endian_flags) +{ + int i; + size_t alen = 0; + uint_t cookie_cnt; + struct ddi_device_acc_attr tmp_endian_attr; + + tmp_endian_attr = endian_attr; + tmp_endian_attr.devacc_attr_endian_flags = endian_flags; + + i = ddi_dma_alloc_handle(instance->dip, &obj->dma_attr, + DDI_DMA_SLEEP, NULL, &obj->dma_handle); + if (i != DDI_SUCCESS) { + + switch (i) { + case DDI_DMA_BADATTR : + con_log(CL_ANN, (CE_WARN, + "Failed ddi_dma_alloc_handle- Bad attribute")); + break; + case DDI_DMA_NORESOURCES : + con_log(CL_ANN, (CE_WARN, + "Failed ddi_dma_alloc_handle- No Resources")); + break; + default : + con_log(CL_ANN, (CE_WARN, + "Failed ddi_dma_alloc_handle: " + "unknown status %d", i)); + break; + } + + return (-1); + } + + if ((ddi_dma_mem_alloc(obj->dma_handle, obj->size, &tmp_endian_attr, + DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL, + &obj->buffer, &alen, &obj->acc_handle) != DDI_SUCCESS) || + alen < obj->size) { + + ddi_dma_free_handle(&obj->dma_handle); + + con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_mem_alloc")); + + return (-1); + } + + if (ddi_dma_addr_bind_handle(obj->dma_handle, NULL, obj->buffer, + obj->size, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP, + NULL, &obj->dma_cookie[0], &cookie_cnt) != DDI_SUCCESS) { + + ddi_dma_mem_free(&obj->acc_handle); + ddi_dma_free_handle(&obj->dma_handle); + + con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_addr_bind_handle")); + + return (-1); + } + + if (drsas_check_dma_handle(obj->dma_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (-1); + } + + if (drsas_check_acc_handle(obj->acc_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (-1); + } + + return (cookie_cnt); +} + +/* + * drsas_free_dma_obj(struct drsas_instance *, dma_obj_t) + * + * De-allocate the memory and other resources for an dma object, which must + * have been alloated by a previous call to drsas_alloc_dma_obj() + */ +static int +drsas_free_dma_obj(struct drsas_instance *instance, dma_obj_t obj) +{ + + if (drsas_check_dma_handle(obj.dma_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + return (DDI_FAILURE); + } + + if (drsas_check_acc_handle(obj.acc_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + return (DDI_FAILURE); + } + + (void) ddi_dma_unbind_handle(obj.dma_handle); + ddi_dma_mem_free(&obj.acc_handle); + ddi_dma_free_handle(&obj.dma_handle); + + return (DDI_SUCCESS); +} + +/* + * drsas_dma_alloc(instance_t *, struct scsi_pkt *, struct buf *, + * int, int (*)()) + * + * Allocate dma resources for a new scsi command + */ +static int +drsas_dma_alloc(struct drsas_instance *instance, struct scsi_pkt *pkt, + struct buf *bp, int flags, int (*callback)()) +{ + int dma_flags; + int (*cb)(caddr_t); + int i; + + ddi_dma_attr_t tmp_dma_attr = drsas_generic_dma_attr; + struct scsa_cmd *acmd = PKT2CMD(pkt); + + acmd->cmd_buf = bp; + + if (bp->b_flags & B_READ) { + acmd->cmd_flags &= ~CFLAG_DMASEND; + dma_flags = DDI_DMA_READ; + } else { + acmd->cmd_flags |= CFLAG_DMASEND; + dma_flags = DDI_DMA_WRITE; + } + + if (flags & PKT_CONSISTENT) { + acmd->cmd_flags |= CFLAG_CONSISTENT; + dma_flags |= DDI_DMA_CONSISTENT; + } + + if (flags & PKT_DMA_PARTIAL) { + dma_flags |= DDI_DMA_PARTIAL; + } + + dma_flags |= DDI_DMA_REDZONE; + + cb = (callback == NULL_FUNC) ? DDI_DMA_DONTWAIT : DDI_DMA_SLEEP; + + tmp_dma_attr.dma_attr_sgllen = instance->max_num_sge; + tmp_dma_attr.dma_attr_addr_hi = 0xffffffffffffffffull; + + if ((i = ddi_dma_alloc_handle(instance->dip, &tmp_dma_attr, + cb, 0, &acmd->cmd_dmahandle)) != DDI_SUCCESS) { + switch (i) { + case DDI_DMA_BADATTR: + bioerror(bp, EFAULT); + return (DDI_FAILURE); + + case DDI_DMA_NORESOURCES: + bioerror(bp, 0); + return (DDI_FAILURE); + + default: + con_log(CL_ANN, (CE_PANIC, "ddi_dma_alloc_handle: " + "impossible result (0x%x)", i)); + bioerror(bp, EFAULT); + return (DDI_FAILURE); + } + } + + i = ddi_dma_buf_bind_handle(acmd->cmd_dmahandle, bp, dma_flags, + cb, 0, &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies); + + switch (i) { + case DDI_DMA_PARTIAL_MAP: + if ((dma_flags & DDI_DMA_PARTIAL) == 0) { + con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle: " + "DDI_DMA_PARTIAL_MAP impossible")); + goto no_dma_cookies; + } + + if (ddi_dma_numwin(acmd->cmd_dmahandle, &acmd->cmd_nwin) == + DDI_FAILURE) { + con_log(CL_ANN, (CE_PANIC, "ddi_dma_numwin failed")); + goto no_dma_cookies; + } + + if (ddi_dma_getwin(acmd->cmd_dmahandle, acmd->cmd_curwin, + &acmd->cmd_dma_offset, &acmd->cmd_dma_len, + &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies) == + DDI_FAILURE) { + + con_log(CL_ANN, (CE_PANIC, "ddi_dma_getwin failed")); + goto no_dma_cookies; + } + + goto get_dma_cookies; + case DDI_DMA_MAPPED: + acmd->cmd_nwin = 1; + acmd->cmd_dma_len = 0; + acmd->cmd_dma_offset = 0; + +get_dma_cookies: + i = 0; + acmd->cmd_dmacount = 0; + for (;;) { + acmd->cmd_dmacount += + acmd->cmd_dmacookies[i++].dmac_size; + + if (i == instance->max_num_sge || + i == acmd->cmd_ncookies) + break; + + ddi_dma_nextcookie(acmd->cmd_dmahandle, + &acmd->cmd_dmacookies[i]); + } + + acmd->cmd_cookie = i; + acmd->cmd_cookiecnt = i; + + acmd->cmd_flags |= CFLAG_DMAVALID; + + if (bp->b_bcount >= acmd->cmd_dmacount) { + pkt->pkt_resid = bp->b_bcount - acmd->cmd_dmacount; + } else { + pkt->pkt_resid = 0; + } + + return (DDI_SUCCESS); + case DDI_DMA_NORESOURCES: + bioerror(bp, 0); + break; + case DDI_DMA_NOMAPPING: + bioerror(bp, EFAULT); + break; + case DDI_DMA_TOOBIG: + bioerror(bp, EINVAL); + break; + case DDI_DMA_INUSE: + con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle:" + " DDI_DMA_INUSE impossible")); + break; + default: + con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle: " + "impossible result (0x%x)", i)); + break; + } + +no_dma_cookies: + ddi_dma_free_handle(&acmd->cmd_dmahandle); + acmd->cmd_dmahandle = NULL; + acmd->cmd_flags &= ~CFLAG_DMAVALID; + return (DDI_FAILURE); +} + +/* + * drsas_dma_move(struct drsas_instance *, struct scsi_pkt *, struct buf *) + * + * move dma resources to next dma window + * + */ +static int +drsas_dma_move(struct drsas_instance *instance, struct scsi_pkt *pkt, + struct buf *bp) +{ + int i = 0; + + struct scsa_cmd *acmd = PKT2CMD(pkt); + + /* + * If there are no more cookies remaining in this window, + * must move to the next window first. + */ + if (acmd->cmd_cookie == acmd->cmd_ncookies) { + if (acmd->cmd_curwin == acmd->cmd_nwin && acmd->cmd_nwin == 1) { + return (DDI_SUCCESS); + } + + /* at last window, cannot move */ + if (++acmd->cmd_curwin >= acmd->cmd_nwin) { + return (DDI_FAILURE); + } + + if (ddi_dma_getwin(acmd->cmd_dmahandle, acmd->cmd_curwin, + &acmd->cmd_dma_offset, &acmd->cmd_dma_len, + &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies) == + DDI_FAILURE) { + return (DDI_FAILURE); + } + + acmd->cmd_cookie = 0; + } else { + /* still more cookies in this window - get the next one */ + ddi_dma_nextcookie(acmd->cmd_dmahandle, + &acmd->cmd_dmacookies[0]); + } + + /* get remaining cookies in this window, up to our maximum */ + for (;;) { + acmd->cmd_dmacount += acmd->cmd_dmacookies[i++].dmac_size; + acmd->cmd_cookie++; + + if (i == instance->max_num_sge || + acmd->cmd_cookie == acmd->cmd_ncookies) { + break; + } + + ddi_dma_nextcookie(acmd->cmd_dmahandle, + &acmd->cmd_dmacookies[i]); + } + + acmd->cmd_cookiecnt = i; + + if (bp->b_bcount >= acmd->cmd_dmacount) { + pkt->pkt_resid = bp->b_bcount - acmd->cmd_dmacount; + } else { + pkt->pkt_resid = 0; + } + + return (DDI_SUCCESS); +} + +/* + * build_cmd + */ +static struct drsas_cmd * +build_cmd(struct drsas_instance *instance, struct scsi_address *ap, + struct scsi_pkt *pkt, uchar_t *cmd_done) +{ + uint16_t flags = 0; + uint32_t i; + uint32_t context __unused; + uint32_t sge_bytes; + ddi_acc_handle_t acc_handle; + struct drsas_cmd *cmd; + struct drsas_sge64 *mfi_sgl; + struct scsa_cmd *acmd = PKT2CMD(pkt); + struct drsas_pthru_frame *pthru; + struct drsas_io_frame *ldio; + + /* find out if this is logical or physical drive command. */ + acmd->islogical = MRDRV_IS_LOGICAL(ap); + acmd->device_id = MAP_DEVICE_ID(instance, ap); + *cmd_done = 0; + + /* get the command packet */ + if (!(cmd = get_mfi_pkt(instance))) { + return (NULL); + } + + acc_handle = cmd->frame_dma_obj.acc_handle; + + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(acc_handle, &cmd->frame->hdr.context, cmd->index); + + cmd->pkt = pkt; + cmd->cmd = acmd; + + /* lets get the command directions */ + if (acmd->cmd_flags & CFLAG_DMASEND) { + flags = MFI_FRAME_DIR_WRITE; + + if (acmd->cmd_flags & CFLAG_CONSISTENT) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, + acmd->cmd_dma_offset, acmd->cmd_dma_len, + DDI_DMA_SYNC_FORDEV); + } + } else if (acmd->cmd_flags & ~CFLAG_DMASEND) { + flags = MFI_FRAME_DIR_READ; + + if (acmd->cmd_flags & CFLAG_CONSISTENT) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, + acmd->cmd_dma_offset, acmd->cmd_dma_len, + DDI_DMA_SYNC_FORCPU); + } + } else { + flags = MFI_FRAME_DIR_NONE; + } + + flags |= MFI_FRAME_SGL64; + + switch (pkt->pkt_cdbp[0]) { + + /* + * case SCMD_SYNCHRONIZE_CACHE: + * flush_cache(instance); + * return_mfi_pkt(instance, cmd); + * *cmd_done = 1; + * + * return (NULL); + */ + + case SCMD_READ: + case SCMD_WRITE: + case SCMD_READ_G1: + case SCMD_WRITE_G1: + if (acmd->islogical) { + ldio = (struct drsas_io_frame *)cmd->frame; + + /* + * preare the Logical IO frame: + * 2nd bit is zero for all read cmds + */ + ddi_put8(acc_handle, &ldio->cmd, + (pkt->pkt_cdbp[0] & 0x02) ? MFI_CMD_OP_LD_WRITE + : MFI_CMD_OP_LD_READ); + ddi_put8(acc_handle, &ldio->cmd_status, 0x0); + ddi_put8(acc_handle, &ldio->scsi_status, 0x0); + ddi_put8(acc_handle, &ldio->target_id, acmd->device_id); + ddi_put16(acc_handle, &ldio->timeout, 0); + ddi_put8(acc_handle, &ldio->reserved_0, 0); + ddi_put16(acc_handle, &ldio->pad_0, 0); + ddi_put16(acc_handle, &ldio->flags, flags); + + /* Initialize sense Information */ + bzero(cmd->sense, SENSE_LENGTH); + ddi_put8(acc_handle, &ldio->sense_len, SENSE_LENGTH); + ddi_put32(acc_handle, &ldio->sense_buf_phys_addr_hi, 0); + ddi_put32(acc_handle, &ldio->sense_buf_phys_addr_lo, + cmd->sense_phys_addr); + ddi_put32(acc_handle, &ldio->start_lba_hi, 0); + ddi_put8(acc_handle, &ldio->access_byte, + (acmd->cmd_cdblen != 6) ? pkt->pkt_cdbp[1] : 0); + ddi_put8(acc_handle, &ldio->sge_count, + acmd->cmd_cookiecnt); + mfi_sgl = (struct drsas_sge64 *)&ldio->sgl; + + context = ddi_get32(acc_handle, &ldio->context); + + if (acmd->cmd_cdblen == CDB_GROUP0) { + ddi_put32(acc_handle, &ldio->lba_count, ( + (uint16_t)(pkt->pkt_cdbp[4]))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[3])) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 8) | + ((uint32_t)((pkt->pkt_cdbp[1]) & 0x1F) + << 16))); + } else if (acmd->cmd_cdblen == CDB_GROUP1) { + ddi_put32(acc_handle, &ldio->lba_count, ( + ((uint16_t)(pkt->pkt_cdbp[8])) | + ((uint16_t)(pkt->pkt_cdbp[7]) << 8))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[5])) | + ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 24))); + } else if (acmd->cmd_cdblen == CDB_GROUP2) { + ddi_put32(acc_handle, &ldio->lba_count, ( + ((uint16_t)(pkt->pkt_cdbp[9])) | + ((uint16_t)(pkt->pkt_cdbp[8]) << 8) | + ((uint16_t)(pkt->pkt_cdbp[7]) << 16) | + ((uint16_t)(pkt->pkt_cdbp[6]) << 24))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[5])) | + ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 24))); + } else if (acmd->cmd_cdblen == CDB_GROUP3) { + ddi_put32(acc_handle, &ldio->lba_count, ( + ((uint16_t)(pkt->pkt_cdbp[13])) | + ((uint16_t)(pkt->pkt_cdbp[12]) << 8) | + ((uint16_t)(pkt->pkt_cdbp[11]) << 16) | + ((uint16_t)(pkt->pkt_cdbp[10]) << 24))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[9])) | + ((uint32_t)(pkt->pkt_cdbp[8]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[7]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[6]) << 24))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[5])) | + ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 24))); + } + + break; + } + /* fall through */ + default: + + switch (pkt->pkt_cdbp[0]) { + case SCMD_MODE_SENSE: + case SCMD_MODE_SENSE_G1: { + union scsi_cdb *cdbp; + uint16_t page_code; + + cdbp = (void *)pkt->pkt_cdbp; + page_code = (uint16_t)cdbp->cdb_un.sg.scsi[0]; + switch (page_code) { + case 0x3: + case 0x4: + (void) drsas_mode_sense_build(pkt); + return_mfi_pkt(instance, cmd); + *cmd_done = 1; + return (NULL); + } + break; + } + default: + break; + } + + pthru = (struct drsas_pthru_frame *)cmd->frame; + + /* prepare the DCDB frame */ + ddi_put8(acc_handle, &pthru->cmd, (acmd->islogical) ? + MFI_CMD_OP_LD_SCSI : MFI_CMD_OP_PD_SCSI); + ddi_put8(acc_handle, &pthru->cmd_status, 0x0); + ddi_put8(acc_handle, &pthru->scsi_status, 0x0); + ddi_put8(acc_handle, &pthru->target_id, acmd->device_id); + ddi_put8(acc_handle, &pthru->lun, 0); + ddi_put8(acc_handle, &pthru->cdb_len, acmd->cmd_cdblen); + ddi_put16(acc_handle, &pthru->timeout, 0); + ddi_put16(acc_handle, &pthru->flags, flags); + ddi_put32(acc_handle, &pthru->data_xfer_len, + acmd->cmd_dmacount); + ddi_put8(acc_handle, &pthru->sge_count, acmd->cmd_cookiecnt); + mfi_sgl = (struct drsas_sge64 *)&pthru->sgl; + + bzero(cmd->sense, SENSE_LENGTH); + ddi_put8(acc_handle, &pthru->sense_len, SENSE_LENGTH); + ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0); + ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo, + cmd->sense_phys_addr); + + context = ddi_get32(acc_handle, &pthru->context); + ddi_rep_put8(acc_handle, (uint8_t *)pkt->pkt_cdbp, + (uint8_t *)pthru->cdb, acmd->cmd_cdblen, DDI_DEV_AUTOINCR); + + break; + } +#ifdef lint + context = context; +#endif + /* prepare the scatter-gather list for the firmware */ + for (i = 0; i < acmd->cmd_cookiecnt; i++, mfi_sgl++) { + ddi_put64(acc_handle, &mfi_sgl->phys_addr, + acmd->cmd_dmacookies[i].dmac_laddress); + ddi_put32(acc_handle, &mfi_sgl->length, + acmd->cmd_dmacookies[i].dmac_size); + } + + sge_bytes = sizeof (struct drsas_sge64)*acmd->cmd_cookiecnt; + + cmd->frame_count = (sge_bytes / MRMFI_FRAME_SIZE) + + ((sge_bytes % MRMFI_FRAME_SIZE) ? 1 : 0) + 1; + + if (cmd->frame_count >= 8) { + cmd->frame_count = 8; + } + + return (cmd); +} + +/* + * issue_mfi_pthru + */ +static int +issue_mfi_pthru(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + struct drsas_cmd *cmd, int mode) +{ + void *ubuf; + uint32_t kphys_addr = 0; + uint32_t xferlen = 0; + uint_t model; + ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; + dma_obj_t pthru_dma_obj; + struct drsas_pthru_frame *kpthru; + struct drsas_pthru_frame *pthru; + int i; + pthru = &cmd->frame->pthru; + kpthru = (struct drsas_pthru_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32")); + + xferlen = kpthru->sgl.sge32[0].length; + + ubuf = (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr; + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32")); + xferlen = kpthru->sgl.sge32[0].length; + ubuf = (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr; +#else + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP64")); + xferlen = kpthru->sgl.sge64[0].length; + ubuf = (void *)(ulong_t)kpthru->sgl.sge64[0].phys_addr; +#endif + } + + if (xferlen) { + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + pthru_dma_obj.size = xferlen; + pthru_dma_obj.dma_attr = drsas_generic_dma_attr; + pthru_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + pthru_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + pthru_dma_obj.dma_attr.dma_attr_sgllen = 1; + pthru_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &pthru_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_pthru: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + if (kpthru->flags & MFI_FRAME_DIR_WRITE) { + for (i = 0; i < xferlen; i++) { + if (ddi_copyin((uint8_t *)ubuf+i, + (uint8_t *)pthru_dma_obj.buffer+i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_pthru : " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + kphys_addr = pthru_dma_obj.dma_cookie[0].dmac_address; + } + + ddi_put8(acc_handle, &pthru->cmd, kpthru->cmd); + ddi_put8(acc_handle, &pthru->sense_len, kpthru->sense_len); + ddi_put8(acc_handle, &pthru->cmd_status, 0); + ddi_put8(acc_handle, &pthru->scsi_status, 0); + ddi_put8(acc_handle, &pthru->target_id, kpthru->target_id); + ddi_put8(acc_handle, &pthru->lun, kpthru->lun); + ddi_put8(acc_handle, &pthru->cdb_len, kpthru->cdb_len); + ddi_put8(acc_handle, &pthru->sge_count, kpthru->sge_count); + ddi_put16(acc_handle, &pthru->timeout, kpthru->timeout); + ddi_put32(acc_handle, &pthru->data_xfer_len, kpthru->data_xfer_len); + + ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0); + /* pthru->sense_buf_phys_addr_lo = cmd->sense_phys_addr; */ + ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo, 0); + + ddi_rep_put8(acc_handle, (uint8_t *)kpthru->cdb, (uint8_t *)pthru->cdb, + pthru->cdb_len, DDI_DEV_AUTOINCR); + + ddi_put16(acc_handle, &pthru->flags, kpthru->flags & ~MFI_FRAME_SGL64); + ddi_put32(acc_handle, &pthru->sgl.sge32[0].length, xferlen); + ddi_put32(acc_handle, &pthru->sgl.sge32[0].phys_addr, kphys_addr); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_pthru: fw_ioctl failed")); + } else { + if (xferlen && kpthru->flags & MFI_FRAME_DIR_READ) { + for (i = 0; i < xferlen; i++) { + if (ddi_copyout( + (uint8_t *)pthru_dma_obj.buffer+i, + (uint8_t *)ubuf+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_pthru : " + "copy to user space failed")); + return (DDI_FAILURE); + } + } + } + } + + kpthru->cmd_status = ddi_get8(acc_handle, &pthru->cmd_status); + kpthru->scsi_status = ddi_get8(acc_handle, &pthru->scsi_status); + + con_log(CL_ANN, (CE_NOTE, "issue_mfi_pthru: cmd_status %x, " + "scsi_status %x", kpthru->cmd_status, kpthru->scsi_status)); + + if (xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, pthru_dma_obj) != DDI_SUCCESS) + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * issue_mfi_dcmd + */ +static int +issue_mfi_dcmd(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + struct drsas_cmd *cmd, int mode) +{ + void *ubuf; + uint32_t kphys_addr = 0; + uint32_t xferlen = 0; + uint32_t model; + dma_obj_t dcmd_dma_obj; + struct drsas_dcmd_frame *kdcmd; + struct drsas_dcmd_frame *dcmd; + ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; + int i; + dcmd = &cmd->frame->dcmd; + kdcmd = (struct drsas_dcmd_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32")); + + xferlen = kdcmd->sgl.sge32[0].length; + + ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32")); + xferlen = kdcmd->sgl.sge32[0].length; + ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; +#else + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_LP64")); + xferlen = kdcmd->sgl.sge64[0].length; + ubuf = (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr; +#endif + } + if (xferlen) { + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + dcmd_dma_obj.size = xferlen; + dcmd_dma_obj.dma_attr = drsas_generic_dma_attr; + dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1; + dcmd_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &dcmd_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + if (kdcmd->flags & MFI_FRAME_DIR_WRITE) { + for (i = 0; i < xferlen; i++) { + if (ddi_copyin((uint8_t *)ubuf + i, + (uint8_t *)dcmd_dma_obj.buffer + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_dcmd : " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + kphys_addr = dcmd_dma_obj.dma_cookie[0].dmac_address; + } + + ddi_put8(acc_handle, &dcmd->cmd, kdcmd->cmd); + ddi_put8(acc_handle, &dcmd->cmd_status, 0); + ddi_put8(acc_handle, &dcmd->sge_count, kdcmd->sge_count); + ddi_put16(acc_handle, &dcmd->timeout, kdcmd->timeout); + ddi_put32(acc_handle, &dcmd->data_xfer_len, kdcmd->data_xfer_len); + ddi_put32(acc_handle, &dcmd->opcode, kdcmd->opcode); + + ddi_rep_put8(acc_handle, (uint8_t *)kdcmd->mbox.b, + (uint8_t *)dcmd->mbox.b, DCMD_MBOX_SZ, DDI_DEV_AUTOINCR); + + ddi_put16(acc_handle, &dcmd->flags, kdcmd->flags & ~MFI_FRAME_SGL64); + ddi_put32(acc_handle, &dcmd->sgl.sge32[0].length, xferlen); + ddi_put32(acc_handle, &dcmd->sgl.sge32[0].phys_addr, kphys_addr); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: fw_ioctl failed")); + } else { + if (xferlen && (kdcmd->flags & MFI_FRAME_DIR_READ)) { + for (i = 0; i < xferlen; i++) { + if (ddi_copyout( + (uint8_t *)dcmd_dma_obj.buffer + i, + (uint8_t *)ubuf + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_dcmd : " + "copy to user space failed")); + return (DDI_FAILURE); + } + } + } + } + + kdcmd->cmd_status = ddi_get8(acc_handle, &dcmd->cmd_status); + + if (xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS) + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * issue_mfi_smp + */ +static int +issue_mfi_smp(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + struct drsas_cmd *cmd, int mode) +{ + void *request_ubuf; + void *response_ubuf; + uint32_t request_xferlen = 0; + uint32_t response_xferlen = 0; + uint_t model; + dma_obj_t request_dma_obj; + dma_obj_t response_dma_obj; + ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; + struct drsas_smp_frame *ksmp; + struct drsas_smp_frame *smp; + struct drsas_sge32 *sge32; +#ifndef _ILP32 + struct drsas_sge64 *sge64; +#endif + int i; + uint64_t tmp_sas_addr; + + smp = &cmd->frame->smp; + ksmp = (struct drsas_smp_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32")); + + sge32 = &ksmp->sgl[0].sge32[0]; + response_xferlen = sge32[0].length; + request_xferlen = sge32[1].length; + con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: " + "response_xferlen = %x, request_xferlen = %x", + response_xferlen, request_xferlen)); + + response_ubuf = (void *)(ulong_t)sge32[0].phys_addr; + request_ubuf = (void *)(ulong_t)sge32[1].phys_addr; + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: " + "response_ubuf = %p, request_ubuf = %p", + response_ubuf, request_ubuf)); + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32")); + + sge32 = &ksmp->sgl[0].sge32[0]; + response_xferlen = sge32[0].length; + request_xferlen = sge32[1].length; + con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: " + "response_xferlen = %x, request_xferlen = %x", + response_xferlen, request_xferlen)); + + response_ubuf = (void *)(ulong_t)sge32[0].phys_addr; + request_ubuf = (void *)(ulong_t)sge32[1].phys_addr; + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: " + "response_ubuf = %p, request_ubuf = %p", + response_ubuf, request_ubuf)); +#else + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_LP64")); + + sge64 = &ksmp->sgl[0].sge64[0]; + response_xferlen = sge64[0].length; + request_xferlen = sge64[1].length; + + response_ubuf = (void *)(ulong_t)sge64[0].phys_addr; + request_ubuf = (void *)(ulong_t)sge64[1].phys_addr; +#endif + } + if (request_xferlen) { + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + request_dma_obj.size = request_xferlen; + request_dma_obj.dma_attr = drsas_generic_dma_attr; + request_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + request_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + request_dma_obj.dma_attr.dma_attr_sgllen = 1; + request_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &request_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + for (i = 0; i < request_xferlen; i++) { + if (ddi_copyin((uint8_t *)request_ubuf + i, + (uint8_t *)request_dma_obj.buffer + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + if (response_xferlen) { + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + response_dma_obj.size = response_xferlen; + response_dma_obj.dma_attr = drsas_generic_dma_attr; + response_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + response_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + response_dma_obj.dma_attr.dma_attr_sgllen = 1; + response_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &response_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + for (i = 0; i < response_xferlen; i++) { + if (ddi_copyin((uint8_t *)response_ubuf + i, + (uint8_t *)response_dma_obj.buffer + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + ddi_put8(acc_handle, &smp->cmd, ksmp->cmd); + ddi_put8(acc_handle, &smp->cmd_status, 0); + ddi_put8(acc_handle, &smp->connection_status, 0); + ddi_put8(acc_handle, &smp->sge_count, ksmp->sge_count); + /* smp->context = ksmp->context; */ + ddi_put16(acc_handle, &smp->timeout, ksmp->timeout); + ddi_put32(acc_handle, &smp->data_xfer_len, ksmp->data_xfer_len); + + bcopy((void *)&ksmp->sas_addr, (void *)&tmp_sas_addr, + sizeof (uint64_t)); + ddi_put64(acc_handle, &smp->sas_addr, tmp_sas_addr); + + ddi_put16(acc_handle, &smp->flags, ksmp->flags & ~MFI_FRAME_SGL64); + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_ILP32")); + + sge32 = &smp->sgl[0].sge32[0]; + ddi_put32(acc_handle, &sge32[0].length, response_xferlen); + ddi_put32(acc_handle, &sge32[0].phys_addr, + response_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(acc_handle, &sge32[1].length, request_xferlen); + ddi_put32(acc_handle, &sge32[1].phys_addr, + request_dma_obj.dma_cookie[0].dmac_address); + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_ILP32")); + sge32 = &smp->sgl[0].sge32[0]; + ddi_put32(acc_handle, &sge32[0].length, response_xferlen); + ddi_put32(acc_handle, &sge32[0].phys_addr, + response_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(acc_handle, &sge32[1].length, request_xferlen); + ddi_put32(acc_handle, &sge32[1].phys_addr, + request_dma_obj.dma_cookie[0].dmac_address); +#else + con_log(CL_ANN1, (CE_NOTE, + "issue_mfi_smp: DDI_MODEL_LP64")); + sge64 = &smp->sgl[0].sge64[0]; + ddi_put32(acc_handle, &sge64[0].length, response_xferlen); + ddi_put64(acc_handle, &sge64[0].phys_addr, + response_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(acc_handle, &sge64[1].length, request_xferlen); + ddi_put64(acc_handle, &sge64[1].phys_addr, + request_dma_obj.dma_cookie[0].dmac_address); +#endif + } + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp : " + "smp->response_xferlen = %d, smp->request_xferlen = %d " + "smp->data_xfer_len = %d", ddi_get32(acc_handle, &sge32[0].length), + ddi_get32(acc_handle, &sge32[1].length), + ddi_get32(acc_handle, &smp->data_xfer_len))); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_smp: fw_ioctl failed")); + } else { + con_log(CL_ANN1, (CE_NOTE, + "issue_mfi_smp: copy to user space")); + + if (request_xferlen) { + for (i = 0; i < request_xferlen; i++) { + if (ddi_copyout( + (uint8_t *)request_dma_obj.buffer + + i, (uint8_t *)request_ubuf + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_smp : copy to user space" + " failed")); + return (DDI_FAILURE); + } + } + } + + if (response_xferlen) { + for (i = 0; i < response_xferlen; i++) { + if (ddi_copyout( + (uint8_t *)response_dma_obj.buffer + + i, (uint8_t *)response_ubuf + + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_smp : copy to " + "user space failed")); + return (DDI_FAILURE); + } + } + } + } + + ksmp->cmd_status = ddi_get8(acc_handle, &smp->cmd_status); + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: smp->cmd_status = %d", + ddi_get8(acc_handle, &smp->cmd_status))); + + + if (request_xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, request_dma_obj) != + DDI_SUCCESS) + return (DDI_FAILURE); + } + + if (response_xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, response_dma_obj) != + DDI_SUCCESS) + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * issue_mfi_stp + */ +static int +issue_mfi_stp(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + struct drsas_cmd *cmd, int mode) +{ + void *fis_ubuf; + void *data_ubuf; + uint32_t fis_xferlen = 0; + uint32_t data_xferlen = 0; + uint_t model; + dma_obj_t fis_dma_obj; + dma_obj_t data_dma_obj; + struct drsas_stp_frame *kstp; + struct drsas_stp_frame *stp; + ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; + int i; + + stp = &cmd->frame->stp; + kstp = (struct drsas_stp_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32")); + + fis_xferlen = kstp->sgl.sge32[0].length; + data_xferlen = kstp->sgl.sge32[1].length; + + fis_ubuf = (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr; + data_ubuf = (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr; + } + else + { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32")); + + fis_xferlen = kstp->sgl.sge32[0].length; + data_xferlen = kstp->sgl.sge32[1].length; + + fis_ubuf = (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr; + data_ubuf = (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr; +#else + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_LP64")); + + fis_xferlen = kstp->sgl.sge64[0].length; + data_xferlen = kstp->sgl.sge64[1].length; + + fis_ubuf = (void *)(ulong_t)kstp->sgl.sge64[0].phys_addr; + data_ubuf = (void *)(ulong_t)kstp->sgl.sge64[1].phys_addr; +#endif + } + + + if (fis_xferlen) { + con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: " + "fis_ubuf = %p fis_xferlen = %x", fis_ubuf, fis_xferlen)); + + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + fis_dma_obj.size = fis_xferlen; + fis_dma_obj.dma_attr = drsas_generic_dma_attr; + fis_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + fis_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + fis_dma_obj.dma_attr.dma_attr_sgllen = 1; + fis_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &fis_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp : " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + for (i = 0; i < fis_xferlen; i++) { + if (ddi_copyin((uint8_t *)fis_ubuf + i, + (uint8_t *)fis_dma_obj.buffer + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + if (data_xferlen) { + con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: data_ubuf = %p " + "data_xferlen = %x", data_ubuf, data_xferlen)); + + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + data_dma_obj.size = data_xferlen; + data_dma_obj.dma_attr = drsas_generic_dma_attr; + data_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + data_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + data_dma_obj.dma_attr.dma_attr_sgllen = 1; + data_dma_obj.dma_attr.dma_attr_align = 1; + +/* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &data_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + for (i = 0; i < data_xferlen; i++) { + if (ddi_copyin((uint8_t *)data_ubuf + i, + (uint8_t *)data_dma_obj.buffer + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + ddi_put8(acc_handle, &stp->cmd, kstp->cmd); + ddi_put8(acc_handle, &stp->cmd_status, 0); + ddi_put8(acc_handle, &stp->connection_status, 0); + ddi_put8(acc_handle, &stp->target_id, kstp->target_id); + ddi_put8(acc_handle, &stp->sge_count, kstp->sge_count); + + ddi_put16(acc_handle, &stp->timeout, kstp->timeout); + ddi_put32(acc_handle, &stp->data_xfer_len, kstp->data_xfer_len); + + ddi_rep_put8(acc_handle, (uint8_t *)kstp->fis, (uint8_t *)stp->fis, 10, + DDI_DEV_AUTOINCR); + + ddi_put16(acc_handle, &stp->flags, kstp->flags & ~MFI_FRAME_SGL64); + ddi_put32(acc_handle, &stp->stp_flags, kstp->stp_flags); + ddi_put32(acc_handle, &stp->sgl.sge32[0].length, fis_xferlen); + ddi_put32(acc_handle, &stp->sgl.sge32[0].phys_addr, + fis_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(acc_handle, &stp->sgl.sge32[1].length, data_xferlen); + ddi_put32(acc_handle, &stp->sgl.sge32[1].phys_addr, + data_dma_obj.dma_cookie[0].dmac_address); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: fw_ioctl failed")); + } else { + + if (fis_xferlen) { + for (i = 0; i < fis_xferlen; i++) { + if (ddi_copyout( + (uint8_t *)fis_dma_obj.buffer + i, + (uint8_t *)fis_ubuf + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_stp : copy to " + "user space failed")); + return (DDI_FAILURE); + } + } + } + } + if (data_xferlen) { + for (i = 0; i < data_xferlen; i++) { + if (ddi_copyout( + (uint8_t *)data_dma_obj.buffer + i, + (uint8_t *)data_ubuf + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_stp : copy to" + " user space failed")); + return (DDI_FAILURE); + } + } + } + + kstp->cmd_status = ddi_get8(acc_handle, &stp->cmd_status); + + if (fis_xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, fis_dma_obj) != DDI_SUCCESS) + return (DDI_FAILURE); + } + + if (data_xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, data_dma_obj) != DDI_SUCCESS) + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * fill_up_drv_ver + */ +static void +fill_up_drv_ver(struct drsas_drv_ver *dv) +{ + (void) memset(dv, 0, sizeof (struct drsas_drv_ver)); + + (void) memcpy(dv->signature, "$LSI LOGIC$", strlen("$LSI LOGIC$")); + (void) memcpy(dv->os_name, "Solaris", strlen("Solaris")); + (void) memcpy(dv->drv_name, "dr_sas", strlen("dr_sas")); + (void) memcpy(dv->drv_ver, DRSAS_VERSION, strlen(DRSAS_VERSION)); + (void) memcpy(dv->drv_rel_date, DRSAS_RELDATE, + strlen(DRSAS_RELDATE)); +} + +/* + * handle_drv_ioctl + */ +static int +handle_drv_ioctl(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + int mode) +{ + int i; + int rval = DDI_SUCCESS; + int *props = NULL; + void *ubuf; + + uint8_t *pci_conf_buf; + uint32_t xferlen; + uint32_t num_props; + uint_t model; + struct drsas_dcmd_frame *kdcmd; + struct drsas_drv_ver dv; + struct drsas_pci_information pi; + + kdcmd = (struct drsas_dcmd_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_ILP32")); + + xferlen = kdcmd->sgl.sge32[0].length; + + ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_ILP32")); + xferlen = kdcmd->sgl.sge32[0].length; + ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; +#else + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_LP64")); + xferlen = kdcmd->sgl.sge64[0].length; + ubuf = (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr; +#endif + } + con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: " + "dataBuf=%p size=%d bytes", ubuf, xferlen)); + + switch (kdcmd->opcode) { + case DRSAS_DRIVER_IOCTL_DRIVER_VERSION: + con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_DRIVER_VERSION")); + + fill_up_drv_ver(&dv); + for (i = 0; i < xferlen; i++) { + if (ddi_copyout((uint8_t *)&dv + i, (uint8_t *)ubuf + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_DRIVER_VERSION" + " : copy to user space failed")); + kdcmd->cmd_status = 1; + rval = DDI_FAILURE; + break; + } + } + if (i == xferlen) + kdcmd->cmd_status = 0; + break; + case DRSAS_DRIVER_IOCTL_PCI_INFORMATION: + con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_PCI_INFORMAITON")); + + if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, instance->dip, + 0, "reg", &props, &num_props)) { + con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_PCI_INFORMATION : " + "ddi_prop_look_int_array failed")); + rval = DDI_FAILURE; + } else { + + pi.busNumber = (props[0] >> 16) & 0xFF; + pi.deviceNumber = (props[0] >> 11) & 0x1f; + pi.functionNumber = (props[0] >> 8) & 0x7; + ddi_prop_free((void *)props); + } + + pci_conf_buf = (uint8_t *)&pi.pciHeaderInfo; + + for (i = 0; i < (sizeof (struct drsas_pci_information) - + offsetof(struct drsas_pci_information, pciHeaderInfo)); + i++) { + pci_conf_buf[i] = + pci_config_get8(instance->pci_handle, i); + } + for (i = 0; i < xferlen; i++) { + if (ddi_copyout((uint8_t *)&pi + i, (uint8_t *)ubuf + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_PCI_INFORMATION" + " : copy to user space failed")); + kdcmd->cmd_status = 1; + rval = DDI_FAILURE; + break; + } + } + + if (i == xferlen) + kdcmd->cmd_status = 0; + + break; + default: + con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: " + "invalid driver specific IOCTL opcode = 0x%x", + kdcmd->opcode)); + kdcmd->cmd_status = 1; + rval = DDI_FAILURE; + break; + } + + return (rval); +} + +/* + * handle_mfi_ioctl + */ +static int +handle_mfi_ioctl(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + int mode) +{ + int rval = DDI_SUCCESS; + + struct drsas_header *hdr; + struct drsas_cmd *cmd; + + cmd = get_mfi_pkt(instance); + + if (!cmd) { + con_log(CL_ANN, (CE_WARN, "dr_sas: " + "failed to get a cmd packet")); + return (DDI_FAILURE); + } + + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + hdr = (struct drsas_header *)&ioctl->frame[0]; + + switch (hdr->cmd) { + case MFI_CMD_OP_DCMD: + rval = issue_mfi_dcmd(instance, ioctl, cmd, mode); + break; + case MFI_CMD_OP_SMP: + rval = issue_mfi_smp(instance, ioctl, cmd, mode); + break; + case MFI_CMD_OP_STP: + rval = issue_mfi_stp(instance, ioctl, cmd, mode); + break; + case MFI_CMD_OP_LD_SCSI: + case MFI_CMD_OP_PD_SCSI: + rval = issue_mfi_pthru(instance, ioctl, cmd, mode); + break; + default: + con_log(CL_ANN, (CE_WARN, "handle_mfi_ioctl: " + "invalid mfi ioctl hdr->cmd = %d", hdr->cmd)); + rval = DDI_FAILURE; + break; + } + + + return_mfi_pkt(instance, cmd); + if (drsas_common_check(instance, cmd) != DDI_SUCCESS) + rval = DDI_FAILURE; + return (rval); +} + +/* + * AEN + */ +static int +handle_mfi_aen(struct drsas_instance *instance, struct drsas_aen *aen) +{ + int rval = 0; + + rval = register_mfi_aen(instance, instance->aen_seq_num, + aen->class_locale_word); + + aen->cmd_status = (uint8_t)rval; + + return (rval); +} + +static int +register_mfi_aen(struct drsas_instance *instance, uint32_t seq_num, + uint32_t class_locale_word) +{ + int ret_val; + + struct drsas_cmd *cmd, *aen_cmd; + struct drsas_dcmd_frame *dcmd; + union drsas_evt_class_locale curr_aen; + union drsas_evt_class_locale prev_aen; + + /* + * If there an AEN pending already (aen_cmd), check if the + * class_locale of that pending AEN is inclusive of the new + * AEN request we currently have. If it is, then we don't have + * to do anything. In other words, whichever events the current + * AEN request is subscribing to, have already been subscribed + * to. + * + * If the old_cmd is _not_ inclusive, then we have to abort + * that command, form a class_locale that is superset of both + * old and current and re-issue to the FW + */ + + curr_aen.word = class_locale_word; + aen_cmd = instance->aen_cmd; + if (aen_cmd) { + prev_aen.word = ddi_get32(aen_cmd->frame_dma_obj.acc_handle, + &aen_cmd->frame->dcmd.mbox.w[1]); + + /* + * A class whose enum value is smaller is inclusive of all + * higher values. If a PROGRESS (= -1) was previously + * registered, then a new registration requests for higher + * classes need not be sent to FW. They are automatically + * included. + * + * Locale numbers don't have such hierarchy. They are bitmap + * values + */ + if ((prev_aen.members.class <= curr_aen.members.class) && + !((prev_aen.members.locale & curr_aen.members.locale) ^ + curr_aen.members.locale)) { + /* + * Previously issued event registration includes + * current request. Nothing to do. + */ + + return (0); + } else { + curr_aen.members.locale |= prev_aen.members.locale; + + if (prev_aen.members.class < curr_aen.members.class) + curr_aen.members.class = prev_aen.members.class; + + ret_val = abort_aen_cmd(instance, aen_cmd); + + if (ret_val) { + con_log(CL_ANN, (CE_WARN, "register_mfi_aen: " + "failed to abort prevous AEN command")); + + return (ret_val); + } + } + } else { + curr_aen.word = class_locale_word; + } + + cmd = get_mfi_pkt(instance); + + if (!cmd) + return (ENOMEM); + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + dcmd = &cmd->frame->dcmd; + + /* for(i = 0; i < DCMD_MBOX_SZ; i++) dcmd->mbox.b[i] = 0; */ + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + (void) memset(instance->mfi_evt_detail_obj.buffer, 0, + sizeof (struct drsas_evt_detail)); + + /* Prepare DCMD for aen registration */ + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0x0); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_READ); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, + sizeof (struct drsas_evt_detail)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + DR_DCMD_CTRL_EVENT_WAIT); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[0], seq_num); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[1], + curr_aen.word); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr, + instance->mfi_evt_detail_obj.dma_cookie[0].dmac_address); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length, + sizeof (struct drsas_evt_detail)); + + instance->aen_seq_num = seq_num; + + + /* + * Store reference to the cmd used to register for AEN. When an + * application wants us to register for AEN, we have to abort this + * cmd and re-register with a new EVENT LOCALE supplied by that app + */ + instance->aen_cmd = cmd; + + cmd->frame_count = 1; + + /* Issue the aen registration frame */ + /* atomic_add_16 (&instance->fw_outstanding, 1); */ + instance->func_ptr->issue_cmd(cmd, instance); + + return (0); +} + +static void +display_scsi_inquiry(caddr_t scsi_inq) +{ +#define MAX_SCSI_DEVICE_CODE 14 + int i; + char inquiry_buf[256] = {0}; + int len; + const char *const scsi_device_types[] = { + "Direct-Access ", + "Sequential-Access", + "Printer ", + "Processor ", + "WORM ", + "CD-ROM ", + "Scanner ", + "Optical Device ", + "Medium Changer ", + "Communications ", + "Unknown ", + "Unknown ", + "Unknown ", + "Enclosure ", + }; + + len = 0; + + len += snprintf(inquiry_buf + len, 265 - len, " Vendor: "); + for (i = 8; i < 16; i++) { + len += snprintf(inquiry_buf + len, 265 - len, "%c", + scsi_inq[i]); + } + + len += snprintf(inquiry_buf + len, 265 - len, " Model: "); + + for (i = 16; i < 32; i++) { + len += snprintf(inquiry_buf + len, 265 - len, "%c", + scsi_inq[i]); + } + + len += snprintf(inquiry_buf + len, 265 - len, " Rev: "); + + for (i = 32; i < 36; i++) { + len += snprintf(inquiry_buf + len, 265 - len, "%c", + scsi_inq[i]); + } + + len += snprintf(inquiry_buf + len, 265 - len, "\n"); + + + i = scsi_inq[0] & 0x1f; + + + len += snprintf(inquiry_buf + len, 265 - len, " Type: %s ", + i < MAX_SCSI_DEVICE_CODE ? scsi_device_types[i] : + "Unknown "); + + + len += snprintf(inquiry_buf + len, 265 - len, + " ANSI SCSI revision: %02x", scsi_inq[2] & 0x07); + + if ((scsi_inq[2] & 0x07) == 1 && (scsi_inq[3] & 0x0f) == 1) { + len += snprintf(inquiry_buf + len, 265 - len, " CCS\n"); + } else { + len += snprintf(inquiry_buf + len, 265 - len, "\n"); + } + + con_log(CL_ANN1, (CE_CONT, inquiry_buf)); +} + +static int +read_fw_status_reg_ppc(struct drsas_instance *instance) +{ + return ((int)RD_OB_SCRATCH_PAD_0(instance)); +} + +static void +issue_cmd_ppc(struct drsas_cmd *cmd, struct drsas_instance *instance) +{ + atomic_add_16(&instance->fw_outstanding, 1); + + /* Issue the command to the FW */ + WR_IB_QPORT((cmd->frame_phys_addr) | + (((cmd->frame_count - 1) << 1) | 1), instance); +} + +/* + * issue_cmd_in_sync_mode + */ +static int +issue_cmd_in_sync_mode_ppc(struct drsas_instance *instance, + struct drsas_cmd *cmd) +{ + int i; + uint32_t msecs = MFI_POLL_TIMEOUT_SECS * (10 * MILLISEC); + + con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: called")); + + cmd->cmd_status = ENODATA; + + WR_IB_QPORT((cmd->frame_phys_addr) | + (((cmd->frame_count - 1) << 1) | 1), instance); + + mutex_enter(&instance->int_cmd_mtx); + + for (i = 0; i < msecs && (cmd->cmd_status == ENODATA); i++) { + cv_wait(&instance->int_cmd_cv, &instance->int_cmd_mtx); + } + + mutex_exit(&instance->int_cmd_mtx); + + con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: done")); + + if (i < (msecs -1)) { + return (DDI_SUCCESS); + } else { + return (DDI_FAILURE); + } +} + +/* + * issue_cmd_in_poll_mode + */ +static int +issue_cmd_in_poll_mode_ppc(struct drsas_instance *instance, + struct drsas_cmd *cmd) +{ + int i; + uint16_t flags; + uint32_t msecs = MFI_POLL_TIMEOUT_SECS * MILLISEC; + struct drsas_header *frame_hdr; + + con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_poll_mode_ppc: called")); + + frame_hdr = (struct drsas_header *)cmd->frame; + ddi_put8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status, + MFI_CMD_STATUS_POLL_MODE); + flags = ddi_get16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags); + flags |= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE; + + ddi_put16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags, flags); + + /* issue the frame using inbound queue port */ + WR_IB_QPORT((cmd->frame_phys_addr) | + (((cmd->frame_count - 1) << 1) | 1), instance); + + /* wait for cmd_status to change from 0xFF */ + for (i = 0; i < msecs && ( + ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status) + == MFI_CMD_STATUS_POLL_MODE); i++) { + drv_usecwait(MILLISEC); /* wait for 1000 usecs */ + } + + if (ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status) + == MFI_CMD_STATUS_POLL_MODE) { + con_log(CL_ANN, (CE_NOTE, "issue_cmd_in_poll_mode: " + "cmd polling timed out")); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +static void +enable_intr_ppc(struct drsas_instance *instance) +{ + uint32_t mask; + + con_log(CL_ANN1, (CE_NOTE, "enable_intr_ppc: called")); + + /* WR_OB_DOORBELL_CLEAR(0xFFFFFFFF, instance); */ + WR_OB_DOORBELL_CLEAR(OB_DOORBELL_CLEAR_MASK, instance); + + /* WR_OB_INTR_MASK(~0x80000000, instance); */ + WR_OB_INTR_MASK(~(MFI_REPLY_2108_MESSAGE_INTR_MASK), instance); + + /* dummy read to force PCI flush */ + mask = RD_OB_INTR_MASK(instance); + + con_log(CL_ANN1, (CE_NOTE, "enable_intr_ppc: " + "outbound_intr_mask = 0x%x", mask)); +} + +static void +disable_intr_ppc(struct drsas_instance *instance) +{ + uint32_t mask __unused; + + con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: called")); + + con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: before : " + "outbound_intr_mask = 0x%x", RD_OB_INTR_MASK(instance))); + + /* WR_OB_INTR_MASK(0xFFFFFFFF, instance); */ + WR_OB_INTR_MASK(OB_INTR_MASK, instance); + + con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: after : " + "outbound_intr_mask = 0x%x", RD_OB_INTR_MASK(instance))); + + /* dummy read to force PCI flush */ + mask = RD_OB_INTR_MASK(instance); +#ifdef lint + mask = mask; +#endif +} + +static int +intr_ack_ppc(struct drsas_instance *instance) +{ + uint32_t status; + + con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: called")); + + /* check if it is our interrupt */ + status = RD_OB_INTR_STATUS(instance); + + con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: status = 0x%x", status)); + + if (!(status & MFI_REPLY_2108_MESSAGE_INTR)) { + return (DDI_INTR_UNCLAIMED); + } + + /* clear the interrupt by writing back the same value */ + WR_OB_DOORBELL_CLEAR(status, instance); + + /* dummy READ */ + status = RD_OB_INTR_STATUS(instance); + + con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: interrupt cleared")); + + return (DDI_INTR_CLAIMED); +} + +static int +drsas_common_check(struct drsas_instance *instance, + struct drsas_cmd *cmd) +{ + int ret = DDI_SUCCESS; + + if (drsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) != + DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + if (cmd->pkt != NULL) { + cmd->pkt->pkt_reason = CMD_TRAN_ERR; + cmd->pkt->pkt_statistics = 0; + } + ret = DDI_FAILURE; + } + if (drsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle) + != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + if (cmd->pkt != NULL) { + cmd->pkt->pkt_reason = CMD_TRAN_ERR; + cmd->pkt->pkt_statistics = 0; + } + ret = DDI_FAILURE; + } + if (drsas_check_dma_handle(instance->mfi_evt_detail_obj.dma_handle) != + DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + if (cmd->pkt != NULL) { + cmd->pkt->pkt_reason = CMD_TRAN_ERR; + cmd->pkt->pkt_statistics = 0; + } + ret = DDI_FAILURE; + } + if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + + ddi_fm_acc_err_clear(instance->regmap_handle, DDI_FME_VER0); + + if (cmd->pkt != NULL) { + cmd->pkt->pkt_reason = CMD_TRAN_ERR; + cmd->pkt->pkt_statistics = 0; + } + ret = DDI_FAILURE; + } + + return (ret); +} + +/*ARGSUSED*/ +static int +drsas_fm_error_cb(dev_info_t *dip, ddi_fm_error_t *err, const void *impl_data) +{ + /* + * as the driver can always deal with an error in any dma or + * access handle, we can just return the fme_status value. + */ + pci_ereport_post(dip, err, NULL); + return (err->fme_status); +} + +static void +drsas_fm_init(struct drsas_instance *instance) +{ + /* Need to change iblock to priority for new MSI intr */ + ddi_iblock_cookie_t fm_ibc; + + /* Only register with IO Fault Services if we have some capability */ + if (instance->fm_capabilities) { + /* Adjust access and dma attributes for FMA */ + endian_attr.devacc_attr_access = DDI_FLAGERR_ACC; + drsas_generic_dma_attr.dma_attr_flags = DDI_DMA_FLAGERR; + + /* + * Register capabilities with IO Fault Services. + * fm_capabilities will be updated to indicate + * capabilities actually supported (not requested.) + */ + + ddi_fm_init(instance->dip, &instance->fm_capabilities, &fm_ibc); + + /* + * Initialize pci ereport capabilities if ereport + * capable (should always be.) + */ + + if (DDI_FM_EREPORT_CAP(instance->fm_capabilities) || + DDI_FM_ERRCB_CAP(instance->fm_capabilities)) { + pci_ereport_setup(instance->dip); + } + + /* + * Register error callback if error callback capable. + */ + if (DDI_FM_ERRCB_CAP(instance->fm_capabilities)) { + ddi_fm_handler_register(instance->dip, + drsas_fm_error_cb, (void*) instance); + } + } else { + endian_attr.devacc_attr_access = DDI_DEFAULT_ACC; + drsas_generic_dma_attr.dma_attr_flags = 0; + } +} + +static void +drsas_fm_fini(struct drsas_instance *instance) +{ + /* Only unregister FMA capabilities if registered */ + if (instance->fm_capabilities) { + /* + * Un-register error callback if error callback capable. + */ + if (DDI_FM_ERRCB_CAP(instance->fm_capabilities)) { + ddi_fm_handler_unregister(instance->dip); + } + + /* + * Release any resources allocated by pci_ereport_setup() + */ + if (DDI_FM_EREPORT_CAP(instance->fm_capabilities) || + DDI_FM_ERRCB_CAP(instance->fm_capabilities)) { + pci_ereport_teardown(instance->dip); + } + + /* Unregister from IO Fault Services */ + ddi_fm_fini(instance->dip); + + /* Adjust access and dma attributes for FMA */ + endian_attr.devacc_attr_access = DDI_DEFAULT_ACC; + drsas_generic_dma_attr.dma_attr_flags = 0; + } +} + +int +drsas_check_acc_handle(ddi_acc_handle_t handle) +{ + ddi_fm_error_t de; + + if (handle == NULL) { + return (DDI_FAILURE); + } + + ddi_fm_acc_err_get(handle, &de, DDI_FME_VERSION); + + return (de.fme_status); +} + +int +drsas_check_dma_handle(ddi_dma_handle_t handle) +{ + ddi_fm_error_t de; + + if (handle == NULL) { + return (DDI_FAILURE); + } + + ddi_fm_dma_err_get(handle, &de, DDI_FME_VERSION); + + return (de.fme_status); +} + +void +drsas_fm_ereport(struct drsas_instance *instance, char *detail) +{ + uint64_t ena; + char buf[FM_MAX_CLASS]; + + (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail); + ena = fm_ena_generate(0, FM_ENA_FMT1); + if (DDI_FM_EREPORT_CAP(instance->fm_capabilities)) { + ddi_fm_ereport_post(instance->dip, buf, ena, DDI_NOSLEEP, + FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERSION, NULL); + } +} + +static int +drsas_add_intrs(struct drsas_instance *instance, int intr_type) +{ + + dev_info_t *dip = instance->dip; + int avail, actual, count; + int i, flag, ret; + + con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: intr_type = %x", + intr_type)); + + /* Get number of interrupts */ + ret = ddi_intr_get_nintrs(dip, intr_type, &count); + if ((ret != DDI_SUCCESS) || (count == 0)) { + con_log(CL_ANN, (CE_WARN, "ddi_intr_get_nintrs() failed:" + "ret %d count %d", ret, count)); + + return (DDI_FAILURE); + } + + con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: count = %d ", count)); + + /* Get number of available interrupts */ + ret = ddi_intr_get_navail(dip, intr_type, &avail); + if ((ret != DDI_SUCCESS) || (avail == 0)) { + con_log(CL_ANN, (CE_WARN, "ddi_intr_get_navail() failed:" + "ret %d avail %d", ret, avail)); + + return (DDI_FAILURE); + } + con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: avail = %d ", avail)); + + /* Only one interrupt routine. So limit the count to 1 */ + if (count > 1) { + count = 1; + } + + /* + * Allocate an array of interrupt handlers. Currently we support + * only one interrupt. The framework can be extended later. + */ + instance->intr_size = count * sizeof (ddi_intr_handle_t); + instance->intr_htable = kmem_zalloc(instance->intr_size, KM_SLEEP); + ASSERT(instance->intr_htable); + + flag = ((intr_type == DDI_INTR_TYPE_MSI) || (intr_type == + DDI_INTR_TYPE_MSIX)) ? DDI_INTR_ALLOC_STRICT:DDI_INTR_ALLOC_NORMAL; + + /* Allocate interrupt */ + ret = ddi_intr_alloc(dip, instance->intr_htable, intr_type, 0, + count, &actual, flag); + + if ((ret != DDI_SUCCESS) || (actual == 0)) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: " + "avail = %d", avail)); + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + if (actual < count) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: " + "Requested = %d Received = %d", count, actual)); + } + instance->intr_cnt = actual; + + /* + * Get the priority of the interrupt allocated. + */ + if ((ret = ddi_intr_get_pri(instance->intr_htable[0], + &instance->intr_pri)) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: " + "get priority call failed")); + + for (i = 0; i < actual; i++) { + (void) ddi_intr_free(instance->intr_htable[i]); + } + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + + /* + * Test for high level mutex. we don't support them. + */ + if (instance->intr_pri >= ddi_intr_get_hilevel_pri()) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: " + "High level interrupts not supported.")); + + for (i = 0; i < actual; i++) { + (void) ddi_intr_free(instance->intr_htable[i]); + } + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + + con_log(CL_DLEVEL1, (CE_NOTE, "drsas_add_intrs: intr_pri = 0x%x ", + instance->intr_pri)); + + /* Call ddi_intr_add_handler() */ + for (i = 0; i < actual; i++) { + ret = ddi_intr_add_handler(instance->intr_htable[i], + (ddi_intr_handler_t *)drsas_isr, (caddr_t)instance, + (caddr_t)(uintptr_t)i); + + if (ret != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs:" + "failed %d", ret)); + + for (i = 0; i < actual; i++) { + (void) ddi_intr_free(instance->intr_htable[i]); + } + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + + } + + con_log(CL_DLEVEL1, (CE_WARN, " ddi_intr_add_handler done")); + + if ((ret = ddi_intr_get_cap(instance->intr_htable[0], + &instance->intr_cap)) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "ddi_intr_get_cap() failed %d", + ret)); + + /* Free already allocated intr */ + for (i = 0; i < actual; i++) { + (void) ddi_intr_remove_handler( + instance->intr_htable[i]); + (void) ddi_intr_free(instance->intr_htable[i]); + } + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + + if (instance->intr_cap & DDI_INTR_FLAG_BLOCK) { + con_log(CL_ANN, (CE_WARN, "Calling ddi_intr_block _enable")); + + (void) ddi_intr_block_enable(instance->intr_htable, + instance->intr_cnt); + } else { + con_log(CL_ANN, (CE_NOTE, " calling ddi_intr_enable")); + + for (i = 0; i < instance->intr_cnt; i++) { + (void) ddi_intr_enable(instance->intr_htable[i]); + con_log(CL_ANN, (CE_NOTE, "ddi intr enable returns " + "%d", i)); + } + } + + return (DDI_SUCCESS); + +} + + +static void +drsas_rem_intrs(struct drsas_instance *instance) +{ + int i; + + con_log(CL_ANN, (CE_NOTE, "drsas_rem_intrs called")); + + /* Disable all interrupts first */ + if (instance->intr_cap & DDI_INTR_FLAG_BLOCK) { + (void) ddi_intr_block_disable(instance->intr_htable, + instance->intr_cnt); + } else { + for (i = 0; i < instance->intr_cnt; i++) { + (void) ddi_intr_disable(instance->intr_htable[i]); + } + } + + /* Remove all the handlers */ + + for (i = 0; i < instance->intr_cnt; i++) { + (void) ddi_intr_remove_handler(instance->intr_htable[i]); + (void) ddi_intr_free(instance->intr_htable[i]); + } + + kmem_free(instance->intr_htable, instance->intr_size); +} + +static int +drsas_tran_bus_config(dev_info_t *parent, uint_t flags, + ddi_bus_config_op_t op, void *arg, dev_info_t **childp) +{ + struct drsas_instance *instance; + int config; + int rval; + + char *ptr = NULL; + int tgt, lun; + + con_log(CL_ANN1, (CE_NOTE, "Bus config called for op = %x", op)); + + if ((instance = ddi_get_soft_state(drsas_state, + ddi_get_instance(parent))) == NULL) { + return (NDI_FAILURE); + } + + /* Hold nexus during bus_config */ + ndi_devi_enter(parent, &config); + switch (op) { + case BUS_CONFIG_ONE: { + + /* parse wwid/target name out of name given */ + if ((ptr = strchr((char *)arg, '@')) == NULL) { + rval = NDI_FAILURE; + break; + } + ptr++; + + if (drsas_parse_devname(arg, &tgt, &lun) != 0) { + rval = NDI_FAILURE; + break; + } + + if (lun == 0) { + rval = drsas_config_ld(instance, tgt, lun, childp); + } else { + rval = NDI_FAILURE; + } + + break; + } + case BUS_CONFIG_DRIVER: + case BUS_CONFIG_ALL: { + + rval = drsas_config_all_devices(instance); + + rval = NDI_SUCCESS; + break; + } + } + + if (rval == NDI_SUCCESS) { + rval = ndi_busop_bus_config(parent, flags, op, arg, childp, 0); + + } + ndi_devi_exit(parent, config); + + con_log(CL_ANN1, (CE_NOTE, "drsas_tran_bus_config: rval = %x", + rval)); + return (rval); +} + +static int +drsas_config_all_devices(struct drsas_instance *instance) +{ + int rval, tgt; + + for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) { + (void) drsas_config_ld(instance, tgt, 0, NULL); + + } + + rval = NDI_SUCCESS; + return (rval); +} + +static int +drsas_parse_devname(char *devnm, int *tgt, int *lun) +{ + char devbuf[SCSI_MAXNAMELEN]; + char *addr; + char *p, *tp, *lp; + long num; + + /* Parse dev name and address */ + (void) strcpy(devbuf, devnm); + addr = ""; + for (p = devbuf; *p != '\0'; p++) { + if (*p == '@') { + addr = p + 1; + *p = '\0'; + } else if (*p == ':') { + *p = '\0'; + break; + } + } + + /* Parse target and lun */ + for (p = tp = addr, lp = NULL; *p != '\0'; p++) { + if (*p == ',') { + lp = p + 1; + *p = '\0'; + break; + } + } + if (tgt && tp) { + if (ddi_strtol(tp, NULL, 0x10, &num)) { + return (DDI_FAILURE); /* Can declare this as constant */ + } + *tgt = (int)num; + } + if (lun && lp) { + if (ddi_strtol(lp, NULL, 0x10, &num)) { + return (DDI_FAILURE); + } + *lun = (int)num; + } + return (DDI_SUCCESS); /* Success case */ +} + +static int +drsas_config_ld(struct drsas_instance *instance, uint16_t tgt, + uint8_t lun, dev_info_t **ldip) +{ + struct scsi_device *sd; + dev_info_t *child; + int rval; + + con_log(CL_ANN1, (CE_NOTE, "drsas_config_ld: t = %d l = %d", + tgt, lun)); + + if ((child = drsas_find_child(instance, tgt, lun)) != NULL) { + if (ldip) { + *ldip = child; + } + con_log(CL_ANN1, (CE_NOTE, + "drsas_config_ld: Child = %p found t = %d l = %d", + (void *)child, tgt, lun)); + return (NDI_SUCCESS); + } + + sd = kmem_zalloc(sizeof (struct scsi_device), KM_SLEEP); + sd->sd_address.a_hba_tran = instance->tran; + sd->sd_address.a_target = (uint16_t)tgt; + sd->sd_address.a_lun = (uint8_t)lun; + + if (scsi_hba_probe(sd, NULL) == SCSIPROBE_EXISTS) + rval = drsas_config_scsi_device(instance, sd, ldip); + else + rval = NDI_FAILURE; + + /* sd_unprobe is blank now. Free buffer manually */ + if (sd->sd_inq) { + kmem_free(sd->sd_inq, SUN_INQSIZE); + sd->sd_inq = (struct scsi_inquiry *)NULL; + } + + kmem_free(sd, sizeof (struct scsi_device)); + con_log(CL_ANN1, (CE_NOTE, "drsas_config_ld: return rval = %d", + rval)); + return (rval); +} + +static int +drsas_config_scsi_device(struct drsas_instance *instance, + struct scsi_device *sd, dev_info_t **dipp) +{ + char *nodename = NULL; + char **compatible = NULL; + int ncompatible = 0; + char *childname; + dev_info_t *ldip = NULL; + int tgt = sd->sd_address.a_target; + int lun = sd->sd_address.a_lun; + int dtype = sd->sd_inq->inq_dtype & DTYPE_MASK; + int rval; + + con_log(CL_ANN1, (CE_WARN, "dr_sas: scsi_device t%dL%d", tgt, lun)); + scsi_hba_nodename_compatible_get(sd->sd_inq, NULL, dtype, + NULL, &nodename, &compatible, &ncompatible); + + if (nodename == NULL) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: Found no compatible driver " + "for t%dL%d", tgt, lun)); + rval = NDI_FAILURE; + goto finish; + } + + childname = (dtype == DTYPE_DIRECT) ? "sd" : nodename; + con_log(CL_ANN1, (CE_WARN, + "dr_sas: Childname = %2s nodename = %s", childname, nodename)); + + /* Create a dev node */ + rval = ndi_devi_alloc(instance->dip, childname, DEVI_SID_NODEID, &ldip); + con_log(CL_ANN1, (CE_WARN, + "dr_sas_config_scsi_device: ndi_devi_alloc rval = %x", rval)); + if (rval == NDI_SUCCESS) { + if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "target", tgt) != + DDI_PROP_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create " + "property for t%dl%d target", tgt, lun)); + rval = NDI_FAILURE; + goto finish; + } + if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "lun", lun) != + DDI_PROP_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create " + "property for t%dl%d lun", tgt, lun)); + rval = NDI_FAILURE; + goto finish; + } + + if (ndi_prop_update_string_array(DDI_DEV_T_NONE, ldip, + "compatible", compatible, ncompatible) != + DDI_PROP_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create " + "property for t%dl%d compatible", tgt, lun)); + rval = NDI_FAILURE; + goto finish; + } + + rval = ndi_devi_online(ldip, NDI_ONLINE_ATTACH); + if (rval != NDI_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to online " + "t%dl%d", tgt, lun)); + ndi_prop_remove_all(ldip); + (void) ndi_devi_free(ldip); + } else { + con_log(CL_ANN1, (CE_WARN, "dr_sas: online Done :" + "0 t%dl%d", tgt, lun)); + } + + } +finish: + if (dipp) { + *dipp = ldip; + } + + con_log(CL_DLEVEL1, (CE_WARN, + "dr_sas: config_scsi_device rval = %d t%dL%d", + rval, tgt, lun)); + scsi_hba_nodename_compatible_free(nodename, compatible); + return (rval); +} + +/*ARGSUSED*/ +static int +drsas_service_evt(struct drsas_instance *instance, int tgt, int lun, int event, + uint64_t wwn) +{ + struct drsas_eventinfo *mrevt = NULL; + + con_log(CL_ANN1, (CE_NOTE, + "drsas_service_evt called for t%dl%d event = %d", + tgt, lun, event)); + + if ((instance->taskq == NULL) || (mrevt = + kmem_zalloc(sizeof (struct drsas_eventinfo), KM_NOSLEEP)) == NULL) { + return (ENOMEM); + } + + mrevt->instance = instance; + mrevt->tgt = tgt; + mrevt->lun = lun; + mrevt->event = event; + + if ((ddi_taskq_dispatch(instance->taskq, + (void (*)(void *))drsas_issue_evt_taskq, mrevt, DDI_NOSLEEP)) != + DDI_SUCCESS) { + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: Event task failed for t%dl%d event = %d", + tgt, lun, event)); + kmem_free(mrevt, sizeof (struct drsas_eventinfo)); + return (DDI_FAILURE); + } + return (DDI_SUCCESS); +} + +static void +drsas_issue_evt_taskq(struct drsas_eventinfo *mrevt) +{ + struct drsas_instance *instance = mrevt->instance; + dev_info_t *dip, *pdip; + int circ1 = 0; + char *devname; + + con_log(CL_ANN1, (CE_NOTE, "drsas_issue_evt_taskq: called for" + " tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + + if (mrevt->tgt < MRDRV_MAX_LD && mrevt->lun == 0) { + dip = instance->dr_ld_list[mrevt->tgt].dip; + } else { + return; + } + + ndi_devi_enter(instance->dip, &circ1); + switch (mrevt->event) { + case DRSAS_EVT_CONFIG_TGT: + if (dip == NULL) { + + if (mrevt->lun == 0) { + (void) drsas_config_ld(instance, mrevt->tgt, + 0, NULL); + } + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: EVT_CONFIG_TGT called:" + " for tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + + } else { + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: EVT_CONFIG_TGT dip != NULL:" + " for tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + } + break; + case DRSAS_EVT_UNCONFIG_TGT: + if (dip) { + if (i_ddi_devi_attached(dip)) { + + pdip = ddi_get_parent(dip); + + devname = kmem_zalloc(MAXNAMELEN + 1, KM_SLEEP); + (void) ddi_deviname(dip, devname); + + (void) devfs_clean(pdip, devname + 1, + DV_CLEAN_FORCE); + kmem_free(devname, MAXNAMELEN + 1); + } + (void) ndi_devi_offline(dip, NDI_DEVI_REMOVE); + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: EVT_UNCONFIG_TGT called:" + " for tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + } else { + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: EVT_UNCONFIG_TGT dip == NULL:" + " for tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + } + break; + } + kmem_free(mrevt, sizeof (struct drsas_eventinfo)); + ndi_devi_exit(instance->dip, circ1); +} + +static int +drsas_mode_sense_build(struct scsi_pkt *pkt) +{ + union scsi_cdb *cdbp; + uint16_t page_code; + struct scsa_cmd *acmd; + struct buf *bp; + struct mode_header *modehdrp; + + cdbp = (void *)pkt->pkt_cdbp; + page_code = cdbp->cdb_un.sg.scsi[0]; + acmd = PKT2CMD(pkt); + bp = acmd->cmd_buf; + if ((!bp) && bp->b_un.b_addr && bp->b_bcount && acmd->cmd_dmacount) { + con_log(CL_ANN1, (CE_WARN, "Failing MODESENSE Command")); + /* ADD pkt statistics as Command failed. */ + return (NULL); + } + + bp_mapin(bp); + bzero(bp->b_un.b_addr, bp->b_bcount); + + switch (page_code) { + case 0x3: { + struct mode_format *page3p = NULL; + modehdrp = (struct mode_header *)(bp->b_un.b_addr); + modehdrp->bdesc_length = MODE_BLK_DESC_LENGTH; + + page3p = (void *)((caddr_t)modehdrp + + MODE_HEADER_LENGTH + MODE_BLK_DESC_LENGTH); + page3p->mode_page.code = 0x3; + page3p->mode_page.length = + (uchar_t)(sizeof (struct mode_format)); + page3p->data_bytes_sect = 512; + page3p->sect_track = 63; + break; + } + case 0x4: { + struct mode_geometry *page4p = NULL; + modehdrp = (struct mode_header *)(bp->b_un.b_addr); + modehdrp->bdesc_length = MODE_BLK_DESC_LENGTH; + + page4p = (void *)((caddr_t)modehdrp + + MODE_HEADER_LENGTH + MODE_BLK_DESC_LENGTH); + page4p->mode_page.code = 0x4; + page4p->mode_page.length = + (uchar_t)(sizeof (struct mode_geometry)); + page4p->heads = 255; + page4p->rpm = 10000; + break; + } + default: + break; + } + return (NULL); +} diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.conf b/usr/src/uts/common/io/dr_sas/dr_sas.conf new file mode 100644 index 0000000000..3792f43ca4 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/dr_sas.conf @@ -0,0 +1,15 @@ +# +# Copyright (c) 2008-2009, LSI Logic Corporation. +# All rights reserved. +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# dr_sas.conf for sol 10 (and later) for all supported architectures +# +# global definitions + +# MSI specific flag. user can uncomment this line and set flag "yes" to enable MSI +#drsas-enable-msi="yes"; diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.h b/usr/src/uts/common/io/dr_sas/dr_sas.h new file mode 100644 index 0000000000..8f78658edf --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/dr_sas.h @@ -0,0 +1,1766 @@ +/* + * dr_sas.h: header for dr_sas + * + * Solaris MegaRAID driver for SAS2.0 controllers + * Copyright (c) 2008-2009, LSI Logic Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _DR_SAS_H_ +#define _DR_SAS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/scsi/scsi.h> +#include "dr_sas_list.h" + +/* + * MegaRAID SAS2.0 Driver meta data + */ +#define DRSAS_VERSION "LSIv2.0" +#define DRSAS_RELDATE "Jan 9, 2009" + +#define DRSAS_TRUE 1 +#define DRSAS_FALSE 0 + +/* + * MegaRAID SAS2.0 device id conversion definitions. + */ +#define INST2LSIRDCTL(x) ((x) << INST_MINOR_SHIFT) + +/* + * MegaRAID SAS2.0 supported controllers + */ +#define PCI_DEVICE_ID_LSI_2108VDE 0x0078 +#define PCI_DEVICE_ID_LSI_2108V 0x0079 + +/* + * Register Index for 2108 Controllers. + */ +#define REGISTER_SET_IO_2108 (2) + +#define DRSAS_MAX_SGE_CNT 0x50 + +#define DRSAS_IOCTL_DRIVER 0x12341234 +#define DRSAS_IOCTL_FIRMWARE 0x12345678 +#define DRSAS_IOCTL_AEN 0x87654321 + +#define DRSAS_1_SECOND 1000000 + +/* Dynamic Enumeration Flags */ +#define DRSAS_PD_LUN 1 +#define DRSAS_LD_LUN 0 +#define DRSAS_PD_TGT_MAX 255 +#define DRSAS_GET_PD_MAX(s) ((s)->dr_pd_max) +#define WWN_STRLEN 17 + +/* + * ===================================== + * MegaRAID SAS2.0 MFI firmware definitions + * ===================================== + */ +/* + * MFI stands for MegaRAID SAS2.0 FW Interface. This is just a moniker for + * protocol between the software and firmware. Commands are issued using + * "message frames" + */ + +/* + * FW posts its state in upper 4 bits of outbound_msg_0 register + */ +#define MFI_STATE_SHIFT 28 +#define MFI_STATE_MASK ((uint32_t)0xF<<MFI_STATE_SHIFT) +#define MFI_STATE_UNDEFINED ((uint32_t)0x0<<MFI_STATE_SHIFT) +#define MFI_STATE_BB_INIT ((uint32_t)0x1<<MFI_STATE_SHIFT) +#define MFI_STATE_FW_INIT ((uint32_t)0x4<<MFI_STATE_SHIFT) +#define MFI_STATE_WAIT_HANDSHAKE ((uint32_t)0x6<<MFI_STATE_SHIFT) +#define MFI_STATE_FW_INIT_2 ((uint32_t)0x7<<MFI_STATE_SHIFT) +#define MFI_STATE_DEVICE_SCAN ((uint32_t)0x8<<MFI_STATE_SHIFT) +#define MFI_STATE_BOOT_MESSAGE_PENDING ((uint32_t)0x9<<MFI_STATE_SHIFT) +#define MFI_STATE_FLUSH_CACHE ((uint32_t)0xA<<MFI_STATE_SHIFT) +#define MFI_STATE_READY ((uint32_t)0xB<<MFI_STATE_SHIFT) +#define MFI_STATE_OPERATIONAL ((uint32_t)0xC<<MFI_STATE_SHIFT) +#define MFI_STATE_FAULT ((uint32_t)0xF<<MFI_STATE_SHIFT) + +#define MRMFI_FRAME_SIZE 64 + +/* + * During FW init, clear pending cmds & reset state using inbound_msg_0 + * + * ABORT : Abort all pending cmds + * READY : Move from OPERATIONAL to READY state; discard queue info + * MFIMODE : Discard (possible) low MFA posted in 64-bit mode (??) + * CLR_HANDSHAKE: FW is waiting for HANDSHAKE from BIOS or Driver + */ +#define MFI_INIT_ABORT 0x00000001 +#define MFI_INIT_READY 0x00000002 +#define MFI_INIT_MFIMODE 0x00000004 +#define MFI_INIT_CLEAR_HANDSHAKE 0x00000008 +#define MFI_INIT_HOTPLUG 0x00000010 +#define MFI_STOP_ADP 0x00000020 +#define MFI_RESET_FLAGS MFI_INIT_READY|MFI_INIT_MFIMODE|MFI_INIT_ABORT + +/* + * MFI frame flags + */ +#define MFI_FRAME_POST_IN_REPLY_QUEUE 0x0000 +#define MFI_FRAME_DONT_POST_IN_REPLY_QUEUE 0x0001 +#define MFI_FRAME_SGL32 0x0000 +#define MFI_FRAME_SGL64 0x0002 +#define MFI_FRAME_SENSE32 0x0000 +#define MFI_FRAME_SENSE64 0x0004 +#define MFI_FRAME_DIR_NONE 0x0000 +#define MFI_FRAME_DIR_WRITE 0x0008 +#define MFI_FRAME_DIR_READ 0x0010 +#define MFI_FRAME_DIR_BOTH 0x0018 + +/* + * Definition for cmd_status + */ +#define MFI_CMD_STATUS_POLL_MODE 0xFF +#define MFI_CMD_STATUS_SYNC_MODE 0xFF + +/* + * MFI command opcodes + */ +#define MFI_CMD_OP_INIT 0x00 +#define MFI_CMD_OP_LD_READ 0x01 +#define MFI_CMD_OP_LD_WRITE 0x02 +#define MFI_CMD_OP_LD_SCSI 0x03 +#define MFI_CMD_OP_PD_SCSI 0x04 +#define MFI_CMD_OP_DCMD 0x05 +#define MFI_CMD_OP_ABORT 0x06 +#define MFI_CMD_OP_SMP 0x07 +#define MFI_CMD_OP_STP 0x08 + +#define DR_DCMD_CTRL_GET_INFO 0x01010000 + +#define DR_DCMD_CTRL_CACHE_FLUSH 0x01101000 +#define DR_FLUSH_CTRL_CACHE 0x01 +#define DR_FLUSH_DISK_CACHE 0x02 + +#define DR_DCMD_CTRL_SHUTDOWN 0x01050000 +#define DRSAS_ENABLE_DRIVE_SPINDOWN 0x01 + +#define DR_DCMD_CTRL_EVENT_GET_INFO 0x01040100 +#define DR_DCMD_CTRL_EVENT_GET 0x01040300 +#define DR_DCMD_CTRL_EVENT_WAIT 0x01040500 +#define DR_DCMD_LD_GET_PROPERTIES 0x03030000 +#define DR_DCMD_PD_GET_INFO 0x02020000 + +/* + * Solaris Specific MAX values + */ +#define MAX_SGL 24 +/* + * MFI command completion codes + */ +enum MFI_STAT { + MFI_STAT_OK = 0x00, + MFI_STAT_INVALID_CMD = 0x01, + MFI_STAT_INVALID_DCMD = 0x02, + MFI_STAT_INVALID_PARAMETER = 0x03, + MFI_STAT_INVALID_SEQUENCE_NUMBER = 0x04, + MFI_STAT_ABORT_NOT_POSSIBLE = 0x05, + MFI_STAT_APP_HOST_CODE_NOT_FOUND = 0x06, + MFI_STAT_APP_IN_USE = 0x07, + MFI_STAT_APP_NOT_INITIALIZED = 0x08, + MFI_STAT_ARRAY_INDEX_INVALID = 0x09, + MFI_STAT_ARRAY_ROW_NOT_EMPTY = 0x0a, + MFI_STAT_CONFIG_RESOURCE_CONFLICT = 0x0b, + MFI_STAT_DEVICE_NOT_FOUND = 0x0c, + MFI_STAT_DRIVE_TOO_SMALL = 0x0d, + MFI_STAT_FLASH_ALLOC_FAIL = 0x0e, + MFI_STAT_FLASH_BUSY = 0x0f, + MFI_STAT_FLASH_ERROR = 0x10, + MFI_STAT_FLASH_IMAGE_BAD = 0x11, + MFI_STAT_FLASH_IMAGE_INCOMPLETE = 0x12, + MFI_STAT_FLASH_NOT_OPEN = 0x13, + MFI_STAT_FLASH_NOT_STARTED = 0x14, + MFI_STAT_FLUSH_FAILED = 0x15, + MFI_STAT_HOST_CODE_NOT_FOUNT = 0x16, + MFI_STAT_LD_CC_IN_PROGRESS = 0x17, + MFI_STAT_LD_INIT_IN_PROGRESS = 0x18, + MFI_STAT_LD_LBA_OUT_OF_RANGE = 0x19, + MFI_STAT_LD_MAX_CONFIGURED = 0x1a, + MFI_STAT_LD_NOT_OPTIMAL = 0x1b, + MFI_STAT_LD_RBLD_IN_PROGRESS = 0x1c, + MFI_STAT_LD_RECON_IN_PROGRESS = 0x1d, + MFI_STAT_LD_WRONG_RAID_LEVEL = 0x1e, + MFI_STAT_MAX_SPARES_EXCEEDED = 0x1f, + MFI_STAT_MEMORY_NOT_AVAILABLE = 0x20, + MFI_STAT_MFC_HW_ERROR = 0x21, + MFI_STAT_NO_HW_PRESENT = 0x22, + MFI_STAT_NOT_FOUND = 0x23, + MFI_STAT_NOT_IN_ENCL = 0x24, + MFI_STAT_PD_CLEAR_IN_PROGRESS = 0x25, + MFI_STAT_PD_TYPE_WRONG = 0x26, + MFI_STAT_PR_DISABLED = 0x27, + MFI_STAT_ROW_INDEX_INVALID = 0x28, + MFI_STAT_SAS_CONFIG_INVALID_ACTION = 0x29, + MFI_STAT_SAS_CONFIG_INVALID_DATA = 0x2a, + MFI_STAT_SAS_CONFIG_INVALID_PAGE = 0x2b, + MFI_STAT_SAS_CONFIG_INVALID_TYPE = 0x2c, + MFI_STAT_SCSI_DONE_WITH_ERROR = 0x2d, + MFI_STAT_SCSI_IO_FAILED = 0x2e, + MFI_STAT_SCSI_RESERVATION_CONFLICT = 0x2f, + MFI_STAT_SHUTDOWN_FAILED = 0x30, + MFI_STAT_TIME_NOT_SET = 0x31, + MFI_STAT_WRONG_STATE = 0x32, + MFI_STAT_LD_OFFLINE = 0x33, + /* UNUSED: 0x34 to 0xfe */ + MFI_STAT_INVALID_STATUS = 0xFF +}; + +enum DR_EVT_CLASS { + DR_EVT_CLASS_DEBUG = -2, + DR_EVT_CLASS_PROGRESS = -1, + DR_EVT_CLASS_INFO = 0, + DR_EVT_CLASS_WARNING = 1, + DR_EVT_CLASS_CRITICAL = 2, + DR_EVT_CLASS_FATAL = 3, + DR_EVT_CLASS_DEAD = 4 +}; + +enum DR_EVT_LOCALE { + DR_EVT_LOCALE_LD = 0x0001, + DR_EVT_LOCALE_PD = 0x0002, + DR_EVT_LOCALE_ENCL = 0x0004, + DR_EVT_LOCALE_BBU = 0x0008, + DR_EVT_LOCALE_SAS = 0x0010, + DR_EVT_LOCALE_CTRL = 0x0020, + DR_EVT_LOCALE_CONFIG = 0x0040, + DR_EVT_LOCALE_CLUSTER = 0x0080, + DR_EVT_LOCALE_ALL = 0xffff +}; + +#define DR_EVT_CFG_CLEARED 0x0004 +#define DR_EVT_LD_CREATED 0x008a +#define DR_EVT_LD_DELETED 0x008b +#define DR_EVT_PD_REMOVED_EXT 0x00f8 +#define DR_EVT_PD_INSERTED_EXT 0x00f7 + +enum LD_STATE { + LD_OFFLINE = 0, + LD_PARTIALLY_DEGRADED = 1, + LD_DEGRADED = 2, + LD_OPTIMAL = 3, + LD_INVALID = 0xFF +}; + +enum DRSAS_EVT { + DRSAS_EVT_CONFIG_TGT = 0, + DRSAS_EVT_UNCONFIG_TGT = 1, + DRSAS_EVT_UNCONFIG_SMP = 2 +}; + +#define DMA_OBJ_ALLOCATED 1 +#define DMA_OBJ_REALLOCATED 2 +#define DMA_OBJ_FREED 3 + +/* + * dma_obj_t - Our DMA object + * @param buffer : kernel virtual address + * @param size : size of the data to be allocated + * @param acc_handle : access handle + * @param dma_handle : dma handle + * @param dma_cookie : scatter-gather list + * @param dma_attr : dma attributes for this buffer + * Our DMA object. The caller must initialize the size and dma attributes + * (dma_attr) fields before allocating the resources. + */ +typedef struct { + caddr_t buffer; + uint32_t size; + ddi_acc_handle_t acc_handle; + ddi_dma_handle_t dma_handle; + ddi_dma_cookie_t dma_cookie[DRSAS_MAX_SGE_CNT]; + ddi_dma_attr_t dma_attr; + uint8_t status; + uint8_t reserved[3]; +} dma_obj_t; + +struct drsas_eventinfo { + struct drsas_instance *instance; + int tgt; + int lun; + int event; +}; + +struct drsas_ld { + dev_info_t *dip; + uint8_t lun_type; + uint8_t reserved[3]; +}; + +struct drsas_pd { + dev_info_t *dip; + uint8_t lun_type; + uint8_t dev_id; + uint8_t flags; + uint8_t reserved; +}; + +struct drsas_pd_info { + uint16_t deviceId; + uint16_t seqNum; + uint8_t inquiryData[96]; + uint8_t vpdPage83[64]; + uint8_t notSupported; + uint8_t scsiDevType; + uint8_t a; + uint8_t device_speed; + uint32_t mediaerrcnt; + uint32_t other; + uint32_t pred; + uint32_t lastpred; + uint16_t fwState; + uint8_t disabled; + uint8_t linkspwwd; + uint32_t ddfType; + struct { + uint8_t count; + uint8_t isPathBroken; + uint8_t connectorIndex[2]; + uint8_t reserved[4]; + uint64_t sasAddr[2]; + uint8_t reserved2[16]; + } pathInfo; +}; + +typedef struct drsas_instance { + uint32_t *producer; + uint32_t *consumer; + + uint32_t *reply_queue; + dma_obj_t mfi_internal_dma_obj; + + uint8_t init_id; + uint8_t reserved[3]; + + uint16_t max_num_sge; + uint16_t max_fw_cmds; + uint32_t max_sectors_per_req; + + struct drsas_cmd **cmd_list; + + mlist_t cmd_pool_list; + kmutex_t cmd_pool_mtx; + + mlist_t cmd_pend_list; + kmutex_t cmd_pend_mtx; + + dma_obj_t mfi_evt_detail_obj; + struct drsas_cmd *aen_cmd; + + uint32_t aen_seq_num; + uint32_t aen_class_locale_word; + + scsi_hba_tran_t *tran; + + kcondvar_t int_cmd_cv; + kmutex_t int_cmd_mtx; + + kcondvar_t aen_cmd_cv; + kmutex_t aen_cmd_mtx; + + kcondvar_t abort_cmd_cv; + kmutex_t abort_cmd_mtx; + + dev_info_t *dip; + ddi_acc_handle_t pci_handle; + + timeout_id_t timeout_id; + uint32_t unique_id; + uint16_t fw_outstanding; + caddr_t regmap; + ddi_acc_handle_t regmap_handle; + uint8_t isr_level; + ddi_iblock_cookie_t iblock_cookie; + ddi_iblock_cookie_t soft_iblock_cookie; + ddi_softintr_t soft_intr_id; + uint8_t softint_running; + kmutex_t completed_pool_mtx; + mlist_t completed_pool_list; + + caddr_t internal_buf; + uint32_t internal_buf_dmac_add; + uint32_t internal_buf_size; + + uint16_t vendor_id; + uint16_t device_id; + uint16_t subsysvid; + uint16_t subsysid; + int instance; + int baseaddress; + char iocnode[16]; + + int fm_capabilities; + + struct drsas_func_ptr *func_ptr; + /* MSI interrupts specific */ + ddi_intr_handle_t *intr_htable; + int intr_type; + int intr_cnt; + size_t intr_size; + uint_t intr_pri; + int intr_cap; + + ddi_taskq_t *taskq; + struct drsas_ld *dr_ld_list; +} drsas_t; + +struct drsas_func_ptr { + int (*read_fw_status_reg)(struct drsas_instance *); + void (*issue_cmd)(struct drsas_cmd *, struct drsas_instance *); + int (*issue_cmd_in_sync_mode)(struct drsas_instance *, + struct drsas_cmd *); + int (*issue_cmd_in_poll_mode)(struct drsas_instance *, + struct drsas_cmd *); + void (*enable_intr)(struct drsas_instance *); + void (*disable_intr)(struct drsas_instance *); + int (*intr_ack)(struct drsas_instance *); +}; + +/* + * ### Helper routines ### + */ + +/* + * con_log() - console log routine + * @param level : indicates the severity of the message. + * @fparam mt : format string + * + * con_log displays the error messages on the console based on the current + * debug level. Also it attaches the appropriate kernel severity level with + * the message. + * + * + * console messages debug levels + */ +#define CL_NONE 0 /* No debug information */ +#define CL_ANN 1 /* print unconditionally, announcements */ +#define CL_ANN1 2 /* No o/p */ +#define CL_DLEVEL1 3 /* debug level 1, informative */ +#define CL_DLEVEL2 4 /* debug level 2, verbose */ +#define CL_DLEVEL3 5 /* debug level 3, very verbose */ + +#ifdef __SUNPRO_C +#define __func__ "" +#endif + +#define con_log(level, fmt) { if (debug_level_g >= level) cmn_err fmt; } + +/* + * ### SCSA definitions ### + */ +#define PKT2TGT(pkt) ((pkt)->pkt_address.a_target) +#define PKT2LUN(pkt) ((pkt)->pkt_address.a_lun) +#define PKT2TRAN(pkt) ((pkt)->pkt_adress.a_hba_tran) +#define ADDR2TRAN(ap) ((ap)->a_hba_tran) + +#define TRAN2MR(tran) (struct drsas_instance *)(tran)->tran_hba_private) +#define ADDR2MR(ap) (TRAN2MR(ADDR2TRAN(ap)) + +#define PKT2CMD(pkt) ((struct scsa_cmd *)(pkt)->pkt_ha_private) +#define CMD2PKT(sp) ((sp)->cmd_pkt) +#define PKT2REQ(pkt) (&(PKT2CMD(pkt)->request)) + +#define CMD2ADDR(cmd) (&CMD2PKT(cmd)->pkt_address) +#define CMD2TRAN(cmd) (CMD2PKT(cmd)->pkt_address.a_hba_tran) +#define CMD2MR(cmd) (TRAN2MR(CMD2TRAN(cmd))) + +#define CFLAG_DMAVALID 0x0001 /* requires a dma operation */ +#define CFLAG_DMASEND 0x0002 /* Transfer from the device */ +#define CFLAG_CONSISTENT 0x0040 /* consistent data transfer */ + +/* + * ### Data structures for ioctl inteface and internal commands ### + */ + +/* + * Data direction flags + */ +#define UIOC_RD 0x00001 +#define UIOC_WR 0x00002 + +#define SCP2HOST(scp) (scp)->device->host /* to host */ +#define SCP2HOSTDATA(scp) SCP2HOST(scp)->hostdata /* to soft state */ +#define SCP2CHANNEL(scp) (scp)->device->channel /* to channel */ +#define SCP2TARGET(scp) (scp)->device->id /* to target */ +#define SCP2LUN(scp) (scp)->device->lun /* to LUN */ + +#define SCSIHOST2ADAP(host) (((caddr_t *)(host->hostdata))[0]) +#define SCP2ADAPTER(scp) \ + (struct drsas_instance *)SCSIHOST2ADAP(SCP2HOST(scp)) + +#define MRDRV_IS_LOGICAL_SCSA(instance, acmd) \ + (acmd->device_id < MRDRV_MAX_LD) ? 1 : 0 +#define MRDRV_IS_LOGICAL(ap) \ + ((ap->a_target < MRDRV_MAX_LD) && (ap->a_lun == 0)) ? 1 : 0 +#define MAP_DEVICE_ID(instance, ap) \ + (ap->a_target) + +#define HIGH_LEVEL_INTR 1 +#define NORMAL_LEVEL_INTR 0 + +/* + * scsa_cmd - Per-command mr private data + * @param cmd_dmahandle : dma handle + * @param cmd_dmacookies : current dma cookies + * @param cmd_pkt : scsi_pkt reference + * @param cmd_dmacount : dma count + * @param cmd_cookie : next cookie + * @param cmd_ncookies : cookies per window + * @param cmd_cookiecnt : cookies per sub-win + * @param cmd_nwin : number of dma windows + * @param cmd_curwin : current dma window + * @param cmd_dma_offset : current window offset + * @param cmd_dma_len : current window length + * @param cmd_flags : private flags + * @param cmd_cdblen : length of cdb + * @param cmd_scblen : length of scb + * @param cmd_buf : command buffer + * @param channel : channel for scsi sub-system + * @param target : target for scsi sub-system + * @param lun : LUN for scsi sub-system + * + * - Allocated at same time as scsi_pkt by scsi_hba_pkt_alloc(9E) + * - Pointed to by pkt_ha_private field in scsi_pkt + */ +struct scsa_cmd { + ddi_dma_handle_t cmd_dmahandle; + ddi_dma_cookie_t cmd_dmacookies[DRSAS_MAX_SGE_CNT]; + struct scsi_pkt *cmd_pkt; + ulong_t cmd_dmacount; + uint_t cmd_cookie; + uint_t cmd_ncookies; + uint_t cmd_cookiecnt; + uint_t cmd_nwin; + uint_t cmd_curwin; + off_t cmd_dma_offset; + ulong_t cmd_dma_len; + ulong_t cmd_flags; + uint_t cmd_cdblen; + uint_t cmd_scblen; + struct buf *cmd_buf; + ushort_t device_id; + uchar_t islogical; + uchar_t lun; + struct drsas_device *drsas_dev; +}; + + +struct drsas_cmd { + union drsas_frame *frame; + uint32_t frame_phys_addr; + uint8_t *sense; + uint32_t sense_phys_addr; + dma_obj_t frame_dma_obj; + uint8_t frame_dma_obj_status; + + uint32_t index; + uint8_t sync_cmd; + uint8_t cmd_status; + uint16_t abort_aen; + mlist_t list; + uint32_t frame_count; + struct scsa_cmd *cmd; + struct scsi_pkt *pkt; +}; + +#define MAX_MGMT_ADAPTERS 1024 +#define IOC_SIGNATURE "MR-SAS" + +#define IOC_CMD_FIRMWARE 0x0 +#define DRSAS_DRIVER_IOCTL_COMMON 0xF0010000 +#define DRSAS_DRIVER_IOCTL_DRIVER_VERSION 0xF0010100 +#define DRSAS_DRIVER_IOCTL_PCI_INFORMATION 0xF0010200 +#define DRSAS_DRIVER_IOCTL_MRRAID_STATISTICS 0xF0010300 + + +#define DRSAS_MAX_SENSE_LENGTH 32 + +struct drsas_mgmt_info { + + uint16_t count; + struct drsas_instance *instance[MAX_MGMT_ADAPTERS]; + uint16_t map[MAX_MGMT_ADAPTERS]; + int max_index; +}; + +#pragma pack(1) + +/* + * SAS controller properties + */ +struct drsas_ctrl_prop { + uint16_t seq_num; + uint16_t pred_fail_poll_interval; + uint16_t intr_throttle_count; + uint16_t intr_throttle_timeouts; + + uint8_t rebuild_rate; + uint8_t patrol_read_rate; + uint8_t bgi_rate; + uint8_t cc_rate; + uint8_t recon_rate; + + uint8_t cache_flush_interval; + + uint8_t spinup_drv_count; + uint8_t spinup_delay; + + uint8_t cluster_enable; + uint8_t coercion_mode; + uint8_t disk_write_cache_disable; + uint8_t alarm_enable; + + uint8_t reserved[44]; +}; + +/* + * SAS controller information + */ +struct drsas_ctrl_info { + /* PCI device information */ + struct { + uint16_t vendor_id; + uint16_t device_id; + uint16_t sub_vendor_id; + uint16_t sub_device_id; + uint8_t reserved[24]; + } pci; + + /* Host interface information */ + struct { + uint8_t PCIX : 1; + uint8_t PCIE : 1; + uint8_t iSCSI : 1; + uint8_t SAS_3G : 1; + uint8_t reserved_0 : 4; + uint8_t reserved_1[6]; + uint8_t port_count; + uint64_t port_addr[8]; + } host_interface; + + /* Device (backend) interface information */ + struct { + uint8_t SPI : 1; + uint8_t SAS_3G : 1; + uint8_t SATA_1_5G : 1; + uint8_t SATA_3G : 1; + uint8_t reserved_0 : 4; + uint8_t reserved_1[6]; + uint8_t port_count; + uint64_t port_addr[8]; + } device_interface; + + /* List of components residing in flash. All str are null terminated */ + uint32_t image_check_word; + uint32_t image_component_count; + + struct { + char name[8]; + char version[32]; + char build_date[16]; + char built_time[16]; + } image_component[8]; + + /* + * List of flash components that have been flashed on the card, but + * are not in use, pending reset of the adapter. This list will be + * empty if a flash operation has not occurred. All stings are null + * terminated + */ + uint32_t pending_image_component_count; + + struct { + char name[8]; + char version[32]; + char build_date[16]; + char build_time[16]; + } pending_image_component[8]; + + uint8_t max_arms; + uint8_t max_spans; + uint8_t max_arrays; + uint8_t max_lds; + + char product_name[80]; + char serial_no[32]; + + /* + * Other physical/controller/operation information. Indicates the + * presence of the hardware + */ + struct { + uint32_t bbu : 1; + uint32_t alarm : 1; + uint32_t nvram : 1; + uint32_t uart : 1; + uint32_t reserved : 28; + } hw_present; + + uint32_t current_fw_time; + + /* Maximum data transfer sizes */ + uint16_t max_concurrent_cmds; + uint16_t max_sge_count; + uint32_t max_request_size; + + /* Logical and physical device counts */ + uint16_t ld_present_count; + uint16_t ld_degraded_count; + uint16_t ld_offline_count; + + uint16_t pd_present_count; + uint16_t pd_disk_present_count; + uint16_t pd_disk_pred_failure_count; + uint16_t pd_disk_failed_count; + + /* Memory size information */ + uint16_t nvram_size; + uint16_t memory_size; + uint16_t flash_size; + + /* Error counters */ + uint16_t mem_correctable_error_count; + uint16_t mem_uncorrectable_error_count; + + /* Cluster information */ + uint8_t cluster_permitted; + uint8_t cluster_active; + uint8_t reserved_1[2]; + + /* Controller capabilities structures */ + struct { + uint32_t raid_level_0 : 1; + uint32_t raid_level_1 : 1; + uint32_t raid_level_5 : 1; + uint32_t raid_level_1E : 1; + uint32_t reserved : 28; + } raid_levels; + + struct { + uint32_t rbld_rate : 1; + uint32_t cc_rate : 1; + uint32_t bgi_rate : 1; + uint32_t recon_rate : 1; + uint32_t patrol_rate : 1; + uint32_t alarm_control : 1; + uint32_t cluster_supported : 1; + uint32_t bbu : 1; + uint32_t spanning_allowed : 1; + uint32_t dedicated_hotspares : 1; + uint32_t revertible_hotspares : 1; + uint32_t foreign_config_import : 1; + uint32_t self_diagnostic : 1; + uint32_t reserved : 19; + } adapter_operations; + + struct { + uint32_t read_policy : 1; + uint32_t write_policy : 1; + uint32_t io_policy : 1; + uint32_t access_policy : 1; + uint32_t reserved : 28; + } ld_operations; + + struct { + uint8_t min; + uint8_t max; + uint8_t reserved[2]; + } stripe_size_operations; + + struct { + uint32_t force_online : 1; + uint32_t force_offline : 1; + uint32_t force_rebuild : 1; + uint32_t reserved : 29; + } pd_operations; + + struct { + uint32_t ctrl_supports_sas : 1; + uint32_t ctrl_supports_sata : 1; + uint32_t allow_mix_in_encl : 1; + uint32_t allow_mix_in_ld : 1; + uint32_t allow_sata_in_cluster : 1; + uint32_t reserved : 27; + } pd_mix_support; + + /* Include the controller properties (changeable items) */ + uint8_t reserved_2[12]; + struct drsas_ctrl_prop properties; + + uint8_t pad[0x800 - 0x640]; +}; + +/* + * ================================== + * MegaRAID SAS2.0 driver definitions + * ================================== + */ +#define MRDRV_MAX_NUM_CMD 1024 + +#define MRDRV_MAX_PD_CHANNELS 2 +#define MRDRV_MAX_LD_CHANNELS 2 +#define MRDRV_MAX_CHANNELS (MRDRV_MAX_PD_CHANNELS + \ + MRDRV_MAX_LD_CHANNELS) +#define MRDRV_MAX_DEV_PER_CHANNEL 128 +#define MRDRV_DEFAULT_INIT_ID -1 +#define MRDRV_MAX_CMD_PER_LUN 1000 +#define MRDRV_MAX_LUN 1 +#define MRDRV_MAX_LD 64 + +#define MRDRV_RESET_WAIT_TIME 300 +#define MRDRV_RESET_NOTICE_INTERVAL 5 + +#define DRSAS_IOCTL_CMD 0 + +/* + * FW can accept both 32 and 64 bit SGLs. We want to allocate 32/64 bit + * SGLs based on the size of dma_addr_t + */ +#define IS_DMA64 (sizeof (dma_addr_t) == 8) + +#define IB_MSG_0_OFF 0x10 /* XScale */ +#define OB_MSG_0_OFF 0x18 /* XScale */ +#define IB_DOORBELL_OFF 0x20 /* XScale & ROC */ +#define OB_INTR_STATUS_OFF 0x30 /* XScale & ROC */ +#define OB_INTR_MASK_OFF 0x34 /* XScale & ROC */ +#define IB_QPORT_OFF 0x40 /* XScale & ROC */ +#define OB_DOORBELL_CLEAR_OFF 0xA0 /* ROC */ +#define OB_SCRATCH_PAD_0_OFF 0xB0 /* ROC */ +#define OB_INTR_MASK 0xFFFFFFFF +#define OB_DOORBELL_CLEAR_MASK 0xFFFFFFFF + +/* + * All MFI register set macros accept drsas_register_set* + */ +#define WR_IB_MSG_0(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_MSG_0_OFF), (v)) + +#define RD_OB_MSG_0(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_MSG_0_OFF)) + +#define WR_IB_DOORBELL(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_DOORBELL_OFF), (v)) + +#define RD_IB_DOORBELL(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_DOORBELL_OFF)) + +#define WR_OB_INTR_STATUS(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_STATUS_OFF), (v)) + +#define RD_OB_INTR_STATUS(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_STATUS_OFF)) + +#define WR_OB_INTR_MASK(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF), (v)) + +#define RD_OB_INTR_MASK(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF)) + +#define WR_IB_QPORT(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_QPORT_OFF), (v)) + +#define WR_OB_DOORBELL_CLEAR(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_DOORBELL_CLEAR_OFF), \ + (v)) + +#define RD_OB_SCRATCH_PAD_0(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_SCRATCH_PAD_0_OFF)) + +/* + * When FW is in MFI_STATE_READY or MFI_STATE_OPERATIONAL, the state data + * of Outbound Msg Reg 0 indicates max concurrent cmds supported, max SGEs + * supported per cmd and if 64-bit MFAs (M64) is enabled or disabled. + */ +#define MFI_OB_INTR_STATUS_MASK 0x00000002 + +/* + * This MFI_REPLY_2108_MESSAGE_INTR flag is used also + * in enable_intr_ppc also. Hence bit 2, i.e. 0x4 has + * been set in this flag along with bit 1. + */ +#define MFI_REPLY_2108_MESSAGE_INTR 0x00000001 +#define MFI_REPLY_2108_MESSAGE_INTR_MASK 0x00000005 + +#define MFI_POLL_TIMEOUT_SECS 60 + +#define MFI_ENABLE_INTR(instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF), 1) +#define MFI_DISABLE_INTR(instance) \ +{ \ + uint32_t disable = 1; \ + uint32_t mask = ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF));\ + mask &= ~disable; \ + ddi_put32((instance)->regmap_handle, (uint32_t *) \ + (uintptr_t)((instance)->regmap + OB_INTR_MASK_OFF), mask); \ +} + +/* By default, the firmware programs for 8 Kbytes of memory */ +#define DEFAULT_MFI_MEM_SZ 8192 +#define MINIMUM_MFI_MEM_SZ 4096 + +/* DCMD Message Frame MAILBOX0-11 */ +#define DCMD_MBOX_SZ 12 + + +struct drsas_register_set { + uint32_t reserved_0[4]; + + uint32_t inbound_msg_0; + uint32_t inbound_msg_1; + uint32_t outbound_msg_0; + uint32_t outbound_msg_1; + + uint32_t inbound_doorbell; + uint32_t inbound_intr_status; + uint32_t inbound_intr_mask; + + uint32_t outbound_doorbell; + uint32_t outbound_intr_status; + uint32_t outbound_intr_mask; + + uint32_t reserved_1[2]; + + uint32_t inbound_queue_port; + uint32_t outbound_queue_port; + + uint32_t reserved_2[22]; + + uint32_t outbound_doorbell_clear; + + uint32_t reserved_3[3]; + + uint32_t outbound_scratch_pad; + + uint32_t reserved_4[3]; + + uint32_t inbound_low_queue_port; + + uint32_t inbound_high_queue_port; + + uint32_t reserved_5; + uint32_t index_registers[820]; +}; + +struct drsas_sge32 { + uint32_t phys_addr; + uint32_t length; +}; + +struct drsas_sge64 { + uint64_t phys_addr; + uint32_t length; +}; + +union drsas_sgl { + struct drsas_sge32 sge32[1]; + struct drsas_sge64 sge64[1]; +}; + +struct drsas_header { + uint8_t cmd; + uint8_t sense_len; + uint8_t cmd_status; + uint8_t scsi_status; + + uint8_t target_id; + uint8_t lun; + uint8_t cdb_len; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + uint32_t data_xferlen; +}; + +union drsas_sgl_frame { + struct drsas_sge32 sge32[8]; + struct drsas_sge64 sge64[5]; +}; + +struct drsas_init_frame { + uint8_t cmd; + uint8_t reserved_0; + uint8_t cmd_status; + + uint8_t reserved_1; + uint32_t reserved_2; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t reserved_3; + uint32_t data_xfer_len; + + uint32_t queue_info_new_phys_addr_lo; + uint32_t queue_info_new_phys_addr_hi; + uint32_t queue_info_old_phys_addr_lo; + uint32_t queue_info_old_phys_addr_hi; + + uint32_t reserved_4[6]; +}; + +struct drsas_init_queue_info { + uint32_t init_flags; + uint32_t reply_queue_entries; + + uint32_t reply_queue_start_phys_addr_lo; + uint32_t reply_queue_start_phys_addr_hi; + uint32_t producer_index_phys_addr_lo; + uint32_t producer_index_phys_addr_hi; + uint32_t consumer_index_phys_addr_lo; + uint32_t consumer_index_phys_addr_hi; +}; + +struct drsas_io_frame { + uint8_t cmd; + uint8_t sense_len; + uint8_t cmd_status; + uint8_t scsi_status; + + uint8_t target_id; + uint8_t access_byte; + uint8_t reserved_0; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + uint32_t lba_count; + + uint32_t sense_buf_phys_addr_lo; + uint32_t sense_buf_phys_addr_hi; + + uint32_t start_lba_lo; + uint32_t start_lba_hi; + + union drsas_sgl sgl; +}; + +struct drsas_pthru_frame { + uint8_t cmd; + uint8_t sense_len; + uint8_t cmd_status; + uint8_t scsi_status; + + uint8_t target_id; + uint8_t lun; + uint8_t cdb_len; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + uint32_t data_xfer_len; + + uint32_t sense_buf_phys_addr_lo; + uint32_t sense_buf_phys_addr_hi; + + uint8_t cdb[16]; + union drsas_sgl sgl; +}; + +struct drsas_dcmd_frame { + uint8_t cmd; + uint8_t reserved_0; + uint8_t cmd_status; + uint8_t reserved_1[4]; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + + uint32_t data_xfer_len; + uint32_t opcode; + + union { + uint8_t b[DCMD_MBOX_SZ]; + uint16_t s[6]; + uint32_t w[3]; + } mbox; + + union drsas_sgl sgl; +}; + +struct drsas_abort_frame { + uint8_t cmd; + uint8_t reserved_0; + uint8_t cmd_status; + + uint8_t reserved_1; + uint32_t reserved_2; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t reserved_3; + uint32_t reserved_4; + + uint32_t abort_context; + uint32_t pad_1; + + uint32_t abort_mfi_phys_addr_lo; + uint32_t abort_mfi_phys_addr_hi; + + uint32_t reserved_5[6]; +}; + +struct drsas_smp_frame { + uint8_t cmd; + uint8_t reserved_1; + uint8_t cmd_status; + uint8_t connection_status; + + uint8_t reserved_2[3]; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + + uint32_t data_xfer_len; + + uint64_t sas_addr; + + union drsas_sgl sgl[2]; +}; + +struct drsas_stp_frame { + uint8_t cmd; + uint8_t reserved_1; + uint8_t cmd_status; + uint8_t connection_status; + + uint8_t target_id; + uint8_t reserved_2[2]; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + + uint32_t data_xfer_len; + + uint16_t fis[10]; + uint32_t stp_flags; + union drsas_sgl sgl; +}; + +union drsas_frame { + struct drsas_header hdr; + struct drsas_init_frame init; + struct drsas_io_frame io; + struct drsas_pthru_frame pthru; + struct drsas_dcmd_frame dcmd; + struct drsas_abort_frame abort; + struct drsas_smp_frame smp; + struct drsas_stp_frame stp; + + uint8_t raw_bytes[64]; +}; + +typedef struct drsas_pd_address { + uint16_t device_id; + uint16_t encl_id; + + union { + struct { + uint8_t encl_index; + uint8_t slot_number; + } pd_address; + struct { + uint8_t encl_position; + uint8_t encl_connector_index; + } encl_address; + }address; + + uint8_t scsi_dev_type; + + union { + uint8_t port_bitmap; + uint8_t port_numbers; + } connected; + + uint64_t sas_addr[2]; +} drsas_pd_address_t; + +union drsas_evt_class_locale { + struct { + uint16_t locale; + uint8_t reserved; + int8_t class; + } members; + + uint32_t word; +}; + +struct drsas_evt_log_info { + uint32_t newest_seq_num; + uint32_t oldest_seq_num; + uint32_t clear_seq_num; + uint32_t shutdown_seq_num; + uint32_t boot_seq_num; +}; + +struct drsas_progress { + uint16_t progress; + uint16_t elapsed_seconds; +}; + +struct drsas_evtarg_ld { + uint16_t target_id; + uint8_t ld_index; + uint8_t reserved; +}; + +struct drsas_evtarg_pd { + uint16_t device_id; + uint8_t encl_index; + uint8_t slot_number; +}; + +struct drsas_evt_detail { + uint32_t seq_num; + uint32_t time_stamp; + uint32_t code; + union drsas_evt_class_locale cl; + uint8_t arg_type; + uint8_t reserved1[15]; + + union { + struct { + struct drsas_evtarg_pd pd; + uint8_t cdb_length; + uint8_t sense_length; + uint8_t reserved[2]; + uint8_t cdb[16]; + uint8_t sense[64]; + } cdbSense; + + struct drsas_evtarg_ld ld; + + struct { + struct drsas_evtarg_ld ld; + uint64_t count; + } ld_count; + + struct { + uint64_t lba; + struct drsas_evtarg_ld ld; + } ld_lba; + + struct { + struct drsas_evtarg_ld ld; + uint32_t prevOwner; + uint32_t newOwner; + } ld_owner; + + struct { + uint64_t ld_lba; + uint64_t pd_lba; + struct drsas_evtarg_ld ld; + struct drsas_evtarg_pd pd; + } ld_lba_pd_lba; + + struct { + struct drsas_evtarg_ld ld; + struct drsas_progress prog; + } ld_prog; + + struct { + struct drsas_evtarg_ld ld; + uint32_t prev_state; + uint32_t new_state; + } ld_state; + + struct { + uint64_t strip; + struct drsas_evtarg_ld ld; + } ld_strip; + + struct drsas_evtarg_pd pd; + + struct { + struct drsas_evtarg_pd pd; + uint32_t err; + } pd_err; + + struct { + uint64_t lba; + struct drsas_evtarg_pd pd; + } pd_lba; + + struct { + uint64_t lba; + struct drsas_evtarg_pd pd; + struct drsas_evtarg_ld ld; + } pd_lba_ld; + + struct { + struct drsas_evtarg_pd pd; + struct drsas_progress prog; + } pd_prog; + + struct { + struct drsas_evtarg_pd pd; + uint32_t prevState; + uint32_t newState; + } pd_state; + + struct { + uint16_t vendorId; + uint16_t deviceId; + uint16_t subVendorId; + uint16_t subDeviceId; + } pci; + + uint32_t rate; + char str[96]; + + struct { + uint32_t rtc; + uint32_t elapsedSeconds; + } time; + + struct { + uint32_t ecar; + uint32_t elog; + char str[64]; + } ecc; + + drsas_pd_address_t pd_addr; + + uint8_t b[96]; + uint16_t s[48]; + uint32_t w[24]; + uint64_t d[12]; + } args; + + char description[128]; + +}; + +/* only 63 are usable by the application */ +#define MAX_LOGICAL_DRIVES 64 +/* only 255 physical devices may be used */ +#define MAX_PHYSICAL_DEVICES 256 +#define MAX_PD_PER_ENCLOSURE 64 +/* maximum disks per array */ +#define MAX_ROW_SIZE 32 +/* maximum spans per logical drive */ +#define MAX_SPAN_DEPTH 8 +/* maximum number of arrays a hot spare may be dedicated to */ +#define MAX_ARRAYS_DEDICATED 16 +/* maximum number of arrays which may exist */ +#define MAX_ARRAYS 128 +/* maximum number of foreign configs that may ha managed at once */ +#define MAX_FOREIGN_CONFIGS 8 +/* maximum spares (global and dedicated combined) */ +#define MAX_SPARES_FOR_THE_CONTROLLER MAX_PHYSICAL_DEVICES +/* maximum possible Target IDs (i.e. 0 to 63) */ +#define MAX_TARGET_ID 63 +/* maximum number of supported enclosures */ +#define MAX_ENCLOSURES 32 +/* maximum number of PHYs per controller */ +#define MAX_PHYS_PER_CONTROLLER 16 +/* maximum number of LDs per array (due to DDF limitations) */ +#define MAX_LDS_PER_ARRAY 16 + +/* + * ----------------------------------------------------------------------------- + * ----------------------------------------------------------------------------- + * + * Logical Drive commands + * + * ----------------------------------------------------------------------------- + * ----------------------------------------------------------------------------- + */ +#define DR_DCMD_LD 0x03000000, /* Logical Device (LD) opcodes */ + +/* + * Input: dcmd.opcode - DR_DCMD_LD_GET_LIST + * dcmd.mbox - reserved + * dcmd.sge IN - ptr to returned DR_LD_LIST structure + * Desc: Return the logical drive list structure + * Status: No error + */ + +/* + * defines the logical drive reference structure + */ +typedef union _DR_LD_REF { /* LD reference structure */ + struct { + uint8_t targetId; /* LD target id (0 to MAX_TARGET_ID) */ + uint8_t reserved; /* reserved for in line with DR_PD_REF */ + uint16_t seqNum; /* Sequence Number */ + } ld_ref; + uint32_t ref; /* shorthand reference to full 32-bits */ +} DR_LD_REF; /* 4 bytes */ + +/* + * defines the logical drive list structure + */ +typedef struct _DR_LD_LIST { + uint32_t ldCount; /* number of LDs */ + uint32_t reserved; /* pad to 8-byte boundary */ + struct { + DR_LD_REF ref; /* LD reference */ + uint8_t state; /* current LD state (DR_LD_STATE) */ + uint8_t reserved[3]; /* pad to 8-byte boundary */ + uint64_t size; /* LD size */ + } ldList[MAX_LOGICAL_DRIVES]; +} DR_LD_LIST; + +struct drsas_drv_ver { + uint8_t signature[12]; + uint8_t os_name[16]; + uint8_t os_ver[12]; + uint8_t drv_name[20]; + uint8_t drv_ver[32]; + uint8_t drv_rel_date[20]; +}; + +#define PCI_TYPE0_ADDRESSES 6 +#define PCI_TYPE1_ADDRESSES 2 +#define PCI_TYPE2_ADDRESSES 5 + +struct drsas_pci_common_header { + uint16_t vendorID; /* (ro) */ + uint16_t deviceID; /* (ro) */ + uint16_t command; /* Device control */ + uint16_t status; + uint8_t revisionID; /* (ro) */ + uint8_t progIf; /* (ro) */ + uint8_t subClass; /* (ro) */ + uint8_t baseClass; /* (ro) */ + uint8_t cacheLineSize; /* (ro+) */ + uint8_t latencyTimer; /* (ro+) */ + uint8_t headerType; /* (ro) */ + uint8_t bist; /* Built in self test */ + + union { + struct { + uint32_t baseAddresses[PCI_TYPE0_ADDRESSES]; + uint32_t cis; + uint16_t subVendorID; + uint16_t subSystemID; + uint32_t romBaseAddress; + uint8_t capabilitiesPtr; + uint8_t reserved1[3]; + uint32_t reserved2; + uint8_t interruptLine; + uint8_t interruptPin; /* (ro) */ + uint8_t minimumGrant; /* (ro) */ + uint8_t maximumLatency; /* (ro) */ + } type_0; + + struct { + uint32_t baseAddresses[PCI_TYPE1_ADDRESSES]; + uint8_t primaryBus; + uint8_t secondaryBus; + uint8_t subordinateBus; + uint8_t secondaryLatency; + uint8_t ioBase; + uint8_t ioLimit; + uint16_t secondaryStatus; + uint16_t memoryBase; + uint16_t memoryLimit; + uint16_t prefetchBase; + uint16_t prefetchLimit; + uint32_t prefetchBaseUpper32; + uint32_t prefetchLimitUpper32; + uint16_t ioBaseUpper16; + uint16_t ioLimitUpper16; + uint8_t capabilitiesPtr; + uint8_t reserved1[3]; + uint32_t romBaseAddress; + uint8_t interruptLine; + uint8_t interruptPin; + uint16_t bridgeControl; + } type_1; + + struct { + uint32_t socketRegistersBaseAddress; + uint8_t capabilitiesPtr; + uint8_t reserved; + uint16_t secondaryStatus; + uint8_t primaryBus; + uint8_t secondaryBus; + uint8_t subordinateBus; + uint8_t secondaryLatency; + struct { + uint32_t base; + uint32_t limit; + } range[PCI_TYPE2_ADDRESSES-1]; + uint8_t interruptLine; + uint8_t interruptPin; + uint16_t bridgeControl; + } type_2; + } header; +}; + +struct drsas_pci_link_capability { + union { + struct { + uint32_t linkSpeed :4; + uint32_t linkWidth :6; + uint32_t aspmSupport :2; + uint32_t losExitLatency :3; + uint32_t l1ExitLatency :3; + uint32_t rsvdp :6; + uint32_t portNumber :8; + } bits; + + uint32_t asUlong; + } cap; + +}; + +struct drsas_pci_link_status_capability { + union { + struct { + uint16_t linkSpeed :4; + uint16_t negotiatedLinkWidth :6; + uint16_t linkTrainingError :1; + uint16_t linkTraning :1; + uint16_t slotClockConfig :1; + uint16_t rsvdZ :3; + } bits; + + uint16_t asUshort; + } stat_cap; + + uint16_t reserved; + +}; + +struct drsas_pci_capabilities { + struct drsas_pci_link_capability linkCapability; + struct drsas_pci_link_status_capability linkStatusCapability; +}; + +struct drsas_pci_information +{ + uint32_t busNumber; + uint8_t deviceNumber; + uint8_t functionNumber; + uint8_t interruptVector; + uint8_t reserved; + struct drsas_pci_common_header pciHeaderInfo; + struct drsas_pci_capabilities capability; + uint8_t reserved2[32]; +}; + +struct drsas_ioctl { + uint16_t version; + uint16_t controller_id; + uint8_t signature[8]; + uint32_t reserved_1; + uint32_t control_code; + uint32_t reserved_2[2]; + uint8_t frame[64]; + union drsas_sgl_frame sgl_frame; + uint8_t sense_buff[DRSAS_MAX_SENSE_LENGTH]; + uint8_t data[1]; +}; + +struct drsas_aen { + uint16_t host_no; + uint16_t cmd_status; + uint32_t seq_num; + uint32_t class_locale_word; +}; +#pragma pack() + +#ifndef DDI_VENDOR_LSI +#define DDI_VENDOR_LSI "LSI" +#endif /* DDI_VENDOR_LSI */ + +static int drsas_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); +static int drsas_attach(dev_info_t *, ddi_attach_cmd_t); +static int drsas_reset(dev_info_t *, ddi_reset_cmd_t); +static int drsas_detach(dev_info_t *, ddi_detach_cmd_t); +static int drsas_open(dev_t *, int, int, cred_t *); +static int drsas_close(dev_t, int, int, cred_t *); +static int drsas_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); + +static int drsas_tran_tgt_init(dev_info_t *, dev_info_t *, + scsi_hba_tran_t *, struct scsi_device *); +static struct scsi_pkt *drsas_tran_init_pkt(struct scsi_address *, register + struct scsi_pkt *, struct buf *, int, int, int, int, + int (*)(), caddr_t); +static int drsas_tran_start(struct scsi_address *, + register struct scsi_pkt *); +static int drsas_tran_abort(struct scsi_address *, struct scsi_pkt *); +static int drsas_tran_reset(struct scsi_address *, int); +static int drsas_tran_getcap(struct scsi_address *, char *, int); +static int drsas_tran_setcap(struct scsi_address *, char *, int, int); +static void drsas_tran_destroy_pkt(struct scsi_address *, + struct scsi_pkt *); +static void drsas_tran_dmafree(struct scsi_address *, struct scsi_pkt *); +static void drsas_tran_sync_pkt(struct scsi_address *, struct scsi_pkt *); +static uint_t drsas_isr(); +static uint_t drsas_softintr(); + +static int init_mfi(struct drsas_instance *); +static int drsas_free_dma_obj(struct drsas_instance *, dma_obj_t); +static int drsas_alloc_dma_obj(struct drsas_instance *, dma_obj_t *, + uchar_t); +static struct drsas_cmd *get_mfi_pkt(struct drsas_instance *); +static void return_mfi_pkt(struct drsas_instance *, + struct drsas_cmd *); + +static void free_space_for_mfi(struct drsas_instance *); +static void free_additional_dma_buffer(struct drsas_instance *); +static int alloc_additional_dma_buffer(struct drsas_instance *); +static int read_fw_status_reg_ppc(struct drsas_instance *); +static void issue_cmd_ppc(struct drsas_cmd *, struct drsas_instance *); +static int issue_cmd_in_poll_mode_ppc(struct drsas_instance *, + struct drsas_cmd *); +static int issue_cmd_in_sync_mode_ppc(struct drsas_instance *, + struct drsas_cmd *); +static void enable_intr_ppc(struct drsas_instance *); +static void disable_intr_ppc(struct drsas_instance *); +static int intr_ack_ppc(struct drsas_instance *); +static int mfi_state_transition_to_ready(struct drsas_instance *); +static void destroy_mfi_frame_pool(struct drsas_instance *); +static int create_mfi_frame_pool(struct drsas_instance *); +static int drsas_dma_alloc(struct drsas_instance *, struct scsi_pkt *, + struct buf *, int, int (*)()); +static int drsas_dma_move(struct drsas_instance *, + struct scsi_pkt *, struct buf *); +static void flush_cache(struct drsas_instance *instance); +static void display_scsi_inquiry(caddr_t); +static int start_mfi_aen(struct drsas_instance *instance); +static int handle_drv_ioctl(struct drsas_instance *instance, + struct drsas_ioctl *ioctl, int mode); +static int handle_mfi_ioctl(struct drsas_instance *instance, + struct drsas_ioctl *ioctl, int mode); +static int handle_mfi_aen(struct drsas_instance *instance, + struct drsas_aen *aen); +static void fill_up_drv_ver(struct drsas_drv_ver *dv); +static struct drsas_cmd *build_cmd(struct drsas_instance *instance, + struct scsi_address *ap, struct scsi_pkt *pkt, + uchar_t *cmd_done); +static int register_mfi_aen(struct drsas_instance *instance, + uint32_t seq_num, uint32_t class_locale_word); +static int issue_mfi_pthru(struct drsas_instance *instance, struct + drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode); +static int issue_mfi_dcmd(struct drsas_instance *instance, struct + drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode); +static int issue_mfi_smp(struct drsas_instance *instance, struct + drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode); +static int issue_mfi_stp(struct drsas_instance *instance, struct + drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode); +static int abort_aen_cmd(struct drsas_instance *instance, + struct drsas_cmd *cmd_to_abort); + +static int drsas_common_check(struct drsas_instance *instance, + struct drsas_cmd *cmd); +static void drsas_fm_init(struct drsas_instance *instance); +static void drsas_fm_fini(struct drsas_instance *instance); +static int drsas_fm_error_cb(dev_info_t *, ddi_fm_error_t *, + const void *); +static void drsas_fm_ereport(struct drsas_instance *instance, + char *detail); +static int drsas_check_dma_handle(ddi_dma_handle_t handle); +static int drsas_check_acc_handle(ddi_acc_handle_t handle); + +static void drsas_rem_intrs(struct drsas_instance *instance); +static int drsas_add_intrs(struct drsas_instance *instance, int intr_type); + +static void drsas_tran_tgt_free(dev_info_t *, dev_info_t *, + scsi_hba_tran_t *, struct scsi_device *); +static int drsas_tran_bus_config(dev_info_t *, uint_t, + ddi_bus_config_op_t, void *, dev_info_t **); +static int drsas_parse_devname(char *, int *, int *); +static int drsas_config_all_devices(struct drsas_instance *); +static int drsas_config_scsi_device(struct drsas_instance *, + struct scsi_device *, dev_info_t **); +static int drsas_config_ld(struct drsas_instance *, uint16_t, + uint8_t, dev_info_t **); +static dev_info_t *drsas_find_child(struct drsas_instance *, uint16_t, + uint8_t); +static int drsas_name_node(dev_info_t *, char *, int); +static void drsas_issue_evt_taskq(struct drsas_eventinfo *); +static int drsas_service_evt(struct drsas_instance *, int, int, int, + uint64_t); +static int drsas_mode_sense_build(struct scsi_pkt *); + +#ifdef __cplusplus +} +#endif + +#endif /* _DR_SAS_H_ */ diff --git a/usr/src/uts/common/io/dr_sas/dr_sas_list.h b/usr/src/uts/common/io/dr_sas/dr_sas_list.h new file mode 100644 index 0000000000..4154a77796 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/dr_sas_list.h @@ -0,0 +1,212 @@ +/* + * dr_sas_list.h: header for dr_sas + * + * Solaris MegaRAID driver for SAS2.0 controllers + * Copyright (c) 2008-2009, LSI Logic Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _DR_SAS_LIST_H_ +#define _DR_SAS_LIST_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +struct mlist_head { + struct mlist_head *next, *prev; +}; + +typedef struct mlist_head mlist_t; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct mlist_head name = LIST_HEAD_INIT(name) + +#define INIT_LIST_HEAD(ptr) { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} + + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static void __list_add(struct mlist_head *new, + struct mlist_head *prev, + struct mlist_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + + +/* + * mlist_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static void mlist_add(struct mlist_head *new, struct mlist_head *head) +{ + __list_add(new, head, head->next); +} + + +/* + * mlist_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static void mlist_add_tail(struct mlist_head *new, struct mlist_head *head) +{ + __list_add(new, head->prev, head); +} + + + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static void __list_del(struct mlist_head *prev, + struct mlist_head *next) +{ + next->prev = prev; + prev->next = next; +} + + +/* + * mlist_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static void mlist_del_init(struct mlist_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + + +/* + * mlist_empty - tests whether a list is empty + * @head: the list to test. + */ +static int mlist_empty(struct mlist_head *head) +{ + return (head->next == head); +} + + +/* + * mlist_splice - join two lists + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static void mlist_splice(struct mlist_head *list, struct mlist_head *head) +{ + struct mlist_head *first = list->next; + + if (first != list) { + struct mlist_head *last = list->prev; + struct mlist_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; + } +} + + +/* + * mlist_entry - get the struct for this entry + * @ptr: the &struct mlist_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#define mlist_entry(ptr, type, member) \ + ((type *)((size_t)(ptr) - offsetof(type, member))) + + +/* + * mlist_for_each - iterate over a list + * @pos: the &struct mlist_head to use as a loop counter. + * @head: the head for your list. + */ +#define mlist_for_each(pos, head) \ + for (pos = (head)->next, prefetch(pos->next); pos != (head); \ + pos = pos->next, prefetch(pos->next)) + + +/* + * mlist_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct mlist_head to use as a loop counter. + * @n: another &struct mlist_head to use as temporary storage + * @head: the head for your list. + */ +#define mlist_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +#ifdef __cplusplus +} +#endif + +#endif /* _DR_SAS_LIST_H_ */ diff --git a/usr/src/uts/common/io/elxl/elxl.c b/usr/src/uts/common/io/elxl/elxl.c index 2ffe96aff3..42552225f8 100644 --- a/usr/src/uts/common/io/elxl/elxl.c +++ b/usr/src/uts/common/io/elxl/elxl.c @@ -1,6 +1,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ /* @@ -1163,8 +1164,7 @@ elxl_m_tx(void *arg, mblk_t *mp) cflags = 0; if ((sc->ex_conf & CONF_90XB) != 0) { uint32_t pflags; - hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, - &pflags); + mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &pflags); if (pflags & HCK_IPV4_HDRCKSUM) { cflags |= EX_DPD_IPCKSUM; } @@ -1327,7 +1327,7 @@ elxl_recv(elxl_t *sc, ex_desc_t *rxd, uint32_t stat) if (stat & (EX_UPD_TCPCHECKED | EX_UPD_UDPCHECKED)) { pflags |= (HCK_FULLCKSUM | HCK_FULLCKSUM_OK); } - (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, pflags, 0); + mac_hcksum_set(mp, 0, 0, 0, 0, pflags); } return (mp); diff --git a/usr/src/uts/common/io/eventfd.c b/usr/src/uts/common/io/eventfd.c index 9b0840aa8b..e26cdfc78f 100644 --- a/usr/src/uts/common/io/eventfd.c +++ b/usr/src/uts/common/io/eventfd.c @@ -141,37 +141,39 @@ eventfd_read(dev_t dev, uio_t *uio, cred_t *cr) * transitions from EVENTFD_VALMAX to a lower value. At all other * times, it is already considered writable by poll. */ - if (oval == EVENTFD_VALMAX) { + if (oval >= EVENTFD_VALMAX) { pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT); } return (err); } -/*ARGSUSED*/ static int -eventfd_write(dev_t dev, struct uio *uio, cred_t *credp) +eventfd_post(eventfd_state_t *state, uint64_t val, boolean_t is_async, + boolean_t file_nonblock) { - eventfd_state_t *state; - minor_t minor = getminor(dev); - uint64_t val, oval; - int err; - - if (uio->uio_resid < sizeof (val)) - return (EINVAL); - - if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0) - return (err); - - if (val > EVENTFD_VALMAX) - return (EINVAL); - - state = ddi_get_soft_state(eventfd_softstate, minor); + uint64_t oval; + boolean_t overflow = B_FALSE; mutex_enter(&state->efd_lock); while (val > EVENTFD_VALMAX - state->efd_value) { - if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { + + /* + * When called from (LX) AIO, expectations about overflow and + * blocking are different than normal operation. If the + * incoming value would cause overflow, it is clamped to reach + * the overflow value exactly. This is added to the existing + * value without blocking. Any pollers of the eventfd will see + * POLLERR asserted when this occurs. + */ + if (is_async) { + val = EVENTFD_VALOVERFLOW - state->efd_value; + overflow = B_TRUE; + break; + } + + if (file_nonblock) { mutex_exit(&state->efd_lock); return (EAGAIN); } @@ -186,7 +188,7 @@ eventfd_write(dev_t dev, struct uio *uio, cred_t *credp) } /* - * We now know that we can add the value without overflowing. + * We now know that we can safely add the value. */ state->efd_value = (oval = state->efd_value) + val; @@ -200,10 +202,13 @@ eventfd_write(dev_t dev, struct uio *uio, cred_t *credp) mutex_exit(&state->efd_lock); /* - * Notify pollers as well if the eventfd is now readable. + * Notify pollers as well if the eventfd has become readable or has + * transitioned into overflow. */ if (oval == 0) { pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN); + } else if (overflow && val != 0) { + pollwakeup(&state->efd_pollhd, POLLERR); } return (0); @@ -211,6 +216,29 @@ eventfd_write(dev_t dev, struct uio *uio, cred_t *credp) /*ARGSUSED*/ static int +eventfd_write(dev_t dev, struct uio *uio, cred_t *credp) +{ + eventfd_state_t *state; + boolean_t file_nonblock; + uint64_t val; + int err; + + if (uio->uio_resid < sizeof (val)) + return (EINVAL); + + if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0) + return (err); + + if (val > EVENTFD_VALMAX) + return (EINVAL); + + file_nonblock = (uio->uio_fmode & (FNDELAY|FNONBLOCK)) != 0; + state = ddi_get_soft_state(eventfd_softstate, getminor(dev)); + return (eventfd_post(state, val, B_FALSE, file_nonblock)); +} + +/*ARGSUSED*/ +static int eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp, struct pollhead **phpp) { @@ -228,6 +256,9 @@ eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp, if (state->efd_value < EVENTFD_VALMAX) revents |= POLLWRNORM | POLLOUT; + if (state->efd_value == EVENTFD_VALOVERFLOW) + revents |= POLLERR; + *reventsp = revents & events; if ((*reventsp == 0 && !anyyet) || (events & POLLET)) { *phpp = &state->efd_pollhd; @@ -244,17 +275,28 @@ eventfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) { eventfd_state_t *state; minor_t minor = getminor(dev); + uint64_t *valp; state = ddi_get_soft_state(eventfd_softstate, minor); switch (cmd) { - case EVENTFDIOC_SEMAPHORE: { + case EVENTFDIOC_SEMAPHORE: mutex_enter(&state->efd_lock); state->efd_semaphore ^= 1; mutex_exit(&state->efd_lock); + return (0); + case EVENTFDIOC_POST: + /* + * This ioctl is expected to be kernel-internal, used only by + * the AIO emulation in LX. + */ + if ((md & FKIOCTL) == 0) { + break; + } + valp = (uint64_t *)arg; + VERIFY(eventfd_post(state, *valp, B_TRUE, B_FALSE) == 0); return (0); - } default: break; diff --git a/usr/src/uts/common/io/fibre-channel/impl/fctl.c b/usr/src/uts/common/io/fibre-channel/impl/fctl.c index 4c2a39013a..eb2a0c2ec5 100644 --- a/usr/src/uts/common/io/fibre-channel/impl/fctl.c +++ b/usr/src/uts/common/io/fibre-channel/impl/fctl.c @@ -24,6 +24,7 @@ */ /* * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. + * Copyright (c) 2015 Joyent, Inc. All rights reserved. */ /* * Fibre channel Transport Library (fctl) @@ -5500,6 +5501,11 @@ fc_ulp_get_adapter_paths(char *pathList, int count) maxPorts ++; } + if (maxPorts == 0) { + mutex_exit(&fctl_port_lock); + return (0); + } + /* Now allocate a buffer to store all the pointers for comparisons */ portList = kmem_zalloc(sizeof (fc_local_port_t *) * maxPorts, KM_SLEEP); diff --git a/usr/src/uts/common/io/gld.c b/usr/src/uts/common/io/gld.c index c6c6b65900..5502ea54af 100644 --- a/usr/src/uts/common/io/gld.c +++ b/usr/src/uts/common/io/gld.c @@ -22,6 +22,7 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* @@ -4550,8 +4551,7 @@ gld_unitdata(queue_t *q, mblk_t *mp) ifp = ((gld_mac_pvt_t *)macinfo->gldm_mac_pvt)->interfacep; /* grab any checksum information that may be present */ - hcksum_retrieve(mp->b_cont, NULL, NULL, &start, &stuff, &end, - &value, &flags); + mac_hcksum_get(mp->b_cont, &start, &stuff, &end, &value, &flags); /* * Prepend a valid header for transmission @@ -4567,8 +4567,7 @@ gld_unitdata(queue_t *q, mblk_t *mp) } /* apply any checksum information to the first block in the chain */ - (void) hcksum_assoc(nmp, NULL, NULL, start, stuff, end, value, - flags, 0); + mac_hcksum_set(nmp, start, stuff, end, value, flags); GLD_CLEAR_MBLK_VTAG(nmp); if (gld_start(q, nmp, GLD_WSRV, upri) == GLD_NORESOURCES) { diff --git a/usr/src/uts/common/io/gsqueue/gsqueue.c b/usr/src/uts/common/io/gsqueue/gsqueue.c new file mode 100644 index 0000000000..03bb799499 --- /dev/null +++ b/usr/src/uts/common/io/gsqueue/gsqueue.c @@ -0,0 +1,608 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Serialization queues are a technique used in illumos to provide what's + * commonly known as a 'vertical' perimeter. The idea (described a bit in + * uts/common/inet/squeue.c) is to provide a means to make sure that message + * blocks (mblk_t) are processed in a specific order. Subsystems like ip and vnd + * consume these on different policies, ip on a conn_t basis, vnd on a per + * device basis, and use this to ensure that only one packet is being processed + * at a given time. + * + * Serialization queues were originally used by ip. As part of that + * implementation, many of the details of ip were baked into it. That includes + * things like conn_t, ip receive attributes, and the notion of sets. While an + * individual serialization queue, or gsqueue_t, is a useful level of + * abstraction, it isn't the basis on which monst consumers want to manage them. + * Instead, we have the notion of a set of serialization queues. These sets are + * DR (CPU Dynamic reconfiguration) aware, and allow consumers to have a + * gsqueue_t per CPU to fanout on without managing them all itself. In the + * original implementation, this existed, but they were heavily tied into the + * infrastructure of IP, and its notion of polling on the underlying MAC + * devices. + * + * The result of that past is a new interface to serialization queues and a + * similar, but slightly different, abstraction to sets of these + * (gsqueue_set_t). When designing this there are two different approaches that + * one could consider. The first is that the system has one gsqueue_set_t that + * the entire world shares, whether IP or some other consumer. The other is that + * every consumer has their own set. + * + * The trade offs between these two failure modes are the pathological failure + * modes. There is no guarantee that any two consumers here are equivalent. In + * fact, they very likely have very different latency profiles. If they are + * being processed in the same queue, that can lead to very odd behaviors. More + * generally, if we have a series of processing functions from one consumer + * which are generally short, and another which are generally long, that'll + * cause undue latency that's harder to observe. If we instead take the approach + * that each consumer should have its own set that it fans out over then we + * won't end up with the problem that a given serialization queue will have + * multiple latency profiles, but instead we'll see cpu contention for the bound + * gsqueue_t worker thread. Keep in mind though, that only the gsqueue_t worker + * thread is bound and it is in fact possible for it to be processed by other + * threads on other CPUs. + * + * We've opted to go down the second path, so each consumer has its own + * independent set of serialization queues that it is bound over. + * + * Structure Hierarchies + * --------------------- + * + * At the top level, we have a single list of gsqueue_set_t. The gsqueue_set_t + * encapsulates all the per-CPU gsqueue_t that exist in the form of + * gsqueue_cpu_t. The gsqueue_cpu_t has been designed such that it could + * accommodate more than one gsqueue_t, but today there is a one to one mapping. + * + * We maintain two different lists of gsqueue_cpu_t, the active and defunct + * sets. The active set is maintained in the array `gs_cpus`. There are NCPU + * entries available in `gs_cpus` with the total number of currently active cpus + * described in `gs_ncpus`. The ordering of `gs_cpus` is unimportant. When + * there is no longer a need for a given binding (see the following section for + * more explanation on when this is the case) then we move the entry to the + * `gs_defunct` list which is just a list_t of gsqueue_cpu_t. + * + * In addition, each gsqueue_set_t can have a series of callbacks registered + * with it. These are described in the following section. Graphically, a given + * gsqueue_set_t looks roughly like the following: + * + * +---------------+ + * | gsqueue_set_t | + * +---------------+ + * | | | + * | | * . . . gs_cpus + * | | | + * | | | +-------------------------------------------------+ + * | | +--->| gsqueue_cpu_t || gsqueue_cpu_t || gsqueue_cpu_t |... + * | | +-------------------------------------------------+ + * | | + * | * . . . gs_defunct + * | | + * | | +---------------+ +---------------+ +---------------+ + * | +--->| gsqueue_cpu_t |-->| gsqueue_cpu_t |-->| gsqueue_cpu_t |... + * | +---------------+ +---------------+ +---------------+ + * * . . . gs_cbs + * | + * | +--------------+ +--------------+ +--------------+ + * +--->| gsqueue_cb_t |-->| gsqueue_cb_t |->| gsqueue_cb_t |... + * +--------------+ +--------------+ +--------------+ + * + * CPU DR, gsqueue_t, and gsqueue_t + * -------------------------------- + * + * Recall, that every serialization queue (gsqueue_t or squeue_t) has a worker + * thread that may end up doing work. As part of supporting fanout, we have one + * gsqueue_t per CPU, and its worker thread is bound to that CPU. Because of + * this binding, we need to deal with CPU DR changes. + * + * The gsqueue driver maintains a single CPU DR callback that is used for the + * entire sub-system. We break down CPU DR events into three groups. Offline + * events, online events, and events we can ignore. When the first group occurs, + * we need to go through every gsqueue_t, find the gsqueue_cpu_t that + * corresponds to that processor id, and unbind all of its gsqueue_t's. It's + * rather important that we only unbind the gsqueue_t's and not actually destroy + * them. When this happens, they could very easily have data queued inside of + * them and it's unreasonable to just throw out everything in them at this + * point. The data remains intact and service continues uinterrupted. + * + * When we receive an online event, we do the opposite. We try to find a + * gsqueue_cpu_t that previously was bound to this CPU (by leaving its gqc_cpuid + * field intact) in the defunct list. If we find one, we remove it from the + * defunct list and add it to the active list as well as binding the gsqueue_t + * to the CPU in question. If we don't find one, then we create a new one. + * + * To deal with these kinds of situations, we allow a consumer to register + * callbacks for the gsqueue_t that they are interested in. These callbacks will + * fire whenever we are handling a topology change. The design of the callbacks + * is not that the user can take any administrative action during them, but + * rather set something for them to do asynchronously. It is illegal to make any + * calls into the gsqueue system while you are in a callback. + * + * Locking + * ------- + * + * The lock ordering here is fairly straightforward. Due to our use of CPU + * binding and the CPU DR callbacks, we have an additional lock to consider + * cpu_lock. Because of that, the following are the rules for locking: + * + * + * o If performing binding operations, you must grab cpu_lock. cpu_lock is + * also at the top of the order. + * + * o cpu_lock > gsqueue_lock > gsqueue_t`gs_lock > squeue_t`sq_lock + * If you need to take multiple locks, you must take the greatest + * (left-most) one first. + */ + +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/stat.h> +#include <sys/kmem.h> +#include <sys/stream.h> +#include <sys/modctl.h> +#include <sys/cpuvar.h> +#include <sys/list.h> +#include <sys/sysmacros.h> + +#include <sys/gsqueue.h> +#include <sys/squeue_impl.h> + +typedef struct gsqueue_cb { + struct gsqueue_cb *gcb_next; + gsqueue_cb_f gcb_func; + void *gcb_arg; +} gsqueue_cb_t; + +typedef struct gsqueue_cpu { + list_node_t gqc_lnode; + squeue_t *gqc_head; + processorid_t gqc_cpuid; +} gsqueue_cpu_t; + +struct gsqueue_set { + list_node_t gs_next; + pri_t gs_wpri; + kmutex_t gs_lock; + int gs_ncpus; + gsqueue_cpu_t **gs_cpus; + list_t gs_defunct; + gsqueue_cb_t *gs_cbs; +}; + +static kmutex_t gsqueue_lock; +static list_t gsqueue_list; +static kmem_cache_t *gsqueue_cb_cache; +static kmem_cache_t *gsqueue_cpu_cache; +static kmem_cache_t *gsqueue_set_cache; + +static gsqueue_cpu_t * +gsqueue_cpu_create(pri_t wpri, processorid_t cpuid) +{ + gsqueue_cpu_t *scp; + + scp = kmem_cache_alloc(gsqueue_cpu_cache, KM_SLEEP); + + list_link_init(&scp->gqc_lnode); + scp->gqc_cpuid = cpuid; + scp->gqc_head = squeue_create(wpri, B_FALSE); + scp->gqc_head->sq_state = SQS_DEFAULT; + squeue_bind(scp->gqc_head, cpuid); + + return (scp); +} + +static void +gsqueue_cpu_destroy(gsqueue_cpu_t *scp) +{ + squeue_destroy(scp->gqc_head); + kmem_cache_free(gsqueue_cpu_cache, scp); +} + +gsqueue_set_t * +gsqueue_set_create(pri_t wpri) +{ + int i; + gsqueue_set_t *gssp; + + gssp = kmem_cache_alloc(gsqueue_set_cache, KM_SLEEP); + gssp->gs_wpri = wpri; + gssp->gs_ncpus = 0; + + /* + * We're grabbing CPU lock. Once we let go of it we have to ensure all + * set up of the gsqueue_set_t is complete, as it'll be in there for the + * various CPU DR bits. + */ + mutex_enter(&cpu_lock); + + for (i = 0; i < NCPU; i++) { + gsqueue_cpu_t *scp; + cpu_t *cp = cpu_get(i); + if (cp != NULL && CPU_ACTIVE(cp) && + cp->cpu_flags & CPU_EXISTS) { + scp = gsqueue_cpu_create(wpri, cp->cpu_id); + gssp->gs_cpus[gssp->gs_ncpus] = scp; + gssp->gs_ncpus++; + } + } + + /* Finally we can add it to our global list and be done */ + mutex_enter(&gsqueue_lock); + list_insert_tail(&gsqueue_list, gssp); + mutex_exit(&gsqueue_lock); + mutex_exit(&cpu_lock); + + return (gssp); +} + +void +gsqueue_set_destroy(gsqueue_set_t *gssp) +{ + int i; + gsqueue_cpu_t *scp; + + /* + * Go through and unbind all of the squeues while cpu_lock is held and + * move them to the defunct list. Once that's done, we don't need to do + * anything else with cpu_lock. + */ + mutex_enter(&cpu_lock); + mutex_enter(&gsqueue_lock); + list_remove(&gsqueue_list, gssp); + mutex_exit(&gsqueue_lock); + + mutex_enter(&gssp->gs_lock); + + for (i = 0; i < gssp->gs_ncpus; i++) { + scp = gssp->gs_cpus[i]; + squeue_unbind(scp->gqc_head); + list_insert_tail(&gssp->gs_defunct, scp); + gssp->gs_cpus[i] = NULL; + } + gssp->gs_ncpus = 0; + + mutex_exit(&gssp->gs_lock); + mutex_exit(&cpu_lock); + + while ((scp = list_remove_head(&gssp->gs_defunct)) != NULL) { + gsqueue_cpu_destroy(scp); + } + + while (gssp->gs_cbs != NULL) { + gsqueue_cb_t *cbp; + + cbp = gssp->gs_cbs; + gssp->gs_cbs = cbp->gcb_next; + kmem_cache_free(gsqueue_cb_cache, cbp); + } + + ASSERT3U(gssp->gs_ncpus, ==, 0); + ASSERT3P(list_head(&gssp->gs_defunct), ==, NULL); + ASSERT3P(gssp->gs_cbs, ==, NULL); + kmem_cache_free(gsqueue_set_cache, gssp); +} + +gsqueue_t * +gsqueue_set_get(gsqueue_set_t *gssp, uint_t index) +{ + squeue_t *sqp; + gsqueue_cpu_t *scp; + + mutex_enter(&gssp->gs_lock); + scp = gssp->gs_cpus[index % gssp->gs_ncpus]; + sqp = scp->gqc_head; + mutex_exit(&gssp->gs_lock); + return ((gsqueue_t *)sqp); +} + +uintptr_t +gsqueue_set_cb_add(gsqueue_set_t *gssp, gsqueue_cb_f cb, void *arg) +{ + gsqueue_cb_t *cbp; + + cbp = kmem_cache_alloc(gsqueue_cb_cache, KM_SLEEP); + cbp->gcb_func = cb; + cbp->gcb_arg = arg; + + mutex_enter(&gssp->gs_lock); + cbp->gcb_next = gssp->gs_cbs; + gssp->gs_cbs = cbp; + mutex_exit(&gssp->gs_lock); + return ((uintptr_t)cbp); +} + +int +gsqueue_set_cb_remove(gsqueue_set_t *gssp, uintptr_t id) +{ + gsqueue_cb_t *cbp, *prev; + mutex_enter(&gssp->gs_lock); + cbp = gssp->gs_cbs; + prev = NULL; + while (cbp != NULL) { + if ((uintptr_t)cbp != id) { + prev = cbp; + cbp = cbp->gcb_next; + continue; + } + + if (prev == NULL) { + gssp->gs_cbs = cbp->gcb_next; + } else { + prev->gcb_next = cbp->gcb_next; + } + + mutex_exit(&gssp->gs_lock); + kmem_cache_free(gsqueue_cb_cache, cbp); + return (0); + } + mutex_exit(&gssp->gs_lock); + return (-1); +} + +void +gsqueue_enter_one(gsqueue_t *gsp, mblk_t *mp, gsqueue_proc_f func, void *arg, + int flags, uint8_t tag) +{ + squeue_t *sqp = (squeue_t *)gsp; + + ASSERT(mp->b_next == NULL); + ASSERT(mp->b_prev == NULL); + mp->b_queue = (queue_t *)func; + mp->b_prev = arg; + sqp->sq_enter(sqp, mp, mp, 1, NULL, flags, tag); +} + +static void +gsqueue_notify(gsqueue_set_t *gssp, squeue_t *sqp, boolean_t online) +{ + gsqueue_cb_t *cbp; + + ASSERT(MUTEX_HELD(&gssp->gs_lock)); + cbp = gssp->gs_cbs; + while (cbp != NULL) { + cbp->gcb_func(gssp, (gsqueue_t *)sqp, cbp->gcb_arg, online); + cbp = cbp->gcb_next; + } + +} + +/* + * When we online a processor we need to go through and either bind a defunct + * squeue or create a new one. We'll try to reuse a gsqueue_cpu_t from the + * defunct list that used to be on that processor. If no such gsqueue_cpu_t + * exists, then we'll create a new one. We'd rather avoid taking over an + * existing defunct one that used to be on another CPU, as its not unreasonable + * to believe that its CPU will come back. More CPUs are offlined and onlined by + * the administrator or by creating cpu sets than actually get offlined by FMA. + */ +static void +gsqueue_handle_online(processorid_t id) +{ + gsqueue_set_t *gssp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + mutex_enter(&gsqueue_lock); + for (gssp = list_head(&gsqueue_list); gssp != NULL; + gssp = list_next(&gsqueue_list, gssp)) { + gsqueue_cpu_t *scp; + + mutex_enter(&gssp->gs_lock); + for (scp = list_head(&gssp->gs_defunct); scp != NULL; + scp = list_next(&gssp->gs_defunct, scp)) { + if (scp->gqc_cpuid == id) { + list_remove(&gssp->gs_defunct, scp); + break; + } + } + + if (scp == NULL) { + scp = gsqueue_cpu_create(gssp->gs_wpri, id); + } else { + squeue_bind(scp->gqc_head, id); + } + + ASSERT(gssp->gs_ncpus < NCPU); + gssp->gs_cpus[gssp->gs_ncpus] = scp; + gssp->gs_ncpus++; + gsqueue_notify(gssp, scp->gqc_head, B_TRUE); + mutex_exit(&gssp->gs_lock); + } + mutex_exit(&gsqueue_lock); +} + +static void +gsqueue_handle_offline(processorid_t id) +{ + gsqueue_set_t *gssp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + mutex_enter(&gsqueue_lock); + for (gssp = list_head(&gsqueue_list); gssp != NULL; + gssp = list_next(&gsqueue_list, gssp)) { + int i; + gsqueue_cpu_t *scp = NULL; + + mutex_enter(&gssp->gs_lock); + for (i = 0; i < gssp->gs_ncpus; i++) { + if (gssp->gs_cpus[i]->gqc_cpuid == id) { + scp = gssp->gs_cpus[i]; + break; + } + } + + if (scp != NULL) { + squeue_unbind(scp->gqc_head); + list_insert_tail(&gssp->gs_defunct, scp); + gssp->gs_cpus[i] = gssp->gs_cpus[gssp->gs_ncpus-1]; + gssp->gs_ncpus--; + gsqueue_notify(gssp, scp->gqc_head, B_FALSE); + } + mutex_exit(&gssp->gs_lock); + } + mutex_exit(&gsqueue_lock); +} + +/* ARGSUSED */ +static int +gsqueue_cpu_setup(cpu_setup_t what, int id, void *unused) +{ + cpu_t *cp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + cp = cpu_get(id); + switch (what) { + case CPU_CONFIG: + case CPU_ON: + case CPU_INIT: + case CPU_CPUPART_IN: + if (cp != NULL && CPU_ACTIVE(cp) && cp->cpu_flags & CPU_EXISTS) + gsqueue_handle_online(cp->cpu_id); + break; + case CPU_UNCONFIG: + case CPU_OFF: + case CPU_CPUPART_OUT: + gsqueue_handle_offline(cp->cpu_id); + break; + default: + break; + } + + return (0); +} + + +/* ARGSUSED */ +static int +gsqueue_set_cache_construct(void *buf, void *arg, int kmflags) +{ + gsqueue_set_t *gssp = buf; + + gssp->gs_cpus = kmem_alloc(sizeof (gsqueue_cpu_t *) * NCPU, kmflags); + if (gssp->gs_cpus == NULL) + return (-1); + + mutex_init(&gssp->gs_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&gssp->gs_defunct, sizeof (gsqueue_cpu_t), + offsetof(gsqueue_cpu_t, gqc_lnode)); + gssp->gs_ncpus = 0; + gssp->gs_cbs = NULL; + + return (0); +} + +/* ARGSUSED */ +static void +gsqueue_set_cache_destruct(void *buf, void *arg) +{ + gsqueue_set_t *gssp = buf; + + kmem_free(gssp->gs_cpus, sizeof (gsqueue_cpu_t *) * NCPU); + gssp->gs_cpus = NULL; + list_destroy(&gssp->gs_defunct); + mutex_destroy(&gssp->gs_lock); +} + +static void +gsqueue_ddiinit(void) +{ + list_create(&gsqueue_list, sizeof (gsqueue_set_t), + offsetof(gsqueue_set_t, gs_next)); + mutex_init(&gsqueue_lock, NULL, MUTEX_DRIVER, NULL); + + gsqueue_cb_cache = kmem_cache_create("gsqueue_cb_cache", + sizeof (gsqueue_cb_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + gsqueue_cpu_cache = kmem_cache_create("gsqueue_cpu_cache", + sizeof (gsqueue_cpu_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + gsqueue_set_cache = kmem_cache_create("squeue_set_cache", + sizeof (gsqueue_set_t), + 0, gsqueue_set_cache_construct, gsqueue_set_cache_destruct, + NULL, NULL, NULL, 0); + + + mutex_enter(&cpu_lock); + register_cpu_setup_func(gsqueue_cpu_setup, NULL); + mutex_exit(&cpu_lock); +} + +static int +gsqueue_ddifini(void) +{ + mutex_enter(&gsqueue_lock); + if (list_is_empty(&gsqueue_list) == 0) { + mutex_exit(&gsqueue_lock); + return (EBUSY); + } + list_destroy(&gsqueue_list); + mutex_exit(&gsqueue_lock); + + mutex_enter(&cpu_lock); + register_cpu_setup_func(gsqueue_cpu_setup, NULL); + mutex_exit(&cpu_lock); + + kmem_cache_destroy(gsqueue_set_cache); + kmem_cache_destroy(gsqueue_cpu_cache); + kmem_cache_destroy(gsqueue_cb_cache); + + mutex_destroy(&gsqueue_lock); + + return (0); +} + +static struct modlmisc gsqueue_modmisc = { + &mod_miscops, + "gsqueue" +}; + +static struct modlinkage gsqueue_modlinkage = { + MODREV_1, + &gsqueue_modmisc, + NULL +}; + +int +_init(void) +{ + int ret; + + gsqueue_ddiinit(); + if ((ret = mod_install(&gsqueue_modlinkage)) != 0) { + VERIFY(gsqueue_ddifini() == 0); + return (ret); + } + + return (ret); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&gsqueue_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int ret; + + if ((ret = gsqueue_ddifini()) != 0) + return (ret); + + if ((ret = mod_remove(&gsqueue_modlinkage)) != 0) + return (ret); + + return (0); +} diff --git a/usr/src/uts/common/io/hook.c b/usr/src/uts/common/io/hook.c index c3ebfa0e47..6726f72147 100644 --- a/usr/src/uts/common/io/hook.c +++ b/usr/src/uts/common/io/hook.c @@ -1050,7 +1050,7 @@ hook_family_free(hook_family_int_t *hfi, hook_stack_t *hks) /* Free container */ kmem_free(hfi, sizeof (*hfi)); - if (hks->hks_shutdown == 2) + if (hks != NULL && hks->hks_shutdown == 2) hook_stack_remove(hks); mutex_exit(&hook_stack_lock); diff --git a/usr/src/uts/common/io/i40e/i40e_gld.c b/usr/src/uts/common/io/i40e/i40e_gld.c index d34057d64f..ccf814be0b 100644 --- a/usr/src/uts/common/io/i40e/i40e_gld.c +++ b/usr/src/uts/common/io/i40e/i40e_gld.c @@ -39,7 +39,8 @@ char *i40e_priv_props[] = { static int i40e_group_remove_mac(void *arg, const uint8_t *mac_addr) { - i40e_t *i40e = arg; + i40e_rx_group_t *rxg = arg; + i40e_t *i40e = rxg->irg_i40e; struct i40e_aqc_remove_macvlan_element_data filt; struct i40e_hw *hw = &i40e->i40e_hw_space; int ret, i, last; @@ -107,10 +108,11 @@ done: static int i40e_group_add_mac(void *arg, const uint8_t *mac_addr) { - i40e_t *i40e = arg; - struct i40e_hw *hw = &i40e->i40e_hw_space; - int i, ret; - i40e_uaddr_t *iua; + i40e_rx_group_t *rxg = arg; + i40e_t *i40e = rxg->irg_i40e; + struct i40e_hw *hw = &i40e->i40e_hw_space; + int i, ret; + i40e_uaddr_t *iua; struct i40e_aqc_add_macvlan_element_data filt; if (I40E_IS_MULTICAST(mac_addr)) @@ -136,16 +138,12 @@ i40e_group_add_mac(void *arg, const uint8_t *mac_addr) } } - /* - * Note, the general use of the i40e_vsi_id will have to be refactored - * when we have proper group support. - */ bzero(&filt, sizeof (filt)); bcopy(mac_addr, filt.mac_addr, ETHERADDRL); filt.flags = I40E_AQC_MACVLAN_ADD_PERFECT_MATCH | I40E_AQC_MACVLAN_ADD_IGNORE_VLAN; - if ((ret = i40e_aq_add_macvlan(hw, i40e->i40e_vsi_id, &filt, 1, + if ((ret = i40e_aq_add_macvlan(hw, rxg->irg_vsi_seid, &filt, 1, NULL)) != I40E_SUCCESS) { i40e_error(i40e, "failed to add mac address " "%2x:%2x:%2x:%2x:%2x:%2x to unicast filter: %d", @@ -157,7 +155,7 @@ i40e_group_add_mac(void *arg, const uint8_t *mac_addr) iua = &i40e->i40e_uaddrs[i40e->i40e_resources.ifr_nmacfilt_used]; bcopy(mac_addr, iua->iua_mac, ETHERADDRL); - iua->iua_vsi = i40e->i40e_vsi_id; + iua->iua_vsi = rxg->irg_vsi_seid; i40e->i40e_resources.ifr_nmacfilt_used++; ASSERT(i40e->i40e_resources.ifr_nmacfilt_used <= i40e->i40e_resources.ifr_nmacfilt); @@ -227,7 +225,7 @@ i40e_m_promisc(void *arg, boolean_t on) } - ret = i40e_aq_set_vsi_unicast_promiscuous(hw, i40e->i40e_vsi_id, + ret = i40e_aq_set_vsi_unicast_promiscuous(hw, I40E_DEF_VSI_SEID(i40e), on, NULL, B_FALSE); if (ret != I40E_SUCCESS) { i40e_error(i40e, "failed to %s unicast promiscuity on " @@ -246,7 +244,7 @@ i40e_m_promisc(void *arg, boolean_t on) goto done; } - ret = i40e_aq_set_vsi_multicast_promiscuous(hw, i40e->i40e_vsi_id, + ret = i40e_aq_set_vsi_multicast_promiscuous(hw, I40E_DEF_VSI_SEID(i40e), on, NULL); if (ret != I40E_SUCCESS) { i40e_error(i40e, "failed to %s multicast promiscuity on " @@ -257,8 +255,8 @@ i40e_m_promisc(void *arg, boolean_t on) * Try our best to put us back into a state that MAC expects us * to be in. */ - ret = i40e_aq_set_vsi_unicast_promiscuous(hw, i40e->i40e_vsi_id, - !on, NULL, B_FALSE); + ret = i40e_aq_set_vsi_unicast_promiscuous(hw, + I40E_DEF_VSI_SEID(i40e), !on, NULL, B_FALSE); if (ret != I40E_SUCCESS) { i40e_error(i40e, "failed to %s unicast promiscuity on " "the default VSI after toggling multicast failed: " @@ -294,11 +292,11 @@ i40e_multicast_add(i40e_t *i40e, const uint8_t *multicast_address) if (i40e->i40e_mcast_promisc_count == 0 && i40e->i40e_promisc_on == B_FALSE) { ret = i40e_aq_set_vsi_multicast_promiscuous(hw, - i40e->i40e_vsi_id, B_TRUE, NULL); + I40E_DEF_VSI_SEID(i40e), B_TRUE, NULL); if (ret != I40E_SUCCESS) { i40e_error(i40e, "failed to enable multicast " "promiscuous mode on VSI %d: %d", - i40e->i40e_vsi_id, ret); + I40E_DEF_VSI_SEID(i40e), ret); return (EIO); } } @@ -312,7 +310,7 @@ i40e_multicast_add(i40e_t *i40e, const uint8_t *multicast_address) filt.flags = I40E_AQC_MACVLAN_ADD_HASH_MATCH | I40E_AQC_MACVLAN_ADD_IGNORE_VLAN; - if ((ret = i40e_aq_add_macvlan(hw, i40e->i40e_vsi_id, &filt, 1, + if ((ret = i40e_aq_add_macvlan(hw, I40E_DEF_VSI_SEID(i40e), &filt, 1, NULL)) != I40E_SUCCESS) { i40e_error(i40e, "failed to add mac address " "%2x:%2x:%2x:%2x:%2x:%2x to multicast filter: %d", @@ -353,8 +351,8 @@ i40e_multicast_remove(i40e_t *i40e, const uint8_t *multicast_address) filt.flags = I40E_AQC_MACVLAN_DEL_HASH_MATCH | I40E_AQC_MACVLAN_DEL_IGNORE_VLAN; - if (i40e_aq_remove_macvlan(hw, i40e->i40e_vsi_id, - &filt, 1, NULL) != I40E_SUCCESS) { + if (i40e_aq_remove_macvlan(hw, I40E_DEF_VSI_SEID(i40e), &filt, + 1, NULL) != I40E_SUCCESS) { i40e_error(i40e, "failed to remove mac address " "%2x:%2x:%2x:%2x:%2x:%2x from multicast " "filter: %d", @@ -381,11 +379,11 @@ i40e_multicast_remove(i40e_t *i40e, const uint8_t *multicast_address) if (i40e->i40e_mcast_promisc_count == 1 && i40e->i40e_promisc_on == B_FALSE) { ret = i40e_aq_set_vsi_multicast_promiscuous(hw, - i40e->i40e_vsi_id, B_FALSE, NULL); + I40E_DEF_VSI_SEID(i40e), B_FALSE, NULL); if (ret != I40E_SUCCESS) { i40e_error(i40e, "failed to disable " "multicast promiscuous mode on VSI %d: %d", - i40e->i40e_vsi_id, ret); + I40E_DEF_VSI_SEID(i40e), ret); return (EIO); } } @@ -490,7 +488,7 @@ i40e_fill_tx_ring(void *arg, mac_ring_type_t rtype, const int group_index, * we're not actually grouping things tx-wise at this time. */ ASSERT(group_index == -1); - ASSERT(ring_index < i40e->i40e_num_trqpairs); + ASSERT(ring_index < i40e->i40e_num_trqpairs_per_vsi); itrq->itrq_mactxring = rh; infop->mri_driver = (mac_ring_driver_t)itrq; @@ -516,15 +514,16 @@ i40e_fill_rx_ring(void *arg, mac_ring_type_t rtype, const int group_index, { i40e_t *i40e = arg; mac_intr_t *mintr = &infop->mri_intr; - i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[ring_index]; + uint_t trqpair_index; + i40e_trqpair_t *itrq; - /* - * We assert the group number and ring index to help sanity check - * ourselves and mark that we'll need to rework this when we have - * multiple groups. - */ - ASSERT3S(group_index, ==, 0); - ASSERT3S(ring_index, <, i40e->i40e_num_trqpairs); + /* This assumes static groups. */ + ASSERT3S(group_index, >=, 0); + ASSERT3S(ring_index, >=, 0); + trqpair_index = (group_index * i40e->i40e_num_trqpairs_per_vsi) + + ring_index; + ASSERT3U(trqpair_index, <, i40e->i40e_num_trqpairs); + itrq = &i40e->i40e_trqpairs[trqpair_index]; itrq->itrq_macrxring = rh; infop->mri_driver = (mac_ring_driver_t)itrq; @@ -552,24 +551,22 @@ i40e_fill_rx_group(void *arg, mac_ring_type_t rtype, const int index, mac_group_info_t *infop, mac_group_handle_t gh) { i40e_t *i40e = arg; + i40e_rx_group_t *rxg; if (rtype != MAC_RING_TYPE_RX) return; - /* - * Note, this is a simplified view of a group, given that we only have a - * single group and a single ring at the moment. We'll want to expand - * upon this as we leverage more hardware functionality. - */ - i40e->i40e_rx_group_handle = gh; - infop->mgi_driver = (mac_group_driver_t)i40e; + rxg = &i40e->i40e_rx_groups[index]; + rxg->irg_grp_hdl = gh; + + infop->mgi_driver = (mac_group_driver_t)rxg; infop->mgi_start = NULL; infop->mgi_stop = NULL; infop->mgi_addmac = i40e_group_add_mac; infop->mgi_remmac = i40e_group_remove_mac; - ASSERT(i40e->i40e_num_rx_groups == I40E_GROUP_MAX); - infop->mgi_count = i40e->i40e_num_trqpairs; + ASSERT(i40e->i40e_num_rx_groups <= I40E_GROUP_MAX); + infop->mgi_count = i40e->i40e_num_trqpairs_per_vsi; } static int @@ -732,20 +729,32 @@ i40e_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) break; } + case MAC_CAPAB_LSO: { + mac_capab_lso_t *cap_lso = cap_data; + + if (i40e->i40e_tx_lso_enable == B_TRUE) { + cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; + cap_lso->lso_basic_tcp_ipv4.lso_max = I40E_LSO_MAXLEN; + } else { + return (B_FALSE); + } + break; + } + case MAC_CAPAB_RINGS: cap_rings = cap_data; cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; switch (cap_rings->mr_type) { case MAC_RING_TYPE_TX: /* - * Note, saying we have no rings, but some number of - * groups indicates to MAC that it should create - * psuedo-groups with one for each TX ring. This may not - * be the long term behavior we want, but it'll work for - * now. + * Note, saying we have no groups, but some + * number of rings indicates to MAC that it + * should create psuedo-groups with one for + * each TX ring. This may not be the long term + * behavior we want, but it'll work for now. */ cap_rings->mr_gnum = 0; - cap_rings->mr_rnum = i40e->i40e_num_trqpairs; + cap_rings->mr_rnum = i40e->i40e_num_trqpairs_per_vsi; cap_rings->mr_rget = i40e_fill_tx_ring; cap_rings->mr_gget = NULL; cap_rings->mr_gaddring = NULL; @@ -754,7 +763,7 @@ i40e_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) case MAC_RING_TYPE_RX: cap_rings->mr_rnum = i40e->i40e_num_trqpairs; cap_rings->mr_rget = i40e_fill_rx_ring; - cap_rings->mr_gnum = I40E_GROUP_MAX; + cap_rings->mr_gnum = i40e->i40e_num_rx_groups; cap_rings->mr_gget = i40e_fill_rx_group; cap_rings->mr_gaddring = NULL; cap_rings->mr_gremring = NULL; diff --git a/usr/src/uts/common/io/i40e/i40e_intr.c b/usr/src/uts/common/io/i40e/i40e_intr.c index 51d1bbac92..170bef7ec6 100644 --- a/usr/src/uts/common/io/i40e/i40e_intr.c +++ b/usr/src/uts/common/io/i40e/i40e_intr.c @@ -10,7 +10,7 @@ */ /* - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright 2017 Tegile Systems, Inc. All rights reserved. */ @@ -229,12 +229,20 @@ i40e_intr_adminq_disable(i40e_t *i40e) I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTL0, reg); } +/* + * The next two functions enable/disable the reception of interrupts + * on the given vector. Only vectors 1..N are programmed by these + * functions; vector 0 is special and handled by a different register. + * We must subtract one from the vector because i40e implicitly adds + * one to the vector value. See section 10.2.2.10.13 for more details. + */ static void i40e_intr_io_enable(i40e_t *i40e, int vector) { uint32_t reg; i40e_hw_t *hw = &i40e->i40e_hw_space; + ASSERT3S(vector, >, 0); reg = I40E_PFINT_DYN_CTLN_INTENA_MASK | I40E_PFINT_DYN_CTLN_CLEARPBA_MASK | (I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT); @@ -247,6 +255,7 @@ i40e_intr_io_disable(i40e_t *i40e, int vector) uint32_t reg; i40e_hw_t *hw = &i40e->i40e_hw_space; + ASSERT3S(vector, >, 0); reg = I40E_ITR_INDEX_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT; I40E_WRITE_REG(hw, I40E_PFINT_DYN_CTLN(vector - 1), reg); } @@ -375,49 +384,109 @@ i40e_intr_chip_fini(i40e_t *i40e) } /* - * Enable all of the queues and set the corresponding LNKLSTN registers. Note - * that we always enable queues as interrupt sources, even though we don't - * enable the MSI-X interrupt vectors. + * Set the head of the interrupt linked list. The PFINT_LNKLSTN[N] + * register actually refers to the 'N + 1' interrupt vector. E.g., + * PFINT_LNKLSTN[0] refers to interrupt vector 1. + */ +static void +i40e_set_lnklstn(i40e_t *i40e, uint_t vector, uint_t queue) +{ + uint32_t reg; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + reg = (queue << I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT) | + (I40E_QUEUE_TYPE_RX << I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT); + + I40E_WRITE_REG(hw, I40E_PFINT_LNKLSTN(vector), reg); + DEBUGOUT2("PFINT_LNKLSTN[%u] = 0x%x", vector, reg); +} + +/* + * Set the QINT_RQCTL[queue] register. The next queue is always the Tx + * queue associated with this Rx queue. Unlike PFINT_LNKLSTN, the + * vector should be the actual vector this queue is on -- i.e., it + * should be equal to itrq_rx_intrvec. + */ +static void +i40e_set_rqctl(i40e_t *i40e, uint_t vector, uint_t queue) +{ + uint32_t reg; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + ASSERT3U(vector, ==, i40e->i40e_trqpairs[queue].itrq_rx_intrvec); + + reg = (vector << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) | + (I40E_ITR_INDEX_RX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) | + (queue << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) | + (I40E_QUEUE_TYPE_TX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) | + I40E_QINT_RQCTL_CAUSE_ENA_MASK; + + I40E_WRITE_REG(hw, I40E_QINT_RQCTL(queue), reg); + DEBUGOUT2("QINT_RQCTL[%u] = 0x%x", queue, reg); +} + +/* + * Like i40e_set_rqctl(), but for QINT_TQCTL[queue]. The next queue is + * either the Rx queue of another TRQP, or EOL. + */ +static void +i40e_set_tqctl(i40e_t *i40e, uint_t vector, uint_t queue, uint_t next_queue) +{ + uint32_t reg; + i40e_hw_t *hw = &i40e->i40e_hw_space; + + ASSERT3U(vector, ==, i40e->i40e_trqpairs[queue].itrq_tx_intrvec); + + reg = (vector << I40E_QINT_TQCTL_MSIX_INDX_SHIFT) | + (I40E_ITR_INDEX_TX << I40E_QINT_TQCTL_ITR_INDX_SHIFT) | + (next_queue << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) | + (I40E_QUEUE_TYPE_RX << I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT) | + I40E_QINT_TQCTL_CAUSE_ENA_MASK; + + I40E_WRITE_REG(hw, I40E_QINT_TQCTL(queue), reg); + DEBUGOUT2("QINT_TQCTL[%u] = 0x%x", queue, reg); +} + +/* + * Program the interrupt linked list. Each vector has a linked list of + * queues which act as event sources for that vector. When one of + * those sources has an event the associated interrupt vector is + * fired. This mapping must match the mapping found in + * i40e_map_intrs_to_vectors(). + * + * See section 7.5.3 for more information about the configuration of + * the interrupt linked list. */ static void i40e_intr_init_queue_msix(i40e_t *i40e) { - i40e_hw_t *hw = &i40e->i40e_hw_space; - uint32_t reg; - int i; + uint_t intr_count; /* - * Map queues to MSI-X interrupts. Queue i is mapped to vector i + 1. - * Note that we skip the ITR logic for the moment, just to make our - * lives as explicit and simple as possible. + * The 0th vector is for 'Other Interrupts' only (subject to + * change in the future). */ - for (i = 0; i < i40e->i40e_num_trqpairs; i++) { - i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i]; + intr_count = i40e->i40e_intr_count - 1; - reg = (i << I40E_PFINT_LNKLSTN_FIRSTQ_INDX_SHIFT) | - (I40E_QUEUE_TYPE_RX << - I40E_PFINT_LNKLSTN_FIRSTQ_TYPE_SHIFT); - I40E_WRITE_REG(hw, I40E_PFINT_LNKLSTN(i), reg); + for (uint_t vec = 0; vec < intr_count; vec++) { + boolean_t head = B_TRUE; - reg = - (itrq->itrq_rx_intrvec << I40E_QINT_RQCTL_MSIX_INDX_SHIFT) | - (I40E_ITR_INDEX_RX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) | - (i << I40E_QINT_RQCTL_NEXTQ_INDX_SHIFT) | - (I40E_QUEUE_TYPE_TX << I40E_QINT_RQCTL_NEXTQ_TYPE_SHIFT) | - I40E_QINT_RQCTL_CAUSE_ENA_MASK; + for (uint_t qidx = vec; qidx < i40e->i40e_num_trqpairs; + qidx += intr_count) { + uint_t next_qidx = qidx + intr_count; - I40E_WRITE_REG(hw, I40E_QINT_RQCTL(i), reg); + next_qidx = (next_qidx > i40e->i40e_num_trqpairs) ? + I40E_QUEUE_TYPE_EOL : next_qidx; - reg = - (itrq->itrq_tx_intrvec << I40E_QINT_TQCTL_MSIX_INDX_SHIFT) | - (I40E_ITR_INDEX_TX << I40E_QINT_RQCTL_ITR_INDX_SHIFT) | - (I40E_QUEUE_TYPE_EOL << I40E_QINT_TQCTL_NEXTQ_INDX_SHIFT) | - (I40E_QUEUE_TYPE_RX << I40E_QINT_TQCTL_NEXTQ_TYPE_SHIFT) | - I40E_QINT_TQCTL_CAUSE_ENA_MASK; + if (head) { + i40e_set_lnklstn(i40e, vec, qidx); + head = B_FALSE; + } - I40E_WRITE_REG(hw, I40E_QINT_TQCTL(i), reg); + i40e_set_rqctl(i40e, vec + 1, qidx); + i40e_set_tqctl(i40e, vec + 1, qidx, next_qidx); + } } - } /* @@ -604,31 +673,26 @@ i40e_intr_adminq_work(i40e_t *i40e) } static void -i40e_intr_rx_work(i40e_t *i40e, int queue) +i40e_intr_rx_work(i40e_t *i40e, i40e_trqpair_t *itrq) { mblk_t *mp = NULL; - i40e_trqpair_t *itrq; - - ASSERT(queue < i40e->i40e_num_trqpairs); - itrq = &i40e->i40e_trqpairs[queue]; mutex_enter(&itrq->itrq_rx_lock); if (!itrq->itrq_intr_poll) mp = i40e_ring_rx(itrq, I40E_POLL_NULL); mutex_exit(&itrq->itrq_rx_lock); - if (mp != NULL) { - mac_rx_ring(i40e->i40e_mac_hdl, itrq->itrq_macrxring, mp, - itrq->itrq_rxgen); - } + if (mp == NULL) + return; + + mac_rx_ring(i40e->i40e_mac_hdl, itrq->itrq_macrxring, mp, + itrq->itrq_rxgen); } +/* ARGSUSED */ static void -i40e_intr_tx_work(i40e_t *i40e, int queue) +i40e_intr_tx_work(i40e_t *i40e, i40e_trqpair_t *itrq) { - i40e_trqpair_t *itrq; - - itrq = &i40e->i40e_trqpairs[queue]; i40e_tx_recycle_ring(itrq); } @@ -665,11 +729,17 @@ i40e_intr_other_work(i40e_t *i40e) i40e_intr_adminq_enable(i40e); } +/* + * Handle an MSI-X interrupt. See section 7.5.1.3 for an overview of + * the MSI-X interrupt sequence. + */ uint_t i40e_intr_msix(void *arg1, void *arg2) { i40e_t *i40e = (i40e_t *)arg1; - int vector_idx = (int)(uintptr_t)arg2; + uint_t vector_idx = (uint_t)(uintptr_t)arg2; + + ASSERT3U(vector_idx, <, i40e->i40e_intr_count); /* * When using MSI-X interrupts, vector 0 is always reserved for the @@ -681,10 +751,29 @@ i40e_intr_msix(void *arg1, void *arg2) return (DDI_INTR_CLAIMED); } - i40e_intr_rx_work(i40e, vector_idx - 1); - i40e_intr_tx_work(i40e, vector_idx - 1); - i40e_intr_io_enable(i40e, vector_idx); + ASSERT3U(vector_idx, >, 0); + /* + * We determine the queue indexes via simple arithmetic (as + * opposed to keeping explicit state like a bitmap). While + * conveinent, it does mean that i40e_map_intrs_to_vectors(), + * i40e_intr_init_queue_msix(), and this function must be + * modified as a unit. + * + * We subtract 1 from the vector to offset the addition we + * performed during i40e_map_intrs_to_vectors(). + */ + for (uint_t i = vector_idx - 1; i < i40e->i40e_num_trqpairs; + i += (i40e->i40e_intr_count - 1)) { + i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i]; + + ASSERT3U(i, <, i40e->i40e_num_trqpairs); + ASSERT3P(itrq, !=, NULL); + i40e_intr_rx_work(i40e, itrq); + i40e_intr_tx_work(i40e, itrq); + } + + i40e_intr_io_enable(i40e, vector_idx); return (DDI_INTR_CLAIMED); } @@ -693,6 +782,7 @@ i40e_intr_notx(i40e_t *i40e, boolean_t shared) { i40e_hw_t *hw = &i40e->i40e_hw_space; uint32_t reg; + i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[0]; int ret = DDI_INTR_CLAIMED; if (shared == B_TRUE) { @@ -722,10 +812,10 @@ i40e_intr_notx(i40e_t *i40e, boolean_t shared) i40e_intr_adminq_work(i40e); if (reg & I40E_INTR_NOTX_RX_MASK) - i40e_intr_rx_work(i40e, 0); + i40e_intr_rx_work(i40e, itrq); if (reg & I40E_INTR_NOTX_TX_MASK) - i40e_intr_tx_work(i40e, 0); + i40e_intr_tx_work(i40e, itrq); done: i40e_intr_adminq_enable(i40e); diff --git a/usr/src/uts/common/io/i40e/i40e_main.c b/usr/src/uts/common/io/i40e/i40e_main.c index 54aef43424..20f74d4e95 100644 --- a/usr/src/uts/common/io/i40e/i40e_main.c +++ b/usr/src/uts/common/io/i40e/i40e_main.c @@ -11,7 +11,7 @@ /* * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright 2017 Tegile Systems, Inc. All rights reserved. */ @@ -188,14 +188,15 @@ * VSI Management * -------------- * - * At this time, we currently only support a single MAC group, and thus a single - * VSI. This VSI is considered the default VSI and should be the only one that - * exists after a reset. Currently it is stored as the member - * i40e_t`i40e_vsi_id. While this works for the moment and for an initial - * driver, it's not sufficient for the longer-term path of the driver. Instead, - * we'll want to actually have a unique i40e_vsi_t structure which is used - * everywhere. Note that this means that every place that uses the - * i40e_t`i40e_vsi_id will need to be refactored. + * The PFs share 384 VSIs. The firmware creates one VSI per PF by default. + * During chip start we retrieve the SEID of this VSI and assign it as the + * default VSI for our VEB (one VEB per PF). We then add additional VSIs to + * the VEB up to the determined number of rx groups: i40e_t`i40e_num_rx_groups. + * We currently cap this number to I40E_GROUP_MAX to a) make sure all PFs can + * allocate the same number of VSIs, and b) to keep the interrupt multiplexing + * under control. In the future, when we improve the interrupt allocation, we + * may want to revisit this cap to make better use of the available VSIs. The + * VSI allocation and configuration can be found in i40e_chip_start(). * * ---------------- * Structure Layout @@ -240,7 +241,7 @@ * | i40e_hw_t --+---> Intel common code structure * | mac_handle_t --+---> GLDv3 handle to MAC * | ddi_periodic_t --+---> Link activity timer - * | int (vsi_id) --+---> VSI ID, main identifier + * | i40e_vsi_t * --+---> Array of VSIs * | i40e_func_rsrc_t --+---> Available hardware resources * | i40e_switch_rsrc_t * --+---> Switch resource snapshot * | i40e_sdu --+---> Current MTU @@ -249,11 +250,10 @@ * | i40e_maddr_t * --+---> Array of assigned multicast MACs * | i40e_mcast_promisccount --+---> Active multicast state * | i40e_promisc_on --+---> Current promiscuous mode state - * | int --+---> Number of transmit/receive pairs + * | uint_t --+---> Number of transmit/receive pairs + * | i40e_rx_group_t * --+---> Array of Rx groups * | kstat_t * --+---> PF kstats - * | kstat_t * --+---> VSI kstats * | i40e_pf_stats_t --+---> PF kstat backing data - * | i40e_vsi_stats_t --+---> VSI kstat backing data * | i40e_trqpair_t * --+---------+ * +---------------------------+ | * | @@ -359,8 +359,6 @@ * While bugs have been filed to cover this future work, the following gives an * overview of expected work: * - * o TSO support - * o Multiple group support * o DMA binding and breaking up the locking in ring recycling. * o Enhanced detection of device errors * o Participation in IRM @@ -371,7 +369,7 @@ #include "i40e_sw.h" -static char i40e_ident[] = "Intel 10/40Gb Ethernet v1.0.1"; +static char i40e_ident[] = "Intel 10/40Gb Ethernet v1.0.2"; /* * The i40e_glock primarily protects the lists below and the i40e_device_t @@ -761,15 +759,16 @@ i40e_fm_ereport(i40e_t *i40e, char *detail) } /* - * Here we're trying to get the ID of the default VSI. In general, when we come - * through and look at this shortly after attach, we expect there to only be a - * single element present, which is the default VSI. Importantly, each PF seems - * to not see any other devices, in part because of the simple switch mode that - * we're using. If for some reason, we see more artifact, we'll need to revisit - * what we're doing here. + * Here we're trying to set the SEID of the default VSI. In general, + * when we come through and look at this shortly after attach, we + * expect there to only be a single element present, which is the + * default VSI. Importantly, each PF seems to not see any other + * devices, in part because of the simple switch mode that we're + * using. If for some reason, we see more artifacts, we'll need to + * revisit what we're doing here. */ -static int -i40e_get_vsi_id(i40e_t *i40e) +static boolean_t +i40e_set_def_vsi_seid(i40e_t *i40e) { i40e_hw_t *hw = &i40e->i40e_hw_space; struct i40e_aqc_get_switch_config_resp *sw_config; @@ -784,17 +783,43 @@ i40e_get_vsi_id(i40e_t *i40e) if (rc != I40E_SUCCESS) { i40e_error(i40e, "i40e_aq_get_switch_config() failed %d: %d", rc, hw->aq.asq_last_status); - return (-1); + return (B_FALSE); } if (LE_16(sw_config->header.num_reported) != 1) { i40e_error(i40e, "encountered multiple (%d) switching units " "during attach, not proceeding", LE_16(sw_config->header.num_reported)); + return (B_FALSE); + } + + I40E_DEF_VSI_SEID(i40e) = sw_config->element[0].seid; + return (B_TRUE); +} + +/* + * Get the SEID of the uplink MAC. + */ +static int +i40e_get_mac_seid(i40e_t *i40e) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + struct i40e_aqc_get_switch_config_resp *sw_config; + uint8_t aq_buf[I40E_AQ_LARGE_BUF]; + uint16_t next = 0; + int rc; + + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + sw_config = (struct i40e_aqc_get_switch_config_resp *)aq_buf; + rc = i40e_aq_get_switch_config(hw, sw_config, sizeof (aq_buf), &next, + NULL); + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "i40e_aq_get_switch_config() failed %d: %d", + rc, hw->aq.asq_last_status); return (-1); } - return (sw_config->element[0].seid); + return (LE_16(sw_config->element[0].uplink_seid)); } /* @@ -1098,11 +1123,16 @@ i40e_disable_interrupts(i40e_t *i40e) static void i40e_free_trqpairs(i40e_t *i40e) { - int i; i40e_trqpair_t *itrq; + if (i40e->i40e_rx_groups != NULL) { + kmem_free(i40e->i40e_rx_groups, + sizeof (i40e_rx_group_t) * i40e->i40e_num_rx_groups); + i40e->i40e_rx_groups = NULL; + } + if (i40e->i40e_trqpairs != NULL) { - for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) { itrq = &i40e->i40e_trqpairs[i]; mutex_destroy(&itrq->itrq_rx_lock); mutex_destroy(&itrq->itrq_tx_lock); @@ -1133,7 +1163,6 @@ i40e_free_trqpairs(i40e_t *i40e) static boolean_t i40e_alloc_trqpairs(i40e_t *i40e) { - int i; void *mutexpri = DDI_INTR_PRI(i40e->i40e_intr_pri); /* @@ -1146,7 +1175,7 @@ i40e_alloc_trqpairs(i40e_t *i40e) i40e->i40e_trqpairs = kmem_zalloc(sizeof (i40e_trqpair_t) * i40e->i40e_num_trqpairs, KM_SLEEP); - for (i = 0; i < i40e->i40e_num_trqpairs; i++) { + for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) { i40e_trqpair_t *itrq = &i40e->i40e_trqpairs[i]; itrq->itrq_i40e = i40e; @@ -1156,6 +1185,16 @@ i40e_alloc_trqpairs(i40e_t *i40e) itrq->itrq_index = i; } + i40e->i40e_rx_groups = kmem_zalloc(sizeof (i40e_rx_group_t) * + i40e->i40e_num_rx_groups, KM_SLEEP); + + for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) { + i40e_rx_group_t *rxg = &i40e->i40e_rx_groups[i]; + + rxg->irg_index = i; + rxg->irg_i40e = i40e; + } + return (B_TRUE); } @@ -1164,16 +1203,19 @@ i40e_alloc_trqpairs(i40e_t *i40e) /* * Unless a .conf file already overrode i40e_t structure values, they will * be 0, and need to be set in conjunction with the now-available HW report. - * - * However, at the moment, we cap all of these resources as we only support a - * single receive ring and a single group. */ /* ARGSUSED */ static void i40e_hw_to_instance(i40e_t *i40e, i40e_hw_t *hw) { - if (i40e->i40e_num_trqpairs == 0) { - i40e->i40e_num_trqpairs = I40E_TRQPAIR_MAX; + if (i40e->i40e_num_trqpairs_per_vsi == 0) { + if (i40e_is_x722(i40e)) { + i40e->i40e_num_trqpairs_per_vsi = + I40E_722_MAX_TC_QUEUES; + } else { + i40e->i40e_num_trqpairs_per_vsi = + I40E_710_MAX_TC_QUEUES; + } } if (i40e->i40e_num_rx_groups == 0) { @@ -1309,12 +1351,11 @@ i40e_common_code_init(i40e_t *i40e, i40e_hw_t *hw) } /* - * We need to obtain the Virtual Station ID (VSI) before we can - * perform other operations on the device. + * We need to obtain the Default Virtual Station SEID (VSI) + * before we can perform other operations on the device. */ - i40e->i40e_vsi_id = i40e_get_vsi_id(i40e); - if (i40e->i40e_vsi_id == -1) { - i40e_error(i40e, "failed to obtain VSI ID"); + if (!i40e_set_def_vsi_seid(i40e)) { + i40e_error(i40e, "failed to obtain Default VSI SEID"); return (B_FALSE); } @@ -1559,6 +1600,9 @@ i40e_init_properties(i40e_t *i40e) i40e->i40e_tx_hcksum_enable = i40e_get_prop(i40e, "tx_hcksum_enable", B_FALSE, B_TRUE, B_TRUE); + i40e->i40e_tx_lso_enable = i40e_get_prop(i40e, "tx_lso_enable", + B_FALSE, B_TRUE, B_TRUE); + i40e->i40e_rx_hcksum_enable = i40e_get_prop(i40e, "rx_hcksum_enable", B_FALSE, B_TRUE, B_TRUE); @@ -1728,15 +1772,56 @@ i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo) } i40e->i40e_intr_type = 0; + i40e->i40e_num_rx_groups = I40E_GROUP_MAX; + /* + * We need to determine the number of queue pairs per traffic + * class. We only have one traffic class (TC0), so we'll base + * this off the number of interrupts provided. Furthermore, + * since we only use one traffic class, the number of queues + * per traffic class and per VSI are the same. + */ if ((intr_types & DDI_INTR_TYPE_MSIX) && - i40e->i40e_intr_force <= I40E_INTR_MSIX) { - if (i40e_alloc_intr_handles(i40e, devinfo, - DDI_INTR_TYPE_MSIX)) { - i40e->i40e_num_trqpairs = - MIN(i40e->i40e_intr_count - 1, max_trqpairs); - return (B_TRUE); - } + (i40e->i40e_intr_force <= I40E_INTR_MSIX) && + (i40e_alloc_intr_handles(i40e, devinfo, DDI_INTR_TYPE_MSIX))) { + uint32_t n; + + /* + * While we want the number of queue pairs to match + * the number of interrupts, we must keep stay in + * bounds of the maximum number of queues per traffic + * class. We subtract one from i40e_intr_count to + * account for interrupt zero; which is currently + * restricted to admin queue commands and other + * interrupt causes. + */ + n = MIN(i40e->i40e_intr_count - 1, max_trqpairs); + ASSERT3U(n, >, 0); + + /* + * Round up to the nearest power of two to ensure that + * the QBASE aligns with the TC size which must be + * programmed as a power of two. See the queue mapping + * description in section 7.4.9.5.5.1. + * + * If i40e_intr_count - 1 is not a power of two then + * some queue pairs on the same VSI will have to share + * an interrupt. + * + * We may want to revisit this logic in a future where + * we have more interrupts and more VSIs. Otherwise, + * each VSI will use as many interrupts as possible. + * Using more QPs per VSI means better RSS for each + * group, but at the same time may require more + * sharing of interrupts across VSIs. This may be a + * good candidate for a .conf tunable. + */ + n = 0x1 << ddi_fls(n); + i40e->i40e_num_trqpairs_per_vsi = n; + ASSERT3U(i40e->i40e_num_rx_groups, >, 0); + i40e->i40e_num_trqpairs = i40e->i40e_num_trqpairs_per_vsi * + i40e->i40e_num_rx_groups; + return (B_TRUE); } /* @@ -1745,6 +1830,7 @@ i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo) * single MSI interrupt. */ i40e->i40e_num_trqpairs = I40E_TRQPAIR_NOMSIX; + i40e->i40e_num_trqpairs_per_vsi = i40e->i40e_num_trqpairs; i40e->i40e_num_rx_groups = I40E_GROUP_NOMSIX; if ((intr_types & DDI_INTR_TYPE_MSI) && @@ -1767,24 +1853,20 @@ i40e_alloc_intrs(i40e_t *i40e, dev_info_t *devinfo) static boolean_t i40e_map_intrs_to_vectors(i40e_t *i40e) { - int i; - if (i40e->i40e_intr_type != DDI_INTR_TYPE_MSIX) { return (B_TRUE); } /* - * Each queue pair is mapped to a single interrupt, so transmit - * and receive interrupts for a given queue share the same vector. - * The number of queue pairs is one less than the number of interrupt - * vectors and is assigned the vector one higher than its index. - * Vector zero is reserved for the admin queue. + * Each queue pair is mapped to a single interrupt, so + * transmit and receive interrupts for a given queue share the + * same vector. Vector zero is reserved for the admin queue. */ - ASSERT(i40e->i40e_intr_count == i40e->i40e_num_trqpairs + 1); + for (uint_t i = 0; i < i40e->i40e_num_trqpairs; i++) { + uint_t vector = i % (i40e->i40e_intr_count - 1); - for (i = 0; i < i40e->i40e_num_trqpairs; i++) { - i40e->i40e_trqpairs[i].itrq_rx_intrvec = i + 1; - i40e->i40e_trqpairs[i].itrq_tx_intrvec = i + 1; + i40e->i40e_trqpairs[i].itrq_rx_intrvec = vector + 1; + i40e->i40e_trqpairs[i].itrq_tx_intrvec = vector + 1; } return (B_TRUE); @@ -1923,89 +2005,282 @@ i40e_init_macaddrs(i40e_t *i40e, i40e_hw_t *hw) } /* - * Configure the hardware for the Virtual Station Interface (VSI). Currently - * we only support one, but in the future we could instantiate more than one - * per attach-point. + * Set the properties which have common values across all the VSIs. + * Consult the "Add VSI" command section (7.4.9.5.5.1) for a + * complete description of these properties. */ -static boolean_t -i40e_config_vsi(i40e_t *i40e, i40e_hw_t *hw) +static void +i40e_set_shared_vsi_props(i40e_t *i40e, + struct i40e_aqc_vsi_properties_data *info, uint_t vsi_idx) { - struct i40e_vsi_context context; - int err, tc_queues; + uint_t tc_queues; + uint16_t vsi_qp_base; - bzero(&context, sizeof (struct i40e_vsi_context)); - context.seid = i40e->i40e_vsi_id; - context.pf_num = hw->pf_id; - err = i40e_aq_get_vsi_params(hw, &context, NULL); - if (err != I40E_SUCCESS) { - i40e_error(i40e, "get VSI params failed with %d", err); - return (B_FALSE); - } - - i40e->i40e_vsi_num = context.vsi_number; + /* + * It's important that we use bitwise-OR here; callers to this + * function might enable other sections before calling this + * function. + */ + info->valid_sections |= LE_16(I40E_AQ_VSI_PROP_QUEUE_MAP_VALID | + I40E_AQ_VSI_PROP_VLAN_VALID); /* - * Set the queue and traffic class bits. Keep it simple for now. + * Calculate the starting QP index for this VSI. This base is + * relative to the PF queue space; so a value of 0 for PF#1 + * represents the absolute index PFLAN_QALLOC_FIRSTQ for PF#1. */ - context.info.valid_sections = I40E_AQ_VSI_PROP_QUEUE_MAP_VALID; - context.info.mapping_flags = I40E_AQ_VSI_QUE_MAP_CONTIG; - context.info.queue_mapping[0] = I40E_ASSIGN_ALL_QUEUES; + vsi_qp_base = vsi_idx * i40e->i40e_num_trqpairs_per_vsi; + info->mapping_flags = LE_16(I40E_AQ_VSI_QUE_MAP_CONTIG); + info->queue_mapping[0] = + LE_16((vsi_qp_base << I40E_AQ_VSI_QUEUE_SHIFT) & + I40E_AQ_VSI_QUEUE_MASK); /* - * tc_queues determines the size of the traffic class, where the size is - * 2^^tc_queues to a maximum of 64 for the X710 and 128 for the X722. + * tc_queues determines the size of the traffic class, where + * the size is 2^^tc_queues to a maximum of 64 for the X710 + * and 128 for the X722. * * Some examples: - * i40e_num_trqpairs == 1 => tc_queues = 0, 2^^0 = 1. - * i40e_num_trqpairs == 7 => tc_queues = 3, 2^^3 = 8. - * i40e_num_trqpairs == 8 => tc_queues = 3, 2^^3 = 8. - * i40e_num_trqpairs == 9 => tc_queues = 4, 2^^4 = 16. - * i40e_num_trqpairs == 17 => tc_queues = 5, 2^^5 = 32. - * i40e_num_trqpairs == 64 => tc_queues = 6, 2^^6 = 64. + * i40e_num_trqpairs_per_vsi == 1 => tc_queues = 0, 2^^0 = 1. + * i40e_num_trqpairs_per_vsi == 7 => tc_queues = 3, 2^^3 = 8. + * i40e_num_trqpairs_per_vsi == 8 => tc_queues = 3, 2^^3 = 8. + * i40e_num_trqpairs_per_vsi == 9 => tc_queues = 4, 2^^4 = 16. + * i40e_num_trqpairs_per_vsi == 17 => tc_queues = 5, 2^^5 = 32. + * i40e_num_trqpairs_per_vsi == 64 => tc_queues = 6, 2^^6 = 64. */ - tc_queues = ddi_fls(i40e->i40e_num_trqpairs - 1); + tc_queues = ddi_fls(i40e->i40e_num_trqpairs_per_vsi - 1); - context.info.tc_mapping[0] = ((0 << I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT) & - I40E_AQ_VSI_TC_QUE_OFFSET_MASK) | - ((tc_queues << I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT) & - I40E_AQ_VSI_TC_QUE_NUMBER_MASK); + /* + * The TC queue mapping is in relation to the VSI queue space. + * Since we are only using one traffic class (TC0) we always + * start at queue offset 0. + */ + info->tc_mapping[0] = + LE_16(((0 << I40E_AQ_VSI_TC_QUE_OFFSET_SHIFT) & + I40E_AQ_VSI_TC_QUE_OFFSET_MASK) | + ((tc_queues << I40E_AQ_VSI_TC_QUE_NUMBER_SHIFT) & + I40E_AQ_VSI_TC_QUE_NUMBER_MASK)); - context.info.valid_sections |= I40E_AQ_VSI_PROP_VLAN_VALID; - context.info.port_vlan_flags = I40E_AQ_VSI_PVLAN_MODE_ALL | + /* + * I40E_AQ_VSI_PVLAN_MODE_ALL ("VLAN driver insertion mode") + * + * Allow tagged and untagged packets to be sent to this + * VSI from the host. + * + * I40E_AQ_VSI_PVLAN_EMOD_NOTHING ("VLAN and UP expose mode") + * + * Leave the tag on the frame and place no VLAN + * information in the descriptor. We want this mode + * because our MAC layer will take care of the VLAN tag, + * if there is one. + */ + info->port_vlan_flags = I40E_AQ_VSI_PVLAN_MODE_ALL | I40E_AQ_VSI_PVLAN_EMOD_NOTHING; +} + +/* + * Delete the VSI at this index, if one exists. We assume there is no + * action we can take if this command fails but to log the failure. + */ +static void +i40e_delete_vsi(i40e_t *i40e, uint_t idx) +{ + i40e_hw_t *hw = &i40e->i40e_hw_space; + uint16_t seid = i40e->i40e_vsis[idx].iv_seid; - context.flags = LE16_TO_CPU(I40E_AQ_VSI_TYPE_PF); + if (seid != 0) { + int rc; - i40e->i40e_vsi_stat_id = LE16_TO_CPU(context.info.stat_counter_idx); - if (i40e_stat_vsi_init(i40e) == B_FALSE) - return (B_FALSE); + rc = i40e_aq_delete_element(hw, seid, NULL); - err = i40e_aq_update_vsi_params(hw, &context, NULL); - if (err != I40E_SUCCESS) { - i40e_error(i40e, "Update VSI params failed with %d", err); + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "Failed to delete VSI %d: %d", + rc, hw->aq.asq_last_status); + } + + i40e->i40e_vsis[idx].iv_seid = 0; + } +} + +/* + * Add a new VSI. + */ +static boolean_t +i40e_add_vsi(i40e_t *i40e, i40e_hw_t *hw, uint_t idx) +{ + struct i40e_vsi_context ctx; + i40e_rx_group_t *rxg; + int rc; + + /* + * The default VSI is created by the controller. This function + * creates new, non-defualt VSIs only. + */ + ASSERT3U(idx, !=, 0); + + bzero(&ctx, sizeof (struct i40e_vsi_context)); + ctx.uplink_seid = i40e->i40e_veb_seid; + ctx.pf_num = hw->pf_id; + ctx.flags = I40E_AQ_VSI_TYPE_PF; + ctx.connection_type = I40E_AQ_VSI_CONN_TYPE_NORMAL; + i40e_set_shared_vsi_props(i40e, &ctx.info, idx); + + rc = i40e_aq_add_vsi(hw, &ctx, NULL); + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "i40e_aq_add_vsi() failed %d: %d", rc, + hw->aq.asq_last_status); return (B_FALSE); } + rxg = &i40e->i40e_rx_groups[idx]; + rxg->irg_vsi_seid = ctx.seid; + i40e->i40e_vsis[idx].iv_number = ctx.vsi_number; + i40e->i40e_vsis[idx].iv_seid = ctx.seid; + i40e->i40e_vsis[idx].iv_stats_id = LE_16(ctx.info.stat_counter_idx); + + if (i40e_stat_vsi_init(i40e, idx) == B_FALSE) + return (B_FALSE); return (B_TRUE); } /* - * Configure the RSS key. For the X710 controller family, this is set on a - * per-PF basis via registers. For the X722, this is done on a per-VSI basis - * through the admin queue. + * Configure the hardware for the Default Virtual Station Interface (VSI). */ static boolean_t -i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw) +i40e_config_def_vsi(i40e_t *i40e, i40e_hw_t *hw) { - uint32_t seed[I40E_PFQF_HKEY_MAX_INDEX + 1]; + struct i40e_vsi_context ctx; + i40e_rx_group_t *def_rxg; + int err; + struct i40e_aqc_remove_macvlan_element_data filt; - (void) random_get_pseudo_bytes((uint8_t *)seed, sizeof (seed)); + bzero(&ctx, sizeof (struct i40e_vsi_context)); + ctx.seid = I40E_DEF_VSI_SEID(i40e); + ctx.pf_num = hw->pf_id; + err = i40e_aq_get_vsi_params(hw, &ctx, NULL); + if (err != I40E_SUCCESS) { + i40e_error(i40e, "get VSI params failed with %d", err); + return (B_FALSE); + } - if (i40e_is_x722(i40e)) { + ctx.info.valid_sections = 0; + i40e->i40e_vsis[0].iv_number = ctx.vsi_number; + i40e->i40e_vsis[0].iv_stats_id = LE_16(ctx.info.stat_counter_idx); + if (i40e_stat_vsi_init(i40e, 0) == B_FALSE) + return (B_FALSE); + + i40e_set_shared_vsi_props(i40e, &ctx.info, I40E_DEF_VSI_IDX); + + err = i40e_aq_update_vsi_params(hw, &ctx, NULL); + if (err != I40E_SUCCESS) { + i40e_error(i40e, "Update VSI params failed with %d", err); + return (B_FALSE); + } + + def_rxg = &i40e->i40e_rx_groups[0]; + def_rxg->irg_vsi_seid = I40E_DEF_VSI_SEID(i40e); + + /* + * We have seen three different behaviors in regards to the + * Default VSI and its implicit L2 MAC+VLAN filter. + * + * 1. It has an implicit filter for the factory MAC address + * and this filter counts against 'ifr_nmacfilt_used'. + * + * 2. It has an implicit filter for the factory MAC address + * and this filter DOES NOT count against 'ifr_nmacfilt_used'. + * + * 3. It DOES NOT have an implicit filter. + * + * All three of these cases are accounted for below. If we + * fail to remove the L2 filter (ENOENT) then we assume there + * wasn't one. Otherwise, if we successfully remove the + * filter, we make sure to update the 'ifr_nmacfilt_used' + * count accordingly. + * + * We remove this filter to prevent duplicate delivery of + * packets destined for the primary MAC address as DLS will + * create the same filter on a non-default VSI for the primary + * MAC client. + * + * If you change the following code please test it across as + * many X700 series controllers and firmware revisions as you + * can. + */ + bzero(&filt, sizeof (filt)); + bcopy(hw->mac.port_addr, filt.mac_addr, ETHERADDRL); + filt.flags = I40E_AQC_MACVLAN_DEL_PERFECT_MATCH; + filt.vlan_tag = 0; + + ASSERT3U(i40e->i40e_resources.ifr_nmacfilt_used, <=, 1); + i40e_log(i40e, "Num L2 filters: %u", + i40e->i40e_resources.ifr_nmacfilt_used); + + err = i40e_aq_remove_macvlan(hw, I40E_DEF_VSI_SEID(i40e), &filt, 1, + NULL); + if (err == I40E_SUCCESS) { + i40e_log(i40e, + "Removed L2 filter from Default VSI with SEID %u", + I40E_DEF_VSI_SEID(i40e)); + } else if (hw->aq.asq_last_status == ENOENT) { + i40e_log(i40e, + "No L2 filter for Default VSI with SEID %u", + I40E_DEF_VSI_SEID(i40e)); + } else { + i40e_error(i40e, "Failed to remove L2 filter from" + " Default VSI with SEID %u: %d (%d)", + I40E_DEF_VSI_SEID(i40e), err, hw->aq.asq_last_status); + + return (B_FALSE); + } + + /* + * As mentioned above, the controller created an implicit L2 + * filter for the primary MAC. We want to remove both the + * filter and decrement the filter count. However, not all + * controllers count this implicit filter against the total + * MAC filter count. So here we are making sure it is either + * one or zero. If it is one, then we know it is for the + * implicit filter and we should decrement since we just + * removed the filter above. If it is zero then we know the + * controller that does not count the implicit filter, and it + * was enough to just remove it; we leave the count alone. + * But if it is neither, then we have never seen a controller + * like this before and we should fail to attach. + * + * It is unfortunate that this code must exist but the + * behavior of this implicit L2 filter and its corresponding + * count were dicovered through empirical testing. The + * programming manuals hint at this filter but do not + * explicitly call out the exact behavior. + */ + if (i40e->i40e_resources.ifr_nmacfilt_used == 1) { + i40e->i40e_resources.ifr_nmacfilt_used--; + } else { + if (i40e->i40e_resources.ifr_nmacfilt_used != 0) { + i40e_error(i40e, "Unexpected L2 filter count: %u" + " (expected 0)", + i40e->i40e_resources.ifr_nmacfilt_used); + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static boolean_t +i40e_config_rss_key_x722(i40e_t *i40e, i40e_hw_t *hw) +{ + for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) { + uint32_t seed[I40E_PFQF_HKEY_MAX_INDEX + 1]; struct i40e_aqc_get_set_rss_key_data key; - const char *u8seed = (char *)seed; + const char *u8seed; enum i40e_status_code status; + uint16_t vsi_number = i40e->i40e_vsis[i].iv_number; + + (void) random_get_pseudo_bytes((uint8_t *)seed, sizeof (seed)); + u8seed = (char *)seed; CTASSERT(sizeof (key) >= (sizeof (key.standard_rss_key) + sizeof (key.extended_hash_key))); @@ -2015,14 +2290,35 @@ i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw) bcopy(&u8seed[sizeof (key.standard_rss_key)], key.extended_hash_key, sizeof (key.extended_hash_key)); - status = i40e_aq_set_rss_key(hw, i40e->i40e_vsi_num, &key); + ASSERT3U(vsi_number, !=, 0); + status = i40e_aq_set_rss_key(hw, vsi_number, &key); + if (status != I40E_SUCCESS) { - i40e_error(i40e, "failed to set rss key: %d", status); + i40e_error(i40e, "failed to set RSS key for VSI %u: %d", + vsi_number, status); return (B_FALSE); } + } + + return (B_TRUE); +} + +/* + * Configure the RSS key. For the X710 controller family, this is set on a + * per-PF basis via registers. For the X722, this is done on a per-VSI basis + * through the admin queue. + */ +static boolean_t +i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw) +{ + if (i40e_is_x722(i40e)) { + if (!i40e_config_rss_key_x722(i40e, hw)) + return (B_FALSE); } else { - uint_t i; - for (i = 0; i <= I40E_PFQF_HKEY_MAX_INDEX; i++) + uint32_t seed[I40E_PFQF_HKEY_MAX_INDEX + 1]; + + (void) random_get_pseudo_bytes((uint8_t *)seed, sizeof (seed)); + for (uint_t i = 0; i <= I40E_PFQF_HKEY_MAX_INDEX; i++) i40e_write_rx_ctl(hw, I40E_PFQF_HKEY(i), seed[i]); } @@ -2034,11 +2330,12 @@ i40e_config_rss_key(i40e_t *i40e, i40e_hw_t *hw) * family, with the X722 using a known 7-bit width. On the X710 controller, this * is programmed through its control registers where as on the X722 this is * configured through the admin queue. Also of note, the X722 allows the LUT to - * be set on a per-PF or VSI basis. At this time, as we only have a single VSI, - * we use the PF setting as it is the primary VSI. + * be set on a per-PF or VSI basis. At this time we use the PF setting. If we + * decide to use the per-VSI LUT in the future, then we will need to modify the + * i40e_add_vsi() function to set the RSS LUT bits in the queueing section. * * We populate the LUT in a round robin fashion with the rx queue indices from 0 - * to i40e_num_trqpairs - 1. + * to i40e_num_trqpairs_per_vsi - 1. */ static boolean_t i40e_config_rss_hlut(i40e_t *i40e, i40e_hw_t *hw) @@ -2068,15 +2365,20 @@ i40e_config_rss_hlut(i40e_t *i40e, i40e_hw_t *hw) lut_mask = (1 << hw->func_caps.rss_table_entry_width) - 1; } - for (i = 0; i < I40E_HLUT_TABLE_SIZE; i++) - ((uint8_t *)hlut)[i] = (i % i40e->i40e_num_trqpairs) & lut_mask; + for (i = 0; i < I40E_HLUT_TABLE_SIZE; i++) { + ((uint8_t *)hlut)[i] = + (i % i40e->i40e_num_trqpairs_per_vsi) & lut_mask; + } if (i40e_is_x722(i40e)) { enum i40e_status_code status; - status = i40e_aq_set_rss_lut(hw, i40e->i40e_vsi_num, B_TRUE, - (uint8_t *)hlut, I40E_HLUT_TABLE_SIZE); + + status = i40e_aq_set_rss_lut(hw, 0, B_TRUE, (uint8_t *)hlut, + I40E_HLUT_TABLE_SIZE); + if (status != I40E_SUCCESS) { - i40e_error(i40e, "failed to set RSS LUT: %d", status); + i40e_error(i40e, "failed to set RSS LUT %d: %d", + status, hw->aq.asq_last_status); goto out; } } else { @@ -2188,8 +2490,34 @@ i40e_chip_start(i40e_t *i40e) i40e_intr_chip_init(i40e); - if (!i40e_config_vsi(i40e, hw)) + rc = i40e_get_mac_seid(i40e); + if (rc == -1) { + i40e_error(i40e, "failed to obtain MAC Uplink SEID"); return (B_FALSE); + } + i40e->i40e_mac_seid = (uint16_t)rc; + + /* + * Create a VEB in order to support multiple VSIs. Each VSI + * functions as a MAC group. This call sets the PF's MAC as + * the uplink port and the PF's default VSI as the default + * downlink port. + */ + rc = i40e_aq_add_veb(hw, i40e->i40e_mac_seid, I40E_DEF_VSI_SEID(i40e), + 0x1, B_TRUE, &i40e->i40e_veb_seid, B_FALSE, NULL); + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "i40e_aq_add_veb() failed %d: %d", rc, + hw->aq.asq_last_status); + return (B_FALSE); + } + + if (!i40e_config_def_vsi(i40e, hw)) + return (B_FALSE); + + for (uint_t i = 1; i < i40e->i40e_num_rx_groups; i++) { + if (!i40e_add_vsi(i40e, hw, i)) + return (B_FALSE); + } if (!i40e_config_rss(i40e, hw)) return (B_FALSE); @@ -2549,7 +2877,7 @@ i40e_setup_tx_hmc(i40e_trqpair_t *itrq) * assigned to traffic class zero, because we don't actually use them. */ bzero(&context, sizeof (struct i40e_vsi_context)); - context.seid = i40e->i40e_vsi_id; + context.seid = I40E_DEF_VSI_SEID(i40e); context.pf_num = hw->pf_id; err = i40e_aq_get_vsi_params(hw, &context, NULL); if (err != I40E_SUCCESS) { @@ -2653,7 +2981,8 @@ i40e_setup_tx_rings(i40e_t *i40e) void i40e_stop(i40e_t *i40e, boolean_t free_allocations) { - int i; + uint_t i; + i40e_hw_t *hw = &i40e->i40e_hw_space; ASSERT(MUTEX_HELD(&i40e->i40e_general_lock)); @@ -2689,6 +3018,27 @@ i40e_stop(i40e_t *i40e, boolean_t free_allocations) delay(50 * drv_usectohz(1000)); + /* + * We don't delete the default VSI because it replaces the VEB + * after VEB deletion (see the "Delete Element" section). + * Furthermore, since the default VSI is provided by the + * firmware, we never attempt to delete it. + */ + for (i = 1; i < i40e->i40e_num_rx_groups; i++) { + i40e_delete_vsi(i40e, i); + } + + if (i40e->i40e_veb_seid != 0) { + int rc = i40e_aq_delete_element(hw, i40e->i40e_veb_seid, NULL); + + if (rc != I40E_SUCCESS) { + i40e_error(i40e, "Failed to delete VEB %d: %d", rc, + hw->aq.asq_last_status); + } + + i40e->i40e_veb_seid = 0; + } + i40e_intr_chip_fini(i40e); for (i = 0; i < i40e->i40e_num_trqpairs; i++) { @@ -2718,7 +3068,9 @@ i40e_stop(i40e_t *i40e, boolean_t free_allocations) mutex_exit(&i40e->i40e_trqpairs[i].itrq_tx_lock); } - i40e_stat_vsi_fini(i40e); + for (i = 0; i < i40e->i40e_num_rx_groups; i++) { + i40e_stat_vsi_fini(i40e, i); + } i40e->i40e_link_speed = 0; i40e->i40e_link_duplex = 0; @@ -2783,7 +3135,8 @@ i40e_start(i40e_t *i40e, boolean_t alloc) * Enable broadcast traffic; however, do not enable multicast traffic. * That's handle exclusively through MAC's mc_multicst routines. */ - err = i40e_aq_set_vsi_broadcast(hw, i40e->i40e_vsi_id, B_TRUE, NULL); + err = i40e_aq_set_vsi_broadcast(hw, I40E_DEF_VSI_SEID(i40e), B_TRUE, + NULL); if (err != I40E_SUCCESS) { i40e_error(i40e, "failed to set default VSI: %d", err); rc = B_FALSE; diff --git a/usr/src/uts/common/io/i40e/i40e_stats.c b/usr/src/uts/common/io/i40e/i40e_stats.c index 7a4f0faedd..97e1a08628 100644 --- a/usr/src/uts/common/io/i40e/i40e_stats.c +++ b/usr/src/uts/common/io/i40e/i40e_stats.c @@ -11,7 +11,7 @@ /* * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #include "i40e_sw.h" @@ -69,12 +69,7 @@ * --------------------- * * The hardware keeps statistics at each physical function/MAC (PF) and it keeps - * statistics on each virtual station interface (VSI). Currently we only use one - * VSI per PF (see the i40e_main.c theory statement). The hardware has a limited - * number of statistics units available. While every PF is guaranteed to have a - * statistics unit, it is possible that we will run out for a given VSI. We'll - * have to figure out an appropriate strategy here when we end up supporting - * multiple VSIs. + * statistics on each virtual station interface (VSI). * * The hardware keeps these statistics as 32-bit and 48-bit counters. We are * required to read them and then compute the differences between them. The @@ -100,10 +95,10 @@ * data. * * The pf kstats data is stored in the i40e_t`i40e_pf_kstat. It is backed by the - * i40e_t`i40e_pf_stat structure. Similarly the VSI related kstat is in - * i40e_t`i40e_vsi_kstat and the data is backed in the i40e_t`i40e_vsi_stat. All - * of this data is protected by the i40e_stat_lock, which should be taken last, - * when acquiring locks. + * i40e_t`i40e_pf_stat structure. Similarly the VSI related kstats are in + * i40e_t`i40e_vsis[idx].iv_kstats and the data is backed in the + * i40e_t`i40e_vsis[idx].iv_stats. All of this data is protected by the + * i40e_stat_lock, which should be taken last, when acquiring locks. */ static void @@ -169,15 +164,15 @@ i40e_stat_get_uint32(i40e_t *i40e, uintptr_t reg, kstat_named_t *kstat, } static void -i40e_stat_vsi_update(i40e_t *i40e, boolean_t init) +i40e_stat_vsi_update(i40e_t *i40e, uint_t idx, boolean_t init) { i40e_vsi_stats_t *ivs; i40e_vsi_kstats_t *ivk; - int id = i40e->i40e_vsi_stat_id; + uint16_t id = i40e->i40e_vsis[idx].iv_stats_id; - ASSERT(i40e->i40e_vsi_kstat != NULL); - ivs = &i40e->i40e_vsi_stat; - ivk = i40e->i40e_vsi_kstat->ks_data; + ASSERT3P(i40e->i40e_vsis[idx].iv_kstats, !=, NULL); + ivs = &i40e->i40e_vsis[idx].iv_stats; + ivk = i40e->i40e_vsis[idx].iv_kstats->ks_data; mutex_enter(&i40e->i40e_stat_lock); @@ -231,39 +226,41 @@ i40e_stat_vsi_kstat_update(kstat_t *ksp, int rw) return (EACCES); i40e = ksp->ks_private; - i40e_stat_vsi_update(i40e, B_FALSE); + for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) + i40e_stat_vsi_update(i40e, i, B_FALSE); + return (0); } void -i40e_stat_vsi_fini(i40e_t *i40e) +i40e_stat_vsi_fini(i40e_t *i40e, uint_t idx) { - if (i40e->i40e_vsi_kstat != NULL) { - kstat_delete(i40e->i40e_vsi_kstat); - i40e->i40e_vsi_kstat = NULL; + if (i40e->i40e_vsis[idx].iv_kstats != NULL) { + kstat_delete(i40e->i40e_vsis[idx].iv_kstats); + i40e->i40e_vsis[idx].iv_kstats = NULL; } } boolean_t -i40e_stat_vsi_init(i40e_t *i40e) +i40e_stat_vsi_init(i40e_t *i40e, uint_t idx) { kstat_t *ksp; i40e_vsi_kstats_t *ivk; char buf[64]; + uint16_t vsi_id = i40e->i40e_vsis[idx].iv_seid; - (void) snprintf(buf, sizeof (buf), "vsi_%d", i40e->i40e_vsi_id); + (void) snprintf(buf, sizeof (buf), "vsi_%u", vsi_id); ksp = kstat_create(I40E_MODULE_NAME, ddi_get_instance(i40e->i40e_dip), buf, "net", KSTAT_TYPE_NAMED, sizeof (i40e_vsi_kstats_t) / sizeof (kstat_named_t), 0); if (ksp == NULL) { - i40e_error(i40e, "Failed to create kstats for VSI %d", - i40e->i40e_vsi_id); + i40e_error(i40e, "Failed to create kstats for VSI %u", vsi_id); return (B_FALSE); } - i40e->i40e_vsi_kstat = ksp; + i40e->i40e_vsis[idx].iv_kstats = ksp; ivk = ksp->ks_data; ksp->ks_update = i40e_stat_vsi_kstat_update; ksp->ks_private = i40e; @@ -291,9 +288,9 @@ i40e_stat_vsi_init(i40e_t *i40e) kstat_named_init(&ivk->ivk_tx_errors, "tx_errors", KSTAT_DATA_UINT64); - bzero(&i40e->i40e_vsi_stat, sizeof (i40e_vsi_stats_t)); - i40e_stat_vsi_update(i40e, B_TRUE); - kstat_install(i40e->i40e_vsi_kstat); + bzero(&i40e->i40e_vsis[idx].iv_stats, sizeof (i40e_vsi_stats_t)); + i40e_stat_vsi_update(i40e, idx, B_TRUE); + kstat_install(i40e->i40e_vsis[idx].iv_kstats); return (B_TRUE); } @@ -670,7 +667,12 @@ i40e_stat_pf_init(i40e_t *i40e) void i40e_stats_fini(i40e_t *i40e) { - ASSERT(i40e->i40e_vsi_kstat == NULL); +#ifdef DEBUG + for (uint_t i = 0; i < i40e->i40e_num_rx_groups; i++) { + ASSERT3P(i40e->i40e_vsis[i].iv_kstats, ==, NULL); + } +#endif + if (i40e->i40e_pf_kstat != NULL) { kstat_delete(i40e->i40e_pf_kstat); i40e->i40e_pf_kstat = NULL; @@ -1249,6 +1251,12 @@ i40e_stats_trqpair_init(i40e_trqpair_t *itrq) kstat_named_init(&tsp->itxs_hck_badl4, "tx_hck_badl4", KSTAT_DATA_UINT64); tsp->itxs_hck_badl4.value.ui64 = 0; + kstat_named_init(&tsp->itxs_lso_nohck, "tx_lso_nohck", + KSTAT_DATA_UINT64); + tsp->itxs_lso_nohck.value.ui64 = 0; + kstat_named_init(&tsp->itxs_bind_fails, "tx_bind_fails", + KSTAT_DATA_UINT64); + tsp->itxs_bind_fails.value.ui64 = 0; kstat_named_init(&tsp->itxs_err_notcb, "tx_err_notcb", KSTAT_DATA_UINT64); tsp->itxs_err_notcb.value.ui64 = 0; diff --git a/usr/src/uts/common/io/i40e/i40e_sw.h b/usr/src/uts/common/io/i40e/i40e_sw.h index 78aced0144..fbb6d11b41 100644 --- a/usr/src/uts/common/io/i40e/i40e_sw.h +++ b/usr/src/uts/common/io/i40e/i40e_sw.h @@ -11,7 +11,7 @@ /* * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright 2017 Tegile Systems, Inc. All rights reserved. */ @@ -152,9 +152,10 @@ typedef enum i40e_itr_index { } i40e_itr_index_t; /* - * Table 1-5 of the PRM notes that LSO supports up to 256 KB. + * The hardware claims to support LSO up to 256 KB, but due to the limitations + * imposed by the IP header for non-jumbo frames, we cap it at 64 KB. */ -#define I40E_LSO_MAXLEN (256 * 1024) +#define I40E_LSO_MAXLEN (64 * 1024) #define I40E_CYCLIC_PERIOD NANOSEC /* 1 second */ #define I40E_DRAIN_RX_WAIT (500 * MILLISEC) /* In us */ @@ -173,13 +174,22 @@ typedef enum i40e_itr_index { #define I40E_BUF_IPHDR_ALIGNMENT 2 /* - * The XL710 controller has a limit of eight buffers being allowed to be used - * for the transmission of a single frame. This is defined in 8.4.1 - Transmit + * The XL710 controller has a total of eight buffers available for the + * transmission of any single frame. This is defined in 8.4.1 - Transmit * Packet in System Memory. */ #define I40E_TX_MAX_COOKIE 8 /* + * An LSO frame can be as large as 64KB, so we allow a DMA bind to span more + * cookies than a non-LSO frame. The key here to is to select a value such + * that once the HW has chunked up the LSO frame into MSS-sized segments that no + * single segment spans more than 8 cookies (see comments for + * I40E_TX_MAX_COOKIE) + */ +#define I40E_TX_LSO_MAX_COOKIE 32 + +/* * Sizing to determine the amount of available descriptors at which we'll * consider ourselves blocked. Also, when we have these available, we'll then * consider ourselves available to transmit to MAC again. Strictly speaking, the @@ -203,6 +213,12 @@ typedef enum i40e_itr_index { #define I40E_MAX_TX_DMA_THRESH INT32_MAX /* + * The max size of each individual tx buffer is 16KB - 1. + * See table 8-17 + */ +#define I40E_MAX_TX_BUFSZ 0x0000000000003FFFull + +/* * Resource sizing counts. There are various aspects of hardware where we may * have some variable number of elements that we need to handle. Such as the * hardware capabilities and switch capacities. We cannot know a priori how many @@ -240,21 +256,6 @@ typedef enum i40e_itr_index { #define I40E_HMC_TX_TPH_DISABLE 0 /* - * Whenever we establish and create a VSI, we need to assign some number of - * queues that it's allowed to access from the PF. Because we only have a single - * VSI per PF at this time, we assign it all the queues. - * - * Many of the devices support what's called Data-center Bridging. Which is a - * feature that we don't have much use of at this time. However, we still need - * to fill in this information. We follow the guidance of the note in Table 7-80 - * which talks about bytes 62-77. It says that if we don't want to assign - * anything to traffic classes, we should set the field to zero. Effectively - * this means that everything in the system is assigned to traffic class zero. - */ -#define I40E_ASSIGN_ALL_QUEUES 0 -#define I40E_TRAFFIC_CLASS_NO_QUEUES 0 - -/* * This defines the error mask that we care about from rx descriptors. Currently * we're only concerned with the general errors and oversize errors. */ @@ -268,12 +269,12 @@ typedef enum i40e_itr_index { #define I40E_DDI_PROP_LEN 64 /* - * We currently consolidate some overrides that we use in the code here. These - * will be gone in the fullness of time, but as we're bringing up the device, - * this is what we use. + * Place an artificial limit on the max number of groups. The X710 + * series supports up to 384 VSIs to be partitioned across PFs as the + * driver sees fit. But until we support more interrupts this seems + * like a good place to start. */ -#define I40E_GROUP_MAX 1 -#define I40E_TRQPAIR_MAX 1 +#define I40E_GROUP_MAX 32 #define I40E_GROUP_NOMSIX 1 #define I40E_TRQPAIR_NOMSIX 1 @@ -405,18 +406,29 @@ typedef struct i40e_rx_control_block { typedef enum { I40E_TX_NONE, I40E_TX_COPY, - I40E_TX_DMA + I40E_TX_DMA, + I40E_TX_DESC, } i40e_tx_type_t; typedef struct i40e_tx_desc i40e_tx_desc_t; +typedef struct i40e_tx_context_desc i40e_tx_context_desc_t; typedef union i40e_32byte_rx_desc i40e_rx_desc_t; +struct i40e_dma_bind_info { + caddr_t dbi_paddr; + size_t dbi_len; +}; + typedef struct i40e_tx_control_block { struct i40e_tx_control_block *tcb_next; mblk_t *tcb_mp; i40e_tx_type_t tcb_type; ddi_dma_handle_t tcb_dma_handle; + ddi_dma_handle_t tcb_lso_dma_handle; i40e_dma_buffer_t tcb_dma; + struct i40e_dma_bind_info *tcb_bind_info; + uint_t tcb_bind_ncookies; + boolean_t tcb_used_lso; } i40e_tx_control_block_t; /* @@ -526,6 +538,8 @@ typedef struct i40e_txq_stat { kstat_named_t itxs_hck_nol4info; /* Missing l4 info */ kstat_named_t itxs_hck_badl3; /* Not IPv4/IPv6 */ kstat_named_t itxs_hck_badl4; /* Bad L4 Paylaod */ + kstat_named_t itxs_lso_nohck; /* Missing offloads for LSO */ + kstat_named_t itxs_bind_fails; /* DMA bind failures */ kstat_named_t itxs_err_notcb; /* No tcb's available */ kstat_named_t itxs_err_nodescs; /* No tcb's available */ @@ -761,6 +775,25 @@ typedef struct i40e_func_rsrc { uint_t ifr_nmcastfilt_used; } i40e_func_rsrc_t; +typedef struct i40e_vsi { + uint16_t iv_seid; + uint16_t iv_number; + kstat_t *iv_kstats; + i40e_vsi_stats_t iv_stats; + uint16_t iv_stats_id; +} i40e_vsi_t; + +/* + * While irg_index and irg_grp_hdl aren't used anywhere, they are + * still useful for debugging. + */ +typedef struct i40e_rx_group { + uint32_t irg_index; /* index in i40e_rx_groups[] */ + uint16_t irg_vsi_seid; /* SEID of VSI for this group */ + mac_group_handle_t irg_grp_hdl; /* handle to mac_group_t */ + struct i40e *irg_i40e; /* ref to i40e_t */ +} i40e_rx_group_t; + /* * Main i40e per-instance state. */ @@ -789,11 +822,18 @@ typedef struct i40e { struct i40e_aq_get_phy_abilities_resp i40e_phy; void *i40e_aqbuf; +#define I40E_DEF_VSI_IDX 0 +#define I40E_DEF_VSI(i40e) ((i40e)->i40e_vsis[I40E_DEF_VSI_IDX]) +#define I40E_DEF_VSI_SEID(i40e) (I40E_DEF_VSI(i40e).iv_seid) + /* * Device state, switch information, and resources. */ - int i40e_vsi_id; - uint16_t i40e_vsi_num; + i40e_vsi_t i40e_vsis[I40E_GROUP_MAX]; + uint16_t i40e_mac_seid; /* SEID of physical MAC */ + uint16_t i40e_veb_seid; /* switch atop MAC (SEID) */ + uint16_t i40e_vsi_avail; /* VSIs avail to this PF */ + uint16_t i40e_vsi_used; /* VSIs used by this PF */ struct i40e_device *i40e_device; i40e_func_rsrc_t i40e_resources; uint16_t i40e_switch_rsrc_alloc; @@ -814,12 +854,13 @@ typedef struct i40e { */ i40e_trqpair_t *i40e_trqpairs; boolean_t i40e_mr_enable; - int i40e_num_trqpairs; + uint_t i40e_num_trqpairs; /* total TRQPs (per PF) */ + uint_t i40e_num_trqpairs_per_vsi; /* TRQPs per VSI */ uint_t i40e_other_itr; - int i40e_num_rx_groups; + i40e_rx_group_t *i40e_rx_groups; + uint_t i40e_num_rx_groups; int i40e_num_rx_descs; - mac_group_handle_t i40e_rx_group_handle; uint32_t i40e_rx_ring_size; uint32_t i40e_rx_buf_size; boolean_t i40e_rx_hcksum_enable; @@ -832,6 +873,7 @@ typedef struct i40e { uint32_t i40e_tx_buf_size; uint32_t i40e_tx_block_thresh; boolean_t i40e_tx_hcksum_enable; + boolean_t i40e_tx_lso_enable; uint32_t i40e_tx_dma_min; uint_t i40e_tx_itr; @@ -855,6 +897,7 @@ typedef struct i40e { */ ddi_dma_attr_t i40e_static_dma_attr; ddi_dma_attr_t i40e_txbind_dma_attr; + ddi_dma_attr_t i40e_txbind_lso_dma_attr; ddi_device_acc_attr_t i40e_desc_acc_attr; ddi_device_acc_attr_t i40e_buf_acc_attr; @@ -872,10 +915,7 @@ typedef struct i40e { */ kmutex_t i40e_stat_lock; kstat_t *i40e_pf_kstat; - kstat_t *i40e_vsi_kstat; i40e_pf_stats_t i40e_pf_stat; - i40e_vsi_stats_t i40e_vsi_stat; - uint16_t i40e_vsi_stat_id; /* * Misc. stats and counters that should maybe one day be kstats. @@ -975,8 +1015,8 @@ extern void i40e_tx_cleanup_ring(i40e_trqpair_t *); */ extern boolean_t i40e_stats_init(i40e_t *); extern void i40e_stats_fini(i40e_t *); -extern boolean_t i40e_stat_vsi_init(i40e_t *); -extern void i40e_stat_vsi_fini(i40e_t *); +extern boolean_t i40e_stat_vsi_init(i40e_t *, uint_t); +extern void i40e_stat_vsi_fini(i40e_t *, uint_t); extern boolean_t i40e_stats_trqpair_init(i40e_trqpair_t *); extern void i40e_stats_trqpair_fini(i40e_trqpair_t *); extern int i40e_m_stat(void *, uint_t, uint64_t *); diff --git a/usr/src/uts/common/io/i40e/i40e_transceiver.c b/usr/src/uts/common/io/i40e/i40e_transceiver.c index 75132e27f0..28f3b88dee 100644 --- a/usr/src/uts/common/io/i40e/i40e_transceiver.c +++ b/usr/src/uts/common/io/i40e/i40e_transceiver.c @@ -11,7 +11,7 @@ /* * Copyright 2015 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #include "i40e_sw.h" @@ -60,19 +60,19 @@ * This size is then rounded up to the nearest 1k chunk, which represents the * actual amount of memory that we'll allocate for a single frame. * - * Note, that for rx, we do something that might be unexpected. We always add + * Note, that for RX, we do something that might be unexpected. We always add * an extra two bytes to the frame size that we allocate. We then offset the DMA * address that we receive a packet into by two bytes. This ensures that the IP * header will always be 4 byte aligned because the MAC header is either 14 or * 18 bytes in length, depending on the use of 802.1Q tagging, which makes IP's * and MAC's lives easier. * - * Both the rx and tx descriptor rings (which are what we use to communicate + * Both the RX and TX descriptor rings (which are what we use to communicate * with hardware) are allocated as a single region of DMA memory which is the * size of the descriptor (4 bytes and 2 bytes respectively) times the total - * number of descriptors for an rx and tx ring. + * number of descriptors for an RX and TX ring. * - * While the rx and tx descriptors are allocated using DMA-based memory, the + * While the RX and TX descriptors are allocated using DMA-based memory, the * control blocks for each of them are allocated using normal kernel memory. * They aren't special from a DMA perspective. We'll go over the design of both * receiving and transmitting separately, as they have slightly different @@ -113,16 +113,16 @@ * * To try and ensure that the device always has blocks that it can receive data * into, we maintain two lists of control blocks, a working list and a free - * list. Each list is sized equal to the number of descriptors in the rx ring. - * During the GLDv3 mc_start routine, we allocate a number of rx control blocks + * list. Each list is sized equal to the number of descriptors in the RX ring. + * During the GLDv3 mc_start routine, we allocate a number of RX control blocks * equal to twice the number of descriptors in the ring and we assign them * equally to the free list and to the working list. Each control block also has * DMA memory allocated and associated with which it will be used to receive the * actual packet data. All of a received frame's data will end up in a single * DMA buffer. * - * During operation, we always maintain the invariant that each rx descriptor - * has an associated rx control block which lives in the working list. If we + * During operation, we always maintain the invariant that each RX descriptor + * has an associated RX control block which lives in the working list. If we * feel that we should loan up DMA memory to MAC in the form of a message block, * we can only do so if we can maintain this invariant. To do that, we swap in * one of the buffers from the free list. If none are available, then we resort @@ -130,14 +130,14 @@ * size. * * Loaned message blocks come back to use when freemsg(9F) or freeb(9F) is - * called on the block, at which point we restore the rx control block to the + * called on the block, at which point we restore the RX control block to the * free list and are able to reuse the DMA memory again. While the scheme may * seem odd, it importantly keeps us out of trying to do any DMA allocations in * the normal path of operation, even though we may still have to allocate * message blocks and copy. * - * The following state machine describes the life time of a rx control block. In - * the diagram we abbrviate the rx ring descriptor entry as rxd and the rx + * The following state machine describes the life time of a RX control block. In + * the diagram we abbrviate the RX ring descriptor entry as rxd and the rx * control block entry as rcb. * * | | @@ -160,11 +160,11 @@ * +--------------------<-----| rcb loaned to MAC | * +-------------------+ * - * Finally, note that every rx control block has a reference count on it. One + * Finally, note that every RX control block has a reference count on it. One * reference is added as long as the driver has had the GLDv3 mc_start endpoint * called. If the GLDv3 mc_stop entry point is called, IP has been unplumbed and * no other DLPI consumers remain, then we'll decrement the reference count by - * one. Whenever we loan up the rx control block and associated buffer to MAC, + * one. Whenever we loan up the RX control block and associated buffer to MAC, * then we bump the reference count again. Even though the device is stopped, * there may still be loaned frames in upper levels that we'll want to account * for. Our callback from freemsg(9F)/freeb(9F) will take care of making sure @@ -192,10 +192,10 @@ * state tracking. Effectively, we cache the HEAD register and then update it * ourselves based on our work. * - * When we iterate over the rx descriptors and thus the received frames, we are + * When we iterate over the RX descriptors and thus the received frames, we are * either in an interrupt context or we've been asked by MAC to poll on the * ring. If we've been asked to poll on the ring, we have a maximum number of - * bytes of mblk_t's to return. If processing an rx descriptor would cause us to + * bytes of mblk_t's to return. If processing an RX descriptor would cause us to * exceed that count, then we do not process it. When in interrupt context, we * don't have a strict byte count. However, to ensure liveness, we limit the * amount of data based on a configuration value @@ -249,31 +249,44 @@ * differently due to the fact that all data is originated by the operating * system and not by the device. * - * Like rx, there is both a descriptor ring that we use to communicate to the - * driver and which points to the memory used to transmit a frame. Similarly, - * there is a corresponding transmit control block. Each transmit control block - * has a region of DMA memory allocated to it; however, the way we use it - * varies. + * Like RX, there is both a descriptor ring that we use to communicate to the + * driver and which points to the memory used to transmit a frame. Similarly, + * there is a corresponding transmit control block, however, the correspondence + * between descriptors and control blocks is more complex and not necessarily + * 1-to-1. * * The driver is asked to process a single frame at a time. That message block * may be made up of multiple fragments linked together by the mblk_t`b_cont * member. The device has a hard limit of up to 8 buffers being allowed for use * for a single logical frame. For each fragment, we'll try and use an entry - * from the tx descriptor ring and then we'll allocate a corresponding tx - * control block. Depending on the size of the fragment, we may copy it around - * or we might instead try to do DMA binding of the fragment. - * - * If we exceed the number of blocks that fit, we'll try to pull up the block - * and then we'll do a DMA bind and send it out. - * - * If we don't have enough space in the ring or tx control blocks available, + * from the TX descriptor ring and then we'll allocate a corresponding TX + * control block. + * + * We alter our DMA strategy based on a threshold tied to the frame size. + * This threshold is configurable via the tx_dma_threshold property. If the + * frame size is above the threshold, we do DMA binding of the fragments, + * building a control block and data descriptor for each piece. If it's below + * or at the threshold then we just use a single control block and data + * descriptor and simply bcopy all of the fragments into the pre-allocated DMA + * buffer in the control block. For the LSO TX case we always do DMA binding of + * the fragments, with one control block and one TX data descriptor allocated + * per fragment. + * + * Furthermore, if the frame requires HW offloads such as LSO, tunneling or + * filtering, then the TX data descriptors must be preceeded by a single TX + * context descriptor. Because there is no DMA transfer associated with the + * context descriptor, we allocate a control block with a special type which + * indicates to the TX ring recycle code that there are no associated DMA + * resources to unbind when the control block is free'd. + * + * If we don't have enough space in the ring or TX control blocks available, * then we'll return the unprocessed message block to MAC. This will induce flow * control and once we recycle enough entries, we'll once again enable sending * on the ring. * * We size the working list as equal to the number of descriptors in the ring. * We size the free list as equal to 1.5 times the number of descriptors in the - * ring. We'll allocate a number of tx control block entries equal to the number + * ring. We'll allocate a number of TX control block entries equal to the number * of entries in the free list. By default, all entries are placed in the free * list. As we come along and try to send something, we'll allocate entries from * the free list and add them to the working list, where they'll stay until the @@ -325,7 +338,7 @@ * +------------------+ +------------------+ * | tcb on free list |---*------------------>| tcb on work list | * +------------------+ . +------------------+ - * ^ . tcb allocated | + * ^ . N tcbs allocated[1] | * | to send frame v * | or fragment on | * | wire, mblk from | @@ -335,20 +348,27 @@ * . * . Hardware indicates * entry transmitted. - * tcb recycled, mblk + * tcbs recycled, mblk * from MAC freed. * + * [1] We allocate N tcbs to transmit a single frame where N can be 1 context + * descriptor plus 1 data descriptor, in the non-DMA-bind case. In the DMA + * bind case, N can be 1 context descriptor plus 1 data descriptor per + * b_cont in the mblk. In this case, the mblk is associated with the first + * data descriptor and freed as part of freeing that data descriptor. + * * ------------ * Blocking MAC * ------------ * - * Wen performing transmit, we can run out of descriptors and ring entries. When - * such a case happens, we return the mblk_t to MAC to indicate that we've been - * blocked. At that point in time, MAC becomes blocked and will not transmit - * anything out that specific ring until we notify MAC. To indicate that we're - * in such a situation we set i40e_trqpair_t`itrq_tx_blocked member to B_TRUE. + * When performing transmit, we can run out of descriptors and ring entries. + * When such a case happens, we return the mblk_t to MAC to indicate that we've + * been blocked. At that point in time, MAC becomes blocked and will not + * transmit anything out that specific ring until we notify MAC. To indicate + * that we're in such a situation we set i40e_trqpair_t`itrq_tx_blocked member + * to B_TRUE. * - * When we recycle tx descriptors then we'll end up signaling MAC by calling + * When we recycle TX descriptors then we'll end up signaling MAC by calling * mac_tx_ring_update() if we were blocked, letting it know that it's safe to * start sending frames out to us again. */ @@ -367,13 +387,15 @@ /* * This structure is used to maintain information and flags related to - * transmitting a frame. The first member is the set of flags we need to or into - * the command word (generally checksumming related). The second member controls - * the word offsets which is required for IP and L4 checksumming. + * transmitting a frame. These fields are ultimately used to construct the + * TX data descriptor(s) and, if necessary, the TX context descriptor. */ typedef struct i40e_tx_context { - enum i40e_tx_desc_cmd_bits itc_cmdflags; - uint32_t itc_offsets; + enum i40e_tx_desc_cmd_bits itc_data_cmdflags; + uint32_t itc_data_offsets; + enum i40e_tx_ctx_desc_cmd_bits itc_ctx_cmdflags; + uint32_t itc_ctx_tsolen; + uint32_t itc_ctx_mss; } i40e_tx_context_t; /* @@ -395,14 +417,18 @@ i40e_debug_rx_t i40e_debug_rx_mode = I40E_DEBUG_RX_DEFAULT; * i40e_static_dma_attr, is designed to be used for both the descriptor rings * and the static buffers that we associate with control blocks. For this * reason, we force an SGL length of one. While technically the driver supports - * a larger SGL (5 on rx and 8 on tx), we opt to only use one to simplify our + * a larger SGL (5 on RX and 8 on TX), we opt to only use one to simplify our * management here. In addition, when the Intel common code wants to allocate * memory via the i40e_allocate_virt_mem osdep function, we have it leverage * the static dma attr. * - * The second set of attributes, i40e_txbind_dma_attr, is what we use when we're - * binding a bunch of mblk_t fragments to go out the door. Note that the main - * difference here is that we're allowed a larger SGL length -- eight. + * The latter two sets of attributes, are what we use when we're binding a + * bunch of mblk_t fragments to go out the door. Note that the main difference + * here is that we're allowed a larger SGL length. For non-LSO TX, we + * restrict the SGL length to match the number of TX buffers available to the + * PF (8). For the LSO case we can go much larger, with the caveat that each + * MSS-sized chunk (segment) must not span more than 8 data descriptors and + * hence must not span more than 8 cookies. * * Note, we default to setting ourselves to be DMA capable here. However, * because we could have multiple instances which have different FMA error @@ -429,7 +455,7 @@ static const ddi_dma_attr_t i40e_g_txbind_dma_attr = { DMA_ATTR_V0, /* version number */ 0x0000000000000000ull, /* low address */ 0xFFFFFFFFFFFFFFFFull, /* high address */ - 0x00000000FFFFFFFFull, /* dma counter max */ + I40E_MAX_TX_BUFSZ - 1, /* dma counter max */ I40E_DMA_ALIGNMENT, /* alignment */ 0x00000FFF, /* burst sizes */ 0x00000001, /* minimum transfer size */ @@ -440,6 +466,21 @@ static const ddi_dma_attr_t i40e_g_txbind_dma_attr = { DDI_DMA_FLAGERR /* DMA flags */ }; +static const ddi_dma_attr_t i40e_g_txbind_lso_dma_attr = { + DMA_ATTR_V0, /* version number */ + 0x0000000000000000ull, /* low address */ + 0xFFFFFFFFFFFFFFFFull, /* high address */ + I40E_MAX_TX_BUFSZ - 1, /* dma counter max */ + I40E_DMA_ALIGNMENT, /* alignment */ + 0x00000FFF, /* burst sizes */ + 0x00000001, /* minimum transfer size */ + 0x00000000FFFFFFFFull, /* maximum transfer size */ + 0xFFFFFFFFFFFFFFFFull, /* maximum segment size */ + I40E_TX_LSO_MAX_COOKIE, /* scatter/gather list length */ + 0x00000001, /* granularity */ + DDI_DMA_FLAGERR /* DMA flags */ +}; + /* * Next, we have the attributes for these structures. The descriptor rings are * all strictly little endian, while the data buffers are just arrays of bytes @@ -668,7 +709,7 @@ i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq) rxd->rxd_work_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) * rxd->rxd_ring_size, KM_NOSLEEP); if (rxd->rxd_work_list == NULL) { - i40e_error(i40e, "failed to allocate rx work list for a ring " + i40e_error(i40e, "failed to allocate RX work list for a ring " "of %d entries for ring %d", rxd->rxd_ring_size, itrq->itrq_index); goto cleanup; @@ -677,7 +718,7 @@ i40e_alloc_rx_data(i40e_t *i40e, i40e_trqpair_t *itrq) rxd->rxd_free_list = kmem_zalloc(sizeof (i40e_rx_control_block_t *) * rxd->rxd_free_list_size, KM_NOSLEEP); if (rxd->rxd_free_list == NULL) { - i40e_error(i40e, "failed to allocate a %d entry rx free list " + i40e_error(i40e, "failed to allocate a %d entry RX free list " "for ring %d", rxd->rxd_free_list_size, itrq->itrq_index); goto cleanup; } @@ -765,7 +806,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd) i40e_t *i40e = rxd->rxd_i40e; /* - * First allocate the rx descriptor ring. + * First allocate the RX descriptor ring. */ dmasz = sizeof (i40e_rx_desc_t) * rxd->rxd_ring_size; VERIFY(dmasz > 0); @@ -773,7 +814,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd) &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE, B_TRUE, dmasz) == B_FALSE) { i40e_error(i40e, "failed to allocate DMA resources " - "for rx descriptor ring"); + "for RX descriptor ring"); return (B_FALSE); } rxd->rxd_desc_ring = @@ -799,7 +840,7 @@ i40e_alloc_rx_dma(i40e_rx_data_t *rxd) if (i40e_alloc_dma_buffer(i40e, dmap, &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr, B_TRUE, B_FALSE, dmasz) == B_FALSE) { - i40e_error(i40e, "failed to allocate rx dma buffer"); + i40e_error(i40e, "failed to allocate RX dma buffer"); return (B_FALSE); } @@ -841,6 +882,10 @@ i40e_free_tx_dma(i40e_trqpair_t *itrq) ddi_dma_free_handle(&tcb->tcb_dma_handle); tcb->tcb_dma_handle = NULL; } + if (tcb->tcb_lso_dma_handle != NULL) { + ddi_dma_free_handle(&tcb->tcb_lso_dma_handle); + tcb->tcb_lso_dma_handle = NULL; + } } fsz = sizeof (i40e_tx_control_block_t) * @@ -881,7 +926,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq) (i40e->i40e_tx_ring_size >> 1); /* - * Allocate an additional tx descriptor for the writeback head. + * Allocate an additional TX descriptor for the writeback head. */ dmasz = sizeof (i40e_tx_desc_t) * itrq->itrq_tx_ring_size; dmasz += sizeof (i40e_tx_desc_t); @@ -890,7 +935,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq) if (i40e_alloc_dma_buffer(i40e, &itrq->itrq_desc_area, &i40e->i40e_static_dma_attr, &i40e->i40e_desc_acc_attr, B_FALSE, B_TRUE, dmasz) == B_FALSE) { - i40e_error(i40e, "failed to allocate DMA resources for tx " + i40e_error(i40e, "failed to allocate DMA resources for TX " "descriptor ring"); return (B_FALSE); } @@ -905,7 +950,7 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq) itrq->itrq_tcb_work_list = kmem_zalloc(itrq->itrq_tx_ring_size * sizeof (i40e_tx_control_block_t *), KM_NOSLEEP); if (itrq->itrq_tcb_work_list == NULL) { - i40e_error(i40e, "failed to allocate a %d entry tx work list " + i40e_error(i40e, "failed to allocate a %d entry TX work list " "for ring %d", itrq->itrq_tx_ring_size, itrq->itrq_index); goto cleanup; } @@ -913,14 +958,14 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq) itrq->itrq_tcb_free_list = kmem_zalloc(itrq->itrq_tx_free_list_size * sizeof (i40e_tx_control_block_t *), KM_SLEEP); if (itrq->itrq_tcb_free_list == NULL) { - i40e_error(i40e, "failed to allocate a %d entry tx free list " + i40e_error(i40e, "failed to allocate a %d entry TX free list " "for ring %d", itrq->itrq_tx_free_list_size, itrq->itrq_index); goto cleanup; } /* - * We allocate enough tx control blocks to cover the free list. + * We allocate enough TX control blocks to cover the free list. */ itrq->itrq_tcb_area = kmem_zalloc(sizeof (i40e_tx_control_block_t) * itrq->itrq_tx_free_list_size, KM_NOSLEEP); @@ -948,18 +993,29 @@ i40e_alloc_tx_dma(i40e_trqpair_t *itrq) &i40e->i40e_txbind_dma_attr, DDI_DMA_DONTWAIT, NULL, &tcb->tcb_dma_handle); if (ret != DDI_SUCCESS) { - i40e_error(i40e, "failed to allocate DMA handle for tx " + i40e_error(i40e, "failed to allocate DMA handle for TX " "data binding on ring %d: %d", itrq->itrq_index, ret); tcb->tcb_dma_handle = NULL; goto cleanup; } + ret = ddi_dma_alloc_handle(i40e->i40e_dip, + &i40e->i40e_txbind_lso_dma_attr, DDI_DMA_DONTWAIT, NULL, + &tcb->tcb_lso_dma_handle); + if (ret != DDI_SUCCESS) { + i40e_error(i40e, "failed to allocate DMA handle for TX " + "LSO data binding on ring %d: %d", itrq->itrq_index, + ret); + tcb->tcb_lso_dma_handle = NULL; + goto cleanup; + } + if (i40e_alloc_dma_buffer(i40e, &tcb->tcb_dma, &i40e->i40e_static_dma_attr, &i40e->i40e_buf_acc_attr, B_TRUE, B_FALSE, dmasz) == B_FALSE) { i40e_error(i40e, "failed to allocate %ld bytes of " - "DMA for tx data binding on ring %d", dmasz, + "DMA for TX data binding on ring %d", dmasz, itrq->itrq_index); goto cleanup; } @@ -989,10 +1045,17 @@ i40e_free_ring_mem(i40e_t *i40e, boolean_t failed_init) i40e_rx_data_t *rxd = i40e->i40e_trqpairs[i].itrq_rxdata; /* - * Clean up our rx data. We have to free DMA resources first and + * In some cases i40e_alloc_rx_data() may have failed + * and in that case there is no rxd to free. + */ + if (rxd == NULL) + continue; + + /* + * Clean up our RX data. We have to free DMA resources first and * then if we have no more pending RCB's, then we'll go ahead * and clean things up. Note, we can't set the stopped flag on - * the rx data until after we've done the first pass of the + * the RX data until after we've done the first pass of the * pending resources. Otherwise we might race with * i40e_rx_recycle on determining who should free the * i40e_rx_data_t above. @@ -1055,6 +1118,8 @@ i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma) sizeof (ddi_dma_attr_t)); bcopy(&i40e_g_txbind_dma_attr, &i40e->i40e_txbind_dma_attr, sizeof (ddi_dma_attr_t)); + bcopy(&i40e_g_txbind_lso_dma_attr, &i40e->i40e_txbind_lso_dma_attr, + sizeof (ddi_dma_attr_t)); bcopy(&i40e_g_desc_acc_attr, &i40e->i40e_desc_acc_attr, sizeof (ddi_device_acc_attr_t)); bcopy(&i40e_g_buf_acc_attr, &i40e->i40e_buf_acc_attr, @@ -1063,9 +1128,13 @@ i40e_init_dma_attrs(i40e_t *i40e, boolean_t fma) if (fma == B_TRUE) { i40e->i40e_static_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; i40e->i40e_txbind_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; + i40e->i40e_txbind_lso_dma_attr.dma_attr_flags |= + DDI_DMA_FLAGERR; } else { i40e->i40e_static_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR; i40e->i40e_txbind_dma_attr.dma_attr_flags &= ~DDI_DMA_FLAGERR; + i40e->i40e_txbind_lso_dma_attr.dma_attr_flags &= + ~DDI_DMA_FLAGERR; } } @@ -1102,7 +1171,7 @@ i40e_rcb_alloc(i40e_rx_data_t *rxd) /* * This is the callback that we get from the OS when freemsg(9F) has been called * on a loaned descriptor. In addition, if we take the last reference count - * here, then we have to tear down all of the rx data. + * here, then we have to tear down all of the RX data. */ void i40e_rx_recycle(caddr_t arg) @@ -1768,16 +1837,19 @@ mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi) * to properly program the hardware for checksum offload as well as the * generally required flags. * - * The i40e_tx_context_t`itc_cmdflags contains the set of flags we need to or - * into the descriptor based on the checksum flags for this mblk_t and the + * The i40e_tx_context_t`itc_data_cmdflags contains the set of flags we need to + * 'or' into the descriptor based on the checksum flags for this mblk_t and the * actual information we care about. + * + * If the mblk requires LSO then we'll also gather the information that will be + * used to construct the Transmit Context Descriptor. */ static int i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp, i40e_tx_context_t *tctx) { int ret; - uint32_t flags, start; + uint32_t chkflags, start, mss, lsoflags; mac_ether_offload_info_t meo; i40e_txq_stat_t *txs = &itrq->itrq_txstat; @@ -1786,8 +1858,10 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp, if (i40e->i40e_tx_hcksum_enable != B_TRUE) return (0); - mac_hcksum_get(mp, &start, NULL, NULL, NULL, &flags); - if (flags == 0) + mac_hcksum_get(mp, &start, NULL, NULL, NULL, &chkflags); + mac_lso_get(mp, &mss, &lsoflags); + + if (chkflags == 0 && lsoflags == 0) return (0); if ((ret = mac_ether_offload_info(mp, &meo)) != 0) { @@ -1800,7 +1874,7 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp, * have sufficient information and then set the proper fields in the * command structure. */ - if (flags & HCK_IPV4_HDRCKSUM) { + if (chkflags & HCK_IPV4_HDRCKSUM) { if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) { txs->itxs_hck_nol2info.value.ui64++; return (-1); @@ -1813,10 +1887,10 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp, txs->itxs_hck_badl3.value.ui64++; return (-1); } - tctx->itc_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM; - tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) << + tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4_CSUM; + tctx->itc_data_offsets |= (meo.meoi_l2hlen >> 1) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT; - tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) << + tctx->itc_data_offsets |= (meo.meoi_l3hlen >> 2) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT; } @@ -1826,13 +1900,13 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp, * onto seeing if we have enough information for the L4 checksum * offload. */ - if (flags & HCK_PARTIALCKSUM) { + if (chkflags & HCK_PARTIALCKSUM) { if ((meo.meoi_flags & MEOI_L4INFO_SET) == 0) { txs->itxs_hck_nol4info.value.ui64++; return (-1); } - if (!(flags & HCK_IPV4_HDRCKSUM)) { + if (!(chkflags & HCK_IPV4_HDRCKSUM)) { if ((meo.meoi_flags & MEOI_L2INFO_SET) == 0) { txs->itxs_hck_nol2info.value.ui64++; return (-1); @@ -1843,40 +1917,60 @@ i40e_tx_context(i40e_t *i40e, i40e_trqpair_t *itrq, mblk_t *mp, } if (meo.meoi_l3proto == ETHERTYPE_IP) { - tctx->itc_cmdflags |= + tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV4; } else if (meo.meoi_l3proto == ETHERTYPE_IPV6) { - tctx->itc_cmdflags |= + tctx->itc_data_cmdflags |= I40E_TX_DESC_CMD_IIPT_IPV6; } else { txs->itxs_hck_badl3.value.ui64++; return (-1); } - tctx->itc_offsets |= (meo.meoi_l2hlen >> 1) << + tctx->itc_data_offsets |= (meo.meoi_l2hlen >> 1) << I40E_TX_DESC_LENGTH_MACLEN_SHIFT; - tctx->itc_offsets |= (meo.meoi_l3hlen >> 2) << + tctx->itc_data_offsets |= (meo.meoi_l3hlen >> 2) << I40E_TX_DESC_LENGTH_IPLEN_SHIFT; } switch (meo.meoi_l4proto) { case IPPROTO_TCP: - tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_TCP; + tctx->itc_data_cmdflags |= + I40E_TX_DESC_CMD_L4T_EOFT_TCP; break; case IPPROTO_UDP: - tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_UDP; + tctx->itc_data_cmdflags |= + I40E_TX_DESC_CMD_L4T_EOFT_UDP; break; case IPPROTO_SCTP: - tctx->itc_cmdflags |= I40E_TX_DESC_CMD_L4T_EOFT_SCTP; + tctx->itc_data_cmdflags |= + I40E_TX_DESC_CMD_L4T_EOFT_SCTP; break; default: txs->itxs_hck_badl4.value.ui64++; return (-1); } - tctx->itc_offsets |= (meo.meoi_l4hlen >> 2) << + tctx->itc_data_offsets |= (meo.meoi_l4hlen >> 2) << I40E_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; } + if (lsoflags & HW_LSO) { + /* + * LSO requires that checksum offloads are enabled. If for + * some reason they're not we bail out with an error. + */ + if ((chkflags & HCK_IPV4_HDRCKSUM) == 0 || + (chkflags & HCK_PARTIALCKSUM) == 0) { + txs->itxs_lso_nohck.value.ui64++; + return (-1); + } + + tctx->itc_ctx_cmdflags |= I40E_TX_CTX_DESC_TSO; + tctx->itc_ctx_mss = mss; + tctx->itc_ctx_tsolen = msgsize(mp) - + (meo.meoi_l2hlen + meo.meoi_l3hlen + meo.meoi_l4hlen); + } + return (0); } @@ -1925,7 +2019,20 @@ i40e_tcb_reset(i40e_tx_control_block_t *tcb) tcb->tcb_dma.dmab_len = 0; break; case I40E_TX_DMA: - (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle); + if (tcb->tcb_used_lso == B_TRUE && tcb->tcb_bind_ncookies > 0) + (void) ddi_dma_unbind_handle(tcb->tcb_lso_dma_handle); + else if (tcb->tcb_bind_ncookies > 0) + (void) ddi_dma_unbind_handle(tcb->tcb_dma_handle); + if (tcb->tcb_bind_info != NULL) { + kmem_free(tcb->tcb_bind_info, + tcb->tcb_bind_ncookies * + sizeof (struct i40e_dma_bind_info)); + } + tcb->tcb_bind_info = NULL; + tcb->tcb_bind_ncookies = 0; + tcb->tcb_used_lso = B_FALSE; + break; + case I40E_TX_DESC: break; case I40E_TX_NONE: /* Cast to pacify lint */ @@ -1935,8 +2042,10 @@ i40e_tcb_reset(i40e_tx_control_block_t *tcb) } tcb->tcb_type = I40E_TX_NONE; - freemsg(tcb->tcb_mp); - tcb->tcb_mp = NULL; + if (tcb->tcb_mp != NULL) { + freemsg(tcb->tcb_mp); + tcb->tcb_mp = NULL; + } tcb->tcb_next = NULL; } @@ -1969,10 +2078,11 @@ i40e_tx_cleanup_ring(i40e_trqpair_t *itrq) i40e_tx_control_block_t *tcb; tcb = itrq->itrq_tcb_work_list[index]; - VERIFY(tcb != NULL); - itrq->itrq_tcb_work_list[index] = NULL; - i40e_tcb_reset(tcb); - i40e_tcb_free(itrq, tcb); + if (tcb != NULL) { + itrq->itrq_tcb_work_list[index] = NULL; + i40e_tcb_reset(tcb); + i40e_tcb_free(itrq, tcb); + } bzero(&itrq->itrq_desc_ring[index], sizeof (i40e_tx_desc_t)); index = i40e_next_desc(index, 1, itrq->itrq_tx_ring_size); @@ -1995,6 +2105,7 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq) uint32_t wbhead, toclean, count; i40e_tx_control_block_t *tcbhead; i40e_t *i40e = itrq->itrq_i40e; + uint_t desc_per_tcb, i; mutex_enter(&itrq->itrq_tx_lock); @@ -2042,11 +2153,27 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq) tcbhead = tcb; /* - * We zero this out for sanity purposes. + * In the DMA bind case, there may not necessarily be a 1:1 + * mapping between tcb's and descriptors. If the tcb type + * indicates a DMA binding then check the number of DMA + * cookies to determine how many entries to clean in the + * descriptor ring. */ - bzero(&itrq->itrq_desc_ring[toclean], sizeof (i40e_tx_desc_t)); - toclean = i40e_next_desc(toclean, 1, itrq->itrq_tx_ring_size); - count++; + if (tcb->tcb_type == I40E_TX_DMA) + desc_per_tcb = tcb->tcb_bind_ncookies; + else + desc_per_tcb = 1; + + for (i = 0; i < desc_per_tcb; i++) { + /* + * We zero this out for sanity purposes. + */ + bzero(&itrq->itrq_desc_ring[toclean], + sizeof (i40e_tx_desc_t)); + toclean = i40e_next_desc(toclean, 1, + itrq->itrq_tx_ring_size); + count++; + } } itrq->itrq_desc_head = wbhead; @@ -2078,6 +2205,102 @@ i40e_tx_recycle_ring(i40e_trqpair_t *itrq) DTRACE_PROBE2(i40e__recycle, i40e_trqpair_t *, itrq, uint32_t, count); } +static i40e_tx_control_block_t * +i40e_tx_bind_fragment(i40e_trqpair_t *itrq, const mblk_t *mp, + boolean_t use_lso) +{ + ddi_dma_handle_t dma_handle; + ddi_dma_cookie_t dma_cookie; + uint_t i = 0, ncookies = 0, dmaflags; + i40e_tx_control_block_t *tcb; + i40e_txq_stat_t *txs = &itrq->itrq_txstat; + + if ((tcb = i40e_tcb_alloc(itrq)) == NULL) { + txs->itxs_err_notcb.value.ui64++; + return (NULL); + } + tcb->tcb_type = I40E_TX_DMA; + + if (use_lso == B_TRUE) + dma_handle = tcb->tcb_lso_dma_handle; + else + dma_handle = tcb->tcb_dma_handle; + + dmaflags = DDI_DMA_WRITE | DDI_DMA_STREAMING; + if (ddi_dma_addr_bind_handle(dma_handle, NULL, + (caddr_t)mp->b_rptr, MBLKL(mp), dmaflags, DDI_DMA_DONTWAIT, NULL, + &dma_cookie, &ncookies) != DDI_DMA_MAPPED) { + txs->itxs_bind_fails.value.ui64++; + goto bffail; + } + tcb->tcb_bind_ncookies = ncookies; + tcb->tcb_used_lso = use_lso; + + tcb->tcb_bind_info = + kmem_zalloc(ncookies * sizeof (struct i40e_dma_bind_info), + KM_NOSLEEP); + if (tcb->tcb_bind_info == NULL) + goto bffail; + + while (i < ncookies) { + if (i > 0) + ddi_dma_nextcookie(dma_handle, &dma_cookie); + + tcb->tcb_bind_info[i].dbi_paddr = + (caddr_t)dma_cookie.dmac_laddress; + tcb->tcb_bind_info[i++].dbi_len = dma_cookie.dmac_size; + } + + return (tcb); + +bffail: + i40e_tcb_reset(tcb); + i40e_tcb_free(itrq, tcb); + return (NULL); +} + +static void +i40e_tx_set_data_desc(i40e_trqpair_t *itrq, i40e_tx_context_t *tctx, + struct i40e_dma_bind_info *dbi, boolean_t last_desc) +{ + i40e_tx_desc_t *txdesc; + int cmd; + + ASSERT(MUTEX_HELD(&itrq->itrq_tx_lock)); + itrq->itrq_desc_free--; + txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail]; + itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1, + itrq->itrq_tx_ring_size); + + cmd = I40E_TX_DESC_CMD_ICRC | tctx->itc_data_cmdflags; + + /* + * The last data descriptor needs the EOP bit set, so that the HW knows + * that we're ready to send. Additionally, we set the RS (Report + * Status) bit, so that we are notified when the transmit engine has + * completed DMA'ing all of the data descriptors and data buffers + * associated with this frame. + */ + if (last_desc == B_TRUE) { + cmd |= I40E_TX_DESC_CMD_EOP; + cmd |= I40E_TX_DESC_CMD_RS; + } + + /* + * Per the X710 manual, section 8.4.2.1.1, the buffer size + * must be a value from 1 to 16K minus 1, inclusive. + */ + ASSERT3U(dbi->dbi_len, >=, 1); + ASSERT3U(dbi->dbi_len, <=, I40E_MAX_TX_BUFSZ - 1); + + txdesc->buffer_addr = CPU_TO_LE64((uintptr_t)dbi->dbi_paddr); + txdesc->cmd_type_offset_bsz = + LE_64(((uint64_t)I40E_TX_DESC_DTYPE_DATA | + ((uint64_t)tctx->itc_data_offsets << I40E_TXD_QW1_OFFSET_SHIFT) | + ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) | + ((uint64_t)dbi->dbi_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT))); +} + /* * We've been asked to send a message block on the wire. We'll only have a * single chain. There will not be any b_next pointers; however, there may be @@ -2098,10 +2321,15 @@ i40e_ring_tx(void *arg, mblk_t *mp) { const mblk_t *nmp; size_t mpsize; - i40e_tx_control_block_t *tcb; + i40e_tx_control_block_t *tcb_ctx = NULL, *tcb_data = NULL, + **tcb_dma = NULL; i40e_tx_desc_t *txdesc; + i40e_tx_context_desc_t *ctxdesc; i40e_tx_context_t tctx; int cmd, type; + uint_t i, needed_desc = 0, nbufs = 0; + boolean_t do_ctx_desc = B_FALSE, do_dma_bind = B_FALSE, + use_lso = B_FALSE; i40e_trqpair_t *itrq = arg; i40e_t *i40e = itrq->itrq_i40e; @@ -2121,7 +2349,7 @@ i40e_ring_tx(void *arg, mblk_t *mp) /* * Figure out the relevant context about this frame that we might need - * for enabling checksum, lso, etc. This also fills in information that + * for enabling checksum, LSO, etc. This also fills in information that * we might set around the packet type, etc. */ if (i40e_tx_context(i40e, itrq, mp, &tctx) < 0) { @@ -2129,97 +2357,212 @@ i40e_ring_tx(void *arg, mblk_t *mp) itrq->itrq_txstat.itxs_err_context.value.ui64++; return (NULL); } + if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) { + use_lso = B_TRUE; + do_ctx_desc = B_TRUE; + } /* * For the primordial driver we can punt on doing any recycling right * now; however, longer term we need to probably do some more pro-active - * recycling to cut back on stalls in the tx path. + * recycling to cut back on stalls in the TX path. */ /* - * Do a quick size check to make sure it fits into what we think it - * should for this device. Note that longer term this will be false, - * particularly when we have the world of TSO. + * Iterate through the mblks to calculate both the total size of the + * frame and the number of fragments. This is used to determine + * whether we're doing DMA binding and, if so, how many TX control + * blocks we'll need. */ mpsize = 0; for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { - mpsize += MBLKL(nmp); + size_t blksz = MBLKL(nmp); + if (blksz > 0) { + mpsize += blksz; + nbufs++; + } } - /* - * First we allocate our tx control block and prepare the packet for - * transmit before we do a final check for descriptors. We do it this - * way to minimize the time under the tx lock. - */ - tcb = i40e_tcb_alloc(itrq); - if (tcb == NULL) { - txs->itxs_err_notcb.value.ui64++; - goto txfail; + if (do_ctx_desc) { + /* + * If we're doing tunneling or LSO, then we'll need a TX + * context descriptor in addition to one or more TX data + * descriptors. Since there's no data DMA block or handle + * associated with the context descriptor, we create a special + * control block that behaves effectively like a NOP. + */ + if ((tcb_ctx = i40e_tcb_alloc(itrq)) == NULL) { + txs->itxs_err_notcb.value.ui64++; + goto txfail; + } + tcb_ctx->tcb_type = I40E_TX_DESC; + needed_desc++; } /* - * For transmitting a block, we're currently going to use just a - * single control block and bcopy all of the fragments into it. We - * should be more intelligent about doing DMA binding or otherwise, but - * for getting off the ground this will have to do. + * For the non-LSO TX case, we alter our DMA strategy based on a + * threshold tied to the frame size. This threshold is configurable + * via the tx_dma_threshold property. + * + * If the frame size is above the threshold, we do DMA binding of the + * fragments, building a control block and data descriptor for each + * piece. + * + * If it's below or at the threshold then we just use a single control + * block and data descriptor and simply bcopy all of the fragments into + * the pre-allocated DMA buffer in the control block. + * + * For the LSO TX case we always do DMA binding. */ - ASSERT(tcb->tcb_dma.dmab_len == 0); - ASSERT(tcb->tcb_dma.dmab_size >= mpsize); - for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { - size_t clen = MBLKL(nmp); - void *coff = tcb->tcb_dma.dmab_address + tcb->tcb_dma.dmab_len; + if (use_lso == B_TRUE || mpsize > i40e->i40e_tx_dma_min) { + do_dma_bind = B_TRUE; + tcb_dma = + kmem_zalloc(nbufs * sizeof (i40e_tx_control_block_t *), + KM_NOSLEEP); + if (tcb_dma == NULL) { + i40e_error(i40e, "failed to allocate tcb_dma list"); + goto txfail; + } + /* + * For each b_cont: bind the control block's DMA handle to the + * b_rptr, and record the cookies so that we can later iterate + * through them and build TX data descriptors. + */ + for (nmp = mp, i = 0; nmp != NULL; nmp = nmp->b_cont) { + if (MBLKL(nmp) == 0) + continue; + tcb_dma[i] = i40e_tx_bind_fragment(itrq, nmp, use_lso); + if (tcb_dma[i] == NULL) { + i40e_error(i40e, "dma bind failed!"); + goto txfail; + } + if (i == 0) + tcb_dma[i]->tcb_mp = mp; + needed_desc += tcb_dma[i++]->tcb_bind_ncookies; + } + } else { + /* + * Just use a single control block and bcopy all of the + * fragments into its pre-allocated DMA buffer. + */ + if ((tcb_data = i40e_tcb_alloc(itrq)) == NULL) { + txs->itxs_err_notcb.value.ui64++; + goto txfail; + } + tcb_data->tcb_type = I40E_TX_COPY; + + ASSERT(tcb_data->tcb_dma.dmab_len == 0); + ASSERT(tcb_data->tcb_dma.dmab_size >= mpsize); + + for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { + size_t clen = MBLKL(nmp); + void *coff = tcb_data->tcb_dma.dmab_address + + tcb_data->tcb_dma.dmab_len; - bcopy(nmp->b_rptr, coff, clen); - tcb->tcb_dma.dmab_len += clen; + bcopy(nmp->b_rptr, coff, clen); + tcb_data->tcb_dma.dmab_len += clen; + } + ASSERT(tcb_data->tcb_dma.dmab_len == mpsize); + I40E_DMA_SYNC(&tcb_data->tcb_dma, DDI_DMA_SYNC_FORDEV); + + tcb_data->tcb_mp = mp; + needed_desc++; } - ASSERT(tcb->tcb_dma.dmab_len == mpsize); /* - * While there's really no need to keep the mp here, but let's just do - * it to help with our own debugging for now. + * The second condition ensures that 'itrq_desc_tail' never + * equals 'itrq_desc_head'. This enforces the rule found in + * the second bullet point of section 8.4.3.1.5 of the XL710 + * PG, which declares the TAIL pointer in I40E_QTX_TAIL should + * never overlap with the head. This means that we only ever + * have 'itrq_tx_ring_size - 1' total available descriptors. */ - tcb->tcb_mp = mp; - tcb->tcb_type = I40E_TX_COPY; - I40E_DMA_SYNC(&tcb->tcb_dma, DDI_DMA_SYNC_FORDEV); - mutex_enter(&itrq->itrq_tx_lock); - if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh) { + if (itrq->itrq_desc_free < i40e->i40e_tx_block_thresh || + (itrq->itrq_desc_free - 1) < needed_desc) { txs->itxs_err_nodescs.value.ui64++; mutex_exit(&itrq->itrq_tx_lock); goto txfail; } - /* - * Build up the descriptor and send it out. Thankfully at the moment - * we only need a single desc, because we're not doing anything fancy - * yet. - */ - ASSERT(itrq->itrq_desc_free > 0); - itrq->itrq_desc_free--; - txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail]; - itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb; - itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1, - itrq->itrq_tx_ring_size); + if (do_ctx_desc) { + /* + * If we're enabling any offloads for this frame, then we'll + * need to build up a transmit context descriptor, first. The + * context descriptor needs to be placed in the TX ring before + * the data descriptor(s). See section 8.4.2, table 8-16 + */ + uint_t tail = itrq->itrq_desc_tail; + itrq->itrq_desc_free--; + ctxdesc = (i40e_tx_context_desc_t *)&itrq->itrq_desc_ring[tail]; + itrq->itrq_tcb_work_list[tail] = tcb_ctx; + itrq->itrq_desc_tail = i40e_next_desc(tail, 1, + itrq->itrq_tx_ring_size); + + /* QW0 */ + type = I40E_TX_DESC_DTYPE_CONTEXT; + ctxdesc->tunneling_params = 0; + ctxdesc->l2tag2 = 0; + + /* QW1 */ + ctxdesc->type_cmd_tso_mss = CPU_TO_LE64((uint64_t)type); + if (tctx.itc_ctx_cmdflags & I40E_TX_CTX_DESC_TSO) { + ctxdesc->type_cmd_tso_mss |= CPU_TO_LE64((uint64_t) + ((uint64_t)tctx.itc_ctx_cmdflags << + I40E_TXD_CTX_QW1_CMD_SHIFT) | + ((uint64_t)tctx.itc_ctx_tsolen << + I40E_TXD_CTX_QW1_TSO_LEN_SHIFT) | + ((uint64_t)tctx.itc_ctx_mss << + I40E_TXD_CTX_QW1_MSS_SHIFT)); + } + } - /* - * Note, we always set EOP and RS which indicates that this is the last - * data frame and that we should ask for it to be transmitted. We also - * must always set ICRC, because that is an internal bit that must be - * set to one for data descriptors. The remaining bits in the command - * descriptor depend on checksumming and are determined based on the - * information set up in i40e_tx_context(). - */ - type = I40E_TX_DESC_DTYPE_DATA; - cmd = I40E_TX_DESC_CMD_EOP | - I40E_TX_DESC_CMD_RS | - I40E_TX_DESC_CMD_ICRC | - tctx.itc_cmdflags; - txdesc->buffer_addr = - CPU_TO_LE64((uintptr_t)tcb->tcb_dma.dmab_dma_address); - txdesc->cmd_type_offset_bsz = CPU_TO_LE64(((uint64_t)type | - ((uint64_t)tctx.itc_offsets << I40E_TXD_QW1_OFFSET_SHIFT) | - ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) | - ((uint64_t)tcb->tcb_dma.dmab_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT))); + if (do_dma_bind == B_TRUE) { + /* + * Next build up a transmit data descriptor for each buffer. + */ + boolean_t last_desc = B_FALSE; + for (i = 0; i < nbufs; i++) { + itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = + tcb_dma[i]; + + for (uint_t c = 0; c < tcb_dma[i]->tcb_bind_ncookies; + c++) { + if (i == (nbufs - 1) && + c == (tcb_dma[i]->tcb_bind_ncookies - 1)) { + last_desc = B_TRUE; + } + i40e_tx_set_data_desc(itrq, &tctx, + &tcb_dma[i]->tcb_bind_info[c], last_desc); + } + } + kmem_free(tcb_dma, nbufs * sizeof (i40e_tx_control_block_t *)); + tcb_dma = NULL; + } else { + /* + * Build up the single transmit data descriptor needed for the + * non-DMA-bind case. + */ + itrq->itrq_desc_free--; + txdesc = &itrq->itrq_desc_ring[itrq->itrq_desc_tail]; + itrq->itrq_tcb_work_list[itrq->itrq_desc_tail] = tcb_data; + itrq->itrq_desc_tail = i40e_next_desc(itrq->itrq_desc_tail, 1, + itrq->itrq_tx_ring_size); + + type = I40E_TX_DESC_DTYPE_DATA; + cmd = I40E_TX_DESC_CMD_EOP | + I40E_TX_DESC_CMD_RS | + I40E_TX_DESC_CMD_ICRC | + tctx.itc_data_cmdflags; + txdesc->buffer_addr = + CPU_TO_LE64((uintptr_t)tcb_data->tcb_dma.dmab_dma_address); + txdesc->cmd_type_offset_bsz = CPU_TO_LE64(((uint64_t)type | + ((uint64_t)tctx.itc_data_offsets << + I40E_TXD_QW1_OFFSET_SHIFT) | + ((uint64_t)cmd << I40E_TXD_QW1_CMD_SHIFT) | + ((uint64_t)tcb_data->tcb_dma.dmab_len << + I40E_TXD_QW1_TX_BUF_SZ_SHIFT))); + } /* * Now, finally, sync the DMA data and alert hardware. @@ -2228,6 +2571,7 @@ i40e_ring_tx(void *arg, mblk_t *mp) I40E_WRITE_REG(hw, I40E_QTX_TAIL(itrq->itrq_index), itrq->itrq_desc_tail); + if (i40e_check_acc_handle(i40e->i40e_osdep_space.ios_reg_handle) != DDI_FM_OK) { /* @@ -2241,7 +2585,7 @@ i40e_ring_tx(void *arg, mblk_t *mp) txs->itxs_bytes.value.ui64 += mpsize; txs->itxs_packets.value.ui64++; - txs->itxs_descriptors.value.ui64++; + txs->itxs_descriptors.value.ui64 += needed_desc; mutex_exit(&itrq->itrq_tx_lock); @@ -2254,10 +2598,25 @@ txfail: * Make sure to reset their message block's, since we'll return them * back to MAC. */ - if (tcb != NULL) { - tcb->tcb_mp = NULL; - i40e_tcb_reset(tcb); - i40e_tcb_free(itrq, tcb); + if (tcb_ctx != NULL) { + tcb_ctx->tcb_mp = NULL; + i40e_tcb_reset(tcb_ctx); + i40e_tcb_free(itrq, tcb_ctx); + } + if (tcb_data != NULL) { + tcb_data->tcb_mp = NULL; + i40e_tcb_reset(tcb_data); + i40e_tcb_free(itrq, tcb_data); + } + if (tcb_dma != NULL) { + for (i = 0; i < nbufs; i++) { + if (tcb_dma[i] == NULL) + break; + tcb_dma[i]->tcb_mp = NULL; + i40e_tcb_reset(tcb_dma[i]); + i40e_tcb_free(itrq, tcb_dma[i]); + } + kmem_free(tcb_dma, nbufs * sizeof (i40e_tx_control_block_t *)); } mutex_enter(&itrq->itrq_tx_lock); diff --git a/usr/src/uts/common/io/ib/clients/ibd/ibd_cm.c b/usr/src/uts/common/io/ib/clients/ibd/ibd_cm.c index 1c8318b191..55c4159bc4 100644 --- a/usr/src/uts/common/io/ib/clients/ibd/ibd_cm.c +++ b/usr/src/uts/common/io/ib/clients/ibd/ibd_cm.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -272,8 +273,7 @@ ibd_async_rc_process_too_big(ibd_state_t *state, ibd_req_t *req) icmph->icmph_checksum = IP_CSUM(pmtu_mp, (int32_t)sizeof (ib_header_info_t) + (int32_t)sizeof (ipha_t), 0); - (void) hcksum_assoc(pmtu_mp, NULL, NULL, 0, 0, 0, 0, - HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0); + mac_hcksum_set(pmtu_mp, 0, 0, 0, 0, HCK_FULLCKSUM | HCK_FULLCKSUM_OK); DPRINT(30, "ibd_async_rc_process_too_big: sap=0x%x, ip_src=0x%x, " "ip_dst=0x%x, ttl=%d, len_needed=%d, msg_len=%d", @@ -1560,8 +1560,7 @@ ibd_rc_process_rx(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe, ibt_wc_t *wc) /* * Can RC mode in IB guarantee its checksum correctness? * - * (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, - * HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0); + * mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM | HCK_FULLCKSUM_OK); */ /* diff --git a/usr/src/uts/common/io/inotify.c b/usr/src/uts/common/io/inotify.c new file mode 100644 index 0000000000..eaa0c33f0f --- /dev/null +++ b/usr/src/uts/common/io/inotify.c @@ -0,0 +1,1555 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + * Copyright (c) 2015 The MathWorks, Inc. All rights reserved. + */ + +/* + * Support for the inotify facility, a Linux-borne facility for asynchronous + * notification of certain events on specified files or directories. Our + * implementation broadly leverages the file event monitoring facility, and + * would actually be quite straightforward were it not for a very serious + * blunder in the inotify interface: in addition to allowing for one to be + * notified on events on a particular file or directory, inotify also allows + * for one to be notified on certain events on files _within_ a watched + * directory -- even though those events have absolutely nothing to do with + * the directory itself. This leads to all sorts of madness because file + * operations are (of course) not undertaken on paths but rather on open + * files -- and the relationships between open files and the paths that resolve + * to those files are neither static nor isomorphic. We implement this + * concept by having _child watches_ when directories are watched with events + * in IN_CHILD_EVENTS. We add child watches when a watch on a directory is + * first added, and we modify those child watches dynamically as files are + * created, deleted, moved into or moved out of the specified directory. This + * mechanism works well, absent hard links. Hard links, unfortunately, break + * this rather badly, and the user is warned that watches on directories that + * have multiple directory entries referring to the same file may behave + * unexpectedly. + */ + +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/inotify.h> +#include <sys/fem.h> +#include <sys/conf.h> +#include <sys/stat.h> +#include <sys/vfs_opreg.h> +#include <sys/vmem.h> +#include <sys/avl.h> +#include <sys/sysmacros.h> +#include <sys/cyclic.h> +#include <sys/filio.h> + +struct inotify_state; +struct inotify_kevent; + +typedef struct inotify_watch inotify_watch_t; +typedef struct inotify_state inotify_state_t; +typedef struct inotify_kevent inotify_kevent_t; + +struct inotify_watch { + kmutex_t inw_lock; /* lock protecting ref count */ + int inw_refcnt; /* reference count */ + uint8_t inw_zombie:1; /* boolean: is zombie */ + uint8_t inw_fired:1; /* boolean: fired one-shot */ + uint8_t inw_active:1; /* boolean: watch is active */ + uint8_t inw_orphaned:1; /* boolean: orphaned */ + kcondvar_t inw_cv; /* condvar for zombifier */ + uint32_t inw_mask; /* mask of watch */ + int32_t inw_wd; /* watch descriptor */ + vnode_t *inw_vp; /* underlying vnode */ + inotify_watch_t *inw_parent; /* parent, if a child */ + avl_node_t inw_byvp; /* watches by vnode */ + avl_node_t inw_bywd; /* watches by descriptor */ + avl_tree_t inw_children; /* children, if a parent */ + char *inw_name; /* name, if a child */ + list_node_t inw_orphan; /* orphan list */ + cred_t *inw_cred; /* cred, if orphaned */ + inotify_state_t *inw_state; /* corresponding state */ +}; + +struct inotify_kevent { + inotify_kevent_t *ine_next; /* next event in queue */ + struct inotify_event ine_event; /* event (variable size) */ +}; + +#define INOTIFY_EVENT_LENGTH(ev) \ + (sizeof (inotify_kevent_t) + (ev)->ine_event.len) + +struct inotify_state { + kmutex_t ins_lock; /* lock protecting state */ + avl_tree_t ins_byvp; /* watches by vnode */ + avl_tree_t ins_bywd; /* watches by descriptor */ + vmem_t *ins_wds; /* watch identifier arena */ + int ins_maxwatches; /* maximum number of watches */ + int ins_maxevents; /* maximum number of events */ + int ins_nevents; /* current # of events */ + int32_t ins_size; /* total size of events */ + inotify_kevent_t *ins_head; /* head of event queue */ + inotify_kevent_t *ins_tail; /* tail of event queue */ + pollhead_t ins_pollhd; /* poll head */ + kcondvar_t ins_cv; /* condvar for reading */ + list_t ins_orphans; /* orphan list */ + ddi_periodic_t ins_cleaner; /* cyclic for cleaning */ + inotify_watch_t *ins_zombies; /* zombie watch list */ + cred_t *ins_cred; /* creator's credentials */ + inotify_state_t *ins_next; /* next state on global list */ +}; + +/* + * Tunables (exported read-only in lx-branded zones via /proc). + */ +int inotify_maxwatches = 8192; /* max watches per instance */ +int inotify_maxevents = 16384; /* max events */ +int inotify_maxinstances = 128; /* max instances per user */ + +/* + * Internal global variables. + */ +static kmutex_t inotify_lock; /* lock protecting state */ +static dev_info_t *inotify_devi; /* device info */ +static fem_t *inotify_femp; /* FEM pointer */ +static vmem_t *inotify_minor; /* minor number arena */ +static void *inotify_softstate; /* softstate pointer */ +static inotify_state_t *inotify_state; /* global list if state */ + +static void inotify_watch_event(inotify_watch_t *, uint64_t, char *); +static void inotify_watch_insert(inotify_watch_t *, vnode_t *, char *); +static void inotify_watch_delete(inotify_watch_t *, uint32_t); +static void inotify_watch_remove(inotify_state_t *state, + inotify_watch_t *watch); + +static int +inotify_fop_close(femarg_t *vf, int flag, int count, offset_t offset, + cred_t *cr, caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_close(vf, flag, count, offset, cr, ct)) == 0) { + inotify_watch_event(watch, flag & FWRITE ? + IN_CLOSE_WRITE : IN_CLOSE_NOWRITE, NULL); + } + + return (rval); +} + +static int +inotify_fop_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl, + int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, + vsecattr_t *vsecp) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_create(vf, name, vap, excl, mode, + vpp, cr, flag, ct, vsecp)) == 0) { + inotify_watch_insert(watch, *vpp, name); + inotify_watch_event(watch, IN_CREATE, name); + } + + return (rval); +} + +static int +inotify_fop_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr, + caller_context_t *ct, int flags) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_link(vf, svp, tnm, cr, ct, flags)) == 0) { + inotify_watch_insert(watch, svp, tnm); + inotify_watch_event(watch, IN_CREATE, tnm); + } + + return (rval); +} + +static int +inotify_fop_mkdir(femarg_t *vf, char *name, vattr_t *vap, vnode_t **vpp, + cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_mkdir(vf, name, vap, vpp, cr, + ct, flags, vsecp)) == 0) { + inotify_watch_insert(watch, *vpp, name); + inotify_watch_event(watch, IN_CREATE | IN_ISDIR, name); + } + + return (rval); +} + +static int +inotify_fop_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_open(vf, mode, cr, ct)) == 0) + inotify_watch_event(watch, IN_OPEN, NULL); + + return (rval); +} + +static int +inotify_fop_read(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr, + caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval = vnext_read(vf, uiop, ioflag, cr, ct); + inotify_watch_event(watch, IN_ACCESS, NULL); + + return (rval); +} + +static int +inotify_fop_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval = vnext_readdir(vf, uiop, cr, eofp, ct, flags); + inotify_watch_event(watch, IN_ACCESS | IN_ISDIR, NULL); + + return (rval); +} + +int +inotify_fop_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct, + int flags) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_remove(vf, nm, cr, ct, flags)) == 0) + inotify_watch_event(watch, IN_DELETE, nm); + + return (rval); +} + +int +inotify_fop_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr, + caller_context_t *ct, int flags) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_rmdir(vf, nm, cdir, cr, ct, flags)) == 0) + inotify_watch_event(watch, IN_DELETE | IN_ISDIR, nm); + + return (rval); +} + +static int +inotify_fop_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_setattr(vf, vap, flags, cr, ct)) == 0) + inotify_watch_event(watch, IN_ATTRIB, NULL); + + return (rval); +} + +static int +inotify_fop_write(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr, + caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval = vnext_write(vf, uiop, ioflag, cr, ct); + inotify_watch_event(watch, IN_MODIFY, NULL); + + return (rval); +} + +static int +inotify_fop_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *name, + caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + + switch (vnevent) { + case VE_RENAME_SRC: + inotify_watch_event(watch, IN_MOVE_SELF, NULL); + inotify_watch_delete(watch, IN_MOVE_SELF); + break; + case VE_REMOVE: + /* + * Linux will apparently fire an IN_ATTRIB event when the link + * count changes (including when it drops to 0 on a remove). + * This is merely somewhat odd; what is amazing is that this + * IN_ATTRIB event is not visible on an inotify watch on the + * parent directory. (IN_ATTRIB events are normally sent to + * watches on the parent directory). While it's hard to + * believe that this constitutes desired semantics, ltp + * unfortunately tests this case (if implicitly); in the name + * of bug-for-bug compatibility, we fire IN_ATTRIB iff we are + * explicitly watching the file that has been removed. + */ + if (watch->inw_parent == NULL) + inotify_watch_event(watch, IN_ATTRIB, NULL); + + /*FALLTHROUGH*/ + case VE_RENAME_DEST: + inotify_watch_event(watch, IN_DELETE_SELF, NULL); + inotify_watch_delete(watch, IN_DELETE_SELF); + break; + case VE_RMDIR: + /* + * It seems that IN_ISDIR should really be OR'd in here, but + * Linux doesn't seem to do that in this case; for the sake of + * bug-for-bug compatibility, we don't do it either. + */ + inotify_watch_event(watch, IN_DELETE_SELF, NULL); + inotify_watch_delete(watch, IN_DELETE_SELF); + break; + case VE_CREATE: + case VE_TRUNCATE: + case VE_RESIZE: + inotify_watch_event(watch, IN_MODIFY | IN_ATTRIB, NULL); + break; + case VE_LINK: + inotify_watch_event(watch, IN_ATTRIB, NULL); + break; + case VE_RENAME_SRC_DIR: + inotify_watch_event(watch, IN_MOVED_FROM, name); + break; + case VE_RENAME_DEST_DIR: + if (name == NULL) + name = dvp->v_path; + + inotify_watch_insert(watch, dvp, name); + inotify_watch_event(watch, IN_MOVED_TO, name); + break; + case VE_SUPPORT: + case VE_MOUNTEDOVER: + case VE_PRE_RENAME_SRC: + case VE_PRE_RENAME_DEST: + case VE_PRE_RENAME_DEST_DIR: + break; + } + + return (vnext_vnevent(vf, vnevent, dvp, name, ct)); +} + +const fs_operation_def_t inotify_vnodesrc_template[] = { + VOPNAME_CLOSE, { .femop_close = inotify_fop_close }, + VOPNAME_CREATE, { .femop_create = inotify_fop_create }, + VOPNAME_LINK, { .femop_link = inotify_fop_link }, + VOPNAME_MKDIR, { .femop_mkdir = inotify_fop_mkdir }, + VOPNAME_OPEN, { .femop_open = inotify_fop_open }, + VOPNAME_READ, { .femop_read = inotify_fop_read }, + VOPNAME_READDIR, { .femop_readdir = inotify_fop_readdir }, + VOPNAME_REMOVE, { .femop_remove = inotify_fop_remove }, + VOPNAME_RMDIR, { .femop_rmdir = inotify_fop_rmdir }, + VOPNAME_SETATTR, { .femop_setattr = inotify_fop_setattr }, + VOPNAME_WRITE, { .femop_write = inotify_fop_write }, + VOPNAME_VNEVENT, { .femop_vnevent = inotify_fop_vnevent }, + NULL, NULL +}; + +static int +inotify_watch_cmpwd(inotify_watch_t *lhs, inotify_watch_t *rhs) +{ + if (lhs->inw_wd < rhs->inw_wd) + return (-1); + + if (lhs->inw_wd > rhs->inw_wd) + return (1); + + return (0); +} + +static int +inotify_watch_cmpvp(inotify_watch_t *lhs, inotify_watch_t *rhs) +{ + uintptr_t lvp = (uintptr_t)lhs->inw_vp, rvp = (uintptr_t)rhs->inw_vp; + + if (lvp < rvp) + return (-1); + + if (lvp > rvp) + return (1); + + return (0); +} + +static void +inotify_watch_hold(inotify_watch_t *watch) +{ + mutex_enter(&watch->inw_lock); + VERIFY(watch->inw_refcnt > 0); + watch->inw_refcnt++; + mutex_exit(&watch->inw_lock); +} + +static void +inotify_watch_release(inotify_watch_t *watch) +{ + mutex_enter(&watch->inw_lock); + VERIFY(watch->inw_refcnt > 1); + + if (--watch->inw_refcnt == 1 && watch->inw_zombie) { + /* + * We're down to our last reference; kick anyone that might be + * waiting. + */ + cv_signal(&watch->inw_cv); + } + + mutex_exit(&watch->inw_lock); +} + +static void +inotify_watch_event(inotify_watch_t *watch, uint64_t mask, char *name) +{ + inotify_kevent_t *event, *tail; + inotify_state_t *state = watch->inw_state; + uint32_t wd = watch->inw_wd, cookie = 0, len; + boolean_t removal = mask & IN_REMOVAL ? B_TRUE : B_FALSE; + inotify_watch_t *source = watch; + + if (!(mask &= watch->inw_mask) || mask == IN_ISDIR) + return; + + if (watch->inw_parent != NULL) { + /* + * This is an event on the child; if this isn't a valid child + * event, return. Otherwise, we move our watch to be our + * parent (which we know is around because we have a hold on + * it) and continue. + */ + if (!(mask & IN_CHILD_EVENTS)) + return; + + name = watch->inw_name; + watch = watch->inw_parent; + wd = watch->inw_wd; + } + + if (!removal) { + mutex_enter(&state->ins_lock); + + if (watch->inw_zombie || + watch->inw_fired || !watch->inw_active) { + mutex_exit(&state->ins_lock); + return; + } + } else { + if (!watch->inw_active) + return; + + VERIFY(MUTEX_HELD(&state->ins_lock)); + } + + /* + * If this is an operation on a directory and it's a child event + * (event if it's not on a child), we specify IN_ISDIR. + */ + if (source->inw_vp->v_type == VDIR && (mask & IN_CHILD_EVENTS)) + mask |= IN_ISDIR; + + if (mask & (IN_MOVED_FROM | IN_MOVED_TO)) + cookie = (uint32_t)curthread->t_did; + + if (state->ins_nevents >= state->ins_maxevents) { + /* + * We're at our maximum number of events -- turn our event + * into an IN_Q_OVERFLOW event, which will be coalesced if + * it's already the tail event. + */ + mask = IN_Q_OVERFLOW; + wd = (uint32_t)-1; + cookie = 0; + len = 0; + } + + if ((tail = state->ins_tail) != NULL && tail->ine_event.wd == wd && + tail->ine_event.mask == mask && tail->ine_event.cookie == cookie && + ((tail->ine_event.len == 0 && len == 0) || + (name != NULL && tail->ine_event.len != 0 && + strcmp(tail->ine_event.name, name) == 0))) { + /* + * This is an implicitly coalesced event; we're done. + */ + if (!removal) + mutex_exit(&state->ins_lock); + return; + } + + if (name != NULL) { + /* + * We are in the context of a file event monitoring operation, + * so the name length is bounded by the kernel. + */ + len = strlen(name) + 1; + len = roundup(len, sizeof (struct inotify_event)); + } else { + len = 0; + } + + event = kmem_zalloc(sizeof (inotify_kevent_t) + len, KM_SLEEP); + event->ine_event.wd = wd; + event->ine_event.mask = (uint32_t)mask; + event->ine_event.cookie = cookie; + event->ine_event.len = len; + + if (name != NULL) + (void) strcpy(event->ine_event.name, name); + + if (tail != NULL) { + tail->ine_next = event; + } else { + VERIFY(state->ins_head == NULL); + state->ins_head = event; + cv_broadcast(&state->ins_cv); + } + + state->ins_tail = event; + state->ins_nevents++; + state->ins_size += sizeof (event->ine_event) + len; + + if (removal) + return; + + if ((watch->inw_mask & IN_ONESHOT) && !watch->inw_fired) { + /* + * If this is a one-shot, we need to remove the watch. (Note + * that this will recurse back into inotify_watch_event() to + * fire the IN_IGNORED event -- but with "removal" set.) + */ + watch->inw_fired = 1; + inotify_watch_remove(state, watch); + } + + mutex_exit(&state->ins_lock); + pollwakeup(&state->ins_pollhd, POLLRDNORM | POLLIN); +} + +/* + * Destroy a watch. By the time we're in here, the watch must have exactly + * one reference. + */ +static void +inotify_watch_destroy(inotify_watch_t *watch) +{ + VERIFY(MUTEX_HELD(&watch->inw_lock)); + + if (watch->inw_name != NULL) + kmem_free(watch->inw_name, strlen(watch->inw_name) + 1); + + kmem_free(watch, sizeof (inotify_watch_t)); +} + +static int +inotify_fem_install(vnode_t *vp, inotify_watch_t *watch) +{ + /* + * For vnodes that are devices (of type VCHR or VBLK), we silently + * refuse to actually install any event monitor. This is to avoid + * single-thread deadlock when both a special device vnode and its + * underlying real vnode are being watched: releasing the device + * vnode upon watch removal can induce an attribute update on the + * underlying vnode, which will bring us into inotify_watch_event() + * with our lock already held. While we could fail earlier and more + * explicitly in this case, we choose to keep with the Linux behavior + * on unwatchable entities and allow the watch but not generate any + * events for it. + */ + if (vp->v_type == VCHR || vp->v_type == VBLK) + return (0); + + return (fem_install(vp, inotify_femp, watch, OPARGUNIQ, + (void (*)(void *))inotify_watch_hold, + (void (*)(void *))inotify_watch_release)); +} + +static int +inotify_fem_uninstall(vnode_t *vp, inotify_watch_t *watch) +{ + /* + * See inotify_fem_install(), above, for our rationale here. + */ + if (vp->v_type == VCHR || vp->v_type == VBLK) + return (0); + + return (fem_uninstall(vp, inotify_femp, watch)); +} + +/* + * Zombify a watch. By the time we come in here, it must be true that the + * watch has already been fem_uninstall()'d -- the only reference should be + * in the state's data structure. If we can get away with freeing it, we'll + * do that -- but if the reference count is greater than one due to an active + * vnode operation, we'll put this watch on the zombie list on the state + * structure. + */ +static void +inotify_watch_zombify(inotify_watch_t *watch) +{ + inotify_state_t *state = watch->inw_state; + + VERIFY(MUTEX_HELD(&state->ins_lock)); + VERIFY(!watch->inw_zombie); + + watch->inw_zombie = 1; + + if (watch->inw_parent != NULL) { + inotify_watch_release(watch->inw_parent); + } else { + avl_remove(&state->ins_byvp, watch); + avl_remove(&state->ins_bywd, watch); + vmem_free(state->ins_wds, (void *)(uintptr_t)watch->inw_wd, 1); + watch->inw_wd = -1; + } + + mutex_enter(&watch->inw_lock); + + if (watch->inw_refcnt == 1) { + /* + * There are no operations in flight and there is no way + * for anyone to discover this watch -- we can destroy it. + */ + inotify_watch_destroy(watch); + } else { + /* + * There are operations in flight; we will need to enqueue + * this for later destruction. + */ + watch->inw_parent = state->ins_zombies; + state->ins_zombies = watch; + mutex_exit(&watch->inw_lock); + } +} + +static inotify_watch_t * +inotify_watch_add(inotify_state_t *state, inotify_watch_t *parent, + const char *name, vnode_t *vp, uint32_t mask) +{ + inotify_watch_t *watch; + int err; + + VERIFY(MUTEX_HELD(&state->ins_lock)); + + watch = kmem_zalloc(sizeof (inotify_watch_t), KM_SLEEP); + + watch->inw_vp = vp; + watch->inw_mask = mask; + watch->inw_state = state; + watch->inw_refcnt = 1; + + if (parent == NULL) { + watch->inw_wd = (int)(uintptr_t)vmem_alloc(state->ins_wds, + 1, VM_BESTFIT | VM_SLEEP); + avl_add(&state->ins_byvp, watch); + avl_add(&state->ins_bywd, watch); + + avl_create(&watch->inw_children, + (int(*)(const void *, const void *))inotify_watch_cmpvp, + sizeof (inotify_watch_t), + offsetof(inotify_watch_t, inw_byvp)); + } else { + VERIFY(name != NULL); + inotify_watch_hold(parent); + watch->inw_mask &= IN_CHILD_EVENTS; + watch->inw_parent = parent; + + /* + * Copy the name. Note that when the name is user-specified, + * its length is bounded by the copyinstr() to be MAXPATHLEN + * (and regardless, we know by this point that it exists in + * our parent). + */ + watch->inw_name = kmem_alloc(strlen(name) + 1, KM_SLEEP); + (void) strcpy(watch->inw_name, name); + + avl_add(&parent->inw_children, watch); + } + + /* + * Add our monitor to the vnode. We must not have the watch lock held + * when we do this, as it will immediately hold our watch. + */ + err = inotify_fem_install(vp, watch); + + VERIFY(err == 0); + + return (watch); +} + +/* + * Remove a (non-child) watch. This is called from either synchronous context + * via inotify_rm_watch() or monitor context via either a vnevent or a + * one-shot. + */ +static void +inotify_watch_remove(inotify_state_t *state, inotify_watch_t *watch) +{ + inotify_watch_t *child; + int err; + + VERIFY(MUTEX_HELD(&state->ins_lock)); + VERIFY(watch->inw_parent == NULL); + + err = inotify_fem_uninstall(watch->inw_vp, watch); + VERIFY(err == 0); + + /* + * If we have children, we're going to remove them all and set them + * all to be zombies. + */ + while ((child = avl_first(&watch->inw_children)) != NULL) { + VERIFY(child->inw_parent == watch); + avl_remove(&watch->inw_children, child); + + err = inotify_fem_uninstall(child->inw_vp, child); + VERIFY(err == 0); + + /* + * If this child watch has been orphaned, remove it from the + * state's list of orphans. + */ + if (child->inw_orphaned) { + list_remove(&state->ins_orphans, child); + crfree(child->inw_cred); + } + + VN_RELE(child->inw_vp); + + /* + * We're down (or should be down) to a single reference to + * this child watch; it's safe to zombify it. + */ + inotify_watch_zombify(child); + } + + inotify_watch_event(watch, IN_IGNORED | IN_REMOVAL, NULL); + VN_RELE(watch->inw_vp); + + /* + * It's now safe to zombify the watch -- we know that the only reference + * can come from operations in flight. + */ + inotify_watch_zombify(watch); +} + +/* + * Delete a watch. Should only be called from VOP context. + */ +static void +inotify_watch_delete(inotify_watch_t *watch, uint32_t event) +{ + inotify_state_t *state = watch->inw_state; + inotify_watch_t cmp = { .inw_vp = watch->inw_vp }, *parent; + int err; + + if (event != IN_DELETE_SELF && !(watch->inw_mask & IN_CHILD_EVENTS)) + return; + + mutex_enter(&state->ins_lock); + + if (watch->inw_zombie) { + mutex_exit(&state->ins_lock); + return; + } + + if ((parent = watch->inw_parent) == NULL) { + if (event == IN_DELETE_SELF) { + /* + * If we're here because we're being deleted and we + * are not a child watch, we need to delete the entire + * watch, children and all. + */ + inotify_watch_remove(state, watch); + } + + mutex_exit(&state->ins_lock); + return; + } else { + if (event == IN_DELETE_SELF && + !(parent->inw_mask & IN_EXCL_UNLINK)) { + /* + * This is a child watch for a file that is being + * removed and IN_EXCL_UNLINK has not been specified; + * indicate that it is orphaned and add it to the list + * of orphans. (This list will be checked by the + * cleaning cyclic to determine when the watch has + * become the only hold on the vnode, at which point + * the watch can be zombified.) Note that we check + * if the watch is orphaned before we orphan it: hard + * links make it possible for VE_REMOVE to be called + * multiple times on the same vnode. (!) + */ + if (!watch->inw_orphaned) { + watch->inw_orphaned = 1; + watch->inw_cred = CRED(); + crhold(watch->inw_cred); + list_insert_head(&state->ins_orphans, watch); + } + + mutex_exit(&state->ins_lock); + return; + } + + if (watch->inw_orphaned) { + /* + * If we're here, a file was orphaned and then later + * moved -- which almost certainly means that hard + * links are on the scene. We choose the orphan over + * the move because we don't want to spuriously + * drop events if we can avoid it. + */ + crfree(watch->inw_cred); + list_remove(&state->ins_orphans, watch); + } + } + + if (avl_find(&parent->inw_children, &cmp, NULL) == NULL) { + /* + * This watch has already been deleted from the parent. + */ + mutex_exit(&state->ins_lock); + return; + } + + avl_remove(&parent->inw_children, watch); + err = inotify_fem_uninstall(watch->inw_vp, watch); + VERIFY(err == 0); + + VN_RELE(watch->inw_vp); + + /* + * It's now safe to zombify the watch -- which won't actually delete + * it as we know that the reference count is greater than 1. + */ + inotify_watch_zombify(watch); + mutex_exit(&state->ins_lock); +} + +/* + * Insert a new child watch. Should only be called from VOP context when + * a child is created in a watched directory. + */ +static void +inotify_watch_insert(inotify_watch_t *watch, vnode_t *vp, char *name) +{ + inotify_state_t *state = watch->inw_state; + inotify_watch_t cmp = { .inw_vp = vp }; + + if (!(watch->inw_mask & IN_CHILD_EVENTS)) + return; + + mutex_enter(&state->ins_lock); + + if (watch->inw_zombie || watch->inw_parent != NULL || vp == NULL) { + mutex_exit(&state->ins_lock); + return; + } + + if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) { + mutex_exit(&state->ins_lock); + return; + } + + VN_HOLD(vp); + watch = inotify_watch_add(state, watch, name, vp, watch->inw_mask); + VERIFY(watch != NULL); + + mutex_exit(&state->ins_lock); +} + + +static int +inotify_add_watch(inotify_state_t *state, vnode_t *vp, uint32_t mask, + int32_t *wdp) +{ + inotify_watch_t *watch, cmp = { .inw_vp = vp }; + uint32_t set; + + set = (mask & (IN_ALL_EVENTS | IN_MODIFIERS)) | IN_UNMASKABLE; + + /* + * Lookup our vnode to determine if we already have a watch on it. + */ + mutex_enter(&state->ins_lock); + + if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) { + /* + * We don't have this watch; allocate a new one, provided that + * we have fewer than our limit. + */ + if (avl_numnodes(&state->ins_bywd) >= state->ins_maxwatches) { + mutex_exit(&state->ins_lock); + return (ENOSPC); + } + + VN_HOLD(vp); + watch = inotify_watch_add(state, NULL, NULL, vp, set); + *wdp = watch->inw_wd; + mutex_exit(&state->ins_lock); + + return (0); + } + + VERIFY(!watch->inw_zombie); + + if (!(mask & IN_MASK_ADD)) { + /* + * Note that if we're resetting our event mask and we're + * transitioning from an event mask that includes child events + * to one that doesn't, there will be potentially some stale + * child watches. This is basically fine: they won't fire, + * and they will correctly be removed when the watch is + * removed. + */ + watch->inw_mask = 0; + } + + watch->inw_mask |= set; + + *wdp = watch->inw_wd; + + mutex_exit(&state->ins_lock); + + return (0); +} + +static int +inotify_add_child(inotify_state_t *state, vnode_t *vp, char *name) +{ + inotify_watch_t *watch, cmp = { .inw_vp = vp }; + vnode_t *cvp; + int err; + + /* + * Verify that the specified child doesn't have a directory component + * within it. + */ + if (strchr(name, '/') != NULL) + return (EINVAL); + + /* + * Lookup the underlying file. Note that this will succeed even if + * we don't have permissions to actually read the file. + */ + if ((err = lookupnameat(name, + UIO_SYSSPACE, NO_FOLLOW, NULL, &cvp, vp)) != 0) { + return (err); + } + + /* + * Use our vnode to find our watch, and then add our child watch to it. + */ + mutex_enter(&state->ins_lock); + + if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) { + /* + * This is unexpected -- it means that we don't have the + * watch that we thought we had. + */ + mutex_exit(&state->ins_lock); + VN_RELE(cvp); + return (ENXIO); + } + + /* + * Now lookup the child vnode in the watch; we'll only add it if it + * isn't already there. + */ + cmp.inw_vp = cvp; + + if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) { + mutex_exit(&state->ins_lock); + VN_RELE(cvp); + return (0); + } + + watch = inotify_watch_add(state, watch, name, cvp, watch->inw_mask); + VERIFY(watch != NULL); + mutex_exit(&state->ins_lock); + + return (0); +} + +static int +inotify_rm_watch(inotify_state_t *state, int32_t wd) +{ + inotify_watch_t *watch, cmp = { .inw_wd = wd }; + + mutex_enter(&state->ins_lock); + + if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) { + mutex_exit(&state->ins_lock); + return (EINVAL); + } + + inotify_watch_remove(state, watch); + mutex_exit(&state->ins_lock); + + /* + * Because removing a watch will generate an IN_IGNORED event (and + * because inotify_watch_remove() won't alone induce a pollwakeup()), + * we need to explicitly issue a pollwakeup(). + */ + pollwakeup(&state->ins_pollhd, POLLRDNORM | POLLIN); + + return (0); +} + +static int +inotify_activate(inotify_state_t *state, int32_t wd) +{ + inotify_watch_t *watch, cmp = { .inw_wd = wd }; + + mutex_enter(&state->ins_lock); + + if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) { + mutex_exit(&state->ins_lock); + return (EINVAL); + } + + watch->inw_active = 1; + + mutex_exit(&state->ins_lock); + + return (0); +} + +/* + * Called periodically as a cyclic to process the orphans and zombies. + */ +static void +inotify_clean(void *arg) +{ + inotify_state_t *state = arg; + inotify_watch_t *watch, *parent, *next, **prev; + cred_t *savecred; + int err; + + mutex_enter(&state->ins_lock); + + for (watch = list_head(&state->ins_orphans); + watch != NULL; watch = next) { + next = list_next(&state->ins_orphans, watch); + + VERIFY(!watch->inw_zombie); + VERIFY((parent = watch->inw_parent) != NULL); + + if (watch->inw_vp->v_count > 1) + continue; + + avl_remove(&parent->inw_children, watch); + err = inotify_fem_uninstall(watch->inw_vp, watch); + VERIFY(err == 0); + + list_remove(&state->ins_orphans, watch); + + /* + * For purposes of releasing the vnode, we need to switch our + * cred to be the cred of the orphaning thread (which we held + * at the time this watch was orphaned). + */ + savecred = curthread->t_cred; + curthread->t_cred = watch->inw_cred; + VN_RELE(watch->inw_vp); + crfree(watch->inw_cred); + curthread->t_cred = savecred; + + inotify_watch_zombify(watch); + } + + prev = &state->ins_zombies; + + while ((watch = *prev) != NULL) { + mutex_enter(&watch->inw_lock); + + if (watch->inw_refcnt == 1) { + *prev = watch->inw_parent; + inotify_watch_destroy(watch); + continue; + } + + prev = &watch->inw_parent; + mutex_exit(&watch->inw_lock); + } + + mutex_exit(&state->ins_lock); +} + +/*ARGSUSED*/ +static int +inotify_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) +{ + inotify_state_t *state; + major_t major = getemajor(*devp); + minor_t minor = getminor(*devp); + int instances = 0; + char c[64]; + + if (minor != INOTIFYMNRN_INOTIFY) + return (ENXIO); + + mutex_enter(&inotify_lock); + + for (state = inotify_state; state != NULL; state = state->ins_next) { + if (state->ins_cred == cred_p) + instances++; + } + + if (instances >= inotify_maxinstances) { + mutex_exit(&inotify_lock); + return (EMFILE); + } + + minor = (minor_t)(uintptr_t)vmem_alloc(inotify_minor, 1, + VM_BESTFIT | VM_SLEEP); + + if (ddi_soft_state_zalloc(inotify_softstate, minor) != DDI_SUCCESS) { + vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1); + mutex_exit(&inotify_lock); + return (NULL); + } + + state = ddi_get_soft_state(inotify_softstate, minor); + *devp = makedevice(major, minor); + + crhold(cred_p); + state->ins_cred = cred_p; + state->ins_next = inotify_state; + inotify_state = state; + + (void) snprintf(c, sizeof (c), "inotify_watchid_%d", minor); + state->ins_wds = vmem_create(c, (void *)1, UINT32_MAX, 1, + NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER); + + avl_create(&state->ins_bywd, + (int(*)(const void *, const void *))inotify_watch_cmpwd, + sizeof (inotify_watch_t), + offsetof(inotify_watch_t, inw_bywd)); + + avl_create(&state->ins_byvp, + (int(*)(const void *, const void *))inotify_watch_cmpvp, + sizeof (inotify_watch_t), + offsetof(inotify_watch_t, inw_byvp)); + + list_create(&state->ins_orphans, sizeof (inotify_watch_t), + offsetof(inotify_watch_t, inw_orphan)); + + state->ins_maxwatches = inotify_maxwatches; + state->ins_maxevents = inotify_maxevents; + + mutex_exit(&inotify_lock); + + state->ins_cleaner = ddi_periodic_add(inotify_clean, + state, NANOSEC, DDI_IPL_0); + + return (0); +} + +/*ARGSUSED*/ +static int +inotify_read(dev_t dev, uio_t *uio, cred_t *cr) +{ + inotify_state_t *state; + inotify_kevent_t *event; + minor_t minor = getminor(dev); + int err = 0, nevents = 0; + size_t len; + + state = ddi_get_soft_state(inotify_softstate, minor); + + mutex_enter(&state->ins_lock); + + while (state->ins_head == NULL) { + if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { + mutex_exit(&state->ins_lock); + return (EAGAIN); + } + + if (!cv_wait_sig_swap(&state->ins_cv, &state->ins_lock)) { + mutex_exit(&state->ins_lock); + return (EINTR); + } + } + + /* + * We have events and we have our lock; return as many as we can. + */ + while ((event = state->ins_head) != NULL) { + len = sizeof (event->ine_event) + event->ine_event.len; + + if (uio->uio_resid < len) { + if (nevents == 0) + err = EINVAL; + break; + } + + nevents++; + + if ((err = uiomove(&event->ine_event, len, UIO_READ, uio)) != 0) + break; + + VERIFY(state->ins_nevents > 0); + state->ins_nevents--; + + VERIFY(state->ins_size > 0); + state->ins_size -= len; + + if ((state->ins_head = event->ine_next) == NULL) { + VERIFY(event == state->ins_tail); + VERIFY(state->ins_nevents == 0); + state->ins_tail = NULL; + } + + kmem_free(event, INOTIFY_EVENT_LENGTH(event)); + } + + mutex_exit(&state->ins_lock); + + return (err); +} + +static int +inotify_poll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + inotify_state_t *state; + minor_t minor = getminor(dev); + + state = ddi_get_soft_state(inotify_softstate, minor); + + mutex_enter(&state->ins_lock); + + if (state->ins_head != NULL) { + *reventsp = events & (POLLRDNORM | POLLIN); + } else { + *reventsp = 0; + } + + if ((*reventsp == 0 && !anyyet) || (events & POLLET)) { + *phpp = &state->ins_pollhd; + } + + mutex_exit(&state->ins_lock); + + return (0); +} + +/*ARGSUSED*/ +static int +inotify_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) +{ + inotify_state_t *state; + minor_t minor = getminor(dev); + file_t *fp; + int rval; + + state = ddi_get_soft_state(inotify_softstate, minor); + + switch (cmd) { + case INOTIFYIOC_ADD_WATCH: { + inotify_addwatch_t addwatch; + file_t *fp; + + if (copyin((void *)arg, &addwatch, sizeof (addwatch)) != 0) + return (EFAULT); + + if ((fp = getf(addwatch.inaw_fd)) == NULL) + return (EBADF); + + rval = inotify_add_watch(state, fp->f_vnode, + addwatch.inaw_mask, rv); + + releasef(addwatch.inaw_fd); + return (rval); + } + + case INOTIFYIOC_ADD_CHILD: { + inotify_addchild_t addchild; + char name[MAXPATHLEN]; + + if (copyin((void *)arg, &addchild, sizeof (addchild)) != 0) + return (EFAULT); + + if (copyinstr(addchild.inac_name, name, MAXPATHLEN, NULL) != 0) + return (EFAULT); + + if ((fp = getf(addchild.inac_fd)) == NULL) + return (EBADF); + + rval = inotify_add_child(state, fp->f_vnode, name); + + releasef(addchild.inac_fd); + return (rval); + } + + case INOTIFYIOC_RM_WATCH: + return (inotify_rm_watch(state, arg)); + + case INOTIFYIOC_ACTIVATE: + return (inotify_activate(state, arg)); + + case FIONREAD: { + int32_t size; + + mutex_enter(&state->ins_lock); + size = state->ins_size; + mutex_exit(&state->ins_lock); + + if (copyout(&size, (void *)arg, sizeof (size)) != 0) + return (EFAULT); + + return (0); + } + + default: + break; + } + + return (ENOTTY); +} + +/*ARGSUSED*/ +static int +inotify_close(dev_t dev, int flag, int otyp, cred_t *cred_p) +{ + inotify_state_t *state, **sp; + inotify_watch_t *watch, *zombies; + inotify_kevent_t *event; + minor_t minor = getminor(dev); + + state = ddi_get_soft_state(inotify_softstate, minor); + + if (state->ins_pollhd.ph_list != NULL) { + pollwakeup(&state->ins_pollhd, POLLERR); + pollhead_clean(&state->ins_pollhd); + } + + mutex_enter(&state->ins_lock); + + /* + * First, destroy all of our watches. + */ + while ((watch = avl_first(&state->ins_bywd)) != NULL) + inotify_watch_remove(state, watch); + + /* + * And now destroy our event queue. + */ + while ((event = state->ins_head) != NULL) { + state->ins_head = event->ine_next; + kmem_free(event, INOTIFY_EVENT_LENGTH(event)); + } + + zombies = state->ins_zombies; + state->ins_zombies = NULL; + mutex_exit(&state->ins_lock); + + /* + * Now that our state lock is dropped, we can synchronously wait on + * any zombies. + */ + while ((watch = zombies) != NULL) { + zombies = zombies->inw_parent; + + mutex_enter(&watch->inw_lock); + + while (watch->inw_refcnt > 1) + cv_wait(&watch->inw_cv, &watch->inw_lock); + + inotify_watch_destroy(watch); + } + + if (state->ins_cleaner != NULL) { + ddi_periodic_delete(state->ins_cleaner); + state->ins_cleaner = NULL; + } + + mutex_enter(&inotify_lock); + + /* + * Remove our state from our global list, and release our hold on + * the cred. + */ + for (sp = &inotify_state; *sp != state; sp = &((*sp)->ins_next)) + VERIFY(*sp != NULL); + + *sp = (*sp)->ins_next; + crfree(state->ins_cred); + vmem_destroy(state->ins_wds); + + ddi_soft_state_free(inotify_softstate, minor); + vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1); + + mutex_exit(&inotify_lock); + + return (0); +} + +/*ARGSUSED*/ +static int +inotify_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +{ + mutex_enter(&inotify_lock); + + if (ddi_soft_state_init(&inotify_softstate, + sizeof (inotify_state_t), 0) != 0) { + cmn_err(CE_NOTE, "/dev/inotify failed to create soft state"); + mutex_exit(&inotify_lock); + return (DDI_FAILURE); + } + + if (ddi_create_minor_node(devi, "inotify", S_IFCHR, + INOTIFYMNRN_INOTIFY, DDI_PSEUDO, NULL) == DDI_FAILURE) { + cmn_err(CE_NOTE, "/dev/inotify couldn't create minor node"); + ddi_soft_state_fini(&inotify_softstate); + mutex_exit(&inotify_lock); + return (DDI_FAILURE); + } + + if (fem_create("inotify_fem", + inotify_vnodesrc_template, &inotify_femp) != 0) { + cmn_err(CE_NOTE, "/dev/inotify couldn't create FEM state"); + ddi_remove_minor_node(devi, NULL); + ddi_soft_state_fini(&inotify_softstate); + mutex_exit(&inotify_lock); + return (DDI_FAILURE); + } + + ddi_report_dev(devi); + inotify_devi = devi; + + inotify_minor = vmem_create("inotify_minor", (void *)INOTIFYMNRN_CLONE, + UINT32_MAX - INOTIFYMNRN_CLONE, 1, NULL, NULL, NULL, 0, + VM_SLEEP | VMC_IDENTIFIER); + + mutex_exit(&inotify_lock); + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +inotify_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + switch (cmd) { + case DDI_DETACH: + break; + + case DDI_SUSPEND: + return (DDI_SUCCESS); + + default: + return (DDI_FAILURE); + } + + mutex_enter(&inotify_lock); + fem_free(inotify_femp); + vmem_destroy(inotify_minor); + + ddi_remove_minor_node(inotify_devi, NULL); + inotify_devi = NULL; + + ddi_soft_state_fini(&inotify_softstate); + mutex_exit(&inotify_lock); + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +inotify_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + int error; + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)inotify_devi; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + } + return (error); +} + +static struct cb_ops inotify_cb_ops = { + inotify_open, /* open */ + inotify_close, /* close */ + nulldev, /* strategy */ + nulldev, /* print */ + nodev, /* dump */ + inotify_read, /* read */ + nodev, /* write */ + inotify_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + inotify_poll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + 0, /* streamtab */ + D_NEW | D_MP /* Driver compatibility flag */ +}; + +static struct dev_ops inotify_ops = { + DEVO_REV, /* devo_rev */ + 0, /* refcnt */ + inotify_info, /* get_dev_info */ + nulldev, /* identify */ + nulldev, /* probe */ + inotify_attach, /* attach */ + inotify_detach, /* detach */ + nodev, /* reset */ + &inotify_cb_ops, /* driver operations */ + NULL, /* bus operations */ + nodev, /* dev power */ + ddi_quiesce_not_needed, /* quiesce */ +}; + +static struct modldrv modldrv = { + &mod_driverops, /* module type (this is a pseudo driver) */ + "inotify support", /* name of module */ + &inotify_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modldrv, + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} diff --git a/usr/src/uts/common/io/inotify.conf b/usr/src/uts/common/io/inotify.conf new file mode 100644 index 0000000000..ce9da6180f --- /dev/null +++ b/usr/src/uts/common/io/inotify.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2014 Joyent, Inc. All rights reserved. +# + +name="inotify" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.c b/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.c index c10e23a8a6..cde57df235 100644 --- a/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.c +++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.c @@ -996,17 +996,20 @@ static s32 ixgbe_clear_vmdq_82598(struct ixgbe_hw *hw, u32 rar, u32 vmdq) * @vlan: VLAN id to write to VLAN filter * @vind: VMDq output index that maps queue to VLAN id in VFTA * @vlan_on: boolean flag to turn on/off VLAN in VFTA + * @vlvf_bypass: boolean flag - unused * * Turn on/off specified VLAN in the VLAN filter table. **/ s32 ixgbe_set_vfta_82598(struct ixgbe_hw *hw, u32 vlan, u32 vind, - bool vlan_on) + bool vlan_on, bool vlvf_bypass) { u32 regindex; u32 bitindex; u32 bits; u32 vftabyte; + UNREFERENCED_1PARAMETER(vlvf_bypass); + DEBUGFUNC("ixgbe_set_vfta_82598"); if (vlan > 4095) diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.h b/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.h index d2241c70cd..c32672187a 100644 --- a/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.h +++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_82598.h @@ -40,7 +40,8 @@ s32 ixgbe_fc_enable_82598(struct ixgbe_hw *hw); s32 ixgbe_start_hw_82598(struct ixgbe_hw *hw); void ixgbe_enable_relaxed_ordering_82598(struct ixgbe_hw *hw); s32 ixgbe_set_vmdq_82598(struct ixgbe_hw *hw, u32 rar, u32 vmdq); -s32 ixgbe_set_vfta_82598(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on); +s32 ixgbe_set_vfta_82598(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on, + bool vlvf_bypass); s32 ixgbe_read_analog_reg8_82598(struct ixgbe_hw *hw, u32 reg, u8 *val); s32 ixgbe_write_analog_reg8_82598(struct ixgbe_hw *hw, u32 reg, u8 val); s32 ixgbe_read_i2c_eeprom_82598(struct ixgbe_hw *hw, u8 byte_offset, diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_api.c b/usr/src/uts/common/io/ixgbe/core/ixgbe_api.c index 894d0b2ac9..c550982710 100644 --- a/usr/src/uts/common/io/ixgbe/core/ixgbe_api.c +++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_api.c @@ -1057,33 +1057,38 @@ s32 ixgbe_clear_vfta(struct ixgbe_hw *hw) * ixgbe_set_vfta - Set VLAN filter table * @hw: pointer to hardware structure * @vlan: VLAN id to write to VLAN filter - * @vind: VMDq output index that maps queue to VLAN id in VFTA - * @vlan_on: boolean flag to turn on/off VLAN in VFTA + * @vind: VMDq output index that maps queue to VLAN id in VLVFB + * @vlan_on: boolean flag to turn on/off VLAN + * @vlvf_bypass: boolean flag indicating updating the default pool is okay * * Turn on/off specified VLAN in the VLAN filter table. **/ -s32 ixgbe_set_vfta(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on) +s32 ixgbe_set_vfta(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on, + bool vlvf_bypass) { return ixgbe_call_func(hw, hw->mac.ops.set_vfta, (hw, vlan, vind, - vlan_on), IXGBE_NOT_IMPLEMENTED); + vlan_on, vlvf_bypass), IXGBE_NOT_IMPLEMENTED); } /** * ixgbe_set_vlvf - Set VLAN Pool Filter * @hw: pointer to hardware structure * @vlan: VLAN id to write to VLAN filter - * @vind: VMDq output index that maps queue to VLAN id in VFVFB - * @vlan_on: boolean flag to turn on/off VLAN in VFVF - * @vfta_changed: pointer to boolean flag which indicates whether VFTA - * should be changed + * @vind: VMDq output index that maps queue to VLAN id in VLVFB + * @vlan_on: boolean flag to turn on/off VLAN in VLVF + * @vfta_delta: pointer to the difference between the current value of VFTA + * and the desired value + * @vfta: the desired value of the VFTA + * @vlvf_bypass: boolean flag indicating updating the default pool is okay * * Turn on/off specified bit in VLVF table. **/ s32 ixgbe_set_vlvf(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on, - bool *vfta_changed) + u32 *vfta_delta, u32 vfta, bool vlvf_bypass) { return ixgbe_call_func(hw, hw->mac.ops.set_vlvf, (hw, vlan, vind, - vlan_on, vfta_changed), IXGBE_NOT_IMPLEMENTED); + vlan_on, vfta_delta, vfta, vlvf_bypass), + IXGBE_NOT_IMPLEMENTED); } /** diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_api.h b/usr/src/uts/common/io/ixgbe/core/ixgbe_api.h index 24d507039d..3bee89e45e 100644 --- a/usr/src/uts/common/io/ixgbe/core/ixgbe_api.h +++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_api.h @@ -125,9 +125,10 @@ s32 ixgbe_enable_mc(struct ixgbe_hw *hw); s32 ixgbe_disable_mc(struct ixgbe_hw *hw); s32 ixgbe_clear_vfta(struct ixgbe_hw *hw); s32 ixgbe_set_vfta(struct ixgbe_hw *hw, u32 vlan, - u32 vind, bool vlan_on); + u32 vind, bool vlan_on, bool vlvf_bypass); s32 ixgbe_set_vlvf(struct ixgbe_hw *hw, u32 vlan, u32 vind, - bool vlan_on, bool *vfta_changed); + bool vlan_on, u32 *vfta_delta, u32 vfta, + bool vlvf_bypass); s32 ixgbe_fc_enable(struct ixgbe_hw *hw); s32 ixgbe_setup_fc(struct ixgbe_hw *hw); s32 ixgbe_set_fw_drv_ver(struct ixgbe_hw *hw, u8 maj, u8 min, u8 build, diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_common.c b/usr/src/uts/common/io/ixgbe/core/ixgbe_common.c index f342eee637..656534862c 100644 --- a/usr/src/uts/common/io/ixgbe/core/ixgbe_common.c +++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_common.c @@ -3810,68 +3810,65 @@ s32 ixgbe_init_uta_tables_generic(struct ixgbe_hw *hw) * return the VLVF index where this VLAN id should be placed * **/ -s32 ixgbe_find_vlvf_slot(struct ixgbe_hw *hw, u32 vlan) +s32 ixgbe_find_vlvf_slot(struct ixgbe_hw *hw, u32 vlan, bool vlvf_bypass) { - u32 bits = 0; - u32 first_empty_slot = 0; - s32 regindex; + s32 regindex, first_empty_slot; + u32 bits; /* short cut the special case */ if (vlan == 0) return 0; - /* - * Search for the vlan id in the VLVF entries. Save off the first empty - * slot found along the way - */ - for (regindex = 1; regindex < IXGBE_VLVF_ENTRIES; regindex++) { + /* if vlvf_bypass is set we don't want to use an empty slot, we + * will simply bypass the VLVF if there are no entries present in the + * VLVF that contain our VLAN + */ + first_empty_slot = vlvf_bypass ? IXGBE_ERR_NO_SPACE : 0; + + /* add VLAN enable bit for comparison */ + vlan |= IXGBE_VLVF_VIEN; + + /* Search for the vlan id in the VLVF entries. Save off the first empty + * slot found along the way. + * + * pre-decrement loop covering (IXGBE_VLVF_ENTRIES - 1) .. 1 + */ + for (regindex = IXGBE_VLVF_ENTRIES; --regindex;) { bits = IXGBE_READ_REG(hw, IXGBE_VLVF(regindex)); - if (!bits && !(first_empty_slot)) + if (bits == vlan) + return regindex; + if (!first_empty_slot && !bits) first_empty_slot = regindex; - else if ((bits & 0x0FFF) == vlan) - break; } - /* - * If regindex is less than IXGBE_VLVF_ENTRIES, then we found the vlan - * in the VLVF. Else use the first empty VLVF register for this - * vlan id. - */ - if (regindex >= IXGBE_VLVF_ENTRIES) { - if (first_empty_slot) - regindex = first_empty_slot; - else { - ERROR_REPORT1(IXGBE_ERROR_SOFTWARE, - "No space in VLVF.\n"); - regindex = IXGBE_ERR_NO_SPACE; - } - } + /* If we are here then we didn't find the VLAN. Return first empty + * slot we found during our search, else error. + */ + if (!first_empty_slot) + ERROR_REPORT1(IXGBE_ERROR_SOFTWARE, "No space in VLVF.\n"); - return regindex; + return first_empty_slot ? first_empty_slot : IXGBE_ERR_NO_SPACE; } /** * ixgbe_set_vfta_generic - Set VLAN filter table * @hw: pointer to hardware structure * @vlan: VLAN id to write to VLAN filter - * @vind: VMDq output index that maps queue to VLAN id in VFVFB - * @vlan_on: boolean flag to turn on/off VLAN in VFVF + * @vind: VMDq output index that maps queue to VLAN id in VLVFB + * @vlan_on: boolean flag to turn on/off VLAN + * @vlvf_bypass: boolean flag indicating updating default pool is okay * * Turn on/off specified VLAN in the VLAN filter table. **/ s32 ixgbe_set_vfta_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind, - bool vlan_on) + bool vlan_on, bool vlvf_bypass) { - s32 regindex; - u32 bitindex; - u32 vfta; - u32 targetbit; - s32 ret_val = IXGBE_SUCCESS; - bool vfta_changed = FALSE; + u32 regidx, vfta_delta, vfta; + s32 ret_val; DEBUGFUNC("ixgbe_set_vfta_generic"); - if (vlan > 4095) + if (vlan > 4095 || vind > 63) return IXGBE_ERR_PARAM; /* @@ -3886,33 +3883,33 @@ s32 ixgbe_set_vfta_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind, * bits[11-5]: which register * bits[4-0]: which bit in the register */ - regindex = (vlan >> 5) & 0x7F; - bitindex = vlan & 0x1F; - targetbit = (1 << bitindex); - vfta = IXGBE_READ_REG(hw, IXGBE_VFTA(regindex)); - - if (vlan_on) { - if (!(vfta & targetbit)) { - vfta |= targetbit; - vfta_changed = TRUE; - } - } else { - if ((vfta & targetbit)) { - vfta &= ~targetbit; - vfta_changed = TRUE; - } - } + regidx = vlan / 32; + vfta_delta = 1 << (vlan % 32); + vfta = IXGBE_READ_REG(hw, IXGBE_VFTA(regidx)); + + /* + * vfta_delta represents the difference between the current value + * of vfta and the value we want in the register. Since the diff + * is an XOR mask we can just update the vfta using an XOR + */ + vfta_delta &= vlan_on ? ~vfta : vfta; + vfta ^= vfta_delta; /* Part 2 * Call ixgbe_set_vlvf_generic to set VLVFB and VLVF */ - ret_val = ixgbe_set_vlvf_generic(hw, vlan, vind, vlan_on, - &vfta_changed); - if (ret_val != IXGBE_SUCCESS) + ret_val = ixgbe_set_vlvf_generic(hw, vlan, vind, vlan_on, &vfta_delta, + vfta, vlvf_bypass); + if (ret_val != IXGBE_SUCCESS) { + if (vlvf_bypass) + goto vfta_update; return ret_val; + } - if (vfta_changed) - IXGBE_WRITE_REG(hw, IXGBE_VFTA(regindex), vfta); +vfta_update: + /* Update VFTA now that we are ready for traffic */ + if (vfta_delta) + IXGBE_WRITE_REG(hw, IXGBE_VFTA(regidx), vfta); return IXGBE_SUCCESS; } @@ -3921,21 +3918,25 @@ s32 ixgbe_set_vfta_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind, * ixgbe_set_vlvf_generic - Set VLAN Pool Filter * @hw: pointer to hardware structure * @vlan: VLAN id to write to VLAN filter - * @vind: VMDq output index that maps queue to VLAN id in VFVFB - * @vlan_on: boolean flag to turn on/off VLAN in VFVF - * @vfta_changed: pointer to boolean flag which indicates whether VFTA - * should be changed + * @vind: VMDq output index that maps queue to VLAN id in VLVFB + * @vlan_on: boolean flag to turn on/off VLAN in VLVF + * @vfta_delta: pointer to the difference between the current value of VFTA + * and the desired value + * @vfta: the desired value of the VFTA + * @vlvf_bypass: boolean flag indicating updating default pool is okay * * Turn on/off specified bit in VLVF table. **/ s32 ixgbe_set_vlvf_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind, - bool vlan_on, bool *vfta_changed) + bool vlan_on, u32 *vfta_delta, u32 vfta, + bool vlvf_bypass) { - u32 vt; + u32 bits; + s32 vlvf_index; DEBUGFUNC("ixgbe_set_vlvf_generic"); - if (vlan > 4095) + if (vlan > 4095 || vind > 63) return IXGBE_ERR_PARAM; /* If VT Mode is set @@ -3945,83 +3946,60 @@ s32 ixgbe_set_vlvf_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind, * Or !vlan_on * clear the pool bit and possibly the vind */ - vt = IXGBE_READ_REG(hw, IXGBE_VT_CTL); - if (vt & IXGBE_VT_CTL_VT_ENABLE) { - s32 vlvf_index; - u32 bits; - - vlvf_index = ixgbe_find_vlvf_slot(hw, vlan); - if (vlvf_index < 0) - return vlvf_index; - - if (vlan_on) { - /* set the pool bit */ - if (vind < 32) { - bits = IXGBE_READ_REG(hw, - IXGBE_VLVFB(vlvf_index * 2)); - bits |= (1 << vind); - IXGBE_WRITE_REG(hw, - IXGBE_VLVFB(vlvf_index * 2), - bits); - } else { - bits = IXGBE_READ_REG(hw, - IXGBE_VLVFB((vlvf_index * 2) + 1)); - bits |= (1 << (vind - 32)); - IXGBE_WRITE_REG(hw, - IXGBE_VLVFB((vlvf_index * 2) + 1), - bits); - } - } else { - /* clear the pool bit */ - if (vind < 32) { - bits = IXGBE_READ_REG(hw, - IXGBE_VLVFB(vlvf_index * 2)); - bits &= ~(1 << vind); - IXGBE_WRITE_REG(hw, - IXGBE_VLVFB(vlvf_index * 2), - bits); - bits |= IXGBE_READ_REG(hw, - IXGBE_VLVFB((vlvf_index * 2) + 1)); - } else { - bits = IXGBE_READ_REG(hw, - IXGBE_VLVFB((vlvf_index * 2) + 1)); - bits &= ~(1 << (vind - 32)); - IXGBE_WRITE_REG(hw, - IXGBE_VLVFB((vlvf_index * 2) + 1), - bits); - bits |= IXGBE_READ_REG(hw, - IXGBE_VLVFB(vlvf_index * 2)); - } - } + if (!(IXGBE_READ_REG(hw, IXGBE_VT_CTL) & IXGBE_VT_CTL_VT_ENABLE)) + return IXGBE_SUCCESS; - /* - * If there are still bits set in the VLVFB registers - * for the VLAN ID indicated we need to see if the - * caller is requesting that we clear the VFTA entry bit. - * If the caller has requested that we clear the VFTA - * entry bit but there are still pools/VFs using this VLAN - * ID entry then ignore the request. We're not worried - * about the case where we're turning the VFTA VLAN ID - * entry bit on, only when requested to turn it off as - * there may be multiple pools and/or VFs using the - * VLAN ID entry. In that case we cannot clear the - * VFTA bit until all pools/VFs using that VLAN ID have also - * been cleared. This will be indicated by "bits" being - * zero. + vlvf_index = ixgbe_find_vlvf_slot(hw, vlan, vlvf_bypass); + if (vlvf_index < 0) + return vlvf_index; + + bits = IXGBE_READ_REG(hw, IXGBE_VLVFB(vlvf_index * 2 + vind / 32)); + + /* set the pool bit */ + bits |= 1 << (vind % 32); + if (vlan_on) + goto vlvf_update; + + /* clear the pool bit */ + bits ^= 1 << (vind % 32); + + if (!bits && + !IXGBE_READ_REG(hw, IXGBE_VLVFB(vlvf_index * 2 + 1 - vind / 32))) { + /* Clear VFTA first, then disable VLVF. Otherwise + * we run the risk of stray packets leaking into + * the PF via the default pool */ - if (bits) { - IXGBE_WRITE_REG(hw, IXGBE_VLVF(vlvf_index), - (IXGBE_VLVF_VIEN | vlan)); - if ((!vlan_on) && (vfta_changed != NULL)) { - /* someone wants to clear the vfta entry - * but some pools/VFs are still using it. - * Ignore it. */ - *vfta_changed = FALSE; - } - } else - IXGBE_WRITE_REG(hw, IXGBE_VLVF(vlvf_index), 0); + if (*vfta_delta) + IXGBE_WRITE_REG(hw, IXGBE_VFTA(vlan / 32), vfta); + + /* disable VLVF and clear remaining bit from pool */ + IXGBE_WRITE_REG(hw, IXGBE_VLVF(vlvf_index), 0); + IXGBE_WRITE_REG(hw, IXGBE_VLVFB(vlvf_index * 2 + vind / 32), 0); + + return IXGBE_SUCCESS; } + /* If there are still bits set in the VLVFB registers + * for the VLAN ID indicated we need to see if the + * caller is requesting that we clear the VFTA entry bit. + * If the caller has requested that we clear the VFTA + * entry bit but there are still pools/VFs using this VLAN + * ID entry then ignore the request. We're not worried + * about the case where we're turning the VFTA VLAN ID + * entry bit on, only when requested to turn it off as + * there may be multiple pools and/or VFs using the + * VLAN ID entry. In that case we cannot clear the + * VFTA bit until all pools/VFs using that VLAN ID have also + * been cleared. This will be indicated by "bits" being + * zero. + */ + *vfta_delta = 0; + +vlvf_update: + /* record pool change and enable VLAN ID if not already enabled */ + IXGBE_WRITE_REG(hw, IXGBE_VLVFB(vlvf_index * 2 + vind / 32), bits); + IXGBE_WRITE_REG(hw, IXGBE_VLVF(vlvf_index), IXGBE_VLVF_VIEN | vlan); + return IXGBE_SUCCESS; } diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_common.h b/usr/src/uts/common/io/ixgbe/core/ixgbe_common.h index 069fc88c96..bd18e96f82 100644 --- a/usr/src/uts/common/io/ixgbe/core/ixgbe_common.h +++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_common.h @@ -135,11 +135,12 @@ s32 ixgbe_clear_vmdq_generic(struct ixgbe_hw *hw, u32 rar, u32 vmdq); s32 ixgbe_insert_mac_addr_generic(struct ixgbe_hw *hw, u8 *addr, u32 vmdq); s32 ixgbe_init_uta_tables_generic(struct ixgbe_hw *hw); s32 ixgbe_set_vfta_generic(struct ixgbe_hw *hw, u32 vlan, - u32 vind, bool vlan_on); + u32 vind, bool vlan_on, bool vlvf_bypass); s32 ixgbe_set_vlvf_generic(struct ixgbe_hw *hw, u32 vlan, u32 vind, - bool vlan_on, bool *vfta_changed); + bool vlan_on, u32 *vfta_delta, u32 vfta, + bool vlvf_bypass); s32 ixgbe_clear_vfta_generic(struct ixgbe_hw *hw); -s32 ixgbe_find_vlvf_slot(struct ixgbe_hw *hw, u32 vlan); +s32 ixgbe_find_vlvf_slot(struct ixgbe_hw *hw, u32 vlan, bool vlvf_bypass); s32 ixgbe_check_mac_link_generic(struct ixgbe_hw *hw, ixgbe_link_speed *speed, diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_type.h b/usr/src/uts/common/io/ixgbe/core/ixgbe_type.h index 45e8a7d029..9231979ff7 100644 --- a/usr/src/uts/common/io/ixgbe/core/ixgbe_type.h +++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_type.h @@ -3715,8 +3715,9 @@ struct ixgbe_mac_operations { s32 (*enable_mc)(struct ixgbe_hw *); s32 (*disable_mc)(struct ixgbe_hw *); s32 (*clear_vfta)(struct ixgbe_hw *); - s32 (*set_vfta)(struct ixgbe_hw *, u32, u32, bool); - s32 (*set_vlvf)(struct ixgbe_hw *, u32, u32, bool, bool *); + s32 (*set_vfta)(struct ixgbe_hw *, u32, u32, bool, bool); + s32 (*set_vlvf)(struct ixgbe_hw *, u32, u32, bool, u32 *, u32, + bool); s32 (*init_uta_tables)(struct ixgbe_hw *); void (*set_mac_anti_spoofing)(struct ixgbe_hw *, bool, int); void (*set_vlan_anti_spoofing)(struct ixgbe_hw *, bool, int); diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.c b/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.c index 2ce4d32a30..66d836eb8f 100644 --- a/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.c +++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.c @@ -321,15 +321,16 @@ static s32 ixgbe_mta_vector(struct ixgbe_hw *hw, u8 *mc_addr) return vector; } -static void ixgbevf_write_msg_read_ack(struct ixgbe_hw *hw, - u32 *msg, u16 size) +static s32 ixgbevf_write_msg_read_ack(struct ixgbe_hw *hw, u32 *msg, + u32 *retmsg, u16 size) { struct ixgbe_mbx_info *mbx = &hw->mbx; - u32 retmsg[IXGBE_VFMAILBOX_SIZE]; s32 retval = mbx->ops.write_posted(hw, msg, size, 0); - if (!retval) - mbx->ops.read_posted(hw, retmsg, size, 0); + if (retval) + return retval; + + return mbx->ops.read_posted(hw, retmsg, size, 0); } /** @@ -415,29 +416,29 @@ s32 ixgbe_update_mc_addr_list_vf(struct ixgbe_hw *hw, u8 *mc_addr_list, return mbx->ops.write_posted(hw, msgbuf, IXGBE_VFMAILBOX_SIZE, 0); } -/** +/* * ixgbe_set_vfta_vf - Set/Unset vlan filter table address * @hw: pointer to the HW structure * @vlan: 12 bit VLAN ID * @vind: unused by VF drivers * @vlan_on: if TRUE then set bit, else clear bit + * @vlvf_bypass: boolean flag indicating updating default pool is okay + * + * Turn on/off specified VLAN in the VLAN filter table. **/ -s32 ixgbe_set_vfta_vf(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on) +s32 ixgbe_set_vfta_vf(struct ixgbe_hw *hw, u32 vlan, u32 vind, + bool vlan_on, bool vlvf_bypass) { - struct ixgbe_mbx_info *mbx = &hw->mbx; u32 msgbuf[2]; s32 ret_val; - UNREFERENCED_1PARAMETER(vind); + UNREFERENCED_2PARAMETER(vind, vlvf_bypass); msgbuf[0] = IXGBE_VF_SET_VLAN; msgbuf[1] = vlan; /* Setting the 8 bit field MSG INFO to TRUE indicates "add" */ msgbuf[0] |= vlan_on << IXGBE_VT_MSGINFO_SHIFT; - ret_val = mbx->ops.write_posted(hw, msgbuf, 2, 0); - if (!ret_val) - ret_val = mbx->ops.read_posted(hw, msgbuf, 1, 0); - + ret_val = ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf, 2); if (!ret_val && (msgbuf[0] & IXGBE_VT_MSGTYPE_ACK)) return IXGBE_SUCCESS; @@ -628,7 +629,7 @@ void ixgbevf_rlpml_set_vf(struct ixgbe_hw *hw, u16 max_size) msgbuf[0] = IXGBE_VF_SET_LPE; msgbuf[1] = max_size; - ixgbevf_write_msg_read_ack(hw, msgbuf, 2); + ixgbevf_write_msg_read_ack(hw, msgbuf, msgbuf, 2); } /** diff --git a/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.h b/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.h index edc801367d..e9b8dc34ae 100644 --- a/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.h +++ b/usr/src/uts/common/io/ixgbe/core/ixgbe_vf.h @@ -132,7 +132,8 @@ s32 ixgbevf_set_uc_addr_vf(struct ixgbe_hw *hw, u32 index, u8 *addr); s32 ixgbe_update_mc_addr_list_vf(struct ixgbe_hw *hw, u8 *mc_addr_list, u32 mc_addr_count, ixgbe_mc_addr_itr, bool clear); -s32 ixgbe_set_vfta_vf(struct ixgbe_hw *hw, u32 vlan, u32 vind, bool vlan_on); +s32 ixgbe_set_vfta_vf(struct ixgbe_hw *hw, u32 vlan, u32 vind, + bool vlan_on, bool vlvf_bypass); void ixgbevf_rlpml_set_vf(struct ixgbe_hw *hw, u16 max_size); int ixgbevf_negotiate_api_version(struct ixgbe_hw *hw, int api); int ixgbevf_get_queues(struct ixgbe_hw *hw, unsigned int *num_tcs, diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_main.c b/usr/src/uts/common/io/ixgbe/ixgbe_main.c index 41464050bc..82c9bcdc36 100644 --- a/usr/src/uts/common/io/ixgbe/ixgbe_main.c +++ b/usr/src/uts/common/io/ixgbe/ixgbe_main.c @@ -25,7 +25,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright 2012 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2013 OSN Online Service Nuernberg GmbH. All rights reserved. @@ -57,8 +57,8 @@ static int ixgbe_alloc_rings(ixgbe_t *); static void ixgbe_free_rings(ixgbe_t *); static int ixgbe_alloc_rx_data(ixgbe_t *); static void ixgbe_free_rx_data(ixgbe_t *); -static void ixgbe_setup_rings(ixgbe_t *); -static void ixgbe_setup_rx(ixgbe_t *); +static int ixgbe_setup_rings(ixgbe_t *); +static int ixgbe_setup_rx(ixgbe_t *); static void ixgbe_setup_tx(ixgbe_t *); static void ixgbe_setup_rx_ring(ixgbe_rx_ring_t *); static void ixgbe_setup_tx_ring(ixgbe_tx_ring_t *); @@ -67,6 +67,7 @@ static void ixgbe_setup_vmdq(ixgbe_t *); static void ixgbe_setup_vmdq_rss(ixgbe_t *); static void ixgbe_setup_rss_table(ixgbe_t *); static void ixgbe_init_unicst(ixgbe_t *); +static int ixgbe_init_vlan(ixgbe_t *); static int ixgbe_unicst_find(ixgbe_t *, const uint8_t *); static void ixgbe_setup_multicst(ixgbe_t *); static void ixgbe_get_hw_state(ixgbe_t *); @@ -113,6 +114,8 @@ static void ixgbe_intr_other_work(ixgbe_t *, uint32_t); static void ixgbe_get_driver_control(struct ixgbe_hw *); static int ixgbe_addmac(void *, const uint8_t *); static int ixgbe_remmac(void *, const uint8_t *); +static int ixgbe_addvlan(mac_group_driver_t, uint16_t); +static int ixgbe_remvlan(mac_group_driver_t, uint16_t); static void ixgbe_release_driver_control(struct ixgbe_hw *); static int ixgbe_attach(dev_info_t *, ddi_attach_cmd_t); @@ -273,7 +276,7 @@ static adapter_info_t ixgbe_82599eb_cap = { 128, /* default number of rx queues */ 64, /* maximum number of rx groups */ 1, /* minimum number of rx groups */ - 1, /* default number of rx groups */ + 32, /* default number of rx groups */ 128, /* maximum number of tx queues */ 1, /* minimum number of tx queues */ 8, /* default number of tx queues */ @@ -304,7 +307,7 @@ static adapter_info_t ixgbe_X540_cap = { 128, /* default number of rx queues */ 64, /* maximum number of rx groups */ 1, /* minimum number of rx groups */ - 1, /* default number of rx groups */ + 32, /* default number of rx groups */ 128, /* maximum number of tx queues */ 1, /* minimum number of tx queues */ 8, /* default number of tx queues */ @@ -1149,6 +1152,8 @@ ixgbe_init_driver_settings(ixgbe_t *ixgbe) rx_group = &ixgbe->rx_groups[i]; rx_group->index = i; rx_group->ixgbe = ixgbe; + list_create(&rx_group->vlans, sizeof (ixgbe_vlan_t), + offsetof(ixgbe_vlan_t, ixvl_link)); } for (i = 0; i < ixgbe->num_tx_rings; i++) { @@ -1898,7 +1903,8 @@ ixgbe_start(ixgbe_t *ixgbe, boolean_t alloc_buffer) /* * Setup the rx/tx rings */ - ixgbe_setup_rings(ixgbe); + if (ixgbe_setup_rings(ixgbe) != IXGBE_SUCCESS) + goto start_failure; /* * ixgbe_start() will be called when resetting, however if reset @@ -1999,6 +2005,7 @@ ixgbe_cbfunc(dev_info_t *dip, ddi_cb_action_t cbaction, void *cbarg, void *arg1, void *arg2) { ixgbe_t *ixgbe = (ixgbe_t *)arg1; + int prev = ixgbe->intr_cnt; switch (cbaction) { /* IRM callback */ @@ -2012,7 +2019,8 @@ ixgbe_cbfunc(dev_info_t *dip, ddi_cb_action_t cbaction, void *cbarg, if (ixgbe_intr_adjust(ixgbe, cbaction, count) != DDI_SUCCESS) { ixgbe_error(ixgbe, - "IRM CB: Failed to adjust interrupts"); + "IRM CB: Failed to adjust interrupts [%d %d %d]", + cbaction, count, prev); goto cb_fail; } break; @@ -2271,6 +2279,16 @@ ixgbe_free_rings(ixgbe_t *ixgbe) ixgbe->tx_rings = NULL; } + for (uint_t i = 0; i < ixgbe->num_rx_groups; i++) { + ixgbe_vlan_t *vlp; + ixgbe_rx_group_t *rx_group = &ixgbe->rx_groups[i]; + + while ((vlp = list_remove_head(&rx_group->vlans)) != NULL) + kmem_free(vlp, sizeof (ixgbe_vlan_t)); + + list_destroy(&rx_group->vlans); + } + if (ixgbe->rx_groups != NULL) { kmem_free(ixgbe->rx_groups, sizeof (ixgbe_rx_group_t) * ixgbe->num_rx_groups); @@ -2325,7 +2343,7 @@ ixgbe_free_rx_data(ixgbe_t *ixgbe) /* * ixgbe_setup_rings - Setup rx/tx rings. */ -static void +static int ixgbe_setup_rings(ixgbe_t *ixgbe) { /* @@ -2335,9 +2353,12 @@ ixgbe_setup_rings(ixgbe_t *ixgbe) * 2. Initialize necessary registers for receive/transmit; * 3. Initialize software pointers/parameters for receive/transmit; */ - ixgbe_setup_rx(ixgbe); + if (ixgbe_setup_rx(ixgbe) != IXGBE_SUCCESS) + return (IXGBE_FAILURE); ixgbe_setup_tx(ixgbe); + + return (IXGBE_SUCCESS); } static void @@ -2423,7 +2444,7 @@ ixgbe_setup_rx_ring(ixgbe_rx_ring_t *rx_ring) IXGBE_WRITE_REG(hw, IXGBE_SRRCTL(rx_ring->hw_index), reg_val); } -static void +static int ixgbe_setup_rx(ixgbe_t *ixgbe) { ixgbe_rx_ring_t *rx_ring; @@ -2517,6 +2538,15 @@ ixgbe_setup_rx(ixgbe_t *ixgbe) } /* + * Initialize VLAN SW and HW state if VLAN filtering is + * enabled. + */ + if (ixgbe->vlft_enabled) { + if (ixgbe_init_vlan(ixgbe) != IXGBE_SUCCESS) + return (IXGBE_FAILURE); + } + + /* * Enable the receive unit. This must be done after filter * control is set in FCTRL. On 82598, we disable the descriptor monitor. * 82598 is the only adapter which defines this RXCTRL option. @@ -2598,6 +2628,8 @@ ixgbe_setup_rx(ixgbe_t *ixgbe) IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, reg_val); } + + return (IXGBE_SUCCESS); } static void @@ -2829,7 +2861,7 @@ static void ixgbe_setup_vmdq(ixgbe_t *ixgbe) { struct ixgbe_hw *hw = &ixgbe->hw; - uint32_t vmdctl, i, vtctl; + uint32_t vmdctl, i, vtctl, vlnctl; /* * Setup the VMDq Control register, enable VMDq based on @@ -2864,10 +2896,20 @@ ixgbe_setup_vmdq(ixgbe_t *ixgbe) /* * Enable Virtualization and Replication. */ - vtctl = IXGBE_VT_CTL_VT_ENABLE | IXGBE_VT_CTL_REPLEN; + vtctl = IXGBE_READ_REG(hw, IXGBE_VT_CTL); + ixgbe->rx_def_group = vtctl & IXGBE_VT_CTL_POOL_MASK; + vtctl |= IXGBE_VT_CTL_VT_ENABLE | IXGBE_VT_CTL_REPLEN; IXGBE_WRITE_REG(hw, IXGBE_VT_CTL, vtctl); /* + * Enable VLAN filtering and switching (VFTA and VLVF). + */ + vlnctl = IXGBE_READ_REG(hw, IXGBE_VLNCTRL); + vlnctl |= IXGBE_VLNCTRL_VFE; + IXGBE_WRITE_REG(hw, IXGBE_VLNCTRL, vlnctl); + ixgbe->vlft_enabled = B_TRUE; + + /* * Enable receiving packets to all VFs */ IXGBE_WRITE_REG(hw, IXGBE_VFRE(0), IXGBE_VFRE_ENABLE_ALL); @@ -2887,7 +2929,7 @@ ixgbe_setup_vmdq_rss(ixgbe_t *ixgbe) { struct ixgbe_hw *hw = &ixgbe->hw; uint32_t i, mrqc; - uint32_t vtctl, vmdctl; + uint32_t vtctl, vmdctl, vlnctl; /* * Initialize RETA/ERETA table @@ -2969,10 +3011,21 @@ ixgbe_setup_vmdq_rss(ixgbe_t *ixgbe) /* * Enable Virtualization and Replication. */ + vtctl = IXGBE_READ_REG(hw, IXGBE_VT_CTL); + ixgbe->rx_def_group = vtctl & IXGBE_VT_CTL_POOL_MASK; + vtctl |= IXGBE_VT_CTL_VT_ENABLE | IXGBE_VT_CTL_REPLEN; vtctl = IXGBE_VT_CTL_VT_ENABLE | IXGBE_VT_CTL_REPLEN; IXGBE_WRITE_REG(hw, IXGBE_VT_CTL, vtctl); /* + * Enable VLAN filtering and switching (VFTA and VLVF). + */ + vlnctl = IXGBE_READ_REG(hw, IXGBE_VLNCTRL); + vlnctl |= IXGBE_VLNCTRL_VFE; + IXGBE_WRITE_REG(hw, IXGBE_VLNCTRL, vlnctl); + ixgbe->vlft_enabled = B_TRUE; + + /* * Enable receiving packets to all VFs */ IXGBE_WRITE_REG(hw, IXGBE_VFRE(0), IXGBE_VFRE_ENABLE_ALL); @@ -3143,6 +3196,53 @@ ixgbe_unicst_find(ixgbe_t *ixgbe, const uint8_t *mac_addr) } /* + * Restore the HW state to match the SW state during restart. + */ +static int +ixgbe_init_vlan(ixgbe_t *ixgbe) +{ + /* + * The device is starting for the first time; there is nothing + * to do. + */ + if (!ixgbe->vlft_init) { + ixgbe->vlft_init = B_TRUE; + return (IXGBE_SUCCESS); + } + + for (uint_t i = 0; i < ixgbe->num_rx_groups; i++) { + int ret; + boolean_t vlvf_bypass; + ixgbe_rx_group_t *rxg = &ixgbe->rx_groups[i]; + struct ixgbe_hw *hw = &ixgbe->hw; + + if (rxg->aupe) { + uint32_t vml2flt; + + vml2flt = IXGBE_READ_REG(hw, IXGBE_VMOLR(rxg->index)); + vml2flt |= IXGBE_VMOLR_AUPE; + IXGBE_WRITE_REG(hw, IXGBE_VMOLR(rxg->index), vml2flt); + } + + vlvf_bypass = (rxg->index == ixgbe->rx_def_group); + for (ixgbe_vlan_t *vlp = list_head(&rxg->vlans); vlp != NULL; + vlp = list_next(&rxg->vlans, vlp)) { + ret = ixgbe_set_vfta(hw, vlp->ixvl_vid, rxg->index, + B_TRUE, vlvf_bypass); + + if (ret != IXGBE_SUCCESS) { + ixgbe_error(ixgbe, "Failed to program VFTA" + " for group %u, VID: %u, ret: %d.", + rxg->index, vlp->ixvl_vid, ret); + return (IXGBE_FAILURE); + } + } + } + + return (IXGBE_SUCCESS); +} + +/* * ixgbe_multicst_add - Add a multicst address. */ int @@ -6152,6 +6252,7 @@ ixgbe_fill_group(void *arg, mac_ring_type_t rtype, const int index, mac_group_info_t *infop, mac_group_handle_t gh) { ixgbe_t *ixgbe = (ixgbe_t *)arg; + struct ixgbe_hw *hw = &ixgbe->hw; switch (rtype) { case MAC_RING_TYPE_RX: { @@ -6165,6 +6266,20 @@ ixgbe_fill_group(void *arg, mac_ring_type_t rtype, const int index, infop->mgi_stop = NULL; infop->mgi_addmac = ixgbe_addmac; infop->mgi_remmac = ixgbe_remmac; + + if ((ixgbe->classify_mode == IXGBE_CLASSIFY_VMDQ || + ixgbe->classify_mode == IXGBE_CLASSIFY_VMDQ_RSS) && + (hw->mac.type == ixgbe_mac_82599EB || + hw->mac.type == ixgbe_mac_X540 || + hw->mac.type == ixgbe_mac_X550 || + hw->mac.type == ixgbe_mac_X550EM_x)) { + infop->mgi_addvlan = ixgbe_addvlan; + infop->mgi_remvlan = ixgbe_remvlan; + } else { + infop->mgi_addvlan = NULL; + infop->mgi_remvlan = NULL; + } + infop->mgi_count = (ixgbe->num_rx_rings / ixgbe->num_rx_groups); break; @@ -6264,6 +6379,228 @@ ixgbe_rx_ring_intr_disable(mac_intr_handle_t intrh) return (0); } +static ixgbe_vlan_t * +ixgbe_find_vlan(ixgbe_rx_group_t *rx_group, uint16_t vid) +{ + for (ixgbe_vlan_t *vlp = list_head(&rx_group->vlans); vlp != NULL; + vlp = list_next(&rx_group->vlans, vlp)) { + if (vlp->ixvl_vid == vid) + return (vlp); + } + + return (NULL); +} + +/* + * Attempt to use a VLAN HW filter for this group. If the group is + * interested in untagged packets then set AUPE only. If the group is + * the default then only set the VFTA. Leave the VLVF slots open for + * reserved groups to guarantee their use of HW filtering. + */ +static int +ixgbe_addvlan(mac_group_driver_t gdriver, uint16_t vid) +{ + ixgbe_rx_group_t *rx_group = (ixgbe_rx_group_t *)gdriver; + ixgbe_t *ixgbe = rx_group->ixgbe; + struct ixgbe_hw *hw = &ixgbe->hw; + ixgbe_vlan_t *vlp; + int ret; + boolean_t is_def_grp; + + mutex_enter(&ixgbe->gen_lock); + + if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) { + mutex_exit(&ixgbe->gen_lock); + return (ECANCELED); + } + + /* + * Let's be sure VLAN filtering is enabled. + */ + VERIFY3B(ixgbe->vlft_enabled, ==, B_TRUE); + is_def_grp = (rx_group->index == ixgbe->rx_def_group); + + /* + * VLAN filtering is enabled but we want to receive untagged + * traffic on this group -- set the AUPE bit on the group and + * leave the VLAN tables alone. + */ + if (vid == MAC_VLAN_UNTAGGED) { + /* + * We never enable AUPE on the default group; it is + * redundant. Untagged traffic which passes L2 + * filtering is delivered to the default group if no + * other group is interested. + */ + if (!is_def_grp) { + uint32_t vml2flt; + + vml2flt = IXGBE_READ_REG(hw, + IXGBE_VMOLR(rx_group->index)); + vml2flt |= IXGBE_VMOLR_AUPE; + IXGBE_WRITE_REG(hw, IXGBE_VMOLR(rx_group->index), + vml2flt); + rx_group->aupe = B_TRUE; + } + + mutex_exit(&ixgbe->gen_lock); + return (0); + } + + vlp = ixgbe_find_vlan(rx_group, vid); + if (vlp != NULL) { + /* Only the default group supports multiple clients. */ + VERIFY3B(is_def_grp, ==, B_TRUE); + vlp->ixvl_refs++; + mutex_exit(&ixgbe->gen_lock); + return (0); + } + + /* + * The default group doesn't require a VLVF entry, only a VFTA + * entry. All traffic passing L2 filtering (MPSAR + VFTA) is + * delivered to the default group if no other group is + * interested. The fourth argument, vlvf_bypass, tells the + * ixgbe common code to avoid using a VLVF slot if one isn't + * already allocated to this VLAN. + * + * This logic is meant to reserve VLVF slots for use by + * reserved groups: guaranteeing their use of HW filtering. + */ + ret = ixgbe_set_vfta(hw, vid, rx_group->index, B_TRUE, is_def_grp); + + if (ret == IXGBE_SUCCESS) { + vlp = kmem_zalloc(sizeof (ixgbe_vlan_t), KM_SLEEP); + vlp->ixvl_vid = vid; + vlp->ixvl_refs = 1; + list_insert_tail(&rx_group->vlans, vlp); + mutex_exit(&ixgbe->gen_lock); + return (0); + } + + /* + * We should actually never return ENOSPC because we've set + * things up so that every reserved group is guaranteed to + * have a VLVF slot. + */ + if (ret == IXGBE_ERR_PARAM) + ret = EINVAL; + else if (ret == IXGBE_ERR_NO_SPACE) + ret = ENOSPC; + else + ret = EIO; + + mutex_exit(&ixgbe->gen_lock); + return (ret); +} + +/* + * Attempt to remove the VLAN HW filter associated with this group. If + * we are removing a HW filter for the default group then we know only + * the VFTA was set (VLVF is reserved for non-default/reserved + * groups). If the group wishes to stop receiving untagged traffic + * then clear the AUPE but leave the VLAN filters alone. + */ +static int +ixgbe_remvlan(mac_group_driver_t gdriver, uint16_t vid) +{ + ixgbe_rx_group_t *rx_group = (ixgbe_rx_group_t *)gdriver; + ixgbe_t *ixgbe = rx_group->ixgbe; + struct ixgbe_hw *hw = &ixgbe->hw; + int ret; + ixgbe_vlan_t *vlp; + boolean_t is_def_grp; + + mutex_enter(&ixgbe->gen_lock); + + if (ixgbe->ixgbe_state & IXGBE_SUSPENDED) { + mutex_exit(&ixgbe->gen_lock); + return (ECANCELED); + } + + is_def_grp = (rx_group->index == ixgbe->rx_def_group); + + /* See the AUPE comment in ixgbe_addvlan(). */ + if (vid == MAC_VLAN_UNTAGGED) { + if (!is_def_grp) { + uint32_t vml2flt; + + vml2flt = IXGBE_READ_REG(hw, + IXGBE_VMOLR(rx_group->index)); + vml2flt &= ~IXGBE_VMOLR_AUPE; + IXGBE_WRITE_REG(hw, + IXGBE_VMOLR(rx_group->index), vml2flt); + rx_group->aupe = B_FALSE; + } + mutex_exit(&ixgbe->gen_lock); + return (0); + } + + vlp = ixgbe_find_vlan(rx_group, vid); + if (vlp == NULL) + return (ENOENT); + + /* + * See the comment in ixgbe_addvlan() about is_def_grp and + * vlvf_bypass. + */ + if (vlp->ixvl_refs == 1) { + ret = ixgbe_set_vfta(hw, vid, rx_group->index, B_FALSE, + is_def_grp); + } else { + /* + * Only the default group can have multiple clients. + * If there is more than one client, leave the + * VFTA[vid] bit alone. + */ + VERIFY3B(is_def_grp, ==, B_TRUE); + VERIFY3U(vlp->ixvl_refs, >, 1); + vlp->ixvl_refs--; + mutex_exit(&ixgbe->gen_lock); + return (0); + } + + if (ret != IXGBE_SUCCESS) { + mutex_exit(&ixgbe->gen_lock); + /* IXGBE_ERR_PARAM should be the only possible error here. */ + if (ret == IXGBE_ERR_PARAM) + return (EINVAL); + else + return (EIO); + } + + VERIFY3U(vlp->ixvl_refs, ==, 1); + vlp->ixvl_refs = 0; + list_remove(&rx_group->vlans, vlp); + kmem_free(vlp, sizeof (ixgbe_vlan_t)); + + /* + * Calling ixgbe_set_vfta() on a non-default group may have + * cleared the VFTA[vid] bit even though the default group + * still has clients using the vid. This happens because the + * ixgbe common code doesn't ref count the use of VLANs. Check + * for any use of vid on the default group and make sure the + * VFTA[vid] bit is set. This operation is idempotent: setting + * VFTA[vid] to true if already true won't hurt anything. + */ + if (!is_def_grp) { + ixgbe_rx_group_t *defgrp; + + defgrp = &ixgbe->rx_groups[ixgbe->rx_def_group]; + vlp = ixgbe_find_vlan(defgrp, vid); + if (vlp != NULL) { + /* This shouldn't fail, but if it does return EIO. */ + ret = ixgbe_set_vfta(hw, vid, rx_group->index, B_TRUE, + B_TRUE); + if (ret != IXGBE_SUCCESS) + return (EIO); + } + } + + mutex_exit(&ixgbe->gen_lock); + return (0); +} + /* * Add a mac address. */ diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_sw.h b/usr/src/uts/common/io/ixgbe/ixgbe_sw.h index ca52b10c89..baa4766c0e 100644 --- a/usr/src/uts/common/io/ixgbe/ixgbe_sw.h +++ b/usr/src/uts/common/io/ixgbe/ixgbe_sw.h @@ -27,7 +27,7 @@ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 Saso Kiselkov. All rights reserved. * Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _IXGBE_SW_H @@ -91,6 +91,8 @@ extern "C" { #define MAX_NUM_UNICAST_ADDRESSES 0x80 #define MAX_NUM_MULTICAST_ADDRESSES 0x1000 +#define MAX_NUM_VLAN_FILTERS 0x40 + #define IXGBE_INTR_NONE 0 #define IXGBE_INTR_MSIX 1 #define IXGBE_INTR_MSI 2 @@ -387,6 +389,15 @@ typedef union ixgbe_ether_addr { } mac; } ixgbe_ether_addr_t; +/* + * The list of VLANs an Rx group will accept. + */ +typedef struct ixgbe_vlan { + list_node_t ixvl_link; + uint16_t ixvl_vid; /* The VLAN ID */ + uint_t ixvl_refs; /* Number of users of this VLAN */ +} ixgbe_vlan_t; + typedef enum { USE_NONE, USE_COPY, @@ -589,6 +600,7 @@ typedef struct ixgbe_rx_ring { struct ixgbe *ixgbe; /* Pointer to ixgbe struct */ } ixgbe_rx_ring_t; + /* * Software Receive Ring Group */ @@ -596,6 +608,8 @@ typedef struct ixgbe_rx_group { uint32_t index; /* Group index */ mac_group_handle_t group_handle; /* call back group handle */ struct ixgbe *ixgbe; /* Pointer to ixgbe struct */ + boolean_t aupe; /* AUPE bit */ + list_t vlans; /* list of VLANs to allow */ } ixgbe_rx_group_t; /* @@ -662,6 +676,7 @@ typedef struct ixgbe { */ ixgbe_rx_group_t *rx_groups; /* Array of rx groups */ uint32_t num_rx_groups; /* Number of rx groups in use */ + uint32_t rx_def_group; /* Default Rx group index */ /* * Transmit Rings @@ -715,6 +730,9 @@ typedef struct ixgbe { uint32_t mcast_count; struct ether_addr mcast_table[MAX_NUM_MULTICAST_ADDRESSES]; + boolean_t vlft_enabled; /* VLAN filtering enabled? */ + boolean_t vlft_init; /* VLAN filtering initialized? */ + ulong_t sys_page_size; boolean_t link_check_complete; diff --git a/usr/src/uts/common/io/ksocket/ksocket.c b/usr/src/uts/common/io/ksocket/ksocket.c index 17ac19b612..d96bfd3d75 100644 --- a/usr/src/uts/common/io/ksocket/ksocket.c +++ b/usr/src/uts/common/io/ksocket/ksocket.c @@ -22,7 +22,7 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2017 Joyent, Inc. */ #include <sys/file.h> @@ -932,3 +932,15 @@ ksocket_rele(ksocket_t ks) cv_signal(&so->so_closing_cv); } } + +int +ksocket_krecv_set(ksocket_t ks, ksocket_krecv_f cb, void *arg) +{ + return (so_krecv_set(KSTOSO(ks), (so_krecv_f)cb, arg)); +} + +void +ksocket_krecv_unblock(ksocket_t ks) +{ + so_krecv_unblock(KSTOSO(ks)); +} diff --git a/usr/src/uts/common/io/ksocket/ksocket_impl.h b/usr/src/uts/common/io/ksocket/ksocket_impl.h index ac5251540f..516a68d358 100644 --- a/usr/src/uts/common/io/ksocket/ksocket_impl.h +++ b/usr/src/uts/common/io/ksocket/ksocket_impl.h @@ -22,11 +22,17 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ #ifndef _INET_KSOCKET_KSOCKET_IMPL_H #define _INET_KSOCKET_KSOCKET_IMPL_H +/* + * Note that if this relationship ever changes, the logic in ksocket_krecv_set + * must be updated and we must maintain local state about this on whatever the + * new ksocket object is. + */ #define KSTOSO(ks) ((struct sonode *)(ks)) #define SOTOKS(so) ((ksocket_t)(uintptr_t)(so)) diff --git a/usr/src/uts/common/io/ksyms.c b/usr/src/uts/common/io/ksyms.c index c9f0c63b69..5233fcd0b4 100644 --- a/usr/src/uts/common/io/ksyms.c +++ b/usr/src/uts/common/io/ksyms.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ @@ -219,6 +220,14 @@ ksyms_open(dev_t *devp, int flag, int otyp, struct cred *cred) char *addr; void *hptr = NULL; ksyms_buflist_hdr_t hdr; + + /* + * This device should never be visible in a zone, but if it somehow + * does get created we refuse to allow the zone to use it. + */ + if (crgetzoneid(cred) != GLOBAL_ZONEID) + return (EACCES); + bzero(&hdr, sizeof (struct ksyms_buflist_hdr)); list_create(&hdr.blist, PAGESIZE, offsetof(ksyms_buflist_t, buflist_node)); diff --git a/usr/src/uts/common/io/lofi.c b/usr/src/uts/common/io/lofi.c index 95f4cd7254..1169f3fdfc 100644 --- a/usr/src/uts/common/io/lofi.c +++ b/usr/src/uts/common/io/lofi.c @@ -24,6 +24,7 @@ * Copyright 2013 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016 Andrey Sokolov * Copyright 2016 Toomas Soome <tsoome@me.com> + * Copyright 2019 Joyent, Inc. */ /* @@ -173,7 +174,7 @@ #define SIZE_PROP_NAME "Size" #define ZONE_PROP_NAME "zone" -#define SETUP_C_DATA(cd, buf, len) \ +#define SETUP_C_DATA(cd, buf, len) \ (cd).cd_format = CRYPTO_DATA_RAW; \ (cd).cd_offset = 0; \ (cd).cd_miscdata = NULL; \ @@ -553,7 +554,7 @@ lofi_destroy(struct lofi_state *lsp, cred_t *credp) } if (lsp->ls_vp != NULL) { - (void) VOP_PUTPAGE(lsp->ls_vp, 0, 0, B_INVAL, credp, NULL); + (void) VOP_PUTPAGE(lsp->ls_vp, 0, 0, B_FREE, credp, NULL); (void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 1, 0, credp, NULL); VN_RELE(lsp->ls_vp); @@ -2934,7 +2935,7 @@ err: lofi_destroy(lsp, credp); } else { if (vp != NULL) { - (void) VOP_PUTPAGE(vp, 0, 0, B_INVAL, credp, NULL); + (void) VOP_PUTPAGE(vp, 0, 0, B_FREE, credp, NULL); (void) VOP_CLOSE(vp, flag, 1, 0, credp, NULL); VN_RELE(vp); } diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c index c386eb2941..a63a6a5c61 100644 --- a/usr/src/uts/common/io/mac/mac.c +++ b/usr/src/uts/common/io/mac/mac.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright (c) 2018, Joyent, Inc. * Copyright 2015 Garrett D'Amore <garrett@damore.org> */ @@ -158,7 +158,7 @@ * perimeter) across a call to any other layer from the mac layer. The call to * any other layer could be via mi_* entry points, classifier entry points into * the driver or via upcall pointers into layers above. The mac perimeter may - * be acquired or held only in the down direction, for e.g. when calling into + * be acquired or held only in the down direction, e.g. when calling into * a mi_* driver enty point to provide atomicity of the operation. * * R8. Since it is not guaranteed (see R14) that drivers won't hold locks across @@ -207,7 +207,7 @@ * number whenever the ring's stop routine is invoked. * See comments in mac_rx_ring(); * - * R17 Similarly mi_stop is another synchronization point and the driver must + * R17. Similarly mi_stop is another synchronization point and the driver must * ensure that all upcalls are done and there won't be any future upcall * before returning from mi_stop. * @@ -227,7 +227,7 @@ * * cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind] * - * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename] + * mac perim -> i_dls_devnet_lock [dls_devnet_rename] * * Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac * client to driver. In the case of clients that explictly use the mac provided @@ -460,7 +460,7 @@ mac_init(void) mac_logging_interval = 20; mac_flow_log_enable = B_FALSE; mac_link_log_enable = B_FALSE; - mac_logging_timer = 0; + mac_logging_timer = NULL; /* Register to be notified of noteworthy pools events */ mac_pool_event_reg.pec_func = mac_pool_event_cb; @@ -1115,9 +1115,10 @@ mac_start(mac_handle_t mh) if ((defgrp = MAC_DEFAULT_RX_GROUP(mip)) != NULL) { /* - * Start the default ring, since it will be needed - * to receive broadcast and multicast traffic for - * both primary and non-primary MAC clients. + * Start the default group which is responsible + * for receiving broadcast and multicast + * traffic for both primary and non-primary + * MAC clients. */ ASSERT(defgrp->mrg_state == MAC_GROUP_STATE_REGISTERED); err = mac_start_group_and_rings(defgrp); @@ -1456,7 +1457,7 @@ mac_rx_group_unmark(mac_group_t *grp, uint_t flag) * used by the aggr driver to access and control the underlying HW Rx group * and rings. In this case, the aggr driver has exclusive control of the * underlying HW Rx group/rings, it calls the following functions to - * start/stop the HW Rx rings, disable/enable polling, add/remove mac' + * start/stop the HW Rx rings, disable/enable polling, add/remove MAC * addresses, or set up the Rx callback. */ /* ARGSUSED */ @@ -1501,8 +1502,9 @@ mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh, ASSERT(B_FALSE); return (-1); } + /* - * The mac client did not reserve any RX group, return directly. + * The MAC client did not reserve an Rx group, return directly. * This is probably because the underlying MAC does not support * any groups. */ @@ -1511,7 +1513,7 @@ mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh, if (grp == NULL) return (0); /* - * This group must be reserved by this mac client. + * This group must be reserved by this MAC client. */ ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) && (mcip == MAC_GROUP_ONLY_CLIENT(grp))); @@ -1527,6 +1529,77 @@ mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh, } /* + * Get the HW ring handles of the given group index. If the MAC + * doesn't have a group at this index, or any groups at all, then 0 is + * returned and hwgh is set to NULL. This is a private client API. The + * MAC perimeter must be held when calling this function. + * + * mh: A handle to the MAC that owns the group. + * + * idx: The index of the HW group to be read. + * + * hwgh: If non-NULL, contains a handle to the HW group on return. + * + * hwrh: An array of ring handles pointing to the HW rings in the + * group. The array must be large enough to hold a handle to each ring + * in the group. To be safe, this array should be of size MAX_RINGS_PER_GROUP. + * + * rtype: Used to determine if we are fetching Rx or Tx rings. + * + * Returns the number of rings in the group. + */ +uint_t +mac_hwrings_idx_get(mac_handle_t mh, uint_t idx, mac_group_handle_t *hwgh, + mac_ring_handle_t *hwrh, mac_ring_type_t rtype) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + mac_group_t *grp; + mac_ring_t *ring; + uint_t cnt = 0; + + /* + * The MAC perimeter must be held when accessing the + * mi_{rx,tx}_groups fields. + */ + ASSERT(MAC_PERIM_HELD(mh)); + ASSERT(rtype == MAC_RING_TYPE_RX || rtype == MAC_RING_TYPE_TX); + + if (rtype == MAC_RING_TYPE_RX) { + grp = mip->mi_rx_groups; + } else if (rtype == MAC_RING_TYPE_TX) { + grp = mip->mi_tx_groups; + } + + while (grp != NULL && grp->mrg_index != idx) + grp = grp->mrg_next; + + /* + * If the MAC doesn't have a group at this index or doesn't + * impelement RINGS capab, then set hwgh to NULL and return 0. + */ + if (hwgh != NULL) + *hwgh = NULL; + + if (grp == NULL) + return (0); + + ASSERT3U(idx, ==, grp->mrg_index); + + for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next, cnt++) { + ASSERT3U(cnt, <, MAX_RINGS_PER_GROUP); + hwrh[cnt] = (mac_ring_handle_t)ring; + } + + /* A group should always have at least one ring. */ + ASSERT3U(cnt, >, 0); + + if (hwgh != NULL) + *hwgh = (mac_group_handle_t)grp; + + return (cnt); +} + +/* * This function is called to get info about Tx/Rx rings. * * Return value: returns uint_t which will have various bits set @@ -1542,6 +1615,69 @@ mac_hwring_getinfo(mac_ring_handle_t rh) } /* + * Set the passthru callback on the hardware ring. + */ +void +mac_hwring_set_passthru(mac_ring_handle_t hwrh, mac_rx_t fn, void *arg1, + mac_resource_handle_t arg2) +{ + mac_ring_t *hwring = (mac_ring_t *)hwrh; + + ASSERT3S(hwring->mr_type, ==, MAC_RING_TYPE_RX); + + hwring->mr_classify_type = MAC_PASSTHRU_CLASSIFIER; + + hwring->mr_pt_fn = fn; + hwring->mr_pt_arg1 = arg1; + hwring->mr_pt_arg2 = arg2; +} + +/* + * Clear the passthru callback on the hardware ring. + */ +void +mac_hwring_clear_passthru(mac_ring_handle_t hwrh) +{ + mac_ring_t *hwring = (mac_ring_t *)hwrh; + + ASSERT3S(hwring->mr_type, ==, MAC_RING_TYPE_RX); + + hwring->mr_classify_type = MAC_NO_CLASSIFIER; + + hwring->mr_pt_fn = NULL; + hwring->mr_pt_arg1 = NULL; + hwring->mr_pt_arg2 = NULL; +} + +void +mac_client_set_flow_cb(mac_client_handle_t mch, mac_rx_t func, void *arg1) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + flow_entry_t *flent = mcip->mci_flent; + + mutex_enter(&flent->fe_lock); + flent->fe_cb_fn = (flow_fn_t)func; + flent->fe_cb_arg1 = arg1; + flent->fe_cb_arg2 = NULL; + flent->fe_flags &= ~FE_MC_NO_DATAPATH; + mutex_exit(&flent->fe_lock); +} + +void +mac_client_clear_flow_cb(mac_client_handle_t mch) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + flow_entry_t *flent = mcip->mci_flent; + + mutex_enter(&flent->fe_lock); + flent->fe_cb_fn = (flow_fn_t)mac_rx_def; + flent->fe_cb_arg1 = NULL; + flent->fe_cb_arg2 = NULL; + flent->fe_flags |= FE_MC_NO_DATAPATH; + mutex_exit(&flent->fe_lock); +} + +/* * Export ddi interrupt handles from the HW ring to the pseudo ring and * setup the RX callback of the mac client which exclusively controls * HW ring. @@ -1613,17 +1749,56 @@ mac_hwring_enable_intr(mac_ring_handle_t rh) return (intr->mi_enable(intr->mi_handle)); } +/* + * Start the HW ring pointed to by rh. + * + * This is used by special MAC clients that are MAC themselves and + * need to exert control over the underlying HW rings of the NIC. + */ int mac_hwring_start(mac_ring_handle_t rh) { mac_ring_t *rr_ring = (mac_ring_t *)rh; + int rv = 0; + + if (rr_ring->mr_state != MR_INUSE) + rv = mac_start_ring(rr_ring); + + return (rv); +} + +/* + * Stop the HW ring pointed to by rh. Also see mac_hwring_start(). + */ +void +mac_hwring_stop(mac_ring_handle_t rh) +{ + mac_ring_t *rr_ring = (mac_ring_t *)rh; + + if (rr_ring->mr_state != MR_FREE) + mac_stop_ring(rr_ring); +} + +/* + * Remove the quiesced flag from the HW ring pointed to by rh. + * + * This is used by special MAC clients that are MAC themselves and + * need to exert control over the underlying HW rings of the NIC. + */ +int +mac_hwring_activate(mac_ring_handle_t rh) +{ + mac_ring_t *rr_ring = (mac_ring_t *)rh; MAC_RING_UNMARK(rr_ring, MR_QUIESCE); return (0); } +/* + * Quiesce the HW ring pointed to by rh. Also see mac_hwring_activate(). + */ void -mac_hwring_stop(mac_ring_handle_t rh) +mac_hwring_quiesce(mac_ring_handle_t rh) { mac_ring_t *rr_ring = (mac_ring_t *)rh; @@ -1730,6 +1905,68 @@ mac_hwgroup_remmac(mac_group_handle_t gh, const uint8_t *addr) } /* + * Program the group's HW VLAN filter if it has such support. + * Otherwise, the group will implicitly accept tagged traffic and + * there is nothing to do. + */ +int +mac_hwgroup_addvlan(mac_group_handle_t gh, uint16_t vid) +{ + mac_group_t *group = (mac_group_t *)gh; + + if (!MAC_GROUP_HW_VLAN(group)) + return (0); + + return (mac_group_addvlan(group, vid)); +} + +int +mac_hwgroup_remvlan(mac_group_handle_t gh, uint16_t vid) +{ + mac_group_t *group = (mac_group_t *)gh; + + if (!MAC_GROUP_HW_VLAN(group)) + return (0); + + return (mac_group_remvlan(group, vid)); +} + +/* + * Determine if a MAC has HW VLAN support. This is a private API + * consumed by aggr. In the future it might be nice to have a bitfield + * in mac_capab_rings_t to track which forms of HW filtering are + * supported by the MAC. + */ +boolean_t +mac_has_hw_vlan(mac_handle_t mh) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + return (MAC_GROUP_HW_VLAN(mip->mi_rx_groups)); +} + +/* + * Get the number of Rx HW groups on this MAC. + */ +uint_t +mac_get_num_rx_groups(mac_handle_t mh) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + ASSERT(MAC_PERIM_HELD(mh)); + return (mip->mi_rx_group_count); +} + +int +mac_set_promisc(mac_handle_t mh, boolean_t value) +{ + mac_impl_t *mip = (mac_impl_t *)mh; + + ASSERT(MAC_PERIM_HELD(mh)); + return (i_mac_promisc_set(mip, value)); +} + +/* * Set the RX group to be shared/reserved. Note that the group must be * started/stopped outside of this function. */ @@ -2416,7 +2653,6 @@ mac_disable(mac_handle_t mh) /* * Called when the MAC instance has a non empty flow table, to de-multiplex * incoming packets to the right flow. - * The MAC's rw lock is assumed held as a READER. */ /* ARGSUSED */ static mblk_t * @@ -2426,19 +2662,6 @@ mac_rx_classify(mac_impl_t *mip, mac_resource_handle_t mrh, mblk_t *mp) uint_t flags = FLOW_INBOUND; int err; - /* - * If the mac is a port of an aggregation, pass FLOW_IGNORE_VLAN - * to mac_flow_lookup() so that the VLAN packets can be successfully - * passed to the non-VLAN aggregation flows. - * - * Note that there is possibly a race between this and - * mac_unicast_remove/add() and VLAN packets could be incorrectly - * classified to non-VLAN flows of non-aggregation mac clients. These - * VLAN packets will be then filtered out by the mac module. - */ - if ((mip->mi_state_flags & MIS_EXCLUSIVE) != 0) - flags |= FLOW_IGNORE_VLAN; - err = mac_flow_lookup(mip->mi_flow_tab, mp, flags, &flent); if (err != 0) { /* no registered receive function */ @@ -3772,9 +3995,27 @@ mac_start_group_and_rings(mac_group_t *group) for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) { ASSERT(ring->mr_state == MR_FREE); + if ((rv = mac_start_ring(ring)) != 0) goto error; - ring->mr_classify_type = MAC_SW_CLASSIFIER; + + /* + * When aggr_set_port_sdu() is called, it will remove + * the port client's unicast address. This will cause + * MAC to stop the default group's rings on the port + * MAC. After it modifies the SDU, it will then re-add + * the unicast address. At which time, this function is + * called to start the default group's rings. Normally + * this function would set the classify type to + * MAC_SW_CLASSIFIER; but that will break aggr which + * relies on the passthru classify mode being set for + * correct delivery (see mac_rx_common()). To avoid + * that, we check for a passthru callback and set the + * classify type to MAC_PASSTHRU_CLASSIFIER; as it was + * before the rings were stopped. + */ + ring->mr_classify_type = (ring->mr_pt_fn != NULL) ? + MAC_PASSTHRU_CLASSIFIER : MAC_SW_CLASSIFIER; } return (0); @@ -4077,12 +4318,15 @@ mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype) /* - * Driver must register group->mgi_addmac/remmac() for rx groups - * to support multiple MAC addresses. + * The driver must register some form of hardware MAC + * filter in order for Rx groups to support multiple + * MAC addresses. */ if (rtype == MAC_RING_TYPE_RX && - ((group_info.mgi_addmac == NULL) || - (group_info.mgi_remmac == NULL))) { + (group_info.mgi_addmac == NULL || + group_info.mgi_remmac == NULL)) { + DTRACE_PROBE1(mac__init__rings__no__mac__filter, + char *, mip->mi_name); err = EINVAL; goto bail; } @@ -4129,8 +4373,9 @@ mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype) /* Update this group's status */ mac_set_group_state(group, MAC_GROUP_STATE_REGISTERED); - } else + } else { group->mrg_rings = NULL; + } ASSERT(ring_left == 0); @@ -4320,6 +4565,38 @@ mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype) } /* + * Associate the VLAN filter to the receive group. + */ +int +mac_group_addvlan(mac_group_t *group, uint16_t vlan) +{ + VERIFY3S(group->mrg_type, ==, MAC_RING_TYPE_RX); + VERIFY3P(group->mrg_info.mgi_addvlan, !=, NULL); + + if (vlan > VLAN_ID_MAX) + return (EINVAL); + + vlan = MAC_VLAN_UNTAGGED_VID(vlan); + return (group->mrg_info.mgi_addvlan(group->mrg_info.mgi_driver, vlan)); +} + +/* + * Dissociate the VLAN from the receive group. + */ +int +mac_group_remvlan(mac_group_t *group, uint16_t vlan) +{ + VERIFY3S(group->mrg_type, ==, MAC_RING_TYPE_RX); + VERIFY3P(group->mrg_info.mgi_remvlan, !=, NULL); + + if (vlan > VLAN_ID_MAX) + return (EINVAL); + + vlan = MAC_VLAN_UNTAGGED_VID(vlan); + return (group->mrg_info.mgi_remvlan(group->mrg_info.mgi_driver, vlan)); +} + +/* * Associate a MAC address with a receive group. * * The return value of this function should always be checked properly, because @@ -4335,8 +4612,8 @@ mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype) int mac_group_addmac(mac_group_t *group, const uint8_t *addr) { - ASSERT(group->mrg_type == MAC_RING_TYPE_RX); - ASSERT(group->mrg_info.mgi_addmac != NULL); + VERIFY3S(group->mrg_type, ==, MAC_RING_TYPE_RX); + VERIFY3P(group->mrg_info.mgi_addmac, !=, NULL); return (group->mrg_info.mgi_addmac(group->mrg_info.mgi_driver, addr)); } @@ -4347,8 +4624,8 @@ mac_group_addmac(mac_group_t *group, const uint8_t *addr) int mac_group_remmac(mac_group_t *group, const uint8_t *addr) { - ASSERT(group->mrg_type == MAC_RING_TYPE_RX); - ASSERT(group->mrg_info.mgi_remmac != NULL); + VERIFY3S(group->mrg_type, ==, MAC_RING_TYPE_RX); + VERIFY3P(group->mrg_info.mgi_remmac, !=, NULL); return (group->mrg_info.mgi_remmac(group->mrg_info.mgi_driver, addr)); } @@ -4523,28 +4800,20 @@ i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index) switch (ring->mr_type) { case MAC_RING_TYPE_RX: /* - * Setup SRS on top of the new ring if the group is - * reserved for someones exclusive use. + * Setup an SRS on top of the new ring if the group is + * reserved for someone's exclusive use. */ if (group->mrg_state == MAC_GROUP_STATE_RESERVED) { - mac_client_impl_t *mcip; + mac_client_impl_t *mcip = MAC_GROUP_ONLY_CLIENT(group); - mcip = MAC_GROUP_ONLY_CLIENT(group); - /* - * Even though this group is reserved we migth still - * have multiple clients, i.e a VLAN shares the - * group with the primary mac client. - */ - if (mcip != NULL) { - flent = mcip->mci_flent; - ASSERT(flent->fe_rx_srs_cnt > 0); - mac_rx_srs_group_setup(mcip, flent, SRST_LINK); - mac_fanout_setup(mcip, flent, - MCIP_RESOURCE_PROPS(mcip), mac_rx_deliver, - mcip, NULL, NULL); - } else { - ring->mr_classify_type = MAC_SW_CLASSIFIER; - } + VERIFY3P(mcip, !=, NULL); + flent = mcip->mci_flent; + VERIFY3S(flent->fe_rx_srs_cnt, >, 0); + mac_rx_srs_group_setup(mcip, flent, SRST_LINK); + mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip), + mac_rx_deliver, mcip, NULL, NULL); + } else { + ring->mr_classify_type = MAC_SW_CLASSIFIER; } break; case MAC_RING_TYPE_TX: @@ -4570,7 +4839,7 @@ i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index) mcip = mgcp->mgc_client; flent = mcip->mci_flent; - is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR); + is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT); mac_srs = MCIP_TX_SRS(mcip); tx = &mac_srs->srs_tx; mac_tx_client_quiesce((mac_client_handle_t)mcip); @@ -4714,7 +4983,7 @@ i_mac_group_rem_ring(mac_group_t *group, mac_ring_t *ring, mcip = MAC_GROUP_ONLY_CLIENT(group); ASSERT(mcip != NULL); - ASSERT(mcip->mci_state_flags & MCIS_IS_AGGR); + ASSERT(mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT); mac_srs = MCIP_TX_SRS(mcip); ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_AGGR || mac_srs->srs_tx.st_mode == SRS_TX_BW_AGGR); @@ -4922,12 +5191,12 @@ mac_free_macaddr(mac_address_t *map) mac_impl_t *mip = map->ma_mip; ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); - ASSERT(mip->mi_addresses != NULL); - - map = mac_find_macaddr(mip, map->ma_addr); + VERIFY3P(mip->mi_addresses, !=, NULL); - ASSERT(map != NULL); - ASSERT(map->ma_nusers == 0); + VERIFY3P(map, ==, mac_find_macaddr(mip, map->ma_addr)); + VERIFY3P(map, !=, NULL); + VERIFY3S(map->ma_nusers, ==, 0); + VERIFY3P(map->ma_vlans, ==, NULL); if (map == mip->mi_addresses) { mip->mi_addresses = map->ma_next; @@ -4943,85 +5212,201 @@ mac_free_macaddr(mac_address_t *map) kmem_free(map, sizeof (mac_address_t)); } +static mac_vlan_t * +mac_find_vlan(mac_address_t *map, uint16_t vid) +{ + mac_vlan_t *mvp; + + for (mvp = map->ma_vlans; mvp != NULL; mvp = mvp->mv_next) { + if (mvp->mv_vid == vid) + return (mvp); + } + + return (NULL); +} + +static mac_vlan_t * +mac_add_vlan(mac_address_t *map, uint16_t vid) +{ + mac_vlan_t *mvp; + + /* + * We should never add the same {addr, VID} tuple more + * than once, but let's be sure. + */ + for (mvp = map->ma_vlans; mvp != NULL; mvp = mvp->mv_next) + VERIFY3U(mvp->mv_vid, !=, vid); + + /* Add the VLAN to the head of the VLAN list. */ + mvp = kmem_zalloc(sizeof (mac_vlan_t), KM_SLEEP); + mvp->mv_vid = vid; + mvp->mv_next = map->ma_vlans; + map->ma_vlans = mvp; + + return (mvp); +} + +static void +mac_rem_vlan(mac_address_t *map, mac_vlan_t *mvp) +{ + mac_vlan_t *pre; + + if (map->ma_vlans == mvp) { + map->ma_vlans = mvp->mv_next; + } else { + pre = map->ma_vlans; + while (pre->mv_next != mvp) { + pre = pre->mv_next; + + /* + * We've reached the end of the list without + * finding mvp. + */ + VERIFY3P(pre, !=, NULL); + } + pre->mv_next = mvp->mv_next; + } + + kmem_free(mvp, sizeof (mac_vlan_t)); +} + /* - * Add a MAC address reference for a client. If the desired MAC address - * exists, add a reference to it. Otherwise, add the new address by adding - * it to a reserved group or setting promiscuous mode. Won't try different - * group is the group is non-NULL, so the caller must explictly share - * default group when needed. - * - * Note, the primary MAC address is initialized at registration time, so - * to add it to default group only need to activate it if its reference - * count is still zero. Also, some drivers may not have advertised RINGS - * capability. + * Create a new mac_address_t if this is the first use of the address + * or add a VID to an existing address. In either case, the + * mac_address_t acts as a list of {addr, VID} tuples where each tuple + * shares the same addr. If group is non-NULL then attempt to program + * the MAC's HW filters for this group. Otherwise, if group is NULL, + * then the MAC has no rings and there is nothing to program. */ int -mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr, - boolean_t use_hw) +mac_add_macaddr_vlan(mac_impl_t *mip, mac_group_t *group, uint8_t *addr, + uint16_t vid, boolean_t use_hw) { - mac_address_t *map; - int err = 0; - boolean_t allocated_map = B_FALSE; + mac_address_t *map; + mac_vlan_t *mvp; + int err = 0; + boolean_t allocated_map = B_FALSE; + boolean_t hw_mac = B_FALSE; + boolean_t hw_vlan = B_FALSE; ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); - map = mac_find_macaddr(mip, mac_addr); + map = mac_find_macaddr(mip, addr); /* - * If the new MAC address has not been added. Allocate a new one - * and set it up. + * If this is the first use of this MAC address then allocate + * and initialize a new structure. */ if (map == NULL) { map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP); map->ma_len = mip->mi_type->mt_addr_length; - bcopy(mac_addr, map->ma_addr, map->ma_len); + bcopy(addr, map->ma_addr, map->ma_len); map->ma_nusers = 0; map->ma_group = group; map->ma_mip = mip; + map->ma_untagged = B_FALSE; - /* add the new MAC address to the head of the address list */ + /* Add the new MAC address to the head of the address list. */ map->ma_next = mip->mi_addresses; mip->mi_addresses = map; allocated_map = B_TRUE; } - ASSERT(map->ma_group == NULL || map->ma_group == group); + VERIFY(map->ma_group == NULL || map->ma_group == group); if (map->ma_group == NULL) map->ma_group = group; + if (vid == VLAN_ID_NONE) { + map->ma_untagged = B_TRUE; + mvp = NULL; + } else { + mvp = mac_add_vlan(map, vid); + } + + /* + * Set the VLAN HW filter if: + * + * o the MAC's VLAN HW filtering is enabled, and + * o the address does not currently rely on promisc mode. + * + * This is called even when the client specifies an untagged + * address (VLAN_ID_NONE) because some MAC providers require + * setting additional bits to accept untagged traffic when + * VLAN HW filtering is enabled. + */ + if (MAC_GROUP_HW_VLAN(group) && + map->ma_type != MAC_ADDRESS_TYPE_UNICAST_PROMISC) { + if ((err = mac_group_addvlan(group, vid)) != 0) + goto bail; + + hw_vlan = B_TRUE; + } + + VERIFY3S(map->ma_nusers, >=, 0); + map->ma_nusers++; + /* - * If the MAC address is already in use, simply account for the - * new client. + * If this MAC address already has a HW filter then simply + * increment the counter. */ - if (map->ma_nusers++ > 0) + if (map->ma_nusers > 1) return (0); /* + * All logic from here on out is executed during initial + * creation only. + */ + VERIFY3S(map->ma_nusers, ==, 1); + + /* * Activate this MAC address by adding it to the reserved group. */ if (group != NULL) { - err = mac_group_addmac(group, (const uint8_t *)mac_addr); - if (err == 0) { - map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED; - return (0); + err = mac_group_addmac(group, (const uint8_t *)addr); + + /* + * If the driver is out of filters then we can + * continue and use promisc mode. For any other error, + * assume the driver is in a state where we can't + * program the filters or use promisc mode; so we must + * bail. + */ + if (err != 0 && err != ENOSPC) { + map->ma_nusers--; + goto bail; } + + hw_mac = (err == 0); + } + + if (hw_mac) { + map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED; + return (0); } /* * The MAC address addition failed. If the client requires a - * hardware classified MAC address, fail the operation. + * hardware classified MAC address, fail the operation. This + * feature is only used by sun4v vsw. */ - if (use_hw) { + if (use_hw && !hw_mac) { err = ENOSPC; + map->ma_nusers--; goto bail; } /* - * Try promiscuous mode. - * - * For drivers that don't advertise RINGS capability, do - * nothing for the primary address. + * If we reach this point then either the MAC doesn't have + * RINGS capability or we are out of MAC address HW filters. + * In any case we must put the MAC into promiscuous mode. + */ + VERIFY(group == NULL || !hw_mac); + + /* + * The one exception is the primary address. A non-RINGS + * driver filters the primary address by default; promisc mode + * is not needed. */ if ((group == NULL) && (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0)) { @@ -5030,53 +5415,76 @@ mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr, } /* - * Enable promiscuous mode in order to receive traffic - * to the new MAC address. + * Enable promiscuous mode in order to receive traffic to the + * new MAC address. All existing HW filters still send their + * traffic to their respective group/SRSes. But with promisc + * enabled all unknown traffic is delivered to the default + * group where it is SW classified via mac_rx_classify(). */ if ((err = i_mac_promisc_set(mip, B_TRUE)) == 0) { map->ma_type = MAC_ADDRESS_TYPE_UNICAST_PROMISC; return (0); } - /* - * Free the MAC address that could not be added. Don't free - * a pre-existing address, it could have been the entry - * for the primary MAC address which was pre-allocated by - * mac_init_macaddr(), and which must remain on the list. - */ bail: - map->ma_nusers--; + if (hw_vlan) { + int err2 = mac_group_remvlan(group, vid); + + if (err2 != 0) { + cmn_err(CE_WARN, "Failed to remove VLAN %u from group" + " %d on MAC %s: %d.", vid, group->mrg_index, + mip->mi_name, err2); + } + } + + if (mvp != NULL) + mac_rem_vlan(map, mvp); + if (allocated_map) mac_free_macaddr(map); + return (err); } -/* - * Remove a reference to a MAC address. This may cause to remove the MAC - * address from an associated group or to turn off promiscuous mode. - * The caller needs to handle the failure properly. - */ int -mac_remove_macaddr(mac_address_t *map) +mac_remove_macaddr_vlan(mac_address_t *map, uint16_t vid) { - mac_impl_t *mip = map->ma_mip; - int err = 0; + mac_vlan_t *mvp; + mac_impl_t *mip = map->ma_mip; + mac_group_t *group = map->ma_group; + int err = 0; ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); + VERIFY3P(map, ==, mac_find_macaddr(mip, map->ma_addr)); + + if (vid == VLAN_ID_NONE) { + map->ma_untagged = B_FALSE; + mvp = NULL; + } else { + mvp = mac_find_vlan(map, vid); + VERIFY3P(mvp, !=, NULL); + } + + if (MAC_GROUP_HW_VLAN(group) && + map->ma_type == MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED && + ((err = mac_group_remvlan(group, vid)) != 0)) + return (err); - ASSERT(map == mac_find_macaddr(mip, map->ma_addr)); + if (mvp != NULL) + mac_rem_vlan(map, mvp); /* * If it's not the last client using this MAC address, only update * the MAC clients count. */ - if (--map->ma_nusers > 0) + map->ma_nusers--; + if (map->ma_nusers > 0) return (0); /* - * The MAC address is no longer used by any MAC client, so remove - * it from its associated group, or turn off promiscuous mode - * if it was enabled for the MAC address. + * The MAC address is no longer used by any MAC client, so + * remove it from its associated group. Turn off promiscuous + * mode if this is the last address relying on it. */ switch (map->ma_type) { case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED: @@ -5084,18 +5492,44 @@ mac_remove_macaddr(mac_address_t *map) * Don't free the preset primary address for drivers that * don't advertise RINGS capability. */ - if (map->ma_group == NULL) + if (group == NULL) return (0); - err = mac_group_remmac(map->ma_group, map->ma_addr); - if (err == 0) - map->ma_group = NULL; + if ((err = mac_group_remmac(group, map->ma_addr)) != 0) { + if (vid == VLAN_ID_NONE) + map->ma_untagged = B_TRUE; + else + (void) mac_add_vlan(map, vid); + + /* + * If we fail to remove the MAC address HW + * filter but then also fail to re-add the + * VLAN HW filter then we are in a busted + * state and should just crash. + */ + if (MAC_GROUP_HW_VLAN(group)) { + int err2; + + err2 = mac_group_addvlan(group, vid); + if (err2 != 0) { + cmn_err(CE_WARN, "Failed to readd VLAN" + " %u to group %d on MAC %s: %d.", + vid, group->mrg_index, mip->mi_name, + err2); + } + } + + return (err); + } + + map->ma_group = NULL; break; case MAC_ADDRESS_TYPE_UNICAST_PROMISC: err = i_mac_promisc_set(mip, B_FALSE); break; default: - ASSERT(B_FALSE); + panic("Unexpected ma_type 0x%x, file: %s, line %d", + map->ma_type, __FILE__, __LINE__); } if (err != 0) @@ -5252,8 +5686,9 @@ mac_fini_macaddr(mac_impl_t *mip) * If mi_addresses is initialized, there should be exactly one * entry left on the list with no users. */ - ASSERT(map->ma_nusers == 0); - ASSERT(map->ma_next == NULL); + VERIFY3S(map->ma_nusers, ==, 0); + VERIFY3P(map->ma_next, ==, NULL); + VERIFY3P(map->ma_vlans, ==, NULL); kmem_free(map, sizeof (mac_address_t)); mip->mi_addresses = NULL; @@ -5815,7 +6250,7 @@ mac_stop_logusage(mac_logtype_t type) mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_walker, &estate); (void) untimeout(mac_logging_timer); - mac_logging_timer = 0; + mac_logging_timer = NULL; /* Write log entries for each mac_impl in the list */ i_mac_log_info(&net_log_list, &lstate); @@ -5933,7 +6368,7 @@ mac_reserve_tx_ring(mac_impl_t *mip, mac_ring_t *desired_ring) } /* - * For a reserved group with multiple clients, return the primary client. + * For a non-default group with multiple clients, return the primary client. */ static mac_client_impl_t * mac_get_grp_primary(mac_group_t *grp) @@ -6292,13 +6727,12 @@ mac_group_add_client(mac_group_t *grp, mac_client_impl_t *mcip) break; } - VERIFY(mgcp == NULL); + ASSERT(mgcp == NULL); mgcp = kmem_zalloc(sizeof (mac_grp_client_t), KM_SLEEP); mgcp->mgc_client = mcip; mgcp->mgc_next = grp->mrg_clients; grp->mrg_clients = mgcp; - } void @@ -6319,8 +6753,27 @@ mac_group_remove_client(mac_group_t *grp, mac_client_impl_t *mcip) } /* - * mac_reserve_rx_group() - * + * Return true if any client on this group explicitly asked for HW + * rings (of type mask) or have a bound share. + */ +static boolean_t +i_mac_clients_hw(mac_group_t *grp, uint32_t mask) +{ + mac_grp_client_t *mgcip; + mac_client_impl_t *mcip; + mac_resource_props_t *mrp; + + for (mgcip = grp->mrg_clients; mgcip != NULL; mgcip = mgcip->mgc_next) { + mcip = mgcip->mgc_client; + mrp = MCIP_RESOURCE_PROPS(mcip); + if (mcip->mci_share != NULL || (mrp->mrp_mask & mask) != 0) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* * Finds an available group and exclusively reserves it for a client. * The group is chosen to suit the flow's resource controls (bandwidth and * fanout requirements) and the address type. @@ -6343,7 +6796,6 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move) int need_rings = 0; mac_group_t *candidate_grp = NULL; mac_client_impl_t *gclient; - mac_resource_props_t *gmrp; mac_group_t *donorgrp = NULL; boolean_t rxhw = mrp->mrp_mask & MRP_RX_RINGS; boolean_t unspec = mrp->mrp_mask & MRP_RXRINGS_UNSPEC; @@ -6354,18 +6806,20 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move) isprimary = mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC; /* - * Check if a group already has this mac address (case of VLANs) + * Check if a group already has this MAC address (case of VLANs) * unless we are moving this MAC client from one group to another. */ if (!move && (map = mac_find_macaddr(mip, mac_addr)) != NULL) { if (map->ma_group != NULL) return (map->ma_group); } + if (mip->mi_rx_groups == NULL || mip->mi_rx_group_count == 0) return (NULL); + /* - * If exclusive open, return NULL which will enable the - * caller to use the default group. + * If this client is requesting exclusive MAC access then + * return NULL to ensure the client uses the default group. */ if (mcip->mci_state_flags & MCIS_EXCLUSIVE) return (NULL); @@ -6375,6 +6829,7 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move) mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) { mrp->mrp_nrxrings = 1; } + /* * For static grouping we allow only specifying rings=0 and * unspecified @@ -6383,6 +6838,7 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move) mip->mi_rx_group_type == MAC_GROUP_TYPE_STATIC) { return (NULL); } + if (rxhw) { /* * We have explicitly asked for a group (with nrxrings, @@ -6444,25 +6900,19 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move) * that didn't ask for an exclusive group, but got * one and it has enough rings (combined with what * the donor group can donate) for the new MAC - * client + * client. */ if (grp->mrg_state >= MAC_GROUP_STATE_RESERVED) { /* - * If the primary/donor group is not the default - * group, don't bother looking for a candidate group. - * If we don't have enough rings we will check - * if the primary group can be vacated. + * If the donor group is not the default + * group, don't bother looking for a candidate + * group. If we don't have enough rings we + * will check if the primary group can be + * vacated. */ if (candidate_grp == NULL && donorgrp == MAC_DEFAULT_RX_GROUP(mip)) { - ASSERT(!MAC_GROUP_NO_CLIENT(grp)); - gclient = MAC_GROUP_ONLY_CLIENT(grp); - if (gclient == NULL) - gclient = mac_get_grp_primary(grp); - ASSERT(gclient != NULL); - gmrp = MCIP_RESOURCE_PROPS(gclient); - if (gclient->mci_share == NULL && - (gmrp->mrp_mask & MRP_RX_RINGS) == 0 && + if (!i_mac_clients_hw(grp, MRP_RX_RINGS) && (unspec || (grp->mrg_cur_count + donor_grp_rcnt >= need_rings))) { @@ -6528,6 +6978,7 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move) */ mac_stop_group(grp); } + /* We didn't find an exclusive group for this MAC client */ if (i >= mip->mi_rx_group_count) { @@ -6535,12 +6986,12 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move) return (NULL); /* - * If we found a candidate group then we switch the - * MAC client from the candidate_group to the default - * group and give the group to this MAC client. If - * we didn't find a candidate_group, check if the - * primary is in its own group and if it can make way - * for this MAC client. + * If we found a candidate group then move the + * existing MAC client from the candidate_group to the + * default group and give the candidate_group to the + * new MAC client. If we didn't find a candidate + * group, then check if the primary is in its own + * group and if it can make way for this MAC client. */ if (candidate_grp == NULL && donorgrp != MAC_DEFAULT_RX_GROUP(mip) && @@ -6551,15 +7002,15 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move) boolean_t prim_grp = B_FALSE; /* - * Switch the MAC client from the candidate group - * to the default group.. If this group was the - * donor group, then after the switch we need - * to update the donor group too. + * Switch the existing MAC client from the + * candidate group to the default group. If + * the candidate group is the donor group, + * then after the switch we need to update the + * donor group too. */ grp = candidate_grp; - gclient = MAC_GROUP_ONLY_CLIENT(grp); - if (gclient == NULL) - gclient = mac_get_grp_primary(grp); + gclient = grp->mrg_clients->mgc_client; + VERIFY3P(gclient, !=, NULL); if (grp == mip->mi_rx_donor_grp) prim_grp = B_TRUE; if (mac_rx_switch_group(gclient, grp, @@ -6572,7 +7023,6 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move) donorgrp = MAC_DEFAULT_RX_GROUP(mip); } - /* * Now give this group with the required rings * to this MAC client. @@ -6620,10 +7070,10 @@ mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr, boolean_t move) /* * mac_rx_release_group() * - * This is called when there are no clients left for the group. - * The group is stopped and marked MAC_GROUP_STATE_REGISTERED, - * and if it is a non default group, the shares are removed and - * all rings are assigned back to default group. + * Release the group when it has no remaining clients. The group is + * stopped and its shares are removed and all rings are assigned back + * to default group. This should never be called against the default + * group. */ void mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group) @@ -6632,6 +7082,7 @@ mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group) mac_ring_t *ring; ASSERT(group != MAC_DEFAULT_RX_GROUP(mip)); + ASSERT(MAC_GROUP_NO_CLIENT(group) == B_TRUE); if (mip->mi_rx_donor_grp == group) mip->mi_rx_donor_grp = MAC_DEFAULT_RX_GROUP(mip); @@ -6683,56 +7134,7 @@ mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group) } /* - * When we move the primary's mac address between groups, we need to also - * take all the clients sharing the same mac address along with it (VLANs) - * We remove the mac address for such clients from the group after quiescing - * them. When we add the mac address we restart the client. Note that - * the primary's mac address is removed from the group after all the - * other clients sharing the address are removed. Similarly, the primary's - * mac address is added before all the other client's mac address are - * added. While grp is the group where the clients reside, tgrp is - * the group where the addresses have to be added. - */ -static void -mac_rx_move_macaddr_prim(mac_client_impl_t *mcip, mac_group_t *grp, - mac_group_t *tgrp, uint8_t *maddr, boolean_t add) -{ - mac_impl_t *mip = mcip->mci_mip; - mac_grp_client_t *mgcp = grp->mrg_clients; - mac_client_impl_t *gmcip; - boolean_t prim; - - prim = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0; - - /* - * If the clients are in a non-default group, we just have to - * walk the group's client list. If it is in the default group - * (which will be shared by other clients as well, we need to - * check if the unicast address matches mcip's unicast. - */ - while (mgcp != NULL) { - gmcip = mgcp->mgc_client; - if (gmcip != mcip && - (grp != MAC_DEFAULT_RX_GROUP(mip) || - mcip->mci_unicast == gmcip->mci_unicast)) { - if (!add) { - mac_rx_client_quiesce( - (mac_client_handle_t)gmcip); - (void) mac_remove_macaddr(mcip->mci_unicast); - } else { - (void) mac_add_macaddr(mip, tgrp, maddr, prim); - mac_rx_client_restart( - (mac_client_handle_t)gmcip); - } - } - mgcp = mgcp->mgc_next; - } -} - - -/* - * Move the MAC address from fgrp to tgrp. If this is the primary client, - * we need to take any VLANs etc. together too. + * Move the MAC address from fgrp to tgrp. */ static int mac_rx_move_macaddr(mac_client_impl_t *mcip, mac_group_t *fgrp, @@ -6741,56 +7143,86 @@ mac_rx_move_macaddr(mac_client_impl_t *mcip, mac_group_t *fgrp, mac_impl_t *mip = mcip->mci_mip; uint8_t maddr[MAXMACADDRLEN]; int err = 0; - boolean_t prim; - boolean_t multiclnt = B_FALSE; + uint16_t vid; + mac_unicast_impl_t *muip; + boolean_t use_hw; mac_rx_client_quiesce((mac_client_handle_t)mcip); - ASSERT(mcip->mci_unicast != NULL); + VERIFY3P(mcip->mci_unicast, !=, NULL); bcopy(mcip->mci_unicast->ma_addr, maddr, mcip->mci_unicast->ma_len); - prim = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0; - if (mcip->mci_unicast->ma_nusers > 1) { - mac_rx_move_macaddr_prim(mcip, fgrp, NULL, maddr, B_FALSE); - multiclnt = B_TRUE; - } - ASSERT(mcip->mci_unicast->ma_nusers == 1); - err = mac_remove_macaddr(mcip->mci_unicast); + /* + * Does the client require MAC address hardware classifiction? + */ + use_hw = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0; + vid = i_mac_flow_vid(mcip->mci_flent); + + /* + * You can never move an address that is shared by multiple + * clients. mac_datapath_setup() ensures that clients sharing + * an address are placed on the default group. This guarantees + * that a non-default group will only ever have one client and + * thus make full use of HW filters. + */ + if (mac_check_macaddr_shared(mcip->mci_unicast)) + return (EINVAL); + + err = mac_remove_macaddr_vlan(mcip->mci_unicast, vid); + if (err != 0) { mac_rx_client_restart((mac_client_handle_t)mcip); - if (multiclnt) { - mac_rx_move_macaddr_prim(mcip, fgrp, fgrp, maddr, - B_TRUE); - } return (err); } + + /* + * If this isn't the primary MAC address then the + * mac_address_t has been freed by the last call to + * mac_remove_macaddr_vlan(). In any case, NULL the reference + * to avoid a dangling pointer. + */ + mcip->mci_unicast = NULL; + /* - * Program the H/W Classifier first, if this fails we need - * not proceed with the other stuff. + * We also have to NULL all the mui_map references -- sun4v + * strikes again! */ - if ((err = mac_add_macaddr(mip, tgrp, maddr, prim)) != 0) { + rw_enter(&mcip->mci_rw_lock, RW_WRITER); + for (muip = mcip->mci_unicast_list; muip != NULL; muip = muip->mui_next) + muip->mui_map = NULL; + rw_exit(&mcip->mci_rw_lock); + + /* + * Program the H/W Classifier first, if this fails we need not + * proceed with the other stuff. + */ + if ((err = mac_add_macaddr_vlan(mip, tgrp, maddr, vid, use_hw)) != 0) { + int err2; + /* Revert back the H/W Classifier */ - if ((err = mac_add_macaddr(mip, fgrp, maddr, prim)) != 0) { - /* - * This should not fail now since it worked earlier, - * should we panic? - */ - cmn_err(CE_WARN, - "mac_rx_switch_group: switching %p back" - " to group %p failed!!", (void *)mcip, - (void *)fgrp); + err2 = mac_add_macaddr_vlan(mip, fgrp, maddr, vid, use_hw); + + if (err2 != 0) { + cmn_err(CE_WARN, "Failed to revert HW classification" + " on MAC %s, for client %s: %d.", mip->mi_name, + mcip->mci_name, err2); } + mac_rx_client_restart((mac_client_handle_t)mcip); - if (multiclnt) { - mac_rx_move_macaddr_prim(mcip, fgrp, fgrp, maddr, - B_TRUE); - } return (err); } + + /* + * Get a reference to the new mac_address_t and update the + * client's reference. Then restart the client and add the + * other clients of this MAC addr (if they exsit). + */ mcip->mci_unicast = mac_find_macaddr(mip, maddr); + rw_enter(&mcip->mci_rw_lock, RW_WRITER); + for (muip = mcip->mci_unicast_list; muip != NULL; muip = muip->mui_next) + muip->mui_map = mcip->mci_unicast; + rw_exit(&mcip->mci_rw_lock); mac_rx_client_restart((mac_client_handle_t)mcip); - if (multiclnt) - mac_rx_move_macaddr_prim(mcip, fgrp, tgrp, maddr, B_TRUE); - return (err); + return (0); } /* @@ -6811,19 +7243,34 @@ mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp, mac_impl_t *mip = mcip->mci_mip; mac_grp_client_t *mgcp; - ASSERT(fgrp == mcip->mci_flent->fe_rx_ring_group); + VERIFY3P(fgrp, ==, mcip->mci_flent->fe_rx_ring_group); if ((err = mac_rx_move_macaddr(mcip, fgrp, tgrp)) != 0) return (err); /* - * The group might be reserved, but SRSs may not be set up, e.g. - * primary and its vlans using a reserved group. + * If the group is marked as reserved and in use by a single + * client, then there is an SRS to teardown. */ if (fgrp->mrg_state == MAC_GROUP_STATE_RESERVED && MAC_GROUP_ONLY_CLIENT(fgrp) != NULL) { mac_rx_srs_group_teardown(mcip->mci_flent, B_TRUE); } + + /* + * If we are moving the client from a non-default group, then + * we know that any additional clients on this group share the + * same MAC address. Since we moved the MAC address filter, we + * need to move these clients too. + * + * If we are moving the client from the default group and its + * MAC address has VLAN clients, then we must move those + * clients as well. + * + * In both cases the idea is the same: we moved the MAC + * address filter to the tgrp, so we must move all clients + * using that MAC address to tgrp as well. + */ if (fgrp != MAC_DEFAULT_RX_GROUP(mip)) { mgcp = fgrp->mrg_clients; while (mgcp != NULL) { @@ -6834,20 +7281,21 @@ mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp, gmcip->mci_flent->fe_rx_ring_group = tgrp; } mac_release_rx_group(mcip, fgrp); - ASSERT(MAC_GROUP_NO_CLIENT(fgrp)); + VERIFY3B(MAC_GROUP_NO_CLIENT(fgrp), ==, B_TRUE); mac_set_group_state(fgrp, MAC_GROUP_STATE_REGISTERED); } else { mac_group_remove_client(fgrp, mcip); mac_group_add_client(tgrp, mcip); mcip->mci_flent->fe_rx_ring_group = tgrp; + /* * If there are other clients (VLANs) sharing this address - * we should be here only for the primary. + * then move them too. */ - if (mcip->mci_unicast->ma_nusers > 1) { + if (mac_check_macaddr_shared(mcip->mci_unicast)) { /* * We need to move all the clients that are using - * this h/w address. + * this MAC address. */ mgcp = fgrp->mrg_clients; while (mgcp != NULL) { @@ -6861,20 +7309,24 @@ mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp, } } } + /* - * The default group will still take the multicast, - * broadcast traffic etc., so it won't go to + * The default group still handles multicast and + * broadcast traffic; it won't transition to * MAC_GROUP_STATE_REGISTERED. */ if (fgrp->mrg_state == MAC_GROUP_STATE_RESERVED) mac_rx_group_unmark(fgrp, MR_CONDEMNED); mac_set_group_state(fgrp, MAC_GROUP_STATE_SHARED); } + next_state = mac_group_next_state(tgrp, &group_only_mcip, MAC_DEFAULT_RX_GROUP(mip), B_TRUE); mac_set_group_state(tgrp, next_state); + /* - * If the destination group is reserved, setup the SRSs etc. + * If the destination group is reserved, then setup the SRSes. + * Otherwise make sure to use SW classification. */ if (tgrp->mrg_state == MAC_GROUP_STATE_RESERVED) { mac_rx_srs_group_setup(mcip, mcip->mci_flent, SRST_LINK); @@ -6885,6 +7337,7 @@ mac_rx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp, } else { mac_rx_switch_grp_to_sw(tgrp); } + return (0); } @@ -6915,6 +7368,7 @@ mac_reserve_tx_group(mac_client_impl_t *mcip, boolean_t move) boolean_t isprimary; isprimary = mcip->mci_flent->fe_type & FLOW_PRIMARY_MAC; + /* * When we come here for a VLAN on the primary (dladm create-vlan), * we need to pair it along with the primary (to keep it consistent @@ -6996,8 +7450,7 @@ mac_reserve_tx_group(mac_client_impl_t *mcip, boolean_t move) if (grp->mrg_state == MAC_GROUP_STATE_RESERVED && candidate_grp == NULL) { gclient = MAC_GROUP_ONLY_CLIENT(grp); - if (gclient == NULL) - gclient = mac_get_grp_primary(grp); + VERIFY3P(gclient, !=, NULL); gmrp = MCIP_RESOURCE_PROPS(gclient); if (gclient->mci_share == NULL && (gmrp->mrp_mask & MRP_TX_RINGS) == 0 && @@ -7034,13 +7487,14 @@ mac_reserve_tx_group(mac_client_impl_t *mcip, boolean_t move) */ if (need_exclgrp && candidate_grp != NULL) { /* - * Switch the MAC client from the candidate group - * to the default group. + * Switch the MAC client from the candidate + * group to the default group. We know the + * candidate_grp came from a reserved group + * and thus only has one client. */ grp = candidate_grp; gclient = MAC_GROUP_ONLY_CLIENT(grp); - if (gclient == NULL) - gclient = mac_get_grp_primary(grp); + VERIFY3P(gclient, !=, NULL); mac_tx_client_quiesce((mac_client_handle_t)gclient); mac_tx_switch_group(gclient, grp, defgrp); mac_tx_client_restart((mac_client_handle_t)gclient); @@ -7208,7 +7662,7 @@ mac_tx_switch_group(mac_client_impl_t *mcip, mac_group_t *fgrp, */ mac_group_remove_client(fgrp, mcip); mac_tx_dismantle_soft_rings(fgrp, flent); - if (mcip->mci_unicast->ma_nusers > 1) { + if (mac_check_macaddr_shared(mcip->mci_unicast)) { mgcp = fgrp->mrg_clients; while (mgcp != NULL) { gmcip = mgcp->mgc_client; @@ -7454,7 +7908,7 @@ mac_no_active(mac_handle_t mh) * changes and update the mac_resource_props_t for the VLAN's client. * We need to do this since we don't support setting these properties * on the primary's VLAN clients, but the VLAN clients have to - * follow the primary w.r.t the rings property; + * follow the primary w.r.t the rings property. */ void mac_set_prim_vlan_rings(mac_impl_t *mip, mac_resource_props_t *mrp) @@ -7603,13 +8057,10 @@ mac_group_ring_modify(mac_client_impl_t *mcip, mac_group_t *group, MAC_GROUP_STATE_RESERVED) { continue; } - mcip = MAC_GROUP_ONLY_CLIENT(tgrp); - if (mcip == NULL) - mcip = mac_get_grp_primary(tgrp); - ASSERT(mcip != NULL); - mrp = MCIP_RESOURCE_PROPS(mcip); - if ((mrp->mrp_mask & MRP_RX_RINGS) != 0) + if (i_mac_clients_hw(tgrp, MRP_RX_RINGS)) continue; + mcip = tgrp->mrg_clients->mgc_client; + VERIFY3P(mcip, !=, NULL); if ((tgrp->mrg_cur_count + defgrp->mrg_cur_count) < (modify + 1)) { continue; @@ -7624,12 +8075,10 @@ mac_group_ring_modify(mac_client_impl_t *mcip, mac_group_t *group, MAC_GROUP_STATE_RESERVED) { continue; } - mcip = MAC_GROUP_ONLY_CLIENT(tgrp); - if (mcip == NULL) - mcip = mac_get_grp_primary(tgrp); - mrp = MCIP_RESOURCE_PROPS(mcip); - if ((mrp->mrp_mask & MRP_TX_RINGS) != 0) + if (i_mac_clients_hw(tgrp, MRP_TX_RINGS)) continue; + mcip = tgrp->mrg_clients->mgc_client; + VERIFY3P(mcip, !=, NULL); if ((tgrp->mrg_cur_count + defgrp->mrg_cur_count) < (modify + 1)) { continue; @@ -7899,10 +8348,10 @@ mac_pool_event_cb(pool_event_t what, poolid_t id, void *arg) * Set effective rings property. This could be called from datapath_setup/ * datapath_teardown or set-linkprop. * If the group is reserved we just go ahead and set the effective rings. - * Additionally, for TX this could mean the default group has lost/gained + * Additionally, for TX this could mean the default group has lost/gained * some rings, so if the default group is reserved, we need to adjust the * effective rings for the default group clients. For RX, if we are working - * with the non-default group, we just need * to reset the effective props + * with the non-default group, we just need to reset the effective props * for the default group clients. */ void @@ -8032,6 +8481,7 @@ mac_check_primary_relocation(mac_client_impl_t *mcip, boolean_t rxhw) * the first non-primary. */ ASSERT(mip->mi_nactiveclients == 2); + /* * OK, now we have the primary that needs to be relocated. */ diff --git a/usr/src/uts/common/io/mac/mac_bcast.c b/usr/src/uts/common/io/mac/mac_bcast.c index 1ff33c3578..3b674be1d0 100644 --- a/usr/src/uts/common/io/mac/mac_bcast.c +++ b/usr/src/uts/common/io/mac/mac_bcast.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ #include <sys/types.h> @@ -146,7 +147,7 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback) uint64_t gen; uint_t i; mblk_t *mp_chain1; - flow_entry_t *flent; + flow_entry_t *flent; int err; rw_enter(&mip->mi_rw_lock, RW_READER); @@ -182,13 +183,6 @@ mac_bcast_send(void *arg1, void *arg2, mblk_t *mp_chain, boolean_t is_loopback) */ if ((mp_chain1 = mac_copymsgchain_cksum(mp_chain)) == NULL) break; - /* - * Fix the checksum for packets originating - * from the local machine. - */ - if ((src_mcip != NULL) && - (mp_chain1 = mac_fix_cksum(mp_chain1)) == NULL) - break; FLOW_TRY_REFHOLD(flent, err); if (err != 0) { diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c index efc3cb7f07..de5ef6121f 100644 --- a/usr/src/uts/common/io/mac/mac_client.c +++ b/usr/src/uts/common/io/mac/mac_client.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. * Copyright 2017 RackTop Systems. */ @@ -114,6 +114,7 @@ #include <sys/stream.h> #include <sys/strsun.h> #include <sys/strsubr.h> +#include <sys/pattr.h> #include <sys/dlpi.h> #include <sys/modhash.h> #include <sys/mac_impl.h> @@ -865,9 +866,12 @@ mac_unicast_update_client_flow(mac_client_impl_t *mcip) mac_protect_update_mac_token(mcip); /* - * A MAC client could have one MAC address but multiple - * VLANs. In that case update the flow entries corresponding - * to all VLANs of the MAC client. + * When there are multiple VLANs sharing the same MAC address, + * each gets its own MAC client, except when running on sun4v + * vsw. In that case the mci_flent_list is used to place + * multiple VLAN flows on one MAC client. If we ever get rid + * of vsw then this code can go, but until then we need to + * update all flow entries. */ for (flent = mcip->mci_flent_list; flent != NULL; flent = flent->fe_client_next) { @@ -1025,7 +1029,7 @@ mac_unicast_primary_set(mac_handle_t mh, const uint8_t *addr) return (0); } - if (mac_find_macaddr(mip, (uint8_t *)addr) != 0) { + if (mac_find_macaddr(mip, (uint8_t *)addr) != NULL) { i_mac_perim_exit(mip); return (EBUSY); } @@ -1040,9 +1044,9 @@ mac_unicast_primary_set(mac_handle_t mh, const uint8_t *addr) mac_capab_aggr_t aggr_cap; /* - * If the mac is an aggregation, other than the unicast + * If the MAC is an aggregation, other than the unicast * addresses programming, aggr must be informed about this - * primary unicst address change to change its mac address + * primary unicst address change to change its MAC address * policy to be user-specified. */ ASSERT(map->ma_type == MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED); @@ -1353,7 +1357,7 @@ mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name, mcip->mci_mip = mip; mcip->mci_upper_mip = NULL; - mcip->mci_rx_fn = mac_pkt_drop; + mcip->mci_rx_fn = mac_rx_def; mcip->mci_rx_arg = NULL; mcip->mci_rx_p_fn = NULL; mcip->mci_rx_p_arg = NULL; @@ -1374,7 +1378,7 @@ mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name, mcip->mci_state_flags |= MCIS_IS_AGGR_PORT; if (mip->mi_state_flags & MIS_IS_AGGR) - mcip->mci_state_flags |= MCIS_IS_AGGR; + mcip->mci_state_flags |= MCIS_IS_AGGR_CLIENT; if ((flags & MAC_OPEN_FLAGS_USE_DATALINK_NAME) != 0) { datalink_id_t linkid; @@ -1433,6 +1437,7 @@ mac_client_open(mac_handle_t mh, mac_client_handle_t *mchp, char *name, mcip->mci_flent = flent; FLOW_MARK(flent, FE_MC_NO_DATAPATH); flent->fe_mcip = mcip; + /* * Place initial creation reference on the flow. This reference * is released in the corresponding delete action viz. @@ -1539,7 +1544,8 @@ mac_client_close(mac_client_handle_t mch, uint16_t flags) } /* - * Set the rx bypass receive callback. + * Set the Rx bypass receive callback and return B_TRUE. Return + * B_FALSE if it's not possible to enable bypass. */ boolean_t mac_rx_bypass_set(mac_client_handle_t mch, mac_direct_rx_t rx_fn, void *arg1) @@ -1550,11 +1556,11 @@ mac_rx_bypass_set(mac_client_handle_t mch, mac_direct_rx_t rx_fn, void *arg1) ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); /* - * If the mac_client is a VLAN, we should not do DLS bypass and - * instead let the packets come up via mac_rx_deliver so the vlan - * header can be stripped. + * If the client has more than one VLAN then process packets + * through DLS. This should happen only when sun4v vsw is on + * the scene. */ - if (mcip->mci_nvids > 0) + if (mcip->mci_nvids > 1) return (B_FALSE); /* @@ -1608,8 +1614,8 @@ mac_rx_set(mac_client_handle_t mch, mac_rx_t rx_fn, void *arg) i_mac_perim_exit(mip); /* - * If we're changing the rx function on the primary mac of a vnic, - * make sure any secondary macs on the vnic are updated as well. + * If we're changing the Rx function on the primary MAC of a VNIC, + * make sure any secondary addresses on the VNIC are updated as well. */ if (umip != NULL) { ASSERT((umip->mi_state_flags & MIS_IS_VNIC) != 0); @@ -1623,7 +1629,7 @@ mac_rx_set(mac_client_handle_t mch, mac_rx_t rx_fn, void *arg) void mac_rx_clear(mac_client_handle_t mch) { - mac_rx_set(mch, mac_pkt_drop, NULL); + mac_rx_set(mch, mac_rx_def, NULL); } void @@ -1787,6 +1793,14 @@ mac_client_set_rings_prop(mac_client_impl_t *mcip, mac_resource_props_t *mrp, } /* Let check if we can give this an excl group */ } else if (group == defgrp) { + /* + * If multiple clients share an + * address then they must stay on the + * default group. + */ + if (mac_check_macaddr_shared(mcip->mci_unicast)) + return (0); + ngrp = mac_reserve_rx_group(mcip, mac_addr, B_TRUE); /* Couldn't give it a group, that's fine */ @@ -1809,6 +1823,16 @@ mac_client_set_rings_prop(mac_client_impl_t *mcip, mac_resource_props_t *mrp, } if (group == defgrp && ((mrp->mrp_nrxrings > 0) || unspec)) { + /* + * We are requesting Rx rings. Try to reserve + * a non-default group. + * + * If multiple clients share an address then + * they must stay on the default group. + */ + if (mac_check_macaddr_shared(mcip->mci_unicast)) + return (EINVAL); + ngrp = mac_reserve_rx_group(mcip, mac_addr, B_TRUE); if (ngrp == NULL) return (ENOSPC); @@ -2166,10 +2190,10 @@ mac_unicast_flow_create(mac_client_impl_t *mcip, uint8_t *mac_addr, flent_flags = FLOW_VNIC_MAC; /* - * For the first flow we use the mac client's name - mci_name, for - * subsequent ones we just create a name with the vid. This is + * For the first flow we use the MAC client's name - mci_name, for + * subsequent ones we just create a name with the VID. This is * so that we can add these flows to the same flow table. This is - * fine as the flow name (except for the one with the mac client's + * fine as the flow name (except for the one with the MAC client's * name) is not visible. When the first flow is removed, we just replace * its fdesc with another from the list, so we will still retain the * flent with the MAC client's flow name. @@ -2327,6 +2351,7 @@ mac_client_datapath_setup(mac_client_impl_t *mcip, uint16_t vid, * The unicast MAC address must have been added successfully. */ ASSERT(mcip->mci_unicast != NULL); + /* * Push down the sub-flows that were defined on this link * hitherto. The flows are added to the active flow table @@ -2338,15 +2363,23 @@ mac_client_datapath_setup(mac_client_impl_t *mcip, uint16_t vid, ASSERT(!no_unicast); /* - * A unicast flow already exists for that MAC client, - * this flow must be the same mac address but with - * different VID. It has been checked by mac_addr_in_use(). + * A unicast flow already exists for that MAC client + * so this flow must be the same MAC address but with + * a different VID. It has been checked by + * mac_addr_in_use(). + * + * We will use the SRS etc. from the initial + * mci_flent. We don't need to create a kstat for + * this, as except for the fdesc, everything will be + * used from the first flent. * - * We will use the SRS etc. from the mci_flent. Note that - * We don't need to create kstat for this as except for - * the fdesc, everything will be used from in the 1st flent. + * The only time we should see multiple flents on the + * same MAC client is on the sun4v vsw. If we removed + * that code we should be able to remove the entire + * notion of multiple flents on a MAC client (this + * doesn't affect sub/user flows because they have + * their own list unrelated to mci_flent_list). */ - if (bcmp(mac_addr, map->ma_addr, map->ma_len) != 0) { err = EINVAL; goto bail; @@ -2406,7 +2439,17 @@ done_setup: if (flent->fe_rx_ring_group != NULL) mac_rx_group_unmark(flent->fe_rx_ring_group, MR_INCIPIENT); FLOW_UNMARK(flent, FE_INCIPIENT); - FLOW_UNMARK(flent, FE_MC_NO_DATAPATH); + + /* + * If this is an aggr port client, don't enable the flow's + * datapath at this stage. Otherwise, bcast traffic could + * arrive while the aggr port is in the process of + * initializing. Instead, the flow's datapath is started later + * when mac_client_set_flow_cb() is called. + */ + if ((mcip->mci_state_flags & MCIS_IS_AGGR_PORT) == 0) + FLOW_UNMARK(flent, FE_MC_NO_DATAPATH); + mac_tx_client_unblock(mcip); return (0); bail: @@ -2475,8 +2518,12 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags, boolean_t is_vnic_primary = (flags & MAC_UNICAST_VNIC_PRIMARY); - /* when VID is non-zero, the underlying MAC can not be VNIC */ - ASSERT(!((mip->mi_state_flags & MIS_IS_VNIC) && (vid != 0))); + /* + * When the VID is non-zero the underlying MAC cannot be a + * VNIC. I.e., dladm create-vlan cannot take a VNIC as + * argument, only the primary MAC client. + */ + ASSERT(!((mip->mi_state_flags & MIS_IS_VNIC) && (vid != VLAN_ID_NONE))); /* * Can't unicast add if the client asked only for minimal datapath @@ -2489,18 +2536,19 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags, * Check for an attempted use of the current Port VLAN ID, if enabled. * No client may use it. */ - if (mip->mi_pvid != 0 && vid == mip->mi_pvid) + if (mip->mi_pvid != VLAN_ID_NONE && vid == mip->mi_pvid) return (EBUSY); /* * Check whether it's the primary client and flag it. */ - if (!(mcip->mci_state_flags & MCIS_IS_VNIC) && is_primary && vid == 0) + if (!(mcip->mci_state_flags & MCIS_IS_VNIC) && is_primary && + vid == VLAN_ID_NONE) mcip->mci_flags |= MAC_CLIENT_FLAGS_PRIMARY; /* * is_vnic_primary is true when we come here as a VLAN VNIC - * which uses the primary mac client's address but with a non-zero + * which uses the primary MAC client's address but with a non-zero * VID. In this case the MAC address is not specified by an upper * MAC client. */ @@ -2552,7 +2600,7 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags, /* * Create a handle for vid 0. */ - ASSERT(vid == 0); + ASSERT(vid == VLAN_ID_NONE); muip = kmem_zalloc(sizeof (mac_unicast_impl_t), KM_SLEEP); muip->mui_vid = vid; *mah = (mac_unicast_handle_t)muip; @@ -2572,7 +2620,9 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags, } /* - * If this is a VNIC/VLAN, disable softmac fast-path. + * If this is a VNIC/VLAN, disable softmac fast-path. This is + * only relevant to legacy devices which use softmac to + * interface with GLDv3. */ if (mcip->mci_state_flags & MCIS_IS_VNIC) { err = mac_fastpath_disable((mac_handle_t)mip); @@ -2620,9 +2670,11 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags, (void) mac_client_set_resources(mch, mrp); } else if (mcip->mci_state_flags & MCIS_IS_VNIC) { /* - * This is a primary VLAN client, we don't support - * specifying rings property for this as it inherits the - * rings property from its MAC. + * This is a VLAN client sharing the address of the + * primary MAC client; i.e., one created via dladm + * create-vlan. We don't support specifying ring + * properties for this type of client as it inherits + * these from the primary MAC client. */ if (is_vnic_primary) { mac_resource_props_t *vmrp; @@ -2681,7 +2733,7 @@ i_mac_unicast_add(mac_client_handle_t mch, uint8_t *mac_addr, uint16_t flags, /* * Set the flags here so that if this is a passive client, we - * can return and set it when we call mac_client_datapath_setup + * can return and set it when we call mac_client_datapath_setup * when this becomes the active client. If we defer to using these * flags to mac_client_datapath_setup, then for a passive client, * we'd have to store the flags somewhere (probably fe_flags) @@ -2918,7 +2970,7 @@ mac_client_datapath_teardown(mac_client_handle_t mch, mac_unicast_impl_t *muip, mac_misc_stat_delete(flent); /* Initialize the receiver function to a safe routine */ - flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop; + flent->fe_cb_fn = (flow_fn_t)mac_rx_def; flent->fe_cb_arg1 = NULL; flent->fe_cb_arg2 = NULL; @@ -2984,14 +3036,14 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah) i_mac_perim_enter(mip); if (mcip->mci_flags & MAC_CLIENT_FLAGS_VNIC_PRIMARY) { /* - * Called made by the upper MAC client of a VNIC. + * Call made by the upper MAC client of a VNIC. * There's nothing much to do, the unicast address will * be removed by the VNIC driver when the VNIC is deleted, * but let's ensure that all our transmit is done before * the client does a mac_client_stop lest it trigger an * assert in the driver. */ - ASSERT(muip->mui_vid == 0); + ASSERT(muip->mui_vid == VLAN_ID_NONE); mac_tx_client_flush(mcip); @@ -3055,6 +3107,7 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah) i_mac_perim_exit(mip); return (0); } + /* * Remove the VID from the list of client's VIDs. */ @@ -3081,7 +3134,7 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah) * flows. */ flent = mac_client_get_flow(mcip, muip); - ASSERT(flent != NULL); + VERIFY3P(flent, !=, NULL); /* * The first one is disappearing, need to make sure @@ -3109,6 +3162,7 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah) FLOW_FINAL_REFRELE(flent); ASSERT(!(mcip->mci_state_flags & MCIS_EXCLUSIVE)); + /* * Enable fastpath if this is a VNIC or a VLAN. */ @@ -3122,7 +3176,8 @@ mac_unicast_remove(mac_client_handle_t mch, mac_unicast_handle_t mah) mui_vid = muip->mui_vid; mac_client_datapath_teardown(mch, muip, flent); - if ((mcip->mci_flags & MAC_CLIENT_FLAGS_PRIMARY) && mui_vid == 0) { + if ((mcip->mci_flags & MAC_CLIENT_FLAGS_PRIMARY) && + mui_vid == VLAN_ID_NONE) { mcip->mci_flags &= ~MAC_CLIENT_FLAGS_PRIMARY; } else { i_mac_perim_exit(mip); @@ -3264,6 +3319,11 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type, mac_cb_info_t *mcbi; int rc; + if ((flags & MAC_PROMISC_FLAGS_NO_COPY) && + (flags & MAC_PROMISC_FLAGS_DO_FIXUPS)) { + return (EINVAL); + } + i_mac_perim_enter(mip); if ((rc = mac_start((mac_handle_t)mip)) != 0) { @@ -3310,6 +3370,7 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type, mpip->mpi_strip_vlan_tag = ((flags & MAC_PROMISC_FLAGS_VLAN_TAG_STRIP) != 0); mpip->mpi_no_copy = ((flags & MAC_PROMISC_FLAGS_NO_COPY) != 0); + mpip->mpi_do_fixups = ((flags & MAC_PROMISC_FLAGS_DO_FIXUPS) != 0); mcbi = &mip->mi_promisc_cb_info; mutex_enter(mcbi->mcbi_lockp); @@ -3530,6 +3591,13 @@ mac_tx(mac_client_handle_t mch, mblk_t *mp_chain, uintptr_t hint, obytes = (mp_chain->b_cont == NULL ? MBLKL(mp_chain) : msgdsize(mp_chain)); + /* + * There's a chance this primary client might be part + * of a bridge and the packet forwarded to a local + * receiver -- mark the packet accordingly. + */ + DB_CKSUMFLAGS(mp_chain) |= HW_LOCAL_MAC; + MAC_TX(mip, srs_tx->st_arg2, mp_chain, mcip); if (mp_chain == NULL) { cookie = NULL; @@ -3943,33 +4011,63 @@ mac_client_get_effective_resources(mac_client_handle_t mch, * The unicast packets of MAC_CLIENT_PROMISC_FILTER callbacks are dispatched * after classification by mac_rx_deliver(). */ - static void mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp, boolean_t loopback) { - mblk_t *mp_copy, *mp_next; + mblk_t *mp_next; + boolean_t local = (DB_CKSUMFLAGS(mp) & HW_LOCAL_MAC) != 0; + + if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag || + (mpip->mpi_do_fixups && local)) { + mblk_t *mp_copy; - if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag) { mp_copy = copymsg(mp); if (mp_copy == NULL) return; + /* + * The consumer has requested we emulate HW offloads + * for host-local packets. + */ + if (mpip->mpi_do_fixups && local) { + /* + * Remember that copymsg() doesn't copy + * b_next, so we are only passing a single + * packet to mac_hw_emul(). Also keep in mind + * that mp_copy will become an mblk chain if + * the argument is an LSO message. + */ + mac_hw_emul(&mp_copy, NULL, NULL, + MAC_HWCKSUM_EMUL | MAC_LSO_EMUL); + + if (mp_copy == NULL) + return; + } + if (mpip->mpi_strip_vlan_tag) { mp_copy = mac_strip_vlan_tag_chain(mp_copy); if (mp_copy == NULL) return; } - mp_next = NULL; - } else { - mp_copy = mp; - mp_next = mp->b_next; + + /* + * There is code upstack that can't deal with message + * chains. + */ + for (mblk_t *tmp = mp_copy; tmp != NULL; tmp = mp_next) { + mp_next = tmp->b_next; + tmp->b_next = NULL; + mpip->mpi_fn(mpip->mpi_arg, NULL, tmp, loopback); + } + + return; } - mp_copy->b_next = NULL; - mpip->mpi_fn(mpip->mpi_arg, NULL, mp_copy, loopback); - if (mp_copy == mp) - mp->b_next = mp_next; + mp_next = mp->b_next; + mp->b_next = NULL; + mpip->mpi_fn(mpip->mpi_arg, NULL, mp, loopback); + mp->b_next = mp_next; } /* @@ -4051,8 +4149,9 @@ mac_promisc_dispatch(mac_impl_t *mip, mblk_t *mp_chain, if (is_sender || mpip->mpi_type == MAC_CLIENT_PROMISC_ALL || - is_mcast) + is_mcast) { mac_promisc_dispatch_one(mpip, mp, is_sender); + } } } MAC_PROMISC_WALKER_DCR(mip); @@ -4152,16 +4251,15 @@ mac_info_get(const char *name, mac_info_t *minfop) /* * To get the capabilities that MAC layer cares about, such as rings, factory * mac address, vnic or not, it should directly invoke this function. If the - * link is part of a bridge, then the only "capability" it has is the inability - * to do zero copy. + * link is part of a bridge, then the link is unable to do zero copy. */ boolean_t i_mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data) { mac_impl_t *mip = (mac_impl_t *)mh; - if (mip->mi_bridge_link != NULL) - return (cap == MAC_CAPAB_NO_ZCOPY); + if (mip->mi_bridge_link != NULL && cap == MAC_CAPAB_NO_ZCOPY) + return (B_TRUE); else if (mip->mi_callbacks->mc_callbacks & MC_GETCAPAB) return (mip->mi_getcapab(mip->mi_driver, cap, cap_data)); else @@ -4180,8 +4278,9 @@ mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data) mac_impl_t *mip = (mac_impl_t *)mh; /* - * if mi_nactiveclients > 1, only MAC_CAPAB_LEGACY, MAC_CAPAB_HCKSUM, - * MAC_CAPAB_NO_NATIVEVLAN and MAC_CAPAB_NO_ZCOPY can be advertised. + * Some capabilities are restricted when there are more than one active + * clients on the MAC resource. The ones noted below are safe, + * independent of that count. */ if (mip->mi_nactiveclients > 1) { switch (cap) { @@ -4189,6 +4288,7 @@ mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data) return (B_TRUE); case MAC_CAPAB_LEGACY: case MAC_CAPAB_HCKSUM: + case MAC_CAPAB_LSO: case MAC_CAPAB_NO_NATIVEVLAN: break; default: @@ -4340,7 +4440,13 @@ mac_addr_len(mac_handle_t mh) boolean_t mac_is_vnic(mac_handle_t mh) { - return (((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC); + return ((((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC) != 0); +} + +boolean_t +mac_is_overlay(mac_handle_t mh) +{ + return ((((mac_impl_t *)mh)->mi_state_flags & MIS_IS_OVERLAY) != 0); } mac_handle_t diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c index 8a8f9a9269..70585df698 100644 --- a/usr/src/uts/common/io/mac/mac_datapath_setup.c +++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2017, Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #include <sys/types.h> @@ -604,6 +604,7 @@ mac_srs_cpu_setup(cpu_setup_t what, int id, void *arg) * * TODO: Cleanup and tighten some of the assumptions. */ +boolean_t mac_check_overlay = B_TRUE; boolean_t mac_use_bw_heuristic = B_TRUE; static int mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus) @@ -611,6 +612,7 @@ mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus) uint64_t cpu_speed, bw = 0; int srings = 0; boolean_t bw_enabled = B_FALSE; + mac_client_impl_t *mcip = flent->fe_mcip; ASSERT(!(flent->fe_type & FLOW_USER)); if (flent->fe_resource_props.mrp_mask & MRP_MAXBW && @@ -638,7 +640,16 @@ mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus) */ if (mac_soft_ring_enable) srings = srings * 2; + } else if (mac_check_overlay == B_TRUE && + (mcip->mci_state_flags & MCIS_IS_VNIC) != 0) { + /* Is this a VNIC on an overlay? */ + mac_handle_t mh = (mac_handle_t)mcip->mci_mip; + if (mac_is_overlay(mh) == B_TRUE) { + srings = mac_rx_soft_ring_10gig_count; + } } + + } else { /* * Soft ring computation using CPU speed and specified @@ -1186,7 +1197,7 @@ mac_srs_fanout_list_alloc(mac_soft_ring_set_t *mac_srs) mac_srs->srs_tx_soft_rings = (mac_soft_ring_t **) kmem_zalloc(sizeof (mac_soft_ring_t *) * MAX_RINGS_PER_GROUP, KM_SLEEP); - if (mcip->mci_state_flags & MCIS_IS_AGGR) { + if (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT) { mac_srs_tx_t *tx = &mac_srs->srs_tx; tx->st_soft_rings = (mac_soft_ring_t **) @@ -1595,13 +1606,13 @@ mac_srs_update_bwlimit(flow_entry_t *flent, mac_resource_props_t *mrp) /* * When the first sub-flow is added to a link, we disable polling on the - * link and also modify the entry point to mac_rx_srs_subflow_process. + * link and also modify the entry point to mac_rx_srs_subflow_process(). * (polling is disabled because with the subflow added, accounting * for polling needs additional logic, it is assumed that when a subflow is * added, we can take some hit as a result of disabling polling rather than * adding more complexity - if this becomes a perf. issue we need to * re-rvaluate this logic). When the last subflow is removed, we turn back - * polling and also reset the entry point to mac_rx_srs_process. + * polling and also reset the entry point to mac_rx_srs_process(). * * In the future if there are multiple SRS, we can simply * take one and give it to the flow rather than disabling polling and @@ -1646,7 +1657,7 @@ mac_client_update_classifier(mac_client_impl_t *mcip, boolean_t enable) * Change the S/W classifier so that we can land in the * correct processing function with correct argument. * If all subflows have been removed we can revert to - * mac_rx_srsprocess, else we need mac_rx_srs_subflow_process. + * mac_rx_srs_process(), else we need mac_rx_srs_subflow_process(). */ mutex_enter(&flent->fe_lock); flent->fe_cb_fn = (flow_fn_t)rx_func; @@ -1977,8 +1988,6 @@ no_softrings: } /* - * mac_fanout_setup: - * * Calls mac_srs_fanout_init() or modify() depending upon whether * the SRS is getting initialized or re-initialized. */ @@ -1991,14 +2000,14 @@ mac_fanout_setup(mac_client_impl_t *mcip, flow_entry_t *flent, int i, rx_srs_cnt; ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); + /* - * This is an aggregation port. Fanout will be setup - * over the aggregation itself. + * Aggr ports do not have SRSes. This function should never be + * called on an aggr port. */ - if (mcip->mci_state_flags & MCIS_EXCLUSIVE) - return; - + ASSERT3U((mcip->mci_state_flags & MCIS_IS_AGGR_PORT), ==, 0); mac_rx_srs = flent->fe_rx_srs[0]; + /* * Set up the fanout on the tx side only once, with the * first rx SRS. The CPU binding, fanout, and bandwidth @@ -2054,8 +2063,6 @@ mac_fanout_setup(mac_client_impl_t *mcip, flow_entry_t *flent, } /* - * mac_srs_create: - * * Create a mac_soft_ring_set_t (SRS). If soft_ring_fanout_type is * SRST_TX, an SRS for Tx side is created. Otherwise an SRS for Rx side * processing is created. @@ -2187,7 +2194,7 @@ mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type, * find nothing plus we have an existing backlog * (sr_poll_pkt_cnt > 0), we stay in polling mode but don't poll * the H/W for packets anymore (let the polling thread go to sleep). - * 5) Once the backlog is relived (packets are processed) we reenable + * 5) Once the backlog is relieved (packets are processed) we reenable * polling (by signalling the poll thread) only when the backlog * dips below sr_poll_thres. * 6) sr_hiwat is used exclusively when we are not polling capable @@ -2210,7 +2217,14 @@ mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type, mac_srs->srs_state |= SRS_SOFTRING_QUEUE; } - mac_srs->srs_worker = thread_create(NULL, 0, + /* + * Create the srs_worker with twice the stack of a normal kernel thread + * to reduce the likelihood of stack overflows in receive-side + * processing. (The larger stacks are not the only precaution taken + * against stack overflows; see the use of the MAC_RX_SRS_TOODEEP + * macro for details.) + */ + mac_srs->srs_worker = thread_create(NULL, default_stksize << 1, mac_srs_worker, mac_srs, 0, &p0, TS_RUN, mac_srs->srs_pri); if (is_tx_srs) { @@ -2258,8 +2272,8 @@ mac_srs_create(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t srs_type, /* * Some drivers require serialization and don't send * packet chains in interrupt context. For such - * drivers, we should always queue in soft ring - * so that we get a chance to switch into a polling + * drivers, we should always queue in the soft ring + * so that we get a chance to switch into polling * mode under backlog. */ ring_info = mac_hwring_getinfo((mac_ring_handle_t)ring); @@ -2357,6 +2371,10 @@ mac_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, mac_rx_srs_group_setup(mcip, flent, link_type); mac_tx_srs_group_setup(mcip, flent, link_type); + /* Aggr ports don't have SRSes; thus there is no soft ring fanout. */ + if ((mcip->mci_state_flags & MCIS_IS_AGGR_PORT) != 0) + return; + pool_lock(); cpupart = mac_pset_find(mrp, &use_default); mac_fanout_setup(mcip, flent, MCIP_RESOURCE_PROPS(mcip), @@ -2366,9 +2384,11 @@ mac_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, } /* - * Set up the RX SRSs. If the S/W SRS is not set, set it up, if there - * is a group associated with this MAC client, set up SRSs for individual - * h/w rings. + * Set up the Rx SRSes. If there is no group associated with the + * client, then only setup SW classification. If the client has + * exlusive (MAC_GROUP_STATE_RESERVED) use of the group, then create an + * SRS for each HW ring. If the client is sharing a group, then make + * sure to teardown the HW SRSes. */ void mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, @@ -2379,13 +2399,37 @@ mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, mac_ring_t *ring; uint32_t fanout_type; mac_group_t *rx_group = flent->fe_rx_ring_group; + boolean_t no_unicast; + + /* + * If this is an an aggr port, then don't setup Rx SRS and Rx + * soft rings as they won't be used. However, we still need to + * start the rings to receive data on them. + */ + if (mcip->mci_state_flags & MCIS_IS_AGGR_PORT) { + if (rx_group == NULL) + return; + + for (ring = rx_group->mrg_rings; ring != NULL; + ring = ring->mr_next) { + if (ring->mr_state != MR_INUSE) + (void) mac_start_ring(ring); + } + + return; + } + + /* + * Aggr ports should never have SRSes. + */ + ASSERT3U((mcip->mci_state_flags & MCIS_IS_AGGR_PORT), ==, 0); fanout_type = mac_find_fanout(flent, link_type); + no_unicast = (mcip->mci_state_flags & MCIS_NO_UNICAST_ADDR) != 0; - /* Create the SRS for S/W classification if none exists */ + /* Create the SRS for SW classification if none exists */ if (flent->fe_rx_srs[0] == NULL) { ASSERT(flent->fe_rx_srs_cnt == 0); - /* Setup the Rx SRS */ mac_srs = mac_srs_create(mcip, flent, fanout_type | link_type, mac_rx_deliver, mcip, NULL, NULL); mutex_enter(&flent->fe_lock); @@ -2397,15 +2441,17 @@ mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, if (rx_group == NULL) return; + /* - * fanout for default SRS is done when default SRS are created - * above. As each ring is added to the group, we setup the - * SRS and fanout to it. + * If the group is marked RESERVED then setup an SRS and + * fanout for each HW ring. */ switch (rx_group->mrg_state) { case MAC_GROUP_STATE_RESERVED: for (ring = rx_group->mrg_rings; ring != NULL; ring = ring->mr_next) { + uint16_t vid = i_mac_flow_vid(mcip->mci_flent); + switch (ring->mr_state) { case MR_INUSE: case MR_FREE: @@ -2415,20 +2461,23 @@ mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, (void) mac_start_ring(ring); /* - * Since the group is exclusively ours create - * an SRS for this ring to allow the - * individual SRS to dynamically poll the - * ring. Do this only if the client is not - * a VLAN MAC client, since for VLAN we do - * s/w classification for the VID check, and - * if it has a unicast address. + * If a client requires SW VLAN + * filtering or has no unicast address + * then we don't create any HW ring + * SRSes. */ - if ((mcip->mci_state_flags & - MCIS_NO_UNICAST_ADDR) || - i_mac_flow_vid(mcip->mci_flent) != - VLAN_ID_NONE) { + if ((!MAC_GROUP_HW_VLAN(rx_group) && + vid != VLAN_ID_NONE) || no_unicast) break; - } + + /* + * When a client has exclusive use of + * a group, and that group's traffic + * is fully HW classified, we create + * an SRS for each HW ring in order to + * make use of dynamic polling of said + * HW rings. + */ mac_srs = mac_srs_create(mcip, flent, fanout_type | link_type, mac_rx_deliver, mcip, NULL, ring); @@ -2444,14 +2493,9 @@ mac_rx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, break; case MAC_GROUP_STATE_SHARED: /* - * Set all rings of this group to software classified. - * - * If the group is current RESERVED, the existing mac - * client (the only client on this group) is using - * this group exclusively. In that case we need to - * disable polling on the rings of the group (if it - * was enabled), and free the SRS associated with the - * rings. + * When a group is shared by multiple clients, we must + * use SW classifiction to ensure packets are + * delivered to the correct client. */ mac_rx_switch_grp_to_sw(rx_group); break; @@ -2468,46 +2512,49 @@ void mac_tx_srs_group_setup(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t link_type) { - int cnt; - int ringcnt; - mac_ring_t *ring; - mac_group_t *grp; - /* - * If we are opened exclusively (like aggr does for aggr_ports), - * don't set up Tx SRS and Tx soft rings as they won't be used. - * The same thing has to be done for Rx side also. See bug: - * 6880080 + * If this is an exclusive client (e.g. an aggr port), then + * don't setup Tx SRS and Tx soft rings as they won't be used. + * However, we still need to start the rings to send data + * across them. */ if (mcip->mci_state_flags & MCIS_EXCLUSIVE) { - /* - * If we have rings, start them here. - */ - if (flent->fe_tx_ring_group == NULL) - return; + mac_ring_t *ring; + mac_group_t *grp; + grp = (mac_group_t *)flent->fe_tx_ring_group; - ringcnt = grp->mrg_cur_count; - ring = grp->mrg_rings; - for (cnt = 0; cnt < ringcnt; cnt++) { - if (ring->mr_state != MR_INUSE) { + + if (grp == NULL) + return; + + for (ring = grp->mrg_rings; ring != NULL; + ring = ring->mr_next) { + if (ring->mr_state != MR_INUSE) (void) mac_start_ring(ring); - } - ring = ring->mr_next; } + return; } + + /* + * Aggr ports should never have SRSes. + */ + ASSERT3U((mcip->mci_state_flags & MCIS_IS_AGGR_PORT), ==, 0); + if (flent->fe_tx_srs == NULL) { (void) mac_srs_create(mcip, flent, SRST_TX | link_type, NULL, mcip, NULL, NULL); } + mac_tx_srs_setup(mcip, flent); } /* - * Remove all the RX SRSs. If we want to remove only the SRSs associated - * with h/w rings, leave the S/W SRS alone. This is used when we want to - * move the MAC client from one group to another, so we need to teardown - * on the h/w SRSs. + * Teardown all the Rx SRSes. Unless hwonly is set, then only teardown + * the Rx HW SRSes and leave the SW SRS alone. The hwonly flag is set + * when we wish to move a MAC client from one group to another. In + * that case, we need to release the current HW SRSes but keep the SW + * SRS for continued traffic classifiction. */ void mac_rx_srs_group_teardown(flow_entry_t *flent, boolean_t hwonly) @@ -2525,8 +2572,16 @@ mac_rx_srs_group_teardown(flow_entry_t *flent, boolean_t hwonly) flent->fe_rx_srs[i] = NULL; flent->fe_rx_srs_cnt--; } - ASSERT(!hwonly || flent->fe_rx_srs_cnt == 1); - ASSERT(hwonly || flent->fe_rx_srs_cnt == 0); + + /* + * If we are only tearing down the HW SRSes then there must be + * one SRS left for SW classification. Otherwise we are tearing + * down both HW and SW and there should be no SRSes left. + */ + if (hwonly) + VERIFY3S(flent->fe_rx_srs_cnt, ==, 1); + else + VERIFY3S(flent->fe_rx_srs_cnt, ==, 0); } /* @@ -2828,6 +2883,7 @@ mac_group_next_state(mac_group_t *grp, mac_client_impl_t **group_only_mcip, * even if this is the only client in the default group, we will * leave group as shared). */ + int mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent, uint32_t link_type) @@ -2839,6 +2895,7 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent, mac_group_t *default_tgroup; int err; uint8_t *mac_addr; + uint16_t vid; mac_group_state_t next_state; mac_client_impl_t *group_only_mcip; mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip); @@ -2850,6 +2907,7 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent, boolean_t no_unicast; boolean_t isprimary = flent->fe_type & FLOW_PRIMARY_MAC; mac_client_impl_t *reloc_pmcip = NULL; + boolean_t use_hw; ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); @@ -2881,15 +2939,19 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent, (mrp->mrp_mask & MRP_TXRINGS_UNSPEC)); /* - * By default we have given the primary all the rings - * i.e. the default group. Let's see if the primary - * needs to be relocated so that the addition of this - * client doesn't impact the primary's performance, - * i.e. if the primary is in the default group and - * we add this client, the primary will lose polling. - * We do this only for NICs supporting dynamic ring - * grouping and only when this is the first client - * after the primary (i.e. nactiveclients is 2) + * All the rings initially belong to the default group + * under dynamic grouping. The primary client uses the + * default group when it is the only client. The + * default group is also used as the destination for + * all multicast and broadcast traffic of all clients. + * Therefore, the primary client loses its ability to + * poll the softrings on addition of a second client. + * To avoid a performance penalty, MAC will move the + * primary client to a dedicated group when it can. + * + * When using static grouping, the primary client + * begins life on a non-default group. There is + * no moving needed upon addition of a second client. */ if (!isprimary && mip->mi_nactiveclients == 2 && (group_only_mcip = mac_primary_client_handle(mip)) != @@ -2897,6 +2959,7 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent, reloc_pmcip = mac_check_primary_relocation( group_only_mcip, rxhw); } + /* * Check to see if we can get an exclusive group for * this mac address or if there already exists a @@ -2910,6 +2973,26 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent, } else if (rgroup == NULL) { rgroup = default_rgroup; } + + /* + * If we are adding a second client to a + * non-default group then we need to move the + * existing client to the default group and + * add the new client to the default group as + * well. + */ + if (rgroup != default_rgroup && + rgroup->mrg_state == MAC_GROUP_STATE_RESERVED) { + group_only_mcip = MAC_GROUP_ONLY_CLIENT(rgroup); + err = mac_rx_switch_group(group_only_mcip, rgroup, + default_rgroup); + + if (err != 0) + goto setup_failed; + + rgroup = default_rgroup; + } + /* * Check to see if we can get an exclusive group for * this mac client. If no groups are available, use @@ -2941,14 +3024,17 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent, rgroup->mrg_cur_count); } } + flent->fe_rx_ring_group = rgroup; /* - * Add the client to the group. This could cause - * either this group to move to the shared state or - * cause the default group to move to the shared state. - * The actions on this group are done here, while the - * actions on the default group are postponed to - * the end of this function. + * Add the client to the group and update the + * group's state. If rgroup != default_group + * then the rgroup should only ever have one + * client and be in the RESERVED state. But no + * matter what, the default_rgroup will enter + * the SHARED state since it has to receive + * all broadcast and multicast traffic. This + * case is handled later in the function. */ mac_group_add_client(rgroup, mcip); next_state = mac_group_next_state(rgroup, @@ -2973,28 +3059,37 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent, &group_only_mcip, default_tgroup, B_FALSE); tgroup->mrg_state = next_state; } - /* - * Setup the Rx and Tx SRSes. If we got a pristine group - * exclusively above, mac_srs_group_setup would simply create - * the required SRSes. If we ended up sharing a previously - * reserved group, mac_srs_group_setup would also dismantle the - * SRSes of the previously exclusive group - */ - mac_srs_group_setup(mcip, flent, link_type); /* We are setting up minimal datapath only */ - if (no_unicast) + if (no_unicast) { + mac_srs_group_setup(mcip, flent, link_type); break; - /* Program the S/W Classifer */ + } + + /* Program software classification. */ if ((err = mac_flow_add(mip->mi_flow_tab, flent)) != 0) goto setup_failed; - /* Program the H/W Classifier */ - if ((err = mac_add_macaddr(mip, rgroup, mac_addr, - (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0)) != 0) + /* Program hardware classification. */ + vid = i_mac_flow_vid(flent); + use_hw = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0; + err = mac_add_macaddr_vlan(mip, rgroup, mac_addr, vid, use_hw); + + if (err != 0) goto setup_failed; + mcip->mci_unicast = mac_find_macaddr(mip, mac_addr); - ASSERT(mcip->mci_unicast != NULL); + VERIFY3P(mcip->mci_unicast, !=, NULL); + + /* + * Setup the Rx and Tx SRSes. If the client has a + * reserved group, then mac_srs_group_setup() creates + * the required SRSes for the HW rings. If we have a + * shared group, mac_srs_group_setup() dismantles the + * HW SRSes of the previously exclusive group. + */ + mac_srs_group_setup(mcip, flent, link_type); + /* (Re)init the v6 token & local addr used by link protection */ mac_protect_update_mac_token(mcip); break; @@ -3038,17 +3133,23 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent, ASSERT(default_rgroup->mrg_state == MAC_GROUP_STATE_SHARED); } + /* - * If we get an exclusive group for a VLAN MAC client we - * need to take the s/w path to make the additional check for - * the vid. Disable polling and set it to s/w classification. - * Similarly for clients that don't have a unicast address. + * A VLAN MAC client on a reserved group still + * requires SW classification if the MAC doesn't + * provide VLAN HW filtering. + * + * Clients with no unicast address also require SW + * classification. */ if (rgroup->mrg_state == MAC_GROUP_STATE_RESERVED && - (i_mac_flow_vid(flent) != VLAN_ID_NONE || no_unicast)) { + ((!MAC_GROUP_HW_VLAN(rgroup) && vid != VLAN_ID_NONE) || + no_unicast)) { mac_rx_switch_grp_to_sw(rgroup); } + } + mac_set_rings_effective(mcip); return (0); @@ -3074,6 +3175,7 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, boolean_t check_default_group = B_FALSE; mac_group_state_t next_state; mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip); + uint16_t vid; ASSERT(MAC_PERIM_HELD((mac_handle_t)mip)); @@ -3086,16 +3188,24 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, case SRST_LINK: /* Stop sending packets */ mac_tx_client_block(mcip); + group = flent->fe_rx_ring_group; + vid = i_mac_flow_vid(flent); - /* Stop the packets coming from the H/W */ + /* + * Stop the packet flow from the hardware by disabling + * any hardware filters assigned to this client. + */ if (mcip->mci_unicast != NULL) { int err; - err = mac_remove_macaddr(mcip->mci_unicast); + + err = mac_remove_macaddr_vlan(mcip->mci_unicast, vid); + if (err != 0) { - cmn_err(CE_WARN, "%s: failed to remove a MAC" - " address because of error 0x%x", + cmn_err(CE_WARN, "%s: failed to remove a MAC HW" + " filters because of error 0x%x", mip->mi_name, err); } + mcip->mci_unicast = NULL; } @@ -3103,12 +3213,12 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, mac_flow_remove(mip->mi_flow_tab, flent, B_FALSE); mac_flow_wait(flent, FLOW_DRIVER_UPCALL); - /* Now quiesce and destroy all SRS and soft rings */ + /* Quiesce and destroy all the SRSes. */ mac_rx_srs_group_teardown(flent, B_FALSE); mac_tx_srs_group_teardown(mcip, flent, SRST_LINK); - ASSERT((mcip->mci_flent == flent) && - (flent->fe_next == NULL)); + ASSERT3P(mcip->mci_flent, ==, flent); + ASSERT3P(flent->fe_next, ==, NULL); /* * Release our hold on the group as well. We need @@ -3116,17 +3226,17 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, * left who can use it exclusively. Also, if we * were the last client, release the group. */ - group = flent->fe_rx_ring_group; default_group = MAC_DEFAULT_RX_GROUP(mip); if (group != NULL) { mac_group_remove_client(group, mcip); next_state = mac_group_next_state(group, &grp_only_mcip, default_group, B_TRUE); + if (next_state == MAC_GROUP_STATE_RESERVED) { /* * Only one client left on this RX group. */ - ASSERT(grp_only_mcip != NULL); + VERIFY3P(grp_only_mcip, !=, NULL); mac_set_group_state(group, MAC_GROUP_STATE_RESERVED); group_only_flent = grp_only_mcip->mci_flent; @@ -3151,7 +3261,7 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, * to see if the primary client can get * exclusive access to the default group. */ - ASSERT(group != MAC_DEFAULT_RX_GROUP(mip)); + VERIFY3P(group, !=, MAC_DEFAULT_RX_GROUP(mip)); if (mrp->mrp_mask & MRP_RX_RINGS) { MAC_RX_GRP_RELEASED(mip); if (mip->mi_rx_group_type == @@ -3165,7 +3275,8 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, MAC_GROUP_STATE_REGISTERED); check_default_group = B_TRUE; } else { - ASSERT(next_state == MAC_GROUP_STATE_SHARED); + VERIFY3S(next_state, ==, + MAC_GROUP_STATE_SHARED); mac_set_group_state(group, MAC_GROUP_STATE_SHARED); mac_rx_group_unmark(group, MR_CONDEMNED); @@ -3254,12 +3365,12 @@ mac_datapath_teardown(mac_client_impl_t *mcip, flow_entry_t *flent, */ if (check_default_group) { default_group = MAC_DEFAULT_RX_GROUP(mip); - ASSERT(default_group->mrg_state == MAC_GROUP_STATE_SHARED); + VERIFY3S(default_group->mrg_state, ==, MAC_GROUP_STATE_SHARED); next_state = mac_group_next_state(default_group, &grp_only_mcip, default_group, B_TRUE); if (next_state == MAC_GROUP_STATE_RESERVED) { - ASSERT(grp_only_mcip != NULL && - mip->mi_nactiveclients == 1); + VERIFY3P(grp_only_mcip, !=, NULL); + VERIFY3U(mip->mi_nactiveclients, ==, 1); mac_set_group_state(default_group, MAC_GROUP_STATE_RESERVED); mac_rx_srs_group_setup(grp_only_mcip, @@ -3385,7 +3496,7 @@ mac_srs_free(mac_soft_ring_set_t *mac_srs) ASSERT((mac_srs->srs_state & (SRS_CONDEMNED | SRS_CONDEMNED_DONE | SRS_PROC | SRS_PROC_FAST)) == (SRS_CONDEMNED | SRS_CONDEMNED_DONE)); - mac_pkt_drop(NULL, NULL, mac_srs->srs_first, B_FALSE); + mac_drop_chain(mac_srs->srs_first, "SRS free"); mac_srs_ring_free(mac_srs); mac_srs_soft_rings_free(mac_srs); mac_srs_fanout_list_free(mac_srs); @@ -3783,7 +3894,7 @@ mac_tx_srs_del_ring(mac_soft_ring_set_t *mac_srs, mac_ring_t *tx_ring) * is also stored in st_soft_rings[] array. That entry should * be removed. */ - if (mcip->mci_state_flags & MCIS_IS_AGGR) { + if (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT) { mac_srs_tx_t *tx = &mac_srs->srs_tx; ASSERT(tx->st_soft_rings[tx_ring->mr_index] == remove_sring); @@ -3812,7 +3923,7 @@ mac_tx_srs_setup(mac_client_impl_t *mcip, flow_entry_t *flent) boolean_t is_aggr; uint_t ring_info = 0; - is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR) != 0; + is_aggr = (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT) != 0; grp = flent->fe_tx_ring_group; if (grp == NULL) { ring = (mac_ring_t *)mip->mi_default_tx_ring; @@ -3956,8 +4067,8 @@ mac_fanout_recompute_client(mac_client_impl_t *mcip, cpupart_t *cpupart) } /* - * Walk through the list of mac clients for the MAC. - * For each active mac client, recompute the number of soft rings + * Walk through the list of MAC clients for the MAC. + * For each active MAC client, recompute the number of soft rings * associated with every client, only if current speed is different * from the speed that was previously used for soft ring computation. * If the cable is disconnected whlie the NIC is started, we would get @@ -3980,6 +4091,10 @@ mac_fanout_recompute(mac_impl_t *mip) for (mcip = mip->mi_clients_list; mcip != NULL; mcip = mcip->mci_client_next) { + /* Aggr port clients don't have SRSes. */ + if ((mcip->mci_state_flags & MCIS_IS_AGGR_PORT) != 0) + continue; + if ((mcip->mci_state_flags & MCIS_SHARE_BOUND) != 0 || !MCIP_DATAPATH_SETUP(mcip)) continue; @@ -3992,6 +4107,7 @@ mac_fanout_recompute(mac_impl_t *mip) mac_set_pool_effective(use_default, cpupart, mrp, emrp); pool_unlock(); } + i_mac_perim_exit(mip); } diff --git a/usr/src/uts/common/io/mac/mac_flow.c b/usr/src/uts/common/io/mac/mac_flow.c index aa4985fe4c..62612122d6 100644 --- a/usr/src/uts/common/io/mac/mac_flow.c +++ b/usr/src/uts/common/io/mac/mac_flow.c @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ #include <sys/strsun.h> @@ -229,7 +230,7 @@ mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name, cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL); /* Initialize the receiver function to a safe routine */ - flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop; + flent->fe_cb_fn = (flow_fn_t)mac_rx_def; flent->fe_index = -1; } (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN); diff --git a/usr/src/uts/common/io/mac/mac_protect.c b/usr/src/uts/common/io/mac/mac_protect.c index da83dc643e..ee493bbca1 100644 --- a/usr/src/uts/common/io/mac/mac_protect.c +++ b/usr/src/uts/common/io/mac/mac_protect.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc. All rights reserved. */ /* * Copyright 2014 Nexenta Systems, Inc. All rights reserved. @@ -209,7 +209,7 @@ typedef struct slaac_addr { } slaac_addr_t; static void start_txn_cleanup_timer(mac_client_impl_t *); -static boolean_t allowed_ips_set(mac_resource_props_t *, uint32_t); +static boolean_t dynamic_method_set(mac_protect_t *, uint32_t); #define BUMP_STAT(m, s) (m)->mci_misc_stat.mms_##s++ @@ -580,8 +580,7 @@ intercept_dhcpv4_outbound(mac_client_impl_t *mcip, ipha_t *ipha, uchar_t *end) if (get_dhcpv4_info(ipha, end, &dh4) != 0) return (B_TRUE); - /* ip_nospoof/allowed-ips and DHCP are mutually exclusive by default */ - if (allowed_ips_set(mrp, IPV4_VERSION)) + if (!dynamic_method_set(&mrp->mrp_protect, MPT_DYN_DHCPV4)) return (B_FALSE); if (get_dhcpv4_option(dh4, end, CD_DHCP_TYPE, &opt, &opt_len) != 0 || @@ -1310,8 +1309,7 @@ intercept_dhcpv6_outbound(mac_client_impl_t *mcip, ip6_t *ip6h, uchar_t *end) if (get_dhcpv6_info(ip6h, end, &dh6) != 0) return (B_TRUE); - /* ip_nospoof/allowed-ips and DHCP are mutually exclusive by default */ - if (allowed_ips_set(mrp, IPV6_VERSION)) + if (!dynamic_method_set(&mrp->mrp_protect, MPT_DYN_DHCPV6)) return (B_FALSE); /* @@ -1517,6 +1515,10 @@ intercept_ra_inbound(mac_client_impl_t *mcip, ip6_t *ip6h, uchar_t *end, { struct nd_opt_hdr *opt; int len, optlen; + mac_protect_t *protect = &MCIP_RESOURCE_PROPS(mcip)->mrp_protect; + + if (!dynamic_method_set(protect, MPT_DYN_SLAAC)) + return; if (ip6h->ip6_hlim != 255) { DTRACE_PROBE1(invalid__hoplimit, uint8_t, ip6h->ip6_hlim); @@ -1755,6 +1757,7 @@ ipnospoof_check_v4(mac_client_impl_t *mcip, mac_protect_t *protect, if (*addr == INADDR_ANY) return (B_TRUE); + /* If any specific addresses or subnets are allowed, check them */ for (i = 0; i < protect->mp_ipaddrcnt; i++) { mac_ipaddr_t *v4addr = &protect->mp_ipaddrs[i]; @@ -1775,14 +1778,19 @@ ipnospoof_check_v4(mac_client_impl_t *mcip, mac_protect_t *protect, return (B_TRUE); } } - return (protect->mp_ipaddrcnt == 0 ? - check_dhcpv4_dyn_ip(mcip, *addr) : B_FALSE); + + if (dynamic_method_set(protect, MPT_DYN_DHCPV4)) { + return (check_dhcpv4_dyn_ip(mcip, *addr)); + } + + return (B_FALSE); } static boolean_t ipnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *protect, in6_addr_t *addr) { + boolean_t slaac_enabled, dhcpv6_enabled; uint_t i; /* @@ -1793,7 +1801,7 @@ ipnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *protect, IN6_ARE_ADDR_EQUAL(&mcip->mci_v6_local_addr, addr))) return (B_TRUE); - + /* If any specific addresses or subnets are allowed, check them */ for (i = 0; i < protect->mp_ipaddrcnt; i++) { mac_ipaddr_t *v6addr = &protect->mp_ipaddrs[i]; @@ -1804,12 +1812,15 @@ ipnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *protect, return (B_TRUE); } - if (protect->mp_ipaddrcnt == 0) { - return (check_slaac_ip(mcip, addr) || - check_dhcpv6_dyn_ip(mcip, addr)); - } else { - return (B_FALSE); - } + slaac_enabled = dynamic_method_set(protect, MPT_DYN_SLAAC); + if (slaac_enabled && check_slaac_ip(mcip, addr)) + return (B_TRUE); + + dhcpv6_enabled = dynamic_method_set(protect, MPT_DYN_DHCPV6); + if (dhcpv6_enabled && check_dhcpv6_dyn_ip(mcip, addr)) + return (B_TRUE); + + return (B_FALSE); } /* @@ -2025,6 +2036,9 @@ dhcpnospoof_check_cid(mac_protect_t *p, uchar_t *cid, uint_t cidlen) bcmp(dcid->dc_id, cid, cidlen) == 0) return (B_TRUE); } + + DTRACE_PROBE3(missing__cid, mac_protect_t *, p, + uchar_t *, cid, uint_t, cidlen); return (B_FALSE); } @@ -2046,6 +2060,12 @@ dhcpnospoof_check_v4(mac_client_impl_t *mcip, mac_protect_t *p, bcmp(mcip->mci_unicast->ma_addr, dh4->chaddr, maclen) != 0) { return (B_FALSE); } + + /* Everything after here is checking the Client Identifier */ + if (p->mp_allcids == MPT_TRUE) { + return (B_TRUE); + } + if (get_dhcpv4_option(dh4, end, CD_CLIENT_ID, &cid, &optlen) == 0) cidlen = optlen; @@ -2082,6 +2102,11 @@ dhcpnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *p, mtype == DHCPV6_MSG_RECONFIGURE) return (B_TRUE); + /* Everything after here is checking the Client Identifier */ + if (p->mp_allcids == MPT_TRUE) { + return (B_TRUE); + } + d6o = get_dhcpv6_option(&dh6[1], end - (uchar_t *)&dh6[1], NULL, DHCPV6_OPT_CLIENTID, &cidlen); if (d6o == NULL || (uchar_t *)d6o + cidlen > end) @@ -2159,7 +2184,6 @@ dhcpnospoof_check(mac_client_impl_t *mcip, mac_protect_t *protect, return (0); fail: - /* increment dhcpnospoof stat here */ freemsg(nmp); return (err); } @@ -2487,6 +2511,11 @@ mac_protect_validate(mac_resource_props_t *mrp) if ((err = validate_cids(p)) != 0) return (err); + if (p->mp_allcids != MPT_FALSE && p->mp_allcids != MPT_TRUE && + p->mp_allcids != MPT_RESET) { + return (EINVAL); + } + return (0); } @@ -2554,6 +2583,16 @@ mac_protect_update(mac_resource_props_t *new, mac_resource_props_t *curr) cp->mp_cidcnt = 0; } } + if (np->mp_allcids == MPT_RESET) { + cp->mp_allcids = MPT_FALSE; + } else if (np->mp_allcids != 0) { + cp->mp_allcids = MPT_TRUE; + } + if (np->mp_dynamic == MPT_RESET) { + cp->mp_dynamic = 0; + } else if (np->mp_dynamic != 0) { + cp->mp_dynamic = np->mp_dynamic; + } } void @@ -2597,15 +2636,50 @@ mac_protect_fini(mac_client_impl_t *mcip) } static boolean_t -allowed_ips_set(mac_resource_props_t *mrp, uint32_t af) +dynamic_method_set(mac_protect_t *mpt, uint32_t method) +{ + if (mpt->mp_dynamic != 0) { + return ((mpt->mp_dynamic & method) != 0); + } else { + return (mpt->mp_ipaddrcnt == 0); + } +} + +boolean_t +mac_protect_check_addr(mac_client_handle_t mch, boolean_t isv6, + in6_addr_t *v6addr) { - int i; + mac_perim_handle_t perim; + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_handle_t mh = (mac_handle_t)mcip->mci_mip; - for (i = 0; i < mrp->mrp_protect.mp_ipaddrcnt; i++) { - if (mrp->mrp_protect.mp_ipaddrs[i].ip_version == af) - return (B_TRUE); + mac_perim_enter_by_mh(mh, &perim); + + mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip); + mac_protect_t *p; + boolean_t allowed; + + ASSERT(mrp != NULL); + + p = &mrp->mrp_protect; + + /* If mac protection/ipnospoof isn't enabled, return true */ + if ((mrp->mrp_mask & MRP_PROTECT) == 0 || + (p->mp_types & MPT_IPNOSPOOF) == 0) { + allowed = B_TRUE; + goto done; } - return (B_FALSE); + + if (isv6) { + allowed = ipnospoof_check_v6(mcip, p, v6addr); + } else { + in_addr_t *v4addr = &V4_PART_OF_V6((*v6addr)); + allowed = ipnospoof_check_v4(mcip, p, v4addr); + } + +done: + mac_perim_exit(perim); + return (allowed); } mac_protect_t * diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c index 07201afdec..cb1a76aef6 100644 --- a/usr/src/uts/common/io/mac/mac_provider.c +++ b/usr/src/uts/common/io/mac/mac_provider.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2018 Joyent, Inc. * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved. */ @@ -56,6 +57,7 @@ #include <sys/sdt.h> #include <sys/pattr.h> #include <sys/strsun.h> +#include <sys/vlan.h> /* * MAC Provider Interface. @@ -351,6 +353,9 @@ mac_register(mac_register_t *mregp, mac_handle_t *mhp) if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL)) mip->mi_state_flags |= MIS_IS_AGGR; + if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_OVERLAY, NULL)) + mip->mi_state_flags |= MIS_IS_OVERLAY; + mac_addr_factory_init(mip); mac_transceiver_init(mip); @@ -697,7 +702,6 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) mac_ring_t *mr = (mac_ring_t *)mrh; mac_soft_ring_set_t *mac_srs; mblk_t *bp = mp_chain; - boolean_t hw_classified = B_FALSE; /* * If there are any promiscuous mode callbacks defined for @@ -709,7 +713,7 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) if (mr != NULL) { /* * If the SRS teardown has started, just return. The 'mr' - * continues to be valid until the driver unregisters the mac. + * continues to be valid until the driver unregisters the MAC. * Hardware classified packets will not make their way up * beyond this point once the teardown has started. The driver * is never passed a pointer to a flow entry or SRS or any @@ -722,11 +726,25 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) freemsgchain(mp_chain); return; } - if (mr->mr_classify_type == MAC_HW_CLASSIFIER) { - hw_classified = B_TRUE; + + /* + * The ring is in passthru mode; pass the chain up to + * the pseudo ring. + */ + if (mr->mr_classify_type == MAC_PASSTHRU_CLASSIFIER) { MR_REFHOLD_LOCKED(mr); + mutex_exit(&mr->mr_lock); + mr->mr_pt_fn(mr->mr_pt_arg1, mr->mr_pt_arg2, mp_chain, + B_FALSE); + MR_REFRELE(mr); + return; } - mutex_exit(&mr->mr_lock); + + /* + * The passthru callback should only be set when in + * MAC_PASSTHRU_CLASSIFIER mode. + */ + ASSERT3P(mr->mr_pt_fn, ==, NULL); /* * We check if an SRS is controlling this ring. @@ -734,19 +752,24 @@ mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) * routine otherwise we need to go through mac_rx_classify * to reach the right place. */ - if (hw_classified) { + if (mr->mr_classify_type == MAC_HW_CLASSIFIER) { + MR_REFHOLD_LOCKED(mr); + mutex_exit(&mr->mr_lock); + ASSERT3P(mr->mr_srs, !=, NULL); mac_srs = mr->mr_srs; + /* - * This is supposed to be the fast path. - * All packets received though here were steered by - * the hardware classifier, and share the same - * MAC header info. + * This is the fast path. All packets received + * on this ring are hardware classified and + * share the same MAC header info. */ mac_srs->srs_rx.sr_lower_proc(mh, (mac_resource_handle_t)mac_srs, mp_chain, B_FALSE); MR_REFRELE(mr); return; } + + mutex_exit(&mr->mr_lock); /* We'll fall through to software classification */ } else { flow_entry_t *flent; @@ -1472,7 +1495,8 @@ mac_prop_info_set_perm(mac_prop_info_handle_t ph, uint8_t perm) pr->pr_flags |= MAC_PROP_INFO_PERM; } -void mac_hcksum_get(mblk_t *mp, uint32_t *start, uint32_t *stuff, +void +mac_hcksum_get(const mblk_t *mp, uint32_t *start, uint32_t *stuff, uint32_t *end, uint32_t *value, uint32_t *flags_ptr) { uint32_t flags; @@ -1497,8 +1521,9 @@ void mac_hcksum_get(mblk_t *mp, uint32_t *start, uint32_t *stuff, *flags_ptr = flags; } -void mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff, - uint32_t end, uint32_t value, uint32_t flags) +void +mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff, uint32_t end, + uint32_t value, uint32_t flags) { ASSERT(DB_TYPE(mp) == M_DATA); @@ -1510,6 +1535,31 @@ void mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff, } void +mac_hcksum_clone(const mblk_t *src, mblk_t *dst) +{ + ASSERT3U(DB_TYPE(src), ==, M_DATA); + ASSERT3U(DB_TYPE(dst), ==, M_DATA); + + /* + * Do these assignments unconditionally, rather than only when + * flags is non-zero. This protects a situation where zeroed + * hcksum data does not make the jump onto an mblk_t with + * stale data in those fields. It's important to copy all + * possible flags (HCK_* as well as HW_*) and not just the + * checksum specific flags. Dropping flags during a clone + * could result in dropped packets. If the caller has good + * reason to drop those flags then it should do it manually, + * after the clone. + */ + DB_CKSUMFLAGS(dst) = DB_CKSUMFLAGS(src); + DB_CKSUMSTART(dst) = DB_CKSUMSTART(src); + DB_CKSUMSTUFF(dst) = DB_CKSUMSTUFF(src); + DB_CKSUMEND(dst) = DB_CKSUMEND(src); + DB_CKSUM16(dst) = DB_CKSUM16(src); + DB_LSOMSS(dst) = DB_LSOMSS(src); +} + +void mac_lso_get(mblk_t *mp, uint32_t *mss, uint32_t *flags) { ASSERT(DB_TYPE(mp) == M_DATA); diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c index d91d07db09..e42cbd1320 100644 --- a/usr/src/uts/common/io/mac/mac_sched.c +++ b/usr/src/uts/common/io/mac/mac_sched.c @@ -21,7 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. */ @@ -300,9 +300,8 @@ * * Otherwise, all fanout is performed by software. MAC divides incoming frames * into one of three buckets -- IPv4 TCP traffic, IPv4 UDP traffic, and - * everything else. Note, VLAN tagged traffic is considered other, regardless of - * the interior EtherType. Regardless of the type of fanout, these three - * categories or buckets are always used. + * everything else. Regardless of the type of fanout, these three categories + * or buckets are always used. * * The difference between protocol level fanout and full software ring protocol * fanout is the number of software rings that end up getting created. The @@ -969,6 +968,7 @@ #include <sys/types.h> #include <sys/callb.h> +#include <sys/pattr.h> #include <sys/sdt.h> #include <sys/strsubr.h> #include <sys/strsun.h> @@ -1328,7 +1328,7 @@ int mac_srs_worker_wakeup_ticks = 0; * b_prev may be set to the fanout hint \ * hence can't use freemsg directly \ */ \ - mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \ + mac_drop_chain(mp_chain, "SRS Tx max queue"); \ DTRACE_PROBE1(tx_queued_hiwat, \ mac_soft_ring_set_t *, srs); \ enqueue = 0; \ @@ -1347,11 +1347,11 @@ int mac_srs_worker_wakeup_ticks = 0; if (!(srs->srs_type & SRST_TX)) \ mutex_exit(&srs->srs_bw->mac_bw_lock); -#define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \ - mac_pkt_drop(NULL, NULL, mp, B_FALSE); \ +#define MAC_TX_SRS_DROP_MESSAGE(srs, chain, cookie, s) { \ + mac_drop_pkt((chain), (s)); \ /* increment freed stats */ \ - mac_srs->srs_tx.st_stat.mts_sdrops++; \ - cookie = (mac_tx_cookie_t)srs; \ + (srs)->srs_tx.st_stat.mts_sdrops++; \ + (cookie) = (mac_tx_cookie_t)(srs); \ } #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \ @@ -1367,11 +1367,11 @@ int mac_srs_worker_wakeup_ticks = 0; * can occur in situ (in the interrupt thread) or if it should be left to a * worker thread. Note that the constant used to make this determination is * not entirely made-up, and is a result of some emprical validation. That - * said, the constant is left as a static variable to allow it to be + * said, the constant is left as a global variable to allow it to be * dynamically tuned in the field if and as needed. */ -static uintptr_t mac_rx_srs_stack_needed = 10240; -static uint_t mac_rx_srs_stack_toodeep; +uintptr_t mac_rx_srs_stack_needed = 14336; +uint_t mac_rx_srs_stack_toodeep; #ifndef STACK_GROWTH_DOWN #error Downward stack growth assumed. @@ -1379,7 +1379,7 @@ static uint_t mac_rx_srs_stack_toodeep; #define MAC_RX_SRS_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \ (uintptr_t)curthread->t_stkbase < mac_rx_srs_stack_needed && \ - ++mac_rx_srs_stack_toodeep) + (++mac_rx_srs_stack_toodeep || (mac_rx_srs_stack_toodeep = 1))) /* @@ -1475,16 +1475,15 @@ enum pkt_type { #define PORTS_SIZE 4 /* - * mac_rx_srs_proto_fanout - * - * This routine delivers packets destined to an SRS into one of the + * This routine delivers packets destined for an SRS into one of the * protocol soft rings. * - * Given a chain of packets we need to split it up into multiple sub chains - * destined into TCP, UDP or OTH soft ring. Instead of entering - * the soft ring one packet at a time, we want to enter it in the form of a - * chain otherwise we get this start/stop behaviour where the worker thread - * goes to sleep and then next packets comes in forcing it to wake up etc. + * Given a chain of packets we need to split it up into multiple sub + * chains: TCP, UDP or OTH soft ring. Instead of entering the soft + * ring one packet at a time, we want to enter it in the form of a + * chain otherwise we get this start/stop behaviour where the worker + * thread goes to sleep and then next packet comes in forcing it to + * wake up. */ static void mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) @@ -1523,9 +1522,9 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; /* - * Special clients (eg. VLAN, non ether, etc) need DLS - * processing in the Rx path. SRST_DLS_BYPASS will be clear for - * such SRSs. Another way of disabling bypass is to set the + * Some clients, such as non-ethernet, need DLS processing in + * the Rx path. Such clients clear the SRST_DLS_BYPASS flag. + * DLS bypass may also be disabled via the * MCIS_RX_BYPASS_DISABLE flag. */ dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) && @@ -1537,10 +1536,11 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) bzero(sz, MAX_SR_TYPES * sizeof (size_t)); /* - * We got a chain from SRS that we need to send to the soft rings. - * Since squeues for TCP & IPv4 sap poll their soft rings (for - * performance reasons), we need to separate out v4_tcp, v4_udp - * and the rest goes in other. + * We have a chain from SRS that we need to split across the + * soft rings. The squeues for the TCP and IPv4 SAPs use their + * own soft rings to allow polling from the squeue. The rest of + * the packets are delivered on the OTH soft ring which cannot + * be polled. */ while (head != NULL) { mp = head; @@ -1568,9 +1568,14 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) evhp = (struct ether_vlan_header *)mp->b_rptr; sap = ntohs(evhp->ether_type); hdrsize = sizeof (struct ether_vlan_header); + /* - * Check if the VID of the packet, if any, - * belongs to this client. + * Check if the VID of the packet, if + * any, belongs to this client. + * Technically, if this packet came up + * via a HW classified ring then we + * don't need to perform this check. + * Perhaps a future optimization. */ if (!mac_client_check_flow_vid(mcip, VLAN_ID(ntohs(evhp->ether_tci)))) { @@ -1635,7 +1640,6 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) * performance and may bypass DLS. All other cases go through * the 'OTH' type path without DLS bypass. */ - ipha = (ipha_t *)(mp->b_rptr + hdrsize); if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) type = OTH; @@ -1647,11 +1651,13 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) } ASSERT(type == UNDEF); + /* - * We look for at least 4 bytes past the IP header to get - * the port information. If we get an IP fragment, we don't - * have the port information, and we use just the protocol - * information. + * Determine the type from the IP protocol value. If + * classified as TCP or UDP, then update the read + * pointer to the beginning of the IP header. + * Otherwise leave the message as is for further + * processing by DLS. */ switch (ipha->ipha_protocol) { case IPPROTO_TCP: @@ -1695,11 +1701,10 @@ mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) int fanout_unaligned = 0; /* - * mac_rx_srs_long_fanout - * - * The fanout routine for VLANs, and for anything else that isn't performing - * explicit dls bypass. Returns -1 on an error (drop the packet due to a - * malformed packet), 0 on success, with values written in *indx and *type. + * The fanout routine for any clients with DLS bypass disabled or for + * traffic classified as "other". Returns -1 on an error (drop the + * packet due to a malformed packet), 0 on success, with values + * written in *indx and *type. */ static int mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, @@ -1865,16 +1870,15 @@ src_dst_based_fanout: } /* - * mac_rx_srs_fanout - * - * This routine delivers packets destined to an SRS into a soft ring member + * This routine delivers packets destined for an SRS into a soft ring member * of the set. * - * Given a chain of packets we need to split it up into multiple sub chains - * destined for one of the TCP, UDP or OTH soft rings. Instead of entering - * the soft ring one packet at a time, we want to enter it in the form of a - * chain otherwise we get this start/stop behaviour where the worker thread - * goes to sleep and then next packets comes in forcing it to wake up etc. + * Given a chain of packets we need to split it up into multiple sub + * chains: TCP, UDP or OTH soft ring. Instead of entering the soft + * ring one packet at a time, we want to enter it in the form of a + * chain otherwise we get this start/stop behaviour where the worker + * thread goes to sleep and then next packet comes in forcing it to + * wake up. * * Note: * Since we know what is the maximum fanout possible, we create a 2D array @@ -1935,10 +1939,11 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; /* - * Special clients (eg. VLAN, non ether, etc) need DLS - * processing in the Rx path. SRST_DLS_BYPASS will be clear for - * such SRSs. Another way of disabling bypass is to set the - * MCIS_RX_BYPASS_DISABLE flag. + * Some clients, such as non Ethernet, need DLS processing in + * the Rx path. Such clients clear the SRST_DLS_BYPASS flag. + * DLS bypass may also be disabled via the + * MCIS_RX_BYPASS_DISABLE flag, but this is only consumed by + * sun4v vsw currently. */ dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) && ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0); @@ -1960,7 +1965,7 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) /* * We got a chain from SRS that we need to send to the soft rings. - * Since squeues for TCP & IPv4 sap poll their soft rings (for + * Since squeues for TCP & IPv4 SAP poll their soft rings (for * performance reasons), we need to separate out v4_tcp, v4_udp * and the rest goes in other. */ @@ -1990,9 +1995,14 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) evhp = (struct ether_vlan_header *)mp->b_rptr; sap = ntohs(evhp->ether_type); hdrsize = sizeof (struct ether_vlan_header); + /* - * Check if the VID of the packet, if any, - * belongs to this client. + * Check if the VID of the packet, if + * any, belongs to this client. + * Technically, if this packet came up + * via a HW classified ring then we + * don't need to perform this check. + * Perhaps a future optimization. */ if (!mac_client_check_flow_vid(mcip, VLAN_ID(ntohs(evhp->ether_tci)))) { @@ -2032,7 +2042,6 @@ mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) continue; } - /* * If we are using the default Rx ring where H/W or S/W * classification has not happened, we need to verify if @@ -2621,7 +2630,6 @@ again: mac_srs->srs_state |= (SRS_PROC|proc_type); - /* * mcip is NULL for broadcast and multicast flows. The promisc * callbacks for broadcast and multicast packets are delivered from @@ -2641,10 +2649,8 @@ again: } /* - * Check if SRS itself is doing the processing - * This direct path does not apply when subflows are present. In this - * case, packets need to be dispatched to a soft ring according to the - * flow's bandwidth and other resources contraints. + * Check if SRS itself is doing the processing. This direct + * path applies only when subflows are present. */ if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { mac_direct_rx_t proc; @@ -2888,7 +2894,7 @@ again: mac_srs->srs_bw->mac_bw_sz -= sz; mac_srs->srs_bw->mac_bw_drop_bytes += sz; mutex_exit(&mac_srs->srs_bw->mac_bw_lock); - mac_pkt_drop(NULL, NULL, head, B_FALSE); + mac_drop_chain(head, "Rx no bandwidth"); goto leave_poll; } else { mutex_exit(&mac_srs->srs_bw->mac_bw_lock); @@ -3270,9 +3276,10 @@ mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs, } /* - * mac_rx_srs_process - * - * Receive side routine called from the interrupt path. + * MAC SRS receive side routine. If the data is coming from the + * network (i.e. from a NIC) then this is called in interrupt context. + * If the data is coming from a local sender (e.g. mac_tx_send() or + * bridge_forward()) then this is not called in interrupt context. * * loopback is set to force a context switch on the loopback * path between MAC clients. @@ -3332,7 +3339,7 @@ mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain, mac_bw->mac_bw_drop_bytes += sz; mutex_exit(&mac_bw->mac_bw_lock); mutex_exit(&mac_srs->srs_lock); - mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); + mac_drop_chain(mp_chain, "Rx no bandwidth"); return; } else { if ((mac_bw->mac_bw_sz + sz) <= @@ -3454,7 +3461,8 @@ mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW); if (flag & MAC_DROP_ON_NO_DESC) { - MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); + MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie, + "Tx no desc"); } else { if (mac_srs->srs_first != NULL) wakeup_worker = B_FALSE; @@ -3517,7 +3525,8 @@ mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); if (flag & MAC_DROP_ON_NO_DESC) { if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) { - MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); + MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie, + "Tx SRS hiwat"); } else { MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, cnt, sz); @@ -3890,7 +3899,8 @@ mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, cookie = (mac_tx_cookie_t)mac_srs; *ret_mp = mp_chain; } else { - MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); + MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie, + "Tx no bandwidth"); } mutex_exit(&mac_srs->srs_lock); return (cookie); @@ -4336,6 +4346,14 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, obytes += (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp)); + /* + * Mark all packets as local so that a + * receiver can determine if a packet arrived + * from a local source or from the network. + * This allows some consumers to avoid + * unecessary work like checksum computation. + */ + DB_CKSUMFLAGS(mp) |= HW_LOCAL_MAC; CHECK_VID_AND_ADD_TAG(mp); MAC_TX(mip, ring, mp, src_mcip); @@ -4368,7 +4386,6 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, flow_entry_t *dst_flow_ent; void *flow_cookie; size_t pkt_size; - mblk_t *mp1; next = mp->b_next; mp->b_next = NULL; @@ -4378,49 +4395,25 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, CHECK_VID_AND_ADD_TAG(mp); /* + * Mark all packets as local so that a receiver can + * determine if a packet arrived from a local source + * or from the network. This allows some consumers to + * avoid unecessary work like checksum computation. + */ + DB_CKSUMFLAGS(mp) |= HW_LOCAL_MAC; + + /* * Find the destination. */ dst_flow_ent = mac_tx_classify(mip, mp); if (dst_flow_ent != NULL) { - size_t hdrsize; - int err = 0; - - if (mip->mi_info.mi_nativemedia == DL_ETHER) { - struct ether_vlan_header *evhp = - (struct ether_vlan_header *)mp->b_rptr; - - if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) - hdrsize = sizeof (*evhp); - else - hdrsize = sizeof (struct ether_header); - } else { - mac_header_info_t mhi; - - err = mac_header_info((mac_handle_t)mip, - mp, &mhi); - if (err == 0) - hdrsize = mhi.mhi_hdrsize; - } - /* * Got a matching flow. It's either another * MAC client, or a broadcast/multicast flow. - * Make sure the packet size is within the - * allowed size. If not drop the packet and - * move to next packet. */ - if (err != 0 || - (pkt_size - hdrsize) > mip->mi_sdu_max) { - oerrors++; - DTRACE_PROBE2(loopback__drop, size_t, pkt_size, - mblk_t *, mp); - freemsg(mp); - mp = next; - FLOW_REFRELE(dst_flow_ent); - continue; - } flow_cookie = mac_flow_get_client_cookie(dst_flow_ent); + if (flow_cookie != NULL) { /* * The vnic_bcast_send function expects @@ -4438,6 +4431,7 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, * bypass is set. */ boolean_t do_switch; + mac_client_impl_t *dst_mcip = dst_flow_ent->fe_mcip; @@ -4453,19 +4447,19 @@ mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, * check is done inside the MAC_TX() * macro. */ - if (mip->mi_promisc_list != NULL) + if (mip->mi_promisc_list != NULL) { mac_promisc_dispatch(mip, mp, src_mcip); + } do_switch = ((src_mcip->mci_state_flags & dst_mcip->mci_state_flags & MCIS_CLIENT_POLL_CAPABLE) != 0); - if ((mp1 = mac_fix_cksum(mp)) != NULL) { - (dst_flow_ent->fe_cb_fn)( - dst_flow_ent->fe_cb_arg1, - dst_flow_ent->fe_cb_arg2, - mp1, do_switch); - } + (dst_flow_ent->fe_cb_fn)( + dst_flow_ent->fe_cb_arg1, + dst_flow_ent->fe_cb_arg2, + mp, do_switch); + } FLOW_REFRELE(dst_flow_ent); } else { @@ -4656,6 +4650,9 @@ mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain, * the packet to the promiscuous listeners of the * client, since they expect to see the whole * frame including the VLAN headers. + * + * The MCIS_STRIP_DISABLE is only issued when sun4v + * vsw is in play. */ mp_chain = mac_strip_vlan_tag_chain(mp_chain); } @@ -4664,13 +4661,11 @@ mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain, } /* - * mac_rx_soft_ring_process - * - * process a chain for a given soft ring. The number of packets queued - * in the SRS and its associated soft rings (including this one) is - * very small (tracked by srs_poll_pkt_cnt), then allow the entering - * thread (interrupt or poll thread) to do inline processing. This - * helps keep the latency down under low load. + * Process a chain for a given soft ring. If the number of packets + * queued in the SRS and its associated soft rings (including this + * one) is very small (tracked by srs_poll_pkt_cnt) then allow the + * entering thread (interrupt or poll thread) to process the chain + * inline. This is meant to reduce latency under low load. * * The proc and arg for each mblk is already stored in the mblk in * appropriate places. @@ -4729,13 +4724,13 @@ mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp, ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); /* - * If we have a soft ring set which is doing - * bandwidth control, we need to decrement - * srs_size and count so it the SRS can have a - * accurate idea of what is the real data - * queued between SRS and its soft rings. We - * decrement the counters only when the packet - * gets processed by both SRS and the soft ring. + * If we have an SRS performing bandwidth + * control then we need to decrement the size + * and count so the SRS has an accurate count + * of the data queued between the SRS and its + * soft rings. We decrement the counters only + * when the packet is processed by both the + * SRS and the soft ring. */ mutex_enter(&mac_srs->srs_lock); MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); @@ -4751,8 +4746,8 @@ mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp, if ((ringp->s_ring_first == NULL) || (ringp->s_ring_state & S_RING_BLANK)) { /* - * We processed inline our packet and - * nothing new has arrived or our + * We processed a single packet inline + * and nothing new has arrived or our * receiver doesn't want to receive * any packets. We are done. */ @@ -4821,7 +4816,7 @@ mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); if (flag & MAC_DROP_ON_NO_DESC) { - mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); + mac_drop_chain(mp_chain, "Tx softring no desc"); /* increment freed stats */ ringp->s_ring_drops += cnt; cookie = (mac_tx_cookie_t)ringp; @@ -4865,8 +4860,8 @@ mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, * b_prev may be set to the fanout hint * hence can't use freemsg directly */ - mac_pkt_drop(NULL, NULL, - mp_chain, B_FALSE); + mac_drop_chain(mp_chain, + "Tx softring max queue"); DTRACE_PROBE1(tx_queued_hiwat, mac_soft_ring_t *, ringp); enqueue = B_FALSE; diff --git a/usr/src/uts/common/io/mac/mac_soft_ring.c b/usr/src/uts/common/io/mac/mac_soft_ring.c index dc8cfdd145..4655631dc1 100644 --- a/usr/src/uts/common/io/mac/mac_soft_ring.c +++ b/usr/src/uts/common/io/mac/mac_soft_ring.c @@ -21,7 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* @@ -207,7 +207,7 @@ mac_soft_ring_create(int id, clock_t wait, uint16_t type, ringp->s_ring_tx_hiwat = (mac_tx_soft_ring_hiwat > mac_tx_soft_ring_max_q_cnt) ? mac_tx_soft_ring_max_q_cnt : mac_tx_soft_ring_hiwat; - if (mcip->mci_state_flags & MCIS_IS_AGGR) { + if (mcip->mci_state_flags & MCIS_IS_AGGR_CLIENT) { mac_srs_tx_t *tx = &mac_srs->srs_tx; ASSERT(tx->st_soft_rings[ @@ -242,7 +242,7 @@ mac_soft_ring_free(mac_soft_ring_t *softring) ASSERT((softring->s_ring_state & (S_RING_CONDEMNED | S_RING_CONDEMNED_DONE | S_RING_PROC)) == (S_RING_CONDEMNED | S_RING_CONDEMNED_DONE)); - mac_pkt_drop(NULL, NULL, softring->s_ring_first, B_FALSE); + mac_drop_chain(softring->s_ring_first, "softring free"); softring->s_ring_tx_arg2 = NULL; mac_soft_ring_stat_delete(softring); mac_callback_free(softring->s_ring_notify_cb_list); @@ -339,15 +339,14 @@ mac_soft_ring_fire(void *arg) } /* - * mac_rx_soft_ring_drain + * Drain the soft ring pointed to by ringp. * - * Called when worker thread model (ST_RING_WORKER_ONLY) of processing - * incoming packets is used. s_ring_first contain the queued packets. - * s_ring_rx_func contains the upper level (client) routine where the - * packets are destined and s_ring_rx_arg1/s_ring_rx_arg2 are the - * cookie meant for the client. + * o s_ring_first: pointer to the queued packet chain. + * + * o s_ring_rx_func: pointer to to the client's Rx routine. + * + * o s_ring_rx_{arg1,arg2}: opaque values specific to the client. */ -/* ARGSUSED */ static void mac_rx_soft_ring_drain(mac_soft_ring_t *ringp) { @@ -392,13 +391,12 @@ mac_rx_soft_ring_drain(mac_soft_ring_t *ringp) (*proc)(arg1, arg2, mp, NULL); /* - * If we have a soft ring set which is doing - * bandwidth control, we need to decrement its - * srs_size so it can have a accurate idea of - * what is the real data queued between SRS and - * its soft rings. We decrement the size for a - * packet only when it gets processed by both - * SRS and the soft ring. + * If we have an SRS performing bandwidth control, then + * we need to decrement the size and count so the SRS + * has an accurate measure of the data queued between + * the SRS and its soft rings. We decrement the + * counters only when the packet is processed by both + * the SRS and the soft ring. */ mutex_enter(&mac_srs->srs_lock); MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); @@ -414,12 +412,10 @@ mac_rx_soft_ring_drain(mac_soft_ring_t *ringp) } /* - * mac_soft_ring_worker - * * The soft ring worker routine to process any queued packets. In - * normal case, the worker thread is bound to a CPU. It the soft - * ring is dealing with TCP packets, then the worker thread will - * be bound to the same CPU as the TCP squeue. + * normal case, the worker thread is bound to a CPU. If the soft ring + * handles TCP packets then the worker thread is bound to the same CPU + * as the TCP squeue. */ static void mac_soft_ring_worker(mac_soft_ring_t *ringp) @@ -604,7 +600,7 @@ mac_soft_ring_dls_bypass(void *arg, mac_direct_rx_t rx_func, void *rx_arg1) mac_soft_ring_t *softring = arg; mac_soft_ring_set_t *srs; - ASSERT(rx_func != NULL); + VERIFY3P(rx_func, !=, NULL); mutex_enter(&softring->s_ring_lock); softring->s_ring_rx_func = rx_func; diff --git a/usr/src/uts/common/io/mac/mac_stat.c b/usr/src/uts/common/io/mac/mac_stat.c index 31972f94d8..2244218f20 100644 --- a/usr/src/uts/common/io/mac/mac_stat.c +++ b/usr/src/uts/common/io/mac/mac_stat.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ /* @@ -390,8 +391,8 @@ i_mac_stat_create(void *handle, const char *modname, const char *statname, kstat_t *ksp; kstat_named_t *knp; - ksp = kstat_create(modname, 0, statname, "net", - KSTAT_TYPE_NAMED, count, 0); + ksp = kstat_create_zone(modname, 0, statname, "net", + KSTAT_TYPE_NAMED, count, 0, getzoneid()); if (ksp == NULL) return (NULL); @@ -948,9 +949,9 @@ mac_driver_stat_create(mac_impl_t *mip) major_t major = getmajor(mip->mi_phy_dev); count = MAC_MOD_NKSTAT + MAC_NKSTAT + mip->mi_type->mt_statcount; - ksp = kstat_create((const char *)ddi_major_to_name(major), + ksp = kstat_create_zone((const char *)ddi_major_to_name(major), getminor(mip->mi_phy_dev) - 1, MAC_KSTAT_NAME, - MAC_KSTAT_CLASS, KSTAT_TYPE_NAMED, count, 0); + MAC_KSTAT_CLASS, KSTAT_TYPE_NAMED, count, 0, getzoneid()); if (ksp == NULL) return; @@ -1003,6 +1004,7 @@ void mac_ring_stat_create(mac_ring_t *ring) { mac_impl_t *mip = ring->mr_mip; + mac_group_t *grp = (mac_group_t *)ring->mr_gh; char statname[MAXNAMELEN]; char modname[MAXNAMELEN]; @@ -1014,8 +1016,8 @@ mac_ring_stat_create(mac_ring_t *ring) switch (ring->mr_type) { case MAC_RING_TYPE_RX: - (void) snprintf(statname, sizeof (statname), "mac_rx_ring%d", - ring->mr_index); + (void) snprintf(statname, sizeof (statname), + "mac_rx_ring_%d_%d", grp->mrg_index, ring->mr_index); i_mac_rx_ring_stat_create(ring, modname, statname); break; diff --git a/usr/src/uts/common/io/mac/mac_util.c b/usr/src/uts/common/io/mac/mac_util.c index e83af37f16..111b323f81 100644 --- a/usr/src/uts/common/io/mac/mac_util.c +++ b/usr/src/uts/common/io/mac/mac_util.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* @@ -47,6 +48,74 @@ #include <inet/sadb.h> #include <inet/ipsecesp.h> #include <inet/ipsecah.h> +#include <inet/tcp.h> +#include <inet/udp_impl.h> + +/* + * The next two functions are used for dropping packets or chains of + * packets, respectively. We could use one function for both but + * separating the use cases allows us to specify intent and prevent + * dropping more data than intended. + * + * The purpose of these functions is to aid the debugging effort, + * especially in production. Rather than use freemsg()/freemsgchain(), + * it's preferable to use these functions when dropping a packet in + * the MAC layer. These functions should only be used during + * unexpected conditions. That is, any time a packet is dropped + * outside of the regular, successful datapath. Consolidating all + * drops on these functions allows the user to trace one location and + * determine why the packet was dropped based on the msg. It also + * allows the user to inspect the packet before it is freed. Finally, + * it allows the user to avoid tracing freemsg()/freemsgchain() thus + * keeping the hot path running as efficiently as possible. + * + * NOTE: At this time not all MAC drops are aggregated on these + * functions; but that is the plan. This comment should be erased once + * completed. + */ + +/*PRINTFLIKE2*/ +void +mac_drop_pkt(mblk_t *mp, const char *fmt, ...) +{ + va_list adx; + char msg[128]; + char *msgp = msg; + + ASSERT3P(mp->b_next, ==, NULL); + + va_start(adx, fmt); + (void) vsnprintf(msgp, sizeof (msg), fmt, adx); + va_end(adx); + + DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp); + freemsg(mp); +} + +/*PRINTFLIKE2*/ +void +mac_drop_chain(mblk_t *chain, const char *fmt, ...) +{ + va_list adx; + char msg[128]; + char *msgp = msg; + + va_start(adx, fmt); + (void) vsnprintf(msgp, sizeof (msg), fmt, adx); + va_end(adx); + + /* + * We could use freemsgchain() for the actual freeing but + * since we are already walking the chain to fire the dtrace + * probe we might as well free the msg here too. + */ + for (mblk_t *mp = chain, *next; mp != NULL; ) { + next = mp->b_next; + DTRACE_PROBE2(mac__drop, mblk_t *, mp, char *, msgp); + freemsg(mp); + mp = next; + } +} /* * Copy an mblk, preserving its hardware checksum flags. @@ -55,15 +124,12 @@ static mblk_t * mac_copymsg_cksum(mblk_t *mp) { mblk_t *mp1; - uint32_t start, stuff, end, value, flags; mp1 = copymsg(mp); if (mp1 == NULL) return (NULL); - hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags); - (void) hcksum_assoc(mp1, NULL, NULL, start, stuff, end, value, - flags, KM_NOSLEEP); + mac_hcksum_clone(mp, mp1); return (mp1); } @@ -91,224 +157,1125 @@ mac_copymsgchain_cksum(mblk_t *mp) } /* - * Process the specified mblk chain for proper handling of hardware - * checksum offload. This routine is invoked for loopback traffic - * between MAC clients. - * The function handles a NULL mblk chain passed as argument. + * Perform software checksum on a single message, if needed. The + * emulation performed is determined by an intersection of the mblk's + * flags and the emul flags requested. The emul flags are documented + * in mac.h. */ -mblk_t * -mac_fix_cksum(mblk_t *mp_chain) +static mblk_t * +mac_sw_cksum(mblk_t *mp, mac_emul_t emul) { - mblk_t *mp, *prev = NULL, *new_chain = mp_chain, *mp1; + mblk_t *skipped_hdr = NULL; uint32_t flags, start, stuff, end, value; + uint16_t len; + uint32_t offset; + uint16_t etype; + struct ether_header *ehp; + ipha_t *ipha; + uint8_t proto; + const char *err = ""; - for (mp = mp_chain; mp != NULL; prev = mp, mp = mp->b_next) { - uint16_t len; - uint32_t offset; - struct ether_header *ehp; - uint16_t sap; + /* + * This function should only be called from mac_hw_emul() + * which handles mblk chains and the shared ref case. + */ + ASSERT3P(mp->b_next, ==, NULL); - hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, - &flags); - if (flags == 0) - continue; + mac_hcksum_get(mp, &start, &stuff, &end, &value, NULL); + + /* + * We use DB_CKSUMFLAGS (instead of mac_hcksum_get()) because + * we don't want to mask-out the HW_LOCAL_MAC flag. + */ + flags = DB_CKSUMFLAGS(mp); + + /* Why call this if checksum emulation isn't needed? */ + ASSERT3U(flags & (HCK_FLAGS), !=, 0); + + /* + * Ethernet, and optionally VLAN header. mac_hw_emul() has + * already verified we have enough data to read the L2 header. + */ + ehp = (struct ether_header *)mp->b_rptr; + if (ntohs(ehp->ether_type) == VLAN_TPID) { + struct ether_vlan_header *evhp; + + evhp = (struct ether_vlan_header *)mp->b_rptr; + etype = ntohs(evhp->ether_type); + offset = sizeof (struct ether_vlan_header); + } else { + etype = ntohs(ehp->ether_type); + offset = sizeof (struct ether_header); + } + + /* + * If this packet isn't IPv4, then leave it alone. We still + * need to add IPv6 support and we don't want to affect non-IP + * traffic like ARP. + */ + if (etype != ETHERTYPE_IP) + return (mp); + + ASSERT3U(MBLKL(mp), >=, offset); + + /* + * If the first mblk of this packet contains only the ethernet + * header, skip past it for now. Packets with their data + * contained in only a single mblk can then use the fastpaths + * tuned to that possibility. + */ + if (MBLKL(mp) == offset) { + offset -= MBLKL(mp); + /* This is guaranteed by mac_hw_emul(). */ + ASSERT3P(mp->b_cont, !=, NULL); + skipped_hdr = mp; + mp = mp->b_cont; + } + + /* + * Both full and partial checksum rely on finding the IP + * header in the current mblk. Our native TCP stack honors + * this assumption but it's prudent to guard our future + * clients that might not honor this contract. + */ + ASSERT3U(MBLKL(mp), >=, offset + sizeof (ipha_t)); + if (MBLKL(mp) < (offset + sizeof (ipha_t))) { + err = "mblk doesn't contain IP header"; + goto bail; + } + + /* + * We are about to modify the header mblk; make sure we are + * modifying our own copy. The code that follows assumes that + * the IP/ULP headers exist in this mblk (and drops the + * message if they don't). + */ + if (DB_REF(mp) > 1) { + mblk_t *tmp = copyb(mp); + + if (tmp == NULL) { + err = "copyb failed"; + goto bail; + } + + if (skipped_hdr != NULL) { + ASSERT3P(skipped_hdr->b_cont, ==, mp); + skipped_hdr->b_cont = tmp; + } + + tmp->b_cont = mp->b_cont; + freeb(mp); + mp = tmp; + } + + ipha = (ipha_t *)(mp->b_rptr + offset); + + /* + * This code assumes a "simple" IP header (20 bytes, no + * options). IPv4 options are mostly a historic artifact. The + * one slight exception is Router Alert, but we don't expect + * such a packet to land here. + */ + proto = ipha->ipha_protocol; + ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION); + if (ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION) { + err = "not simple IP header"; + goto bail; + } + + switch (proto) { + case IPPROTO_TCP: + ASSERT3U(MBLKL(mp), >=, + (offset + sizeof (ipha_t) + sizeof (tcph_t))); + if (MBLKL(mp) < (offset + sizeof (ipha_t) + sizeof (tcph_t))) { + err = "mblk doesn't contain TCP header"; + goto bail; + } + break; + + case IPPROTO_UDP: + ASSERT3U(MBLKL(mp), >=, + (offset + sizeof (ipha_t) + sizeof (udpha_t))); + if (MBLKL(mp) < (offset + sizeof (ipha_t) + sizeof (udpha_t))) { + err = "mblk doesn't contain UDP header"; + goto bail; + } + break; + + default: + err = "unexpected protocol"; + goto bail; + } + + if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) { + if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMUL)) { + ipaddr_t src, dst; + uint32_t cksum; + uint16_t *up; + + /* Get a pointer to the ULP checksum. */ + switch (proto) { + case IPPROTO_TCP: + /* LINTED: improper alignment cast */ + up = IPH_TCPH_CHECKSUMP(ipha, + IP_SIMPLE_HDR_LENGTH); + break; + + case IPPROTO_UDP: + /* LINTED: improper alignment cast */ + up = IPH_UDPH_CHECKSUMP(ipha, + IP_SIMPLE_HDR_LENGTH); + break; + } + + /* Pseudo-header checksum. */ + src = ipha->ipha_src; + dst = ipha->ipha_dst; + len = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH; + + cksum = (dst >> 16) + (dst & 0xFFFF) + + (src >> 16) + (src & 0xFFFF); + cksum += htons(len); + + /* + * The checksum value stored in the packet + * needs to be correct. Compute it here. + */ + *up = 0; + cksum += (((proto) == IPPROTO_UDP) ? + IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP); + cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH + + offset, cksum); + *(up) = (uint16_t)(cksum ? cksum : ~cksum); + + } + + /* We always update the ULP checksum flags. */ + if ((flags & HCK_FULLCKSUM) && (emul & MAC_HWCKSUM_EMULS)) { + flags &= ~HCK_FULLCKSUM; + flags |= HCK_FULLCKSUM_OK; + value = 0; + } /* - * Since the processing of checksum offload for loopback - * traffic requires modification of the packet contents, - * ensure sure that we are always modifying our own copy. + * Out of paranoia, and for the sake of correctness, + * we won't calulate the IP header checksum if it's + * already populated. While unlikely, it's possible to + * write code that might end up calling mac_sw_cksum() + * twice on the same mblk (performing both LSO and + * checksum emualtion in a single mblk chain loop -- + * the LSO emulation inserts a new chain into the + * existing chain and then the loop iterates back over + * the new segments and emulates the checksum a second + * time). Normally this wouldn't be a problem, because + * the HCK_*_OK flags are supposed to indicate that we + * don't need to do peform the work. But + * HCK_IPV4_HDRCKSUM and HCK_IPV4_HDRCKSUM_OK have the + * same value; so we cannot use these flags to + * determine if the IP header checksum has already + * been calculated or not. Luckily, if IP requests + * HCK_IPV4_HDRCKSUM, then the IP header checksum will + * be zero. So this test works just as well as + * checking the flag. However, in the future, we + * should fix the HCK_* flags. */ - if (DB_REF(mp) > 1) { - mp1 = copymsg(mp); - if (mp1 == NULL) - continue; - mp1->b_next = mp->b_next; - mp->b_next = NULL; - freemsg(mp); - if (prev != NULL) - prev->b_next = mp1; - else - new_chain = mp1; - mp = mp1; + if ((flags & HCK_IPV4_HDRCKSUM) && (emul & MAC_HWCKSUM_EMULS) && + ipha->ipha_hdr_checksum == 0) { + ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha); + flags &= ~HCK_IPV4_HDRCKSUM; + flags |= HCK_IPV4_HDRCKSUM_OK; } + } + + if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMUL)) { + uint16_t *up, partial, cksum; + uchar_t *ipp; /* ptr to beginning of IP header */ + + ipp = mp->b_rptr + offset; + /* LINTED: cast may result in improper alignment */ + up = (uint16_t *)((uchar_t *)ipp + stuff); + partial = *up; + *up = 0; + + ASSERT3S(end, >, start); + cksum = ~IP_CSUM_PARTIAL(mp, offset + start, partial); + *up = cksum != 0 ? cksum : ~cksum; + } + + /* We always update the ULP checksum flags. */ + if ((flags & HCK_PARTIALCKSUM) && (emul & MAC_HWCKSUM_EMULS)) { + flags &= ~HCK_PARTIALCKSUM; + flags |= HCK_FULLCKSUM_OK; + value = 0; + } + + mac_hcksum_set(mp, start, stuff, end, value, flags); + + /* Don't forget to reattach the header. */ + if (skipped_hdr != NULL) { + ASSERT3P(skipped_hdr->b_cont, ==, mp); /* - * Ethernet, and optionally VLAN header. + * Duplicate the HCKSUM data into the header mblk. + * This mimics mac_add_vlan_tag which ensures that + * both the first mblk _and_ the first data bearing + * mblk possess the HCKSUM information. Consumers like + * IP will end up discarding the ether_header mblk, so + * for now, it is important that the data be available + * in both places. */ - /* LINTED: improper alignment cast */ - ehp = (struct ether_header *)mp->b_rptr; - if (ntohs(ehp->ether_type) == VLAN_TPID) { - struct ether_vlan_header *evhp; + mac_hcksum_clone(mp, skipped_hdr); + mp = skipped_hdr; + } - ASSERT(MBLKL(mp) >= sizeof (struct ether_vlan_header)); - /* LINTED: improper alignment cast */ - evhp = (struct ether_vlan_header *)mp->b_rptr; - sap = ntohs(evhp->ether_type); - offset = sizeof (struct ether_vlan_header); + return (mp); + +bail: + if (skipped_hdr != NULL) { + ASSERT3P(skipped_hdr->b_cont, ==, mp); + mp = skipped_hdr; + } + + mac_drop_pkt(mp, err); + return (NULL); +} + +/* + * Build a single data segment from an LSO packet. The mblk chain + * returned, seg_head, represents the data segment and is always + * exactly seg_len bytes long. The lso_mp and offset input/output + * parameters track our position in the LSO packet. This function + * exists solely as a helper to mac_sw_lso(). + * + * Case A + * + * The current lso_mp is larger than the requested seg_len. The + * beginning of seg_head may start at the beginning of lso_mp or + * offset into it. In either case, a single mblk is returned, and + * *offset is updated to reflect our new position in the current + * lso_mp. + * + * +----------------------------+ + * | in *lso_mp / out *lso_mp | + * +----------------------------+ + * ^ ^ + * | | + * | | + * | | + * +------------------------+ + * | seg_head | + * +------------------------+ + * ^ ^ + * | | + * in *offset = 0 out *offset = seg_len + * + * |------ seg_len ----| + * + * + * +------------------------------+ + * | in *lso_mp / out *lso_mp | + * +------------------------------+ + * ^ ^ + * | | + * | | + * | | + * +------------------------+ + * | seg_head | + * +------------------------+ + * ^ ^ + * | | + * in *offset = N out *offset = N + seg_len + * + * |------ seg_len ----| + * + * + * + * Case B + * + * The requested seg_len consumes exactly the rest of the lso_mp. + * I.e., the seg_head's b_wptr is equivalent to lso_mp's b_wptr. + * The seg_head may start at the beginning of the lso_mp or at some + * offset into it. In either case we return a single mblk, reset + * *offset to zero, and walk to the next lso_mp. + * + * +------------------------+ +------------------------+ + * | in *lso_mp |---------->| out *lso_mp | + * +------------------------+ +------------------------+ + * ^ ^ ^ + * | | | + * | | out *offset = 0 + * | | + * +------------------------+ + * | seg_head | + * +------------------------+ + * ^ + * | + * in *offset = 0 + * + * |------ seg_len ----| + * + * + * + * +----------------------------+ +------------------------+ + * | in *lso_mp |---------->| out *lso_mp | + * +----------------------------+ +------------------------+ + * ^ ^ ^ + * | | | + * | | out *offset = 0 + * | | + * +------------------------+ + * | seg_head | + * +------------------------+ + * ^ + * | + * in *offset = N + * + * |------ seg_len ----| + * + * + * Case C + * + * The requested seg_len is greater than the current lso_mp. In + * this case we must consume LSO mblks until we have enough data to + * satisfy either case (A) or (B) above. We will return multiple + * mblks linked via b_cont, offset will be set based on the cases + * above, and lso_mp will walk forward at least one mblk, but maybe + * more. + * + * N.B. This digram is not exhaustive. The seg_head may start on + * the beginning of an lso_mp. The seg_tail may end exactly on the + * boundary of an lso_mp. And there may be two (in this case the + * middle block wouldn't exist), three, or more mblks in the + * seg_head chain. This is meant as one example of what might + * happen. The main thing to remember is that the seg_tail mblk + * must be one of case (A) or (B) above. + * + * +------------------+ +----------------+ +------------------+ + * | in *lso_mp |--->| *lso_mp |--->| out *lso_mp | + * +------------------+ +----------------+ +------------------+ + * ^ ^ ^ ^ ^ ^ + * | | | | | | + * | | | | | | + * | | | | | | + * | | | | | | + * +------------+ +----------------+ +------------+ + * | seg_head |--->| |--->| seg_tail | + * +------------+ +----------------+ +------------+ + * ^ ^ + * | | + * in *offset = N out *offset = MBLKL(seg_tail) + * + * |------------------- seg_len -------------------| + * + */ +static mblk_t * +build_data_seg(mblk_t **lso_mp, uint32_t *offset, uint32_t seg_len) +{ + mblk_t *seg_head, *seg_tail, *seg_mp; + + ASSERT3P(*lso_mp, !=, NULL); + ASSERT3U((*lso_mp)->b_rptr + *offset, <, (*lso_mp)->b_wptr); + + seg_mp = dupb(*lso_mp); + if (seg_mp == NULL) + return (NULL); + + seg_head = seg_mp; + seg_tail = seg_mp; + + /* Continue where we left off from in the lso_mp. */ + seg_mp->b_rptr += *offset; + +last_mblk: + /* Case (A) */ + if ((seg_mp->b_rptr + seg_len) < seg_mp->b_wptr) { + *offset += seg_len; + seg_mp->b_wptr = seg_mp->b_rptr + seg_len; + return (seg_head); + } + + /* Case (B) */ + if ((seg_mp->b_rptr + seg_len) == seg_mp->b_wptr) { + *offset = 0; + *lso_mp = (*lso_mp)->b_cont; + return (seg_head); + } + + /* Case (C) */ + ASSERT3U(seg_mp->b_rptr + seg_len, >, seg_mp->b_wptr); + + /* + * The current LSO mblk doesn't have enough data to satisfy + * seg_len -- continue peeling off LSO mblks to build the new + * segment message. If allocation fails we free the previously + * allocated segment mblks and return NULL. + */ + while ((seg_mp->b_rptr + seg_len) > seg_mp->b_wptr) { + ASSERT3U(MBLKL(seg_mp), <=, seg_len); + seg_len -= MBLKL(seg_mp); + *offset = 0; + *lso_mp = (*lso_mp)->b_cont; + seg_mp = dupb(*lso_mp); + + if (seg_mp == NULL) { + freemsgchain(seg_head); + return (NULL); + } + + seg_tail->b_cont = seg_mp; + seg_tail = seg_mp; + } + + /* + * We've walked enough LSO mblks that we can now satisfy the + * remaining seg_len. At this point we need to jump back to + * determine if we have arrived at case (A) or (B). + */ + + /* Just to be paranoid that we didn't underflow. */ + ASSERT3U(seg_len, <, IP_MAXPACKET); + ASSERT3U(seg_len, >, 0); + goto last_mblk; +} + +/* + * Perform software segmentation of a single LSO message. Take an LSO + * message as input and return head/tail pointers as output. This + * function should not be invoked directly but instead through + * mac_hw_emul(). + * + * The resulting chain is comprised of multiple (nsegs) MSS sized + * segments. Each segment will consist of two or more mblks joined by + * b_cont: a header and one or more data mblks. The header mblk is + * allocated anew for each message. The first segment's header is used + * as a template for the rest with adjustments made for things such as + * ID, sequence, length, TCP flags, etc. The data mblks reference into + * the existing LSO mblk (passed in as omp) by way of dupb(). Their + * b_rptr/b_wptr values are adjusted to reference only the fraction of + * the LSO message they are responsible for. At the successful + * completion of this function the original mblk (omp) is freed, + * leaving the newely created segment chain as the only remaining + * reference to the data. + */ +static void +mac_sw_lso(mblk_t *omp, mac_emul_t emul, mblk_t **head, mblk_t **tail, + uint_t *count) +{ + uint32_t ocsum_flags, ocsum_start, ocsum_stuff; + uint32_t mss; + uint32_t oehlen, oiphlen, otcphlen, ohdrslen, opktlen, odatalen; + uint32_t oleft; + uint_t nsegs, seg; + int len; + + struct ether_vlan_header *oevh; + const ipha_t *oiph; + const tcph_t *otcph; + ipha_t *niph; + tcph_t *ntcph; + uint16_t ip_id; + uint32_t tcp_seq, tcp_sum, otcp_sum; + + uint32_t offset; + mblk_t *odatamp; + mblk_t *seg_chain, *prev_nhdrmp, *next_nhdrmp, *nhdrmp, *ndatamp; + mblk_t *tmptail; + + ASSERT3P(head, !=, NULL); + ASSERT3P(tail, !=, NULL); + ASSERT3P(count, !=, NULL); + ASSERT3U((DB_CKSUMFLAGS(omp) & HW_LSO), !=, 0); + + /* Assume we are dealing with a single LSO message. */ + ASSERT3P(omp->b_next, ==, NULL); + + /* + * XXX: This is a hack to deal with mac_add_vlan_tag(). + * + * When VLANs are in play, mac_add_vlan_tag() creates a new + * mblk with just the ether_vlan_header and tacks it onto the + * front of 'omp'. This breaks the assumptions made below; + * namely that the TCP/IP headers are in the first mblk. In + * this case, since we already have to pay the cost of LSO + * emulation, we simply pull up everything. While this might + * seem irksome, keep in mind this will only apply in a couple + * of scenarios: a) an LSO-capable VLAN client sending to a + * non-LSO-capable client over the "MAC/bridge loopback" + * datapath or b) an LSO-capable VLAN client is sending to a + * client that, for whatever reason, doesn't have DLS-bypass + * enabled. Finally, we have to check for both a tagged and + * untagged sized mblk depending on if the mblk came via + * mac_promisc_dispatch() or mac_rx_deliver(). + * + * In the future, two things should be done: + * + * 1. This function should make use of some yet to be + * implemented "mblk helpers". These helper functions would + * perform all the b_cont walking for us and guarantee safe + * access to the mblk data. + * + * 2. We should add some slop to the mblks so that + * mac_add_vlan_tag() can just edit the first mblk instead + * of allocating on the hot path. + */ + if (MBLKL(omp) == sizeof (struct ether_vlan_header) || + MBLKL(omp) == sizeof (struct ether_header)) { + mblk_t *tmp = msgpullup(omp, -1); + + if (tmp == NULL) { + mac_drop_pkt(omp, "failed to pull up"); + goto fail; + } + + mac_hcksum_clone(omp, tmp); + freemsg(omp); + omp = tmp; + } + + mss = DB_LSOMSS(omp); + ASSERT3U(msgsize(omp), <=, IP_MAXPACKET + + sizeof (struct ether_vlan_header)); + opktlen = msgsize(omp); + + /* + * First, get references to the IP and TCP headers and + * determine the total TCP length (header + data). + * + * Thanks to mac_hw_emul() we know that the first mblk must + * contain (at minimum) the full L2 header. However, this + * function assumes more than that. It assumes the L2/L3/L4 + * headers are all contained in the first mblk of a message + * (i.e., no b_cont walking for headers). While this is a + * current reality (our native TCP stack and viona both + * enforce this) things may become more nuanced in the future + * (e.g. when introducing encap support or adding new + * clients). For now we guard against this case by dropping + * the packet. + */ + oevh = (struct ether_vlan_header *)omp->b_rptr; + if (oevh->ether_tpid == htons(ETHERTYPE_VLAN)) + oehlen = sizeof (struct ether_vlan_header); + else + oehlen = sizeof (struct ether_header); + + ASSERT3U(MBLKL(omp), >=, (oehlen + sizeof (ipha_t) + sizeof (tcph_t))); + if (MBLKL(omp) < (oehlen + sizeof (ipha_t) + sizeof (tcph_t))) { + mac_drop_pkt(omp, "mblk doesn't contain TCP/IP headers"); + goto fail; + } + + oiph = (ipha_t *)(omp->b_rptr + oehlen); + oiphlen = IPH_HDR_LENGTH(oiph); + otcph = (tcph_t *)(omp->b_rptr + oehlen + oiphlen); + otcphlen = TCP_HDR_LENGTH(otcph); + + /* + * Currently we only support LSO for TCP/IPv4. + */ + if (IPH_HDR_VERSION(oiph) != IPV4_VERSION) { + mac_drop_pkt(omp, "LSO unsupported IP version: %uhh", + IPH_HDR_VERSION(oiph)); + goto fail; + } + + if (oiph->ipha_protocol != IPPROTO_TCP) { + mac_drop_pkt(omp, "LSO unsupported protocol: %uhh", + oiph->ipha_protocol); + goto fail; + } + + if (otcph->th_flags[0] & (TH_SYN | TH_RST | TH_URG)) { + mac_drop_pkt(omp, "LSO packet has SYN|RST|URG set"); + goto fail; + } + + ohdrslen = oehlen + oiphlen + otcphlen; + if ((len = MBLKL(omp)) < ohdrslen) { + mac_drop_pkt(omp, "LSO packet too short: %d < %u", len, + ohdrslen); + goto fail; + } + + /* + * Either we have data in the first mblk or it's just the + * header. In either case, we need to set rptr to the start of + * the TCP data. + */ + if (len > ohdrslen) { + odatamp = omp; + offset = ohdrslen; + } else { + ASSERT3U(len, ==, ohdrslen); + odatamp = omp->b_cont; + offset = 0; + } + + /* Make sure we still have enough data. */ + ASSERT3U(msgsize(odatamp), >=, opktlen - ohdrslen); + + /* + * If a MAC negotiated LSO then it must negotioate both + * HCKSUM_IPHDRCKSUM and either HCKSUM_INET_FULL_V4 or + * HCKSUM_INET_PARTIAL; because both the IP and TCP headers + * change during LSO segmentation (only the 3 fields of the + * pseudo header checksum don't change: src, dst, proto). Thus + * we would expect these flags (HCK_IPV4_HDRCKSUM | + * HCK_PARTIALCKSUM | HCK_FULLCKSUM) to be set and for this + * function to emulate those checksums in software. However, + * that assumes a world where we only expose LSO if the + * underlying hardware exposes LSO. Moving forward the plan is + * to assume LSO in the upper layers and have MAC perform + * software LSO when the underlying provider doesn't support + * it. In such a world, if the provider doesn't support LSO + * but does support hardware checksum offload, then we could + * simply perform the segmentation and allow the hardware to + * calculate the checksums. To the hardware it's just another + * chain of non-LSO packets. + */ + ASSERT3S(DB_TYPE(omp), ==, M_DATA); + ocsum_flags = DB_CKSUMFLAGS(omp); + ASSERT3U(ocsum_flags & HCK_IPV4_HDRCKSUM, !=, 0); + ASSERT3U(ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM), !=, 0); + + /* + * If hardware only provides partial checksum then software + * must supply the pseudo-header checksum. In the case of LSO + * we leave the TCP length at zero to be filled in by + * hardware. This function must handle two scenarios. + * + * 1. Being called by a MAC client on the Rx path to segment + * an LSO packet and calculate the checksum. + * + * 2. Being called by a MAC provider to segment an LSO packet. + * In this case the LSO segmentation is performed in + * software (by this routine) but the MAC provider should + * still calculate the TCP/IP checksums in hardware. + * + * To elaborate on the second case: we cannot have the + * scenario where IP sends LSO packets but the underlying HW + * doesn't support checksum offload -- because in that case + * TCP/IP would calculate the checksum in software (for the + * LSO packet) but then MAC would segment the packet and have + * to redo all the checksum work. So IP should never do LSO + * if HW doesn't support both IP and TCP checksum. + */ + if (ocsum_flags & HCK_PARTIALCKSUM) { + ocsum_start = (uint32_t)DB_CKSUMSTART(omp); + ocsum_stuff = (uint32_t)DB_CKSUMSTUFF(omp); + } + + odatalen = opktlen - ohdrslen; + + /* + * Subtract one to account for the case where the data length + * is evenly divisble by the MSS. Add one to account for the + * fact that the division will always result in one less + * segment than needed. + */ + nsegs = ((odatalen - 1) / mss) + 1; + if (nsegs < 2) { + mac_drop_pkt(omp, "LSO not enough segs: %u", nsegs); + goto fail; + } + + DTRACE_PROBE6(sw__lso__start, mblk_t *, omp, void_ip_t *, oiph, + __dtrace_tcp_tcph_t *, otcph, uint_t, odatalen, uint_t, mss, uint_t, + nsegs); + + seg_chain = NULL; + tmptail = seg_chain; + oleft = odatalen; + + for (uint_t i = 0; i < nsegs; i++) { + boolean_t last_seg = ((i + 1) == nsegs); + uint32_t seg_len; + + /* + * If we fail to allocate, then drop the partially + * allocated chain as well as the LSO packet. Let the + * sender deal with the fallout. + */ + if ((nhdrmp = allocb(ohdrslen, 0)) == NULL) { + freemsgchain(seg_chain); + mac_drop_pkt(omp, "failed to alloc segment header"); + goto fail; + } + ASSERT3P(nhdrmp->b_cont, ==, NULL); + + if (seg_chain == NULL) { + seg_chain = nhdrmp; } else { - sap = ntohs(ehp->ether_type); - offset = sizeof (struct ether_header); + ASSERT3P(tmptail, !=, NULL); + tmptail->b_next = nhdrmp; } - if (MBLKL(mp) <= offset) { - offset -= MBLKL(mp); - if (mp->b_cont == NULL) { - /* corrupted packet, skip it */ - if (prev != NULL) - prev->b_next = mp->b_next; - else - new_chain = mp->b_next; - mp1 = mp->b_next; - mp->b_next = NULL; - freemsg(mp); - mp = mp1; - continue; - } - mp = mp->b_cont; + tmptail = nhdrmp; + + /* + * Calculate this segment's lengh. It's either the MSS + * or whatever remains for the last segment. + */ + seg_len = last_seg ? oleft : mss; + ASSERT3U(seg_len, <=, mss); + ndatamp = build_data_seg(&odatamp, &offset, seg_len); + + if (ndatamp == NULL) { + freemsgchain(seg_chain); + mac_drop_pkt(omp, "LSO failed to segment data"); + goto fail; } - if (flags & (HCK_FULLCKSUM | HCK_IPV4_HDRCKSUM)) { - ipha_t *ipha = NULL; + /* Attach data mblk to header mblk. */ + nhdrmp->b_cont = ndatamp; + DB_CKSUMFLAGS(ndatamp) &= ~HW_LSO; + ASSERT3U(seg_len, <=, oleft); + oleft -= seg_len; + } - /* - * In order to compute the full and header - * checksums, we need to find and parse - * the IP and/or ULP headers. - */ + /* We should have consumed entire LSO msg. */ + ASSERT3S(oleft, ==, 0); + ASSERT3P(odatamp, ==, NULL); + + /* + * All seg data mblks are referenced by the header mblks, null + * out this pointer to catch any bad derefs. + */ + ndatamp = NULL; + + /* + * Set headers and checksum for first segment. + */ + nhdrmp = seg_chain; + bcopy(omp->b_rptr, nhdrmp->b_rptr, ohdrslen); + nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; + niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); + ASSERT3U(msgsize(nhdrmp->b_cont), ==, mss); + niph->ipha_length = htons(oiphlen + otcphlen + mss); + niph->ipha_hdr_checksum = 0; + ip_id = ntohs(niph->ipha_ident); + ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); + tcp_seq = BE32_TO_U32(ntcph->th_seq); + tcp_seq += mss; - sap = (sap < ETHERTYPE_802_MIN) ? 0 : sap; + /* + * The first segment shouldn't: + * + * o indicate end of data transmission (FIN), + * o indicate immediate handling of the data (PUSH). + */ + ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH); + DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); + /* + * If the underlying HW provides partial checksum, then make + * sure to correct the pseudo header checksum before calling + * mac_sw_cksum(). The native TCP stack doesn't include the + * length field in the pseudo header when LSO is in play -- so + * we need to calculate it here. + */ + if (ocsum_flags & HCK_PARTIALCKSUM) { + DB_CKSUMSTART(nhdrmp) = ocsum_start; + DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); + DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; + tcp_sum = BE16_TO_U16(ntcph->th_sum); + otcp_sum = tcp_sum; + tcp_sum += mss + otcphlen; + tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF); + U16_TO_BE16(tcp_sum, ntcph->th_sum); + } + + if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && + (emul & MAC_HWCKSUM_EMULS)) { + next_nhdrmp = nhdrmp->b_next; + nhdrmp->b_next = NULL; + nhdrmp = mac_sw_cksum(nhdrmp, emul); + nhdrmp->b_next = next_nhdrmp; + next_nhdrmp = NULL; + + /* + * We may have freed the nhdrmp argument during + * checksum emulation, make sure that seg_chain + * references a valid mblk. + */ + seg_chain = nhdrmp; + } + + ASSERT3P(nhdrmp, !=, NULL); + + seg = 1; + DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, + (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, + (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, mss, + uint_t, seg); + seg++; + + /* There better be at least 2 segs. */ + ASSERT3P(nhdrmp->b_next, !=, NULL); + prev_nhdrmp = nhdrmp; + nhdrmp = nhdrmp->b_next; + + /* + * Now adjust the headers of the middle segments. For each + * header we need to adjust the following. + * + * o IP ID + * o IP length + * o TCP sequence + * o TCP flags + * o cksum flags + * o cksum values (if MAC_HWCKSUM_EMUL is set) + */ + for (; seg < nsegs; seg++) { + /* + * We use seg_chain as a reference to the first seg + * header mblk -- this first header is a template for + * the rest of the segments. This copy will include + * the now updated checksum values from the first + * header. We must reset these checksum values to + * their original to make sure we produce the correct + * value. + */ + bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen); + nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; + niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); + niph->ipha_ident = htons(++ip_id); + ASSERT3P(msgsize(nhdrmp->b_cont), ==, mss); + niph->ipha_length = htons(oiphlen + otcphlen + mss); + niph->ipha_hdr_checksum = 0; + ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); + U32_TO_BE32(tcp_seq, ntcph->th_seq); + tcp_seq += mss; + /* + * Just like the first segment, the middle segments + * shouldn't have these flags set. + */ + ntcph->th_flags[0] &= ~(TH_FIN | TH_PUSH); + DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); + + if (ocsum_flags & HCK_PARTIALCKSUM) { /* - * IP header. + * First and middle segs have same + * pseudo-header checksum. */ - if (sap != ETHERTYPE_IP) - continue; - - ASSERT(MBLKL(mp) >= offset + sizeof (ipha_t)); - /* LINTED: improper alignment cast */ - ipha = (ipha_t *)(mp->b_rptr + offset); - - if (flags & HCK_FULLCKSUM) { - ipaddr_t src, dst; - uint32_t cksum; - uint16_t *up; - uint8_t proto; - - /* - * Pointer to checksum field in ULP header. - */ - proto = ipha->ipha_protocol; - ASSERT(ipha->ipha_version_and_hdr_length == - IP_SIMPLE_HDR_VERSION); - - switch (proto) { - case IPPROTO_TCP: - /* LINTED: improper alignment cast */ - up = IPH_TCPH_CHECKSUMP(ipha, - IP_SIMPLE_HDR_LENGTH); - break; - - case IPPROTO_UDP: - /* LINTED: improper alignment cast */ - up = IPH_UDPH_CHECKSUMP(ipha, - IP_SIMPLE_HDR_LENGTH); - break; - - default: - cmn_err(CE_WARN, "mac_fix_cksum: " - "unexpected protocol: %d", proto); - continue; - } - - /* - * Pseudo-header checksum. - */ - src = ipha->ipha_src; - dst = ipha->ipha_dst; - len = ntohs(ipha->ipha_length) - - IP_SIMPLE_HDR_LENGTH; - - cksum = (dst >> 16) + (dst & 0xFFFF) + - (src >> 16) + (src & 0xFFFF); - cksum += htons(len); - - /* - * The checksum value stored in the packet needs - * to be correct. Compute it here. - */ - *up = 0; - cksum += (((proto) == IPPROTO_UDP) ? - IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP); - cksum = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH + - offset, cksum); - *(up) = (uint16_t)(cksum ? cksum : ~cksum); - - /* - * Flag the packet so that it appears - * that the checksum has already been - * verified by the hardware. - */ - flags &= ~HCK_FULLCKSUM; - flags |= HCK_FULLCKSUM_OK; - value = 0; - } + U16_TO_BE16(tcp_sum, ntcph->th_sum); + DB_CKSUMSTART(nhdrmp) = ocsum_start; + DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); + DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; + } - if (flags & HCK_IPV4_HDRCKSUM) { - ASSERT(ipha != NULL); - ipha->ipha_hdr_checksum = - (uint16_t)ip_csum_hdr(ipha); - flags &= ~HCK_IPV4_HDRCKSUM; - flags |= HCK_IPV4_HDRCKSUM_OK; + if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && + (emul & MAC_HWCKSUM_EMULS)) { + next_nhdrmp = nhdrmp->b_next; + nhdrmp->b_next = NULL; + nhdrmp = mac_sw_cksum(nhdrmp, emul); + nhdrmp->b_next = next_nhdrmp; + next_nhdrmp = NULL; + /* We may have freed the original nhdrmp. */ + prev_nhdrmp->b_next = nhdrmp; + } - } + DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, + (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, + (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), + uint_t, mss, uint_t, seg); + + ASSERT3P(nhdrmp->b_next, !=, NULL); + prev_nhdrmp = nhdrmp; + nhdrmp = nhdrmp->b_next; + } + + /* Make sure we are on the last segment. */ + ASSERT3U(seg, ==, nsegs); + ASSERT3P(nhdrmp->b_next, ==, NULL); + + /* + * Now we set the last segment header. The difference being + * that FIN/PSH/RST flags are allowed. + */ + bcopy(seg_chain->b_rptr, nhdrmp->b_rptr, ohdrslen); + nhdrmp->b_wptr = nhdrmp->b_rptr + ohdrslen; + niph = (ipha_t *)(nhdrmp->b_rptr + oehlen); + niph->ipha_ident = htons(++ip_id); + len = msgsize(nhdrmp->b_cont); + ASSERT3S(len, >, 0); + niph->ipha_length = htons(oiphlen + otcphlen + len); + niph->ipha_hdr_checksum = 0; + ntcph = (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen); + U32_TO_BE32(tcp_seq, ntcph->th_seq); + + DB_CKSUMFLAGS(nhdrmp) = (uint16_t)(ocsum_flags & ~HW_LSO); + if (ocsum_flags & HCK_PARTIALCKSUM) { + DB_CKSUMSTART(nhdrmp) = ocsum_start; + DB_CKSUMEND(nhdrmp) = ntohs(niph->ipha_length); + DB_CKSUMSTUFF(nhdrmp) = ocsum_stuff; + tcp_sum = otcp_sum; + tcp_sum += len + otcphlen; + tcp_sum = (tcp_sum >> 16) + (tcp_sum & 0xFFFF); + U16_TO_BE16(tcp_sum, ntcph->th_sum); + } + + if ((ocsum_flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) && + (emul & MAC_HWCKSUM_EMULS)) { + /* This should be the last mblk. */ + ASSERT3P(nhdrmp->b_next, ==, NULL); + nhdrmp = mac_sw_cksum(nhdrmp, emul); + prev_nhdrmp->b_next = nhdrmp; + } + + DTRACE_PROBE5(sw__lso__seg, mblk_t *, nhdrmp, void_ip_t *, + (ipha_t *)(nhdrmp->b_rptr + oehlen), __dtrace_tcp_tcph_t *, + (tcph_t *)(nhdrmp->b_rptr + oehlen + oiphlen), uint_t, len, + uint_t, seg); + + /* + * Free the reference to the original LSO message as it is + * being replaced by seg_cahin. + */ + freemsg(omp); + *head = seg_chain; + *tail = nhdrmp; + *count = nsegs; + return; + +fail: + *head = NULL; + *tail = NULL; + *count = 0; +} + +#define HCK_NEEDED (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | HCK_FULLCKSUM) + +/* + * Emulate various hardware offload features in software. Take a chain + * of packets as input and emulate the hardware features specified in + * 'emul'. The resulting chain's head pointer replaces the 'mp_chain' + * pointer given as input, and its tail pointer is written to + * '*otail'. The number of packets in the new chain is written to + * '*ocount'. The 'otail' and 'ocount' arguments are optional and thus + * may be NULL. The 'mp_chain' argument may point to a NULL chain; in + * which case 'mp_chain' will simply stay a NULL chain. + * + * While unlikely, it is technically possible that this function could + * receive a non-NULL chain as input and return a NULL chain as output + * ('*mp_chain' and '*otail' would be NULL and '*ocount' would be + * zero). This could happen if all the packets in the chain are + * dropped or if we fail to allocate new mblks. In this case, there is + * nothing for the caller to free. In any event, the caller shouldn't + * assume that '*mp_chain' is non-NULL on return. + * + * This function was written with two main use cases in mind. + * + * 1. A way for MAC clients to emulate hardware offloads when they + * can't directly handle LSO packets or packets without fully + * calculated checksums. + * + * 2. A way for MAC providers (drivers) to offer LSO even when the + * underlying HW can't or won't supply LSO offload. + * + * At the time of this writing no provider is making use of this + * function. However, the plan for the future is to always assume LSO + * is available and then add SW LSO emulation to all providers that + * don't support it in HW. + */ +void +mac_hw_emul(mblk_t **mp_chain, mblk_t **otail, uint_t *ocount, mac_emul_t emul) +{ + mblk_t *head = NULL, *tail = NULL; + uint_t count = 0; + + ASSERT3S(~(MAC_HWCKSUM_EMULS | MAC_LSO_EMUL) & emul, ==, 0); + ASSERT3P(mp_chain, !=, NULL); + + for (mblk_t *mp = *mp_chain; mp != NULL; ) { + mblk_t *tmp, *next, *tmphead, *tmptail; + struct ether_header *ehp; + uint32_t flags; + uint_t len = MBLKL(mp), l2len; + + /* Perform LSO/cksum one message at a time. */ + next = mp->b_next; + mp->b_next = NULL; + + /* + * For our sanity the first mblk should contain at + * least the full L2 header. + */ + if (len < sizeof (struct ether_header)) { + mac_drop_pkt(mp, "packet too short (A): %u", len); + mp = next; + continue; } - if (flags & HCK_PARTIALCKSUM) { - uint16_t *up, partial, cksum; - uchar_t *ipp; /* ptr to beginning of IP header */ - - if (mp->b_cont != NULL) { - mblk_t *mp1; - - mp1 = msgpullup(mp, offset + end); - if (mp1 == NULL) - continue; - mp1->b_next = mp->b_next; - mp->b_next = NULL; - freemsg(mp); - if (prev != NULL) - prev->b_next = mp1; - else - new_chain = mp1; - mp = mp1; - } + ehp = (struct ether_header *)mp->b_rptr; + if (ntohs(ehp->ether_type) == VLAN_TPID) + l2len = sizeof (struct ether_vlan_header); + else + l2len = sizeof (struct ether_header); - ipp = mp->b_rptr + offset; - /* LINTED: cast may result in improper alignment */ - up = (uint16_t *)((uchar_t *)ipp + stuff); - partial = *up; - *up = 0; + /* + * If the first mblk is solely the L2 header, then + * there better be more data. + */ + if (len < l2len || (len == l2len && mp->b_cont == NULL)) { + mac_drop_pkt(mp, "packet too short (C): %u", len); + mp = next; + continue; + } + + DTRACE_PROBE2(mac__emul, mblk_t *, mp, mac_emul_t, emul); + + /* + * We use DB_CKSUMFLAGS (instead of mac_hcksum_get()) + * because we don't want to mask-out the LSO flag. + */ + flags = DB_CKSUMFLAGS(mp); - cksum = IP_BCSUM_PARTIAL(mp->b_rptr + offset + start, - end - start, partial); - cksum = ~cksum; - *up = cksum ? cksum : ~cksum; + if ((flags & HW_LSO) && (emul & MAC_LSO_EMUL)) { + uint_t tmpcount = 0; /* - * Since we already computed the whole checksum, - * indicate to the stack that it has already - * been verified by the hardware. + * LSO fix-up handles checksum emulation + * inline (if requested). It also frees mp. */ - flags &= ~HCK_PARTIALCKSUM; - flags |= HCK_FULLCKSUM_OK; - value = 0; + mac_sw_lso(mp, emul, &tmphead, &tmptail, + &tmpcount); + count += tmpcount; + } else if ((flags & HCK_NEEDED) && (emul & MAC_HWCKSUM_EMULS)) { + tmp = mac_sw_cksum(mp, emul); + tmphead = tmp; + tmptail = tmp; + count++; + } else { + /* There is nothing to emulate. */ + tmp = mp; + tmphead = tmp; + tmptail = tmp; + count++; } - (void) hcksum_assoc(mp, NULL, NULL, start, stuff, end, - value, flags, KM_NOSLEEP); + /* + * The tmp mblk chain is either the start of the new + * chain or added to the tail of the new chain. + */ + if (head == NULL) { + head = tmphead; + tail = tmptail; + } else { + /* Attach the new mblk to the end of the new chain. */ + tail->b_next = tmphead; + tail = tmptail; + } + + mp = next; } - return (new_chain); + *mp_chain = head; + + if (otail != NULL) + *otail = tail; + + if (ocount != NULL) + *ocount = count; } /* @@ -320,7 +1287,6 @@ mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid) mblk_t *hmp; struct ether_vlan_header *evhp; struct ether_header *ehp; - uint32_t start, stuff, end, value, flags; ASSERT(pri != 0 || vid != 0); @@ -350,9 +1316,7 @@ mac_add_vlan_tag(mblk_t *mp, uint_t pri, uint16_t vid) * Free the original message if it's now empty. Link the * rest of messages to the header message. */ - hcksum_retrieve(mp, NULL, NULL, &start, &stuff, &end, &value, &flags); - (void) hcksum_assoc(hmp, NULL, NULL, start, stuff, end, value, flags, - KM_NOSLEEP); + mac_hcksum_clone(mp, hmp); if (MBLKL(mp) == 0) { hmp->b_cont = mp->b_cont; freeb(mp); @@ -456,16 +1420,9 @@ mac_strip_vlan_tag_chain(mblk_t *mp_chain) */ /* ARGSUSED */ void -mac_pkt_drop(void *arg, mac_resource_handle_t resource, mblk_t *mp, +mac_rx_def(void *arg, mac_resource_handle_t resource, mblk_t *mp, boolean_t loopback) { - mblk_t *mp1 = mp; - - while (mp1 != NULL) { - mp1->b_prev = NULL; - mp1->b_queue = NULL; - mp1 = mp1->b_next; - } freemsgchain(mp); } diff --git a/usr/src/uts/common/io/mem.c b/usr/src/uts/common/io/mem.c index 950fab1272..fcea4a8f03 100644 --- a/usr/src/uts/common/io/mem.c +++ b/usr/src/uts/common/io/mem.c @@ -225,10 +225,19 @@ mmopen(dev_t *devp, int flag, int typ, struct cred *cred) case M_NULL: case M_ZERO: case M_FULL: + /* standard devices */ + break; + case M_MEM: case M_KMEM: case M_ALLKMEM: - /* standard devices */ + /* + * These devices should never be visible in a zone, but if they + * somehow do get created we refuse to allow the zone to use + * them. + */ + if (crgetzoneid(cred) != GLOBAL_ZONEID) + return (EACCES); break; default: diff --git a/usr/src/uts/common/io/mr_sas/mr_sas.conf b/usr/src/uts/common/io/mr_sas/mr_sas.conf index cfda434e23..6c585c6a42 100644 --- a/usr/src/uts/common/io/mr_sas/mr_sas.conf +++ b/usr/src/uts/common/io/mr_sas/mr_sas.conf @@ -13,3 +13,11 @@ # Fast-Path specific flag. Default is "yes". # mrsas-enable-fp="yes"; +flow_control="dmult" queue="qsort" tape="sctp"; + +# MSI specific flag. To enable MSI modify the flag value to "yes" +mrsas-enable-msi="yes"; + +# Fast-Path specific flag. To enable Fast-Path modify the flag value to "yes" +mrsas-enable-fp="yes"; + diff --git a/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE new file mode 100644 index 0000000000..187088ff34 --- /dev/null +++ b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2014, Thales UK Limited + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip new file mode 100644 index 0000000000..cde8b65b37 --- /dev/null +++ b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip @@ -0,0 +1 @@ +NFAST CRYPTO ACCELERATOR DRIVER diff --git a/usr/src/uts/common/io/nfp/autoversion.h b/usr/src/uts/common/io/nfp/autoversion.h new file mode 100644 index 0000000000..b9021942b2 --- /dev/null +++ b/usr/src/uts/common/io/nfp/autoversion.h @@ -0,0 +1,21 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +/* AUTOGENERATED - DO NOT EDIT */ +#ifndef AUTOVERSION_H +#define AUTOVERSION_H + +#define VERSION_RELEASEMAJOR 2 +#define VERSION_RELEASEMINOR 26 +#define VERSION_RELEASEPATCH 40 +#define VERSION_NO "2.26.40cam999" +#define VERSION_COMPNAME "nfdrv" + +#endif diff --git a/usr/src/uts/common/io/nfp/drvlist.c b/usr/src/uts/common/io/nfp/drvlist.c new file mode 100644 index 0000000000..a04b1fd5b0 --- /dev/null +++ b/usr/src/uts/common/io/nfp/drvlist.c @@ -0,0 +1,19 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#include "nfp_common.h" +#include "nfp_cmd.h" + +const nfpcmd_dev *nfp_drvlist[] = { + &i21285_cmddev, + &i21555_cmddev, + NULL +}; + diff --git a/usr/src/uts/common/io/nfp/hostif.c b/usr/src/uts/common/io/nfp/hostif.c new file mode 100644 index 0000000000..684be703ea --- /dev/null +++ b/usr/src/uts/common/io/nfp/hostif.c @@ -0,0 +1,1192 @@ +/* + +hostif.c: nFast PCI driver for Solaris 2.5, 2.6, 2.7 and 2.8 + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +06/05/1998 jsh Original solaris 2.6 +21/05/1999 jsh added support for solaris 2.5 +10/06/1999 jsh added support for solaris 2.7 (32 and 64 bit) +??/??/2001 jsh added support for solaris 2.8 (32 and 64 bit) +16/10/2001 jsh moved from nfast to new structure in nfdrv +12/02/2002 jsh added high level interrupt support + +*/ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/conf.h> +#include <sys/uio.h> +#include <sys/map.h> +#include <sys/debug.h> +#include <sys/modctl.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/open.h> +#include <sys/stat.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/pci.h> + +#include "nfp_common.h" +#include "nfp_hostif.h" +#include "nfp_osif.h" +#include "nfp_cmd.h" + +#include "nfp.h" + +/* mapped memory attributes, no-swap endianess (done in higher level) */ +static struct ddi_device_acc_attr nosw_attr = { + DDI_DEVICE_ATTR_V0, + DDI_NEVERSWAP_ACC, + DDI_STRICTORDER_ACC +}; + +/* dma attributes */ +static ddi_dma_attr_t dma_attrs = { + DMA_ATTR_V0, /* version number */ + (uint64_t)0x0, /* low address */ + (uint64_t)0xffffffff, /* high address */ + (uint64_t)0xffffff, /* DMA counter max */ + (uint64_t)0x1, /* alignment */ + 0x0c, /* burst sizes */ + 0x1, /* minimum transfer size */ + (uint64_t)0x3ffffff, /* maximum transfer size */ + (uint64_t)0x7fff, /* maximum segment size */ + 1, /* no scatter/gather lists */ + 1, /* granularity */ + 0 /* DMA flags */ +}; + +/* + * Debug message control + * Debug Levels: + * 0 = no messages + * 1 = Errors + * 2 = Subroutine calls & control flow + * 3 = I/O Data (verbose!) + * Can be set with adb or in the /etc/system file with + * "set nfp:nfp_debug=<value>" + */ + +int nfp_debug= 1; + +static void *state_head; /* opaque handle top of state structs */ + +static int nfp_open(dev_t *dev, int openflags, int otyp, cred_t *credp); +static int nfp_close(dev_t dev, int openflags, int otyp, cred_t *credp); +static int nfp_release_dev( dev_info_t *dip ); + +static int nfp_read(dev_t dev, struct uio *uiop, cred_t *credp); +static int nfp_write(dev_t dev, struct uio *uiop, cred_t *credp); +static int nfp_strategy(struct buf *bp); + +static int nfp_ioctl(dev_t dev, int cmd, ioctlptr_t arg, int mode, cred_t *credp, int *rvalp); +static int nfp_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp); + +static void nfp_wrtimeout (void *pdev); +static void nfp_rdtimeout (void *pdev); + +static int nfp_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result); +static int nfp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); +static int nfp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); + +static void nfp_read_complete_final(nfp_dev *pdev, int ok); +static void nfp_write_complete_final(nfp_dev *pdev, int ok); + +/* nfp file ops --------------------------------------------------- */ + +static struct cb_ops nfp_cb_ops = { + nfp_open, + nfp_close, + nodev, /* no nfp_strategy */ + nodev, /* no print routine */ + nodev, /* no dump routine */ + nfp_read, + nfp_write, + nfp_ioctl, + nodev, /* no devmap routine */ + nodev, /* no mmap routine */ + nodev, /* no segmap routine */ + nfp_chpoll, + ddi_prop_op, + 0, /* not a STREAMS driver, no cb_str routine */ + D_NEW | D_MP | EXTRA_CB_FLAGS, /* must be safe for multi-thread/multi-processor */ + CB_REV, + nodev, /* aread */ + nodev /* awrite */ +}; + +static struct dev_ops nfp_ops = { + DEVO_REV, /* DEVO_REV indicated by manual */ + 0, /* device reference count */ + nfp_getinfo, + nulldev, /* identify */ + nulldev, /* probe */ + nfp_attach, + nfp_detach, + nodev, /* device reset routine */ + &nfp_cb_ops, + (struct bus_ops *)0, /* bus operations */ +}; + +extern struct mod_ops mod_driverops; +static struct modldrv modldrv = { + &mod_driverops, + NFP_DRVNAME, + &nfp_ops, +}; + +static struct modlinkage modlinkage = { + MODREV_1, /* MODREV_1 indicated by manual */ + (void *)&modldrv, + NULL, /* termination of list of linkage structures */ +}; + +/* interface resource allocation */ + +int nfp_alloc_pci_push( nfp_dev *pdev ) { + /* allocate resources needed for PCI Push, + * if not already allocated. + * return True if successful + */ + nfp_err ret; + uint_t cookie_count; + size_t real_length; + + if(!pdev->read_buf) { + /* allocate read buffer */ + pdev->read_buf = kmem_zalloc( NFP_READBUF_SIZE, KM_NOSLEEP ); + } + if(!pdev->read_buf) { + nfp_log( NFP_DBG1, "nfp_attach: kmem_zalloc read buffer failed"); + pdev->read_buf = NULL; + return 0; + } + + if(!pdev->rd_dma_ok) { + /* allocate dma handle for read buffer */ + ret = ddi_dma_alloc_handle( pdev->dip, + &dma_attrs, + DDI_DMA_DONTWAIT, + NULL, + &pdev->read_dma_handle ); + if( ret != DDI_SUCCESS ) { + nfp_log( NFP_DBG1, + "nfp_alloc_pci_push: ddi_dma_alloc_handle failed (%d)", + ret ); + return 0; + } + + /* Allocate the memory for dma transfers */ + ret = ddi_dma_mem_alloc(pdev->read_dma_handle, NFP_READBUF_SIZE, &nosw_attr, + DDI_DMA_CONSISTENT, DDI_DMA_DONTWAIT, NULL, + (caddr_t*)&pdev->read_buf, &real_length, &pdev->acchandle); + if (ret != DDI_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_alloc_pci_push: ddi_dma_mem_alloc failed (%d)", ret); + ddi_dma_free_handle( &pdev->read_dma_handle ); + return 0; + } + + ret = ddi_dma_addr_bind_handle( pdev->read_dma_handle, + NULL, /* kernel address space */ + (caddr_t)pdev->read_buf, real_length, + DDI_DMA_READ | DDI_DMA_CONSISTENT, /* dma flags */ + DDI_DMA_DONTWAIT, NULL, + &pdev->read_dma_cookie, &cookie_count ); + if( ret != DDI_DMA_MAPPED ) { + nfp_log( NFP_DBG1, + "nfp_alloc_pci_push: ddi_dma_addr_bind_handle failed (%d)", + ret); + ddi_dma_mem_free(&pdev->acchandle); + ddi_dma_free_handle( &pdev->read_dma_handle ); + return 0; + } + if( cookie_count > 1 ) { + nfp_log( NFP_DBG1, + "nfp_alloc_pci_push: error:" + " ddi_dma_addr_bind_handle wants %d transfers", + cookie_count); + ddi_dma_mem_free(&pdev->acchandle); + (void) ddi_dma_unbind_handle( pdev->read_dma_handle ); + ddi_dma_free_handle( &pdev->read_dma_handle ); + return 0; + } + pdev->rd_dma_ok = 1; + } + return pdev->rd_dma_ok; +} + +void nfp_free_pci_push( nfp_dev *pdev ) { + /* free resources allocated to PCI Push */ + if( pdev->rd_dma_ok ) { + (void) ddi_dma_sync(pdev->read_dma_handle,0,0,DDI_DMA_SYNC_FORKERNEL); + ddi_dma_mem_free(&pdev->acchandle); + (void) ddi_dma_unbind_handle( pdev->read_dma_handle ); + ddi_dma_free_handle( &pdev->read_dma_handle ); + pdev->rd_dma_ok = 0; + } + if( pdev->read_buf ) { + kmem_free( pdev->read_buf, NFP_READBUF_SIZE ); + pdev->read_buf = NULL; + } +} + +/* include definition of nfp_set_ifvers() */ +#define nfp_ifvers NFDEV_IF_PCI_PUSH +#include "nfp_ifvers.c" +#undef nfp_ifvers + +/*--------------------*/ +/* nfp_isr */ +/*--------------------*/ + +static u_int nfp_isr( char *pdev_in ) { + /* LINTED: alignment */ + nfp_dev *pdev= (nfp_dev *)pdev_in; + nfp_err ne; + int handled; + + nfp_log( NFP_DBG3, "nfp_isr: entered"); + + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_isr: cannot find dev"); + return DDI_INTR_UNCLAIMED; + } + + /* The isr needs to be mutex'ed - an SMP can call us while we're still + * running! + */ + mutex_enter(&pdev->low_mutex); + ne= pdev->cmddev->isr( pdev->common.cmdctx, &handled ); + mutex_exit(&pdev->low_mutex); + + if( !ne && handled ) + return DDI_INTR_CLAIMED; + if (ne) + nfp_log( NFP_DBG1, "nfp_isr: failed"); + else + nfp_log( NFP_DBG3, "nfp_isr: unclaimed"); + return DDI_INTR_UNCLAIMED; +} + +static u_int nfp_soft_isr( char *pdev_in ) { + /* LINTED: alignment */ + nfp_dev *pdev= (nfp_dev *)pdev_in; + int rd, wr; + + nfp_log( NFP_DBG3, "nfp_soft_isr: entered"); + + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_soft_isr: cannot find dev"); + return DDI_INTR_UNCLAIMED; + } + rd= wr= 0; + + mutex_enter(&pdev->high_mutex); + if(pdev->high_read) { + pdev->high_read= 0; + mutex_exit(&pdev->high_mutex); + rd= 1; + } + if(pdev->high_write) { + pdev->high_write= 0; + wr= 1; + } + mutex_exit(&pdev->high_mutex); + + if(rd) { + nfp_log( NFP_DBG3, "nfp_soft_isr: read done"); + nfp_read_complete_final(pdev, pdev->rd_ok); + } + if(wr) { + nfp_log( NFP_DBG3, "nfp_soft_isr: write done"); + nfp_write_complete_final(pdev, pdev->wr_ok); + } + if( rd || wr ) + return DDI_INTR_CLAIMED; + + nfp_log( NFP_DBG2, "nfp_isr: unclaimed"); + return DDI_INTR_UNCLAIMED; +} + + +/*-------------------------*/ +/* nfp_read */ +/*-------------------------*/ + +void nfp_read_complete(nfp_dev *pdev, int ok) { + nfp_log( NFP_DBG2,"nfp_read_complete: entering"); + + if(pdev->high_intr) { + nfp_log(NFP_DBG2, "nfp_read_complete: high_intr"); + mutex_enter(&pdev->high_mutex); + nfp_log(NFP_DBG3, "nfp_read_complete: high_mutex entered"); + if(pdev->high_read) + nfp_log(NFP_DBG1, "nfp_read_complete: high_read allread set!"); + pdev->high_read= 1; + pdev->rd_ok= ok; + nfp_log(NFP_DBG3, "nfp_read_complete: exiting high_mutex"); + mutex_exit(&pdev->high_mutex); + ddi_trigger_softintr(pdev->soft_int_id); + } else + nfp_read_complete_final( pdev, ok ); + nfp_log( NFP_DBG2,"nfp_read_complete: exiting"); +} + +static void nfp_read_complete_final(nfp_dev *pdev, int ok) { + nfp_log( NFP_DBG2,"nfp_read_complete_final: entering"); + if(pdev->rdtimeout) + (void) untimeout(pdev->rdtimeout); + if(!pdev->rd_outstanding) { + nfp_log( NFP_DBG1,"nfp_read_complete_final: !pdev->rd_outstanding"); + } + nfp_log( NFP_DBG2,"nfp_read_complete_final: pdev->rd_outstanding=0, ok %d", ok); + mutex_enter(&pdev->isr_mutex); + pdev->rd_outstanding= 0; + pdev->rd_ready= 1; + pdev->rd_ok= ok; + cv_broadcast(&pdev->rd_cv); + mutex_exit(&pdev->isr_mutex); + pollwakeup (&pdev->pollhead, POLLRDNORM); + nfp_log( NFP_DBG2,"nfp_read_complete_final: exiting"); +} + +static void nfp_rdtimeout( void *pdev_in ) +{ + nfp_dev *pdev= (nfp_dev *)pdev_in; + + nfp_log( NFP_DBG1, "nfp_rdtimeout: read timed out"); + + if (!pdev) { + nfp_log( NFP_DBG1, "nfp_rdtimeout: NULL pdev." ); + return; + } + pdev->rdtimeout= 0; + nfp_read_complete_final(pdev, 0); +} + +/* ARGSUSED */ +static int nfp_read(dev_t dev, struct uio *uiop, cred_t *credp) { + int ret; + nfp_log( NFP_DBG2, "nfp_read: entered" ); + if (ddi_get_soft_state(state_head, getminor(dev)) != NULL) { + nfp_log( NFP_DBG1, "nfp_read: unable to get nfp_dev"); + return (ENODEV); + } + nfp_log( NFP_DBG2, "nfp_read: about to physio." ); + ret = physio(nfp_strategy, (struct buf *)0, dev, B_READ, minphys, uiop ); + if(ret) + nfp_log( NFP_DBG1, "nfp_read: physio returned %x.", ret ); + return ret; +} + +/*-------------------------*/ +/* nfp_write */ +/*-------------------------*/ + +void nfp_write_complete( nfp_dev *pdev, int ok) { + nfp_log( NFP_DBG2,"nfp_write_complete: entering"); + + if(pdev->high_intr) { + mutex_enter(&pdev->high_mutex); + if(pdev->high_write) + nfp_log(NFP_DBG1, "nfp_write_complete: high_write allread set!"); + pdev->high_write= 1; + pdev->wr_ok= ok; + mutex_exit(&pdev->high_mutex); + ddi_trigger_softintr(pdev->soft_int_id); + } else + nfp_write_complete_final( pdev, ok ); + nfp_log( NFP_DBG2,"nfp_write_complete: exiting"); +} + +static void nfp_write_complete_final( nfp_dev *pdev, int ok) { + struct buf *local_wr_bp; + nfp_log( NFP_DBG2,"nfp_write_complete_final: entering"); + if(pdev->wrtimeout) + (void) untimeout(pdev->wrtimeout); + + if (!pdev->wr_bp) { + nfp_log( NFP_DBG2, "nfp_write_complete_final: write: wr_bp == NULL." ); + return; + } + + bp_mapout(pdev->wr_bp); + pdev->wr_bp->b_resid = ok ? 0 : pdev->wr_bp->b_bcount; + /* Make sure we set wr_ready before calling biodone to avoid a race */ + pdev->wr_ready = 1; + bioerror(pdev->wr_bp, ok ? 0 : ENXIO); + local_wr_bp = pdev->wr_bp; + pdev->wr_bp = 0; + biodone(local_wr_bp); + nfp_log( NFP_DBG2, "nfp_write_complete_final: isr_mutex extited"); + pollwakeup (&pdev->pollhead, POLLWRNORM); + + nfp_log( NFP_DBG2, "nfp_write_complete_final: leaving"); +} + +static void nfp_wrtimeout( void *pdev_in ) +{ + nfp_dev *pdev= (nfp_dev *)pdev_in; + + nfp_log( NFP_DBG1, "nfp_wrtimeout: write timed out"); + + if (!pdev) { + nfp_log( NFP_DBG1, "nfp_wrtimeout: NULL pdev." ); + return; + } + pdev->wrtimeout= 0; + nfp_write_complete_final(pdev, 0); +} + +/* ARGSUSED */ +static int nfp_write(dev_t dev, struct uio *uiop, cred_t *credp) { + int ret; + nfp_log( NFP_DBG2, "nfp_write: entered." ); + if (ddi_get_soft_state(state_head, getminor(dev)) == NULL) { + nfp_log( NFP_DBG1, "nfp_chread: unable to get nfp_dev."); + return (ENODEV); + } + nfp_log( NFP_DBG2, "nfp_write: about to physio." ); + ret = physio(nfp_strategy, (struct buf *)0, dev, B_WRITE, minphys, uiop ); + if(ret) + nfp_log( NFP_DBG1, "nfp_write: physio returned %x.", ret ); + return ret; +} + +/*-------------------------*/ +/* nfp_strategy */ +/*-------------------------*/ + +#define NFP_STRAT_ERR(thebp,err,txt) \ + nfp_log( NFP_DBG1, "nfp_strategy: " txt ".\n"); \ + (thebp)->b_resid = (thebp)->b_bcount; \ + bioerror ((thebp), err); \ + biodone ((thebp)); + +static int nfp_strategy(struct buf *bp) { + register struct nfp_dev *pdev; + nfp_err ne; + + nfp_log( NFP_DBG2, "nfp_strategy: entered." ); + if (!(pdev = ddi_get_soft_state(state_head, getminor(bp->b_edev)))) { + NFP_STRAT_ERR (bp, ENXIO, "unable to get nfp_dev"); + return (0); + } + + if (bp->b_flags & B_READ) { + int count; + /* read */ + if (!pdev->rd_ready) { + NFP_STRAT_ERR (bp,ENXIO,"read called when not ready"); + return (0); + } + pdev->rd_ready=0; + pdev->rd_pending = 0; + if( !pdev->rd_ok) { + NFP_STRAT_ERR (bp,ENXIO,"read failed"); + return (0); + } + /* copy data from module */ + if(pdev->ifvers >= NFDEV_IF_PCI_PUSH) { + nfp_log( NFP_DBG3, "nfp_strategy: copying kernel read buffer"); + if( ddi_dma_sync(pdev->read_dma_handle,0,0,DDI_DMA_SYNC_FORKERNEL) != DDI_SUCCESS ) + { + NFP_STRAT_ERR(bp,ENXIO,"ddi_dma_sync(read_dma_handle) failed"); + return (0); + } + /* LINTED: alignment */ + count= *(unsigned int *)(pdev->read_buf+4); + count= FROM_LE32_MEM(&count); + nfp_log( NFP_DBG3, "nfp_strategy: read count %d", count); + if(count<0 || count>bp->b_bcount) { + NFP_STRAT_ERR(bp,ENXIO,"bad read byte count from device"); + nfp_log( NFP_DBG1, "nfp_strategy: bad read byte count (%d) from device", count); + return (0); + } + bp_mapin (bp); + bcopy( pdev->read_buf + 8, bp->b_un.b_addr, count ); + bp_mapout (bp); + } else { + bp_mapin (bp); + ne= pdev->cmddev->read_block( bp->b_un.b_addr, bp->b_bcount, pdev->common.cmdctx, &count ); + bp_mapout (bp); + if( ne != NFP_SUCCESS) { + NFP_STRAT_ERR (bp,nfp_oserr(ne),"read_block failed"); + return (0); + } + } + bioerror(bp, 0); + bp->b_resid = 0; + biodone (bp); + } else { + /* write */ + if (!pdev->wr_ready) { + NFP_STRAT_ERR (bp,ENXIO,"write called when not ready"); + return (0); + } + if (pdev->wr_bp) { + NFP_STRAT_ERR (bp,ENXIO,"wr_bp != NULL"); + return (0); + } + pdev->wrtimeout= timeout(nfp_wrtimeout, (caddr_t)pdev, NFP_TIMEOUT_SEC * drv_usectohz(1000000)); + pdev->wr_bp = bp; + pdev->wr_ready = 0; + bp_mapin (bp); + ne= pdev->cmddev->write_block( bp->b_un.b_addr, bp->b_bcount, pdev->common.cmdctx); + if( ne != NFP_SUCCESS ) { + bp_mapout (bp); + (void) untimeout(pdev->wrtimeout); + pdev->wr_bp = 0; + pdev->wr_ready = 1; + NFP_STRAT_ERR (bp,nfp_oserr(ne),"write failed"); + return (0); + } + } + nfp_log( NFP_DBG2, "nfp_strategy: leaving"); + + return (0); +} + + +/*--------------------*/ +/* poll / select */ +/*--------------------*/ + +static int nfp_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) { + nfp_dev *pdev; + short revents; + + if (!(pdev = ddi_get_soft_state(state_head, getminor(dev)))) { + nfp_log( NFP_DBG1, "nfp_chpoll: unable to get nfp_dev"); + *reventsp=0; + return (0); + } + nfp_log( NFP_DBG2, "nfp_chpoll: entered %x", events); + + revents=0; + if (events&POLLWRNORM) { + if (pdev->wr_ready) { + nfp_log( NFP_DBG2, "nfp_chpoll: write ready"); + revents|=POLLWRNORM; + } + } + + if (events&POLLRDNORM) { + if (pdev->rd_ready) { + nfp_log( NFP_DBG2, "nfp_chpoll: read ready"); + revents|=POLLRDNORM; + } + } + + if (!revents && !anyyet) { + *phpp=&pdev->pollhead; + } + *reventsp=revents; + + nfp_log( NFP_DBG2, "nfp_chpoll: leaving"); + return (0); +} + + +/*--------------------*/ +/* ioctl */ +/*--------------------*/ + +/* ARGSUSED */ +static int nfp_ioctl(dev_t dev, int cmd, ioctlptr_t arg, int mode, cred_t *credp, int *rvalp) { + register struct nfp_dev *pdev; + + nfp_log( NFP_DBG2, "nfp_ioctl: entered." ); + + if (!(pdev = ddi_get_soft_state(state_head, getminor(dev)))) { + nfp_log( NFP_DBG1, "nfp_ioctl: unable to get nfp dev."); + return (ENXIO); + } + + switch (cmd) { + case NFDEV_IOCTL_ENQUIRY: + { + long *outp; + int outlen; + nfdev_enquiry_str enq_data; + + enq_data.busno = (unsigned int)-1; + enq_data.slotno = (unsigned char)-1; + + /* get our bus and slot num */ + if (ddi_getlongprop (DDI_DEV_T_NONE, + pdev->dip, 0, "reg", + (caddr_t)&outp, &outlen) != DDI_PROP_NOT_FOUND) { + nfp_log( NFP_DBG2, "ddi_getlongprop('reg') ok." ); + if( outlen > 0 ) { + enq_data.busno = ((*outp)>>16) & 0xff; + enq_data.slotno = ((*outp)>>11) & 0x1f; + nfp_log( NFP_DBG2, "busno %d, slotno %d.", + enq_data.busno, enq_data.slotno ); + } + } else + nfp_log( NFP_DBG1, "ddi_getlongprop('reg') failed." ); + + if( ddi_copyout( (char *)&enq_data, (void *)arg, sizeof(enq_data), mode ) != 0 ) { + nfp_log( NFP_DBG1, "ddi_copyout() failed." ); + return EFAULT; + } + } + break; + + case NFDEV_IOCTL_ENSUREREADING: + { + unsigned int addr, len; + nfp_err ret; + if( ddi_copyin( (void *)arg, (char *)&len, sizeof(unsigned int), mode ) != 0 ) { + nfp_log( NFP_DBG1, "ddi_copyin() failed." ); + return (EFAULT); + } + /* signal a read to the module */ + nfp_log( NFP_DBG2, "nfp_ioctl: signalling read request to module, len = %x.", len ); + if (len>8192) { + nfp_log( NFP_DBG1, "nfp_ioctl: len >8192 = %x.", len ); + return EINVAL; + } + if (pdev->rd_outstanding==1) { + nfp_log( NFP_DBG1, "nfp_ioctl: not about to call read with read outstanding."); + return EIO; + } + + addr= 0; + if(pdev->ifvers >= NFDEV_IF_PCI_PUSH) { + if( len > NFP_READBUF_SIZE ) { + nfp_log( NFP_DBG1, "nfp_ioctl: len > NFP_READBUF_SIZE = %x.", len ); + return EINVAL; + } + addr= pdev->read_dma_cookie.dmac_address; + } + + pdev->rd_outstanding = 1; + nfp_log( NFP_DBG2,"nfp_ioctl: pdev->rd_outstanding=1"); + + /* setup timeout timer */ + pdev->rdtimeout= timeout(nfp_rdtimeout, (caddr_t)pdev, NFP_TIMEOUT_SEC * drv_usectohz(1000000)); + + nfp_log( NFP_DBG2, "nfp_ioctl: read request"); + ret = pdev->cmddev->ensure_reading(addr, len, pdev->common.cmdctx); + if ( ret != NFP_SUCCESS ) { + (void) untimeout(pdev->rdtimeout); + pdev->rdtimeout = 0; + pdev->rd_outstanding = 0; + nfp_log( NFP_DBG1, "nfp_ioctl : cmddev->ensure_reading failed "); + return nfp_oserr( ret ); + } + } + break; + + case NFDEV_IOCTL_PCI_IFVERS: + { + int vers; + + nfp_log( NFP_DBG2, "nfp_ioctl: NFDEV_IOCTL_PCI_IFVERS"); + + if( ddi_copyin( (void *)arg, (char *)&vers, sizeof(vers), mode ) != 0 ) { + nfp_log( NFP_DBG1, "ddi_copyin() failed." ); + return (EFAULT); + } + + if( pdev->rd_outstanding ) { + nfp_log( NFP_DBG1, "nfp_ioctl: can't set ifvers %d as read outstanding", vers); + return EIO; + } + + nfp_set_ifvers(pdev, vers); + if( pdev->ifvers != vers ) { + nfp_log( NFP_DBG1, "nfp_ioctl: can't set ifvers %d", vers); + return EIO; + } + } + break; + + case NFDEV_IOCTL_STATS: + { + if( ddi_copyout( (char *)&(pdev->common.stats), + (void *)arg, + sizeof(nfdev_stats_str), + mode ) != 0 ) { + nfp_log( NFP_DBG1, "ddi_copyout() failed." ); + return EFAULT; + } + } + break; + + default: + nfp_log( NFP_DBG1, "nfp_ioctl: unknown ioctl." ); + return EINVAL; + } + + return 0; +} + +/*-------------------------*/ +/* nfp_open */ +/*-------------------------*/ + +/* ARGSUSED */ +int nfp_open(dev_t *dev, int openflags, int otyp, cred_t *credp) +{ + nfp_err ret; + register struct nfp_dev *pdev; + + nfp_log( NFP_DBG2, "entered nfp_open." ); + + pdev = (nfp_dev *)ddi_get_soft_state(state_head, getminor(*dev)); + + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_open: unable to get nfp dev."); + return (ENODEV); + } + + if( otyp != OTYP_CHR ) { + nfp_log( NFP_DBG1, "nfp_open: not opened as character device"); + return (EINVAL); + } + + mutex_enter(&pdev->busy_mutex); + + if (pdev->busy) { + mutex_exit(&pdev->busy_mutex); + nfp_log( NFP_DBG1, "nfp_open: device busy"); + return EBUSY; + } + pdev->busy= 1; + mutex_exit(&pdev->busy_mutex); + + /* use oldest possible interface until told otherwise */ + pdev->ifvers= NFDEV_IF_STANDARD; + nfp_log( NFP_DBG3, "nfp_open: setting ifvers %d", pdev->ifvers); + pdev->rd_ready= 0; /* drop any old data */ + + ret = pdev->cmddev->open(pdev->common.cmdctx); + if( ret != NFP_SUCCESS ) { + nfp_log( NFP_DBG1, "nfp_open : cmddev->open failed "); + return nfp_oserr( ret ); + } + + nfp_log( NFP_DBG2, "nfp_open: done"); + + return 0; +} + +/*--------------------*/ +/* nfp_close */ +/*--------------------*/ + +/* ARGSUSED */ +static int nfp_close(dev_t dev, int openflags, int otyp, cred_t *credp) { + nfp_dev *pdev; + nfp_err ret; + + nfp_log( NFP_DBG2, "nfp_close: entered"); + + pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, getminor(dev)); + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_close: cannot find dev."); + return ENODEV; + } + + mutex_enter(&pdev->isr_mutex); + if(pdev->rd_outstanding) { + int lbolt, err; + nfp_get_lbolt(&lbolt, err); + if(!err) + (void) cv_timedwait(&pdev->rd_cv, &pdev->isr_mutex, lbolt + (NFP_TIMEOUT_SEC * drv_usectohz(1000000)) ); + } + mutex_exit(&pdev->isr_mutex); + ret = pdev->cmddev->close(pdev->common.cmdctx); + if (ret != NFP_SUCCESS ) { + nfp_log( NFP_DBG1, " nfp_close : cmddev->close failed"); + return nfp_oserr( ret ); + } + + mutex_enter(&pdev->busy_mutex); + pdev->busy= 0; + mutex_exit(&pdev->busy_mutex); + + return 0; +} + +/**************************************************************************** + + nfp driver config + + ****************************************************************************/ + +/*-------------------------*/ +/* nfp_getinfo */ +/*-------------------------*/ + +/* ARGSUSED */ +static int nfp_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) { + int error; + nfp_dev *pdev; + + nfp_log( NFP_DBG2, "nfp_getinfo: entered" ); + + pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, getminor((dev_t)arg)); + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_close: cannot find dev."); + return ENODEV; + } + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + if (pdev == NULL) { + *result = NULL; + error = DDI_FAILURE; + } else { + /* + * don't need to use a MUTEX even though we are + * accessing our instance structure; dev->dip + * never changes. + */ + *result = pdev->dip; + error = DDI_SUCCESS; + } + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)(uintptr_t)getminor((dev_t)arg); + error = DDI_SUCCESS; + break; + default: + *result = NULL; + error = DDI_FAILURE; + } + + nfp_log( NFP_DBG2, "nfp_getinfo: leaving." ); + return (error); +} + +/*-------------------------*/ +/* nfp_release */ +/*-------------------------*/ + +static int nfp_release_dev( dev_info_t *dip ) { + nfp_dev *pdev; + int instance, i; + nfp_err ret; + + nfp_log( NFP_DBG2, "nfp_release_dev: entering" ); + + instance = ddi_get_instance(dip); + pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, instance); + if (pdev) { + nfp_log( NFP_DBG3, "nfp_release_dev: removing device" ); + + nfp_free_pci_push(pdev); + + if( pdev->cmddev ) { + nfp_log( NFP_DBG3, "nfp_release_dev: destroying cmd dev" ); + ret = pdev->cmddev->destroy(pdev->common.cmdctx); + if (ret != NFP_SUCCESS) { + nfp_log( NFP_DBG1, " nfp_release_dev : cmddev->destroy failed "); + return nfp_oserr( ret ); + } + } + + if(pdev->high_iblock_cookie) { + nfp_log( NFP_DBG3, "nfp_release_dev: removing high and soft irq" ); + ddi_remove_softintr(pdev->soft_int_id); + ddi_remove_intr(pdev->dip, 0, pdev->high_iblock_cookie); + mutex_destroy( &pdev->busy_mutex ); + cv_destroy( &pdev->rd_cv ); + mutex_destroy( &pdev->isr_mutex ); + mutex_destroy( &pdev->high_mutex ); + } else if(pdev->iblock_cookie) { + nfp_log( NFP_DBG3, "nfp_release_dev: removing irq" ); + ddi_remove_intr(pdev->dip, 0, pdev->iblock_cookie); + mutex_destroy( &pdev->busy_mutex ); + cv_destroy( &pdev->rd_cv ); + mutex_destroy( &pdev->isr_mutex ); + } + if(pdev->low_iblock_cookie) { + ddi_remove_intr(pdev->dip, 0, pdev->low_iblock_cookie); + mutex_destroy( &pdev->low_mutex); + } + + for(i=0;i<6;i++) { + if( pdev->common.extra[i] ) { + nfp_log( NFP_DBG3, "nfp_release_dev: unmapping BAR %d", i ); + ddi_regs_map_free ((ddi_acc_handle_t *)&pdev->common.extra[i]); + } + } + + ddi_remove_minor_node(dip, NULL); + + if (pdev->conf_handle) + pci_config_teardown( &pdev->conf_handle ); + + ddi_soft_state_free(state_head, instance); + } + nfp_log( NFP_DBG2, "nfp_release: finished" ); + + return DDI_SUCCESS; +} + + +/*-------------------------*/ +/* nfp_attach */ +/*-------------------------*/ + +static int nfp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { + int instance; + nfp_dev *pdev = NULL; + int intres; + uint16_t device, vendor, sub_device, sub_vendor; + long *outp; + nfpcmd_dev const *cmddev; + int index, i; + nfp_err ret; + + nfp_log( NFP_DBG2, "nfp_attach: entered." ); + + if (cmd != DDI_ATTACH) { + nfp_log( NFP_DBG1, "nfp_attach: bad command." ); + goto bailout; + } + + instance = ddi_get_instance(dip); + + if (ddi_soft_state_zalloc(state_head, instance) != 0) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_soft_state_zalloc() failed." ); + goto bailout; + } + + pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, instance); + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_attach: cannot find dev."); + return ENODEV; + } + pdev->dip = dip; + + /* map in pci config registers */ + if (pci_config_setup(dip, &pdev->conf_handle)) { + nfp_log( NFP_DBG1, "nfp_attach: pci_config_setup() failed." ); + goto bailout; + } + + /* find out what we have got */ + vendor= PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_VENID ); + device = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_DEVID ); + sub_vendor = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_SUBVENID ); + sub_device = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_SUBSYSID ); + + index= 0; + while( (cmddev = nfp_drvlist[index++]) != NULL ) { + if( cmddev->vendorid == vendor && + cmddev->deviceid == device && + cmddev->sub_vendorid == sub_vendor && + cmddev->sub_deviceid == sub_device ) + break; + } + if( !cmddev ) { + nfp_log( NFP_DBG1, "nfp_attach: unknonw device." ); + goto bailout; + } + + /* map BARs */ + for( i=0; i<6; i++ ) { + if( cmddev->bar_sizes[i] ) { + off_t size; + if( ddi_dev_regsize(dip, i+1, &size) != DDI_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_dev_regsize() failed for BAR %d", i ); + goto bailout; + } + if( size < (cmddev->bar_sizes[i] & ~NFP_MEMBAR_MASK) ) { + nfp_log( NFP_DBG1, "nfp_attach: BAR %d too small %x (%x)", i, size, (cmddev->bar_sizes[i] & ~0xF) ); + goto bailout; + } + if (ddi_regs_map_setup(dip, i+1, (caddr_t *)&pdev->common.bar[i], + 0, cmddev->bar_sizes[i] & ~NFP_MEMBAR_MASK, &nosw_attr, (ddi_acc_handle_t *)&pdev->common.extra[i] )) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_regs_map_setup() failed for BAR %d", i ); + goto bailout; + } + nfp_log( NFP_DBG3, "nfp_attach: BAR[%d] mapped to %x (%x)", i, pdev->common.bar[i], size ); + } + } + + pdev->read_buf = NULL; + pdev->rd_dma_ok = 0; + + /* attach to minor node */ + if (ddi_create_minor_node(dip, "nfp", S_IFCHR, instance, (char *)cmddev->name, 0) == DDI_FAILURE) { + ddi_remove_minor_node(dip, NULL); + nfp_log( NFP_DBG1, "nfp_attach: ddi_create_minor_node() failed." ); + goto bailout; + } + + pdev->wr_ready = 1; + pdev->rd_ready = 0; + pdev->rd_pending = 0; + pdev->rd_outstanding = 0; + pdev->busy=0; + pdev->cmddev= cmddev; + + ret = pdev->cmddev->create(&pdev->common); + if( ret != NFP_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_attach: failed to create command device"); + goto bailout; + } + pdev->common.dev= pdev; + + if (ddi_intr_hilevel(dip, 0) != 0){ + nfp_log( NFP_DBG2, "nfp_attach: high-level interrupt"); + if( ddi_get_iblock_cookie(dip, 0, &pdev->high_iblock_cookie) ) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(high) failed." ); + goto bailout; + } + if( ddi_get_iblock_cookie(dip, 0, &pdev->low_iblock_cookie) ) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(low) failed." ); + goto bailout; + } + mutex_init(&pdev->high_mutex, NULL, MUTEX_DRIVER, + (void *)pdev->high_iblock_cookie); + mutex_init(&pdev->low_mutex, NULL, MUTEX_DRIVER, + (void *)pdev->low_iblock_cookie); + if (ddi_add_intr(dip, 0, NULL, + NULL, nfp_isr, + (caddr_t)pdev) != DDI_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_add_intr(high) failed." ); + goto bailout; + } + if( ddi_get_soft_iblock_cookie(dip, DDI_SOFTINT_HIGH, + &pdev->iblock_cookie) ) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(soft) failed." ); + goto bailout; + } + mutex_init(&pdev->isr_mutex, NULL, MUTEX_DRIVER, + (void *)pdev->iblock_cookie); + if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH, &pdev->soft_int_id, + &pdev->iblock_cookie, NULL, + nfp_soft_isr, (caddr_t)pdev) != DDI_SUCCESS) + goto bailout; + pdev->high_intr= 1; + } else { + nfp_log( NFP_DBG2, "nfp_attach: low-level interrupt"); + + if (ddi_get_iblock_cookie (dip, 0, &pdev->iblock_cookie)) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie() failed." ); + goto bailout; + } + + mutex_init(&pdev->isr_mutex, "nfp isr mutex", MUTEX_DRIVER, (void *)pdev->iblock_cookie); + + if (ddi_add_intr(dip, 0, NULL, + (ddi_idevice_cookie_t *)NULL, nfp_isr, + (caddr_t)pdev) != DDI_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_add_intr() failed." ); + goto bailout; + } + } + mutex_init(&pdev->busy_mutex, "nfp busy mutex", MUTEX_DRIVER, NULL ); + cv_init(&pdev->rd_cv, "nfp read condvar", CV_DRIVER, NULL ); + + /* get our bus and slot num */ + if (ddi_getlongprop (DDI_DEV_T_NONE, + pdev->dip, 0, "reg", + (caddr_t)&outp, &intres) != DDI_PROP_NOT_FOUND) { + nfp_log( NFP_DBG2, "nfp_attach: ddi_getlongprop('reg') ok." ); + if( intres > 0 ) { + nfp_log( NFP_DBG1, "nfp_attach: found PCI nfast bus %x slot %x.", + ((*outp)>>16) & 0xff, ((*outp)>>11) & 0x1f ); + } + } + + nfp_log( NFP_DBG2, "nfp_attach: attach succeeded." ); + return DDI_SUCCESS; + +bailout: + (void) nfp_release_dev( dip ); + + return DDI_FAILURE; +} + +/*-------------------------*/ +/* nfp_detach */ +/*-------------------------*/ + +/* + * When our driver is unloaded, nfp_detach cleans up and frees the resources + * we allocated in nfp_attach. + */ +static int nfp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + (void) nfp_release_dev(dip); + + return (DDI_SUCCESS); +} + +/*-------------------------*/ +/* _init */ +/*-------------------------*/ + +int _init(void) { + register int error; + + nfp_log( NFP_DBG2, "_init: entered" ); + + if ((error = ddi_soft_state_init(&state_head, sizeof (struct nfp_dev), 1)) != 0) { + nfp_log( NFP_DBG1, "_init: soft_state_init() failed" ); + return (error); + } + + if ((error = mod_install(&modlinkage)) != 0) { + nfp_log( NFP_DBG1, "_init: mod_install() failed" ); + ddi_soft_state_fini(&state_head); + } + + nfp_log( NFP_DBG2, "_init: leaving" ); + return (error); +} + +/*-------------------------*/ +/* _info */ +/*-------------------------*/ + +int _info(struct modinfo *modinfop) { + nfp_log( NFP_DBG2, "_info: entered" ); + + return (mod_info(&modlinkage, modinfop)); +} + +/*-------------------------*/ +/* _fini */ +/*-------------------------*/ + +int _fini(void) { + int status; + + nfp_log( NFP_DBG2, "_fini: entered" ); + + if ((status = mod_remove(&modlinkage)) != 0) { + nfp_log( NFP_DBG2, "_fini: mod_remove() failed." ); + return (status); + } + + ddi_soft_state_fini(&state_head); + + nfp_log( NFP_DBG2, "_fini: leaving" ); + + return (status); +} + diff --git a/usr/src/uts/common/io/nfp/i21285.c b/usr/src/uts/common/io/nfp/i21285.c new file mode 100644 index 0000000000..f51a09188d --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21285.c @@ -0,0 +1,310 @@ +/* + +i21285.c: nCipher PCI HSM intel/digital 21285 command driver + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + + +history + +09/10/2001 jsh Original + +*/ + +#include "nfp_common.h" +#include "nfp_error.h" +#include "nfp_hostif.h" +#include "nfp_osif.h" +#include "i21285.h" +#include "nfp_cmd.h" +#include "nfpci.h" + +/* create ------------------------------------------------------- */ + +static nfp_err i21285_create( nfp_cdev *pdev ) { + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21285_create: entered"); + pdev->cmdctx= pdev; /* set our context to just be a pointer to our nfp_cdev */ + + nfp_log( NFP_DBG2, "i21285_create: enable doorbell"); + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21285_create: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + TO_LE32_IO( &tmp32, DOORBELL_ENABLE | POSTLIST_ENABLE); + nfp_outl( pdev, IOBAR, I21285_OFFSET_INTERRUPT_MASK, tmp32 ); + + return NFP_SUCCESS; +} + +/* stop ------------------------------------------------------- */ + +static nfp_err i21285_destroy( void * ctx ) { + nfp_cdev *pdev; + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21285_destroy: entered"); + + pdev= (nfp_cdev *)ctx; + if(!pdev) { + nfp_log( NFP_DBG1, "i21285_destroy: NULL pdev"); + return NFP_ENODEV; + } + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21285_destroy: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + TO_LE32_IO( &tmp32, DOORBELL_DISABLE | POSTLIST_DISABLE ); + nfp_outl( pdev, IOBAR, I21285_OFFSET_INTERRUPT_MASK, tmp32 ); + + return NFP_SUCCESS; +} + +/* open ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21285_open( void * ctx ) { + nfp_log( NFP_DBG2, "i21285_open: entered"); + + return NFP_SUCCESS; +} + +/* close ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21285_close( void * ctx ) { + nfp_log( NFP_DBG2, "i21285_close: entered"); + + return NFP_SUCCESS; +} + +/* isr ------------------------------------------------------- */ + +static nfp_err i21285_isr( void *ctx, int *handled ) { + nfp_cdev *pdev; + unsigned int doorbell; + unsigned int tmp32; + + nfp_log( NFP_DBG3, "i21285_isr: entered"); + + *handled= 0; + pdev= (nfp_cdev *)ctx; + if(!pdev) { + nfp_log( NFP_DBG1, "i21285_isr: NULL pdev"); + return NFP_ENODEV; + } + + doorbell= nfp_inl( pdev, IOBAR, I21285_OFFSET_DOORBELL); + doorbell= FROM_LE32_IO(&doorbell) & 0xffff; + while( doorbell && doorbell != 0xffff) { + *handled= 1; + /* service interrupts */ + if( doorbell & (NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) { + TO_LE32_IO( &tmp32, NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED); + nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + + nfp_log(NFP_DBG2, "i21285_isr: write done interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 ); + + nfp_write_complete(pdev->dev, doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 ); + } + + if( doorbell & (NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED)) { + TO_LE32_IO( &tmp32, NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED ); + nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + + nfp_log(NFP_DBG2, "i21285_isr: read ack interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0 ); + nfp_read_complete( pdev->dev, doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0); + } + + if( doorbell & ~(NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED | + NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) { + nfp_log( NFP_DBG1, "i21285_isr: unexpected interrupt %x", doorbell ); + TO_LE32_IO( &tmp32, 0xffff & doorbell ); + nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + } + doorbell= nfp_inl( pdev, IOBAR, I21285_OFFSET_DOORBELL); + doorbell= FROM_LE32_IO(&doorbell) & 0xffff; + } + return 0; +} + +/* write ------------------------------------------------------- */ + +static nfp_err i21285_write( const char *block, int len, void *ctx ) { + nfp_cdev *cdev; + unsigned int hdr[2]; + nfp_err ne; + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21285_write: entered"); + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21285_write: NULL pdev"); + return NFP_ENODEV; + } + + nfp_log(NFP_DBG2, "i21285_write: pdev->bar[ MEMBAR ]= %x\n", cdev->bar[ MEMBAR ]); + nfp_log(NFP_DBG2, "i21285_write: pdev->bar[ IOBAR ]= %x\n", cdev->bar[ IOBAR ]); + if(!cdev->bar[ MEMBAR ]) { + nfp_log( NFP_DBG1, "i21285_write: null BAR[%d]", MEMBAR ); + return NFP_ENOMEM; + } + ne= nfp_copy_from_user_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_DATA, block, len); + if (ne) { + nfp_log( NFP_DBG1, "i21285_write: nfp_copy_from_user_to_dev failed"); + return ne; + } + TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL); + TO_LE32_MEM(&hdr[1], len); + + ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_CONTROL, (const char *)hdr, 8); + if (ne) { + nfp_log( NFP_DBG1, "i21285_write: nfp_copy_to_dev failed"); + return ne; + } + + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_WR_LENGTH, (char *)hdr, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21285_write: nfp_copy_from_dev failed"); + return ne; + } + + TO_LE32_MEM( &tmp32, len ); + if ( hdr[0] != tmp32 ) { + nfp_log( NFP_DBG1, "i21285_write: length not written"); + return NFP_EIO; + } + + TO_LE32_IO( &tmp32, NFAST_INT_HOST_WRITE_REQUEST); + + nfp_outl( cdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + + nfp_log( NFP_DBG2, "i21285_write: done"); + return NFP_SUCCESS; +} + +/* read ------------------------------------------------------- */ + +static nfp_err i21285_read( char *block, int len, void *ctx, int *rcount) { + nfp_cdev *cdev; + nfp_err ne; + int count; + + nfp_log( NFP_DBG2, "i21285_read: entered, len %d", len); + *rcount= 0; + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21285_read: NULL pdev"); + return NFP_ENODEV; + } + + if(!cdev->bar[ MEMBAR ]) { + nfp_log( NFP_DBG1, "i21285_read: null BAR[%d]", MEMBAR ); + return NFP_ENOMEM; + } + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)&count, 4); + if(ne) { + nfp_log( NFP_DBG1, "i21285_read: nfp_copy_from_dev failed."); + return ne; + } + count= FROM_LE32_MEM(&count); + if(count<0 || count>len) { + nfp_log( NFP_DBG1, "i21285_read: bad byte count (%d) from device", count); + return NFP_EIO; + } + ne= nfp_copy_to_user_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_DATA, block, count); + if( ne ) { + nfp_log( NFP_DBG1, "i21285_read: nfp_copy_to_user_from_dev failed."); + return ne; + } + nfp_log( NFP_DBG2, "i21285_read: done"); + *rcount= count; + return NFP_SUCCESS; +} + +/* chupdate ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21285_chupdate( char *data, int len, void *ctx ) { + nfp_log( NFP_DBG1, "i21285_chupdate: NYI"); + return NFP_SUCCESS; +} + +/* ensure reading -------------------------------------------------- */ + +static nfp_err i21285_ensure_reading( unsigned int addr, int len, void *ctx ) { + nfp_cdev *cdev; + unsigned int hdr[2]; + unsigned int tmp32; + nfp_err ne; + + nfp_log( NFP_DBG2, "i21285_ensure_reading: entered"); + + if(addr) { + nfp_log( NFP_DBG2, "i21285_ensure_reading: bad addr"); + return -NFP_EINVAL; + } + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: NULL pdev"); + return NFP_ENODEV; + } + + if(!cdev->bar[ MEMBAR ]) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: null BAR[%d]", MEMBAR ); + return NFP_ENXIO; + } + nfp_log( NFP_DBG3, "i21285_ensure_reading: pdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]); + nfp_log( NFP_DBG3, "i21285_ensure_reading: pdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]); + TO_LE32_MEM( &hdr[0], NFPCI_JOB_CONTROL); + TO_LE32_MEM( &hdr[1], len); + ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_RD_CONTROL, (const char *)hdr, 8); + if (ne) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: nfp_copy_to_dev failed"); + return ne; + } + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)hdr, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: nfp_copy_from_dev failed"); + return ne; + } + TO_LE32_MEM( &tmp32, len ); + if ( hdr[0] != tmp32 ) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: len not written"); + return NFP_EIO; + }; + TO_LE32_IO( &tmp32, NFAST_INT_HOST_READ_REQUEST ); + nfp_outl( cdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + + return NFP_SUCCESS; +} + +/* command device structure ------------------------------------- */ + + +const nfpcmd_dev i21285_cmddev = { + "nCipher Gen 1 PCI", + PCI_VENDOR_ID_DEC, PCI_DEVICE_ID_DEC_21285, + PCI_VENDOR_ID_NCIPHER, PCI_DEVICE_ID_NFAST_GEN1, + { 0, IOSIZE | PCI_BASE_ADDRESS_SPACE_IO, NFPCI_RAM_MINSIZE, 0, 0, 0 }, + NFP_CMD_FLG_NEED_IOBUF, + i21285_create, + i21285_destroy, + i21285_open, + i21285_close, + i21285_isr, + i21285_write, + i21285_read, + i21285_chupdate, + i21285_ensure_reading, + 0, /* no debug */ +}; + diff --git a/usr/src/uts/common/io/nfp/i21285.h b/usr/src/uts/common/io/nfp/i21285.h new file mode 100644 index 0000000000..4ea1d853ec --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21285.h @@ -0,0 +1,43 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#ifndef NFP_I21285_H +#define NFP_I21285_H + +#ifndef PCI_VENDOR_ID_DEC +#define PCI_VENDOR_ID_DEC 0x1011 +#endif +#ifndef PCI_DEVICE_ID_DEC_21285 +#define PCI_DEVICE_ID_DEC_21285 0x1065 +#endif +#ifndef PCI_VENDOR_ID_NCIPHER +#define PCI_VENDOR_ID_NCIPHER 0x0100 +#endif + +#ifndef PCI_DEVICE_ID_NFAST_GEN1 +#define PCI_DEVICE_ID_NFAST_GEN1 0x0100 +#endif + +#define I21285_OFFSET_DOORBELL 0x60 +#define I21285_OFFSET_INTERRUPT_MASK 0x34 + +#define DOORBELL_ENABLE 0x0 +#define DOORBELL_DISABLE 0x4 + +#define POSTLIST_ENABLE 0x0 +#define POSTLIST_DISABLE 0x8 + +#define IOBAR 1 +#define MEMBAR 2 + +#define IOSIZE 0x80 +#define MEMSIZE 0x100000 + +#endif diff --git a/usr/src/uts/common/io/nfp/i21555.c b/usr/src/uts/common/io/nfp/i21555.c new file mode 100644 index 0000000000..82024dc800 --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21555.c @@ -0,0 +1,423 @@ +/* + +i21555.c: nCipher PCI HSM intel 21555 command driver + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +09/10/2001 jsh Original + +*/ + +#include "nfp_common.h" +#include "nfp_error.h" +#include "nfp_hostif.h" +#include "nfp_osif.h" +#include "i21555.h" +#include "nfp_cmd.h" +#include "nfpci.h" + +/* started ------------------------------------------------------ + * + * Check that device is ready to talk, by checking that + * the i21555 has master enabled on its secondary interface + */ + +static nfp_err i21555_started( nfp_cdev *pdev ) { + unsigned int tmp32; +#ifdef CONFIGSPACE_DEBUG + unsigned int reg32[64]; + int i; +#endif + nfp_err ne; + + nfp_log( NFP_DBG2, "i21555_started: entered"); + +#ifdef CONFIGSPACE_DEBUG + /* Suck up all the registers */ + for (i=0; i < 64; i++) { + ne = nfp_config_inl( pdev, i*4, ®32[i] ); + } + + for (i=0; i < 16; i++) { + int j = i * 4; + nfp_log( NFP_DBG3, "i21555 config reg %2x: %08x %08x %08x %08x", j*4, + reg32[j], reg32[j+1], reg32[j+2], reg32[j+3]); + } +#endif + + ne = nfp_config_inl( pdev, I21555_CFG_SEC_CMD_STATUS, &tmp32 ); + if (ne) { + /* succeed if PCI config reads are not implemented */ + if (ne == NFP_EUNKNOWN) + return NFP_SUCCESS; + nfp_log( NFP_DBG1, "i21555_started: nfp_config_inl failed"); + return ne; + } + + tmp32= FROM_LE32_IO(&tmp32) & 0xffff; + + if ( tmp32 & CFG_CMD_MASTER ) { + nfp_log( NFP_DBG3, "i21555_started: Yes %x", tmp32); + return NFP_SUCCESS; + } else { + nfp_log( NFP_DBG1, "i21555_started: device not started yet %x", tmp32); + return NFP_ESTARTING; + } +} + +/* create ------------------------------------------------------- */ + +static nfp_err i21555_create( nfp_cdev *pdev ) { + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21555_create: entered"); + pdev->cmdctx= pdev; /* set our context to just be a pointer to our nfp_cdev */ + + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_create: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + nfp_log( NFP_DBG2, "i21555_create: enable doorbell"); + TO_LE32_IO( &tmp32, I21555_DOORBELL_PRI_ENABLE ); + nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET_MASK, tmp32 ); + nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK, tmp32 ); + return NFP_SUCCESS; +} + +/* stop ------------------------------------------------------- */ + +static nfp_err i21555_destroy( void * ctx ) { + nfp_cdev *pdev; + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21555_destroy: entered"); + + pdev= (nfp_cdev *)ctx; + if(!pdev) { + nfp_log( NFP_DBG1, "i21555_destroy: NULL pdev"); + return NFP_ENODEV; + } + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_destroy: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + TO_LE32_IO( &tmp32, I21555_DOORBELL_PRI_DISABLE ); + nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET_MASK, tmp32 ); + nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK, tmp32 ); + + return NFP_SUCCESS; +} + +/* open ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21555_open( void * ctx ) { + + nfp_log( NFP_DBG2, "i21555_open: entered"); + + return NFP_SUCCESS; +} + +/* close ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21555_close( void * ctx ) { + nfp_log( NFP_DBG2, "i21555_close: entered"); + + return NFP_SUCCESS; +} + +/* isr ------------------------------------------------------- */ + +static nfp_err i21555_isr( void *ctx, int *handled ) { + nfp_cdev *pdev; + nfp_err ne; + unsigned short doorbell; + unsigned short tmp16; + + nfp_log( NFP_DBG3, "i21555_isr: entered"); + + *handled= 0; + pdev= (nfp_cdev *)ctx; + if(!pdev) { + nfp_log( NFP_DBG1, "i21555_isr: NULL pdev"); + return NFP_ENODEV; + } + + pdev->stats.isr++; + + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_isr: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + + /* This interrupt may not be from our module, so check that it actually is + * us before handling it. + */ + ne = i21555_started( pdev ); + if (ne) { + if (ne != NFP_ESTARTING) { + nfp_log( NFP_DBG1, "i21555_isr: i21555_started failed"); + } + return ne; + } + + doorbell= nfp_inw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET); + doorbell= FROM_LE16_IO(&doorbell); + while( doorbell && doorbell != 0xffff) { + *handled= 1; + /* service interrupts */ + if( doorbell & (NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) { + pdev->stats.isr_write++; + TO_LE16_IO(&tmp16,NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED); + nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 ); + + nfp_log( NFP_DBG2, "i21555_isr: write done interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 ); + + nfp_write_complete(pdev->dev, doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 ); + } + + if( doorbell & (NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED)) { + pdev->stats.isr_read++; + TO_LE16_IO(&tmp16,NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED); + nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 ); + + nfp_log( NFP_DBG2, "i21555_isr: read ack interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0 ); + nfp_read_complete( pdev->dev, doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0); + } + + if( doorbell & ~(NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED | + NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) { + TO_LE16_IO(&tmp16,doorbell); + nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 ); + nfp_log( NFP_DBG1, "i21555_isr: unexpected interrupt %x", doorbell ); + } + doorbell= nfp_inw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET); + doorbell= FROM_LE16_IO(&doorbell); + } + nfp_log( NFP_DBG3, "i21555_isr: exiting"); + return 0; +} + +/* write ------------------------------------------------------- */ + +static nfp_err i21555_write( const char *block, int len, void *ctx) { + nfp_cdev *cdev; + unsigned int hdr[2]; + nfp_err ne; + unsigned short tmp16; + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21555_write: entered"); + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21555_write: NULL cdev"); + return NFP_ENODEV; + } + + cdev->stats.write_fail++; + + if(!cdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_write: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + + ne = i21555_started( cdev ); + if (ne) { + if (ne != NFP_ESTARTING) { + nfp_log( NFP_DBG1, "i21555_write: i21555_started failed"); + } + return ne; + } + + nfp_log( NFP_DBG3, "i21555_write: cdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]); + nfp_log( NFP_DBG3, "i21555_write: cdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]); + nfp_log( NFP_DBG3, "i21555_write: block len %d", len ); + ne= nfp_copy_from_user_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_DATA, block, len); + if (ne) { + nfp_log( NFP_DBG1, "i21555_write: nfp_copy_from_user_to_dev failed"); + return ne; + } + TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL); + TO_LE32_MEM(&hdr[1], len); + ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_CONTROL, (const char *)hdr, 8); + if (ne) { + nfp_log( NFP_DBG1, "i21555_write: nfp_copy_to_dev failed"); + return ne; + } + + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_WR_LENGTH, (char *)hdr, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21555_write: nfp_copy_from_dev failed"); + return ne; + } + + TO_LE32_MEM(&tmp32, len); + if ( hdr[0] != tmp32 ) { + nfp_log( NFP_DBG1, "i21555_write: length not written"); + return NFP_EIO; + } + TO_LE16_IO(&tmp16, NFAST_INT_HOST_WRITE_REQUEST >> 16); + nfp_outw( cdev, IOBAR, I21555_OFFSET_DOORBELL_SEC_SET, tmp16); + + cdev->stats.write_fail--; + cdev->stats.write_block++; + cdev->stats.write_byte += len; + + nfp_log( NFP_DBG2, "i21555_write: done"); + return NFP_SUCCESS; +} + +/* read ------------------------------------------------------- */ + +static nfp_err i21555_read( char *block, int len, void *ctx, int *rcount) { + nfp_cdev *cdev; + nfp_err ne; + int count; + + nfp_log( NFP_DBG2, "i21555_read: entered"); + *rcount= 0; + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21555_read: NULL pdev"); + return NFP_ENODEV; + } + + cdev->stats.read_fail++; + + if(!cdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_read: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)&count, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21555_read: nfp_copy_from_dev failed."); + return ne; + } + count= FROM_LE32_MEM(&count); + if(count<0 || count>len) { + nfp_log( NFP_DBG1, "i21555_read: bad byte count (%d) from device", count); + return NFP_EIO; + } + ne= nfp_copy_to_user_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_DATA, block, count); + if (ne) { + nfp_log( NFP_DBG1, "i21555_read: nfp_copy_to_user failed."); + return ne; + } + nfp_log( NFP_DBG2, "i21555_read: done"); + *rcount= count; + cdev->stats.read_fail--; + cdev->stats.read_block++; + cdev->stats.read_byte += len; + return NFP_SUCCESS; +} + +/* chupdate ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21555_chupdate( char *data, int len, void *ctx ) { + nfp_log( NFP_DBG1, "i21555_chupdate: NYI"); + return NFP_SUCCESS; +} + +/* ensure reading -------------------------------------------------- */ + +static nfp_err i21555_ensure_reading( unsigned int addr, int len, void *ctx ) { + nfp_cdev *cdev; + unsigned int hdr[3]; + unsigned short tmp16; + unsigned int tmp32; + nfp_err ne; + int hdr_len; + + nfp_log( NFP_DBG2, "i21555_ensure_reading: entered"); + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: NULL pdev"); + return NFP_ENODEV; + } + + cdev->stats.ensure_fail++; + + if(!cdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + + ne = i21555_started( cdev ); + if (ne) { + if (ne != NFP_ESTARTING) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: i21555_started failed"); + } + return ne; + } + + nfp_log( NFP_DBG3, "i21555_ensure_reading: pdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]); + nfp_log( NFP_DBG3, "i21555_ensure_reading: pdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]); + if(addr) { + nfp_log( NFP_DBG3, "i21555_ensure_reading: new format, addr %x", addr); + TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL_PCI_PUSH); + TO_LE32_MEM(&hdr[1], len); + TO_LE32_MEM(&hdr[2], addr); + hdr_len= 12; + } else { + TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL); + TO_LE32_MEM(&hdr[1], len); + hdr_len= 8; + } + ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_RD_CONTROL, (const char *)hdr, hdr_len); + if (ne) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: nfp_copy_to_dev failed"); + return ne; + } + + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)hdr, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: nfp_copy_from_dev failed"); + return ne; + } + + TO_LE32_MEM(&tmp32, len); + + if ( hdr[0] != tmp32 ) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: len not written"); + return NFP_EIO; + } + TO_LE16_IO( &tmp16, NFAST_INT_HOST_READ_REQUEST >> 16); + nfp_outw( cdev, IOBAR, I21555_OFFSET_DOORBELL_SEC_SET, tmp16); + + cdev->stats.ensure_fail--; + cdev->stats.ensure++; + + return NFP_SUCCESS; +} + +/* command device structure ------------------------------------- */ + +const nfpcmd_dev i21555_cmddev = { + "nCipher Gen 2 PCI", + PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_21555, + PCI_VENDOR_ID_NCIPHER, PCI_SUBSYSTEM_ID_NFAST_REV1, + { 0, IOSIZE | PCI_BASE_ADDRESS_SPACE_IO, NFPCI_RAM_MINSIZE_JOBS, 0, 0, 0 }, + NFP_CMD_FLG_NEED_IOBUF, + i21555_create, + i21555_destroy, + i21555_open, + i21555_close, + i21555_isr, + i21555_write, + i21555_read, + i21555_chupdate, + i21555_ensure_reading, + i21555_debug, +}; diff --git a/usr/src/uts/common/io/nfp/i21555.h b/usr/src/uts/common/io/nfp/i21555.h new file mode 100644 index 0000000000..d8f3965938 --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21555.h @@ -0,0 +1,51 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#ifndef I21555_H +#define I21555_H + +#ifndef PCI_VENDOR_ID_INTEL +#define PCI_VENDOR_ID_INTEL 0x8086 +#endif + +#ifndef PCI_DEVICE_ID_INTEL_21555 +#define PCI_DEVICE_ID_INTEL_21555 0xb555 +#endif + +#ifndef PCI_VENDOR_ID_NCIPHER +#define PCI_VENDOR_ID_NCIPHER 0x0100 +#endif + +#ifndef PCI_SUBSYSTEM_ID_NFAST_REV1 +#define PCI_SUBSYSTEM_ID_NFAST_REV1 0x0100 +#endif + +#define I21555_OFFSET_DOORBELL_PRI_SET 0x9C +#define I21555_OFFSET_DOORBELL_SEC_SET 0x9E +#define I21555_OFFSET_DOORBELL_PRI_CLEAR 0x98 + +#define I21555_OFFSET_DOORBELL_PRI_SET_MASK 0xA4 +#define I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK 0xA0 + +#define I21555_DOORBELL_PRI_ENABLE 0x0000 +#define I21555_DOORBELL_PRI_DISABLE 0xFFFF + +#define I21555_CFG_SEC_CMD_STATUS 0x44 + +#define CFG_CMD_MASTER 0x0004 + +#define IOBAR 1 +#define MEMBAR 2 + +#define IOSIZE 0x100 + +extern nfp_err i21555_debug( int cmd, void *ctx ); + +#endif diff --git a/usr/src/uts/common/io/nfp/i21555d.c b/usr/src/uts/common/io/nfp/i21555d.c new file mode 100644 index 0000000000..183ace8275 --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21555d.c @@ -0,0 +1,28 @@ +/* + +i21555d.c: nCipher PCI HSM intel 21555 debug ioctl + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + + +history + +15/05/2002 jsh Original, does nothing + +*/ + +#include "nfp_common.h" +#include "nfp_error.h" +#include "nfp_osif.h" +#include "i21555.h" + +/* ARGSUSED */ +nfp_err i21555_debug( int cmd, void *ctx) { + nfp_log( NFP_DBG1, "i21555_debug: entered"); + + return NFP_EUNKNOWN; +} diff --git a/usr/src/uts/common/io/nfp/nfdev-common.h b/usr/src/uts/common/io/nfp/nfdev-common.h new file mode 100644 index 0000000000..8a97bf2c63 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfdev-common.h @@ -0,0 +1,141 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ +/** \file nfdev-common.h + * + * \brief nFast device driver (not generic SCSI) ioctl struct definition file + * include NFDEV-$(system) for ioctl number definitions + * + * 1998.07.13 jsh Started + * + * + */ + +#ifndef NFDEV_COMMON_H +#define NFDEV_COMMON_H + +/** + * Result of the ENQUIRY ioctl. + */ +typedef struct nfdev_enquiry_str { + unsigned int busno; /**< Which bus is the PCI device on. */ + unsigned char slotno; /**< Which slot is the PCI device in. */ + unsigned char reserved[3]; /**< for consistant struct alignment */ +} nfdev_enquiry_str; + +/** + * Result of the STATS ioctl. + */ +typedef struct nfdev_stats_str { + unsigned long isr; /**< Count interrupts. */ + unsigned long isr_read; /**< Count read interrupts. */ + unsigned long isr_write; /**< Count write interrupts. */ + unsigned long write_fail; /**< Count write failures. */ + unsigned long write_block; /**< Count blocks written. */ + unsigned long write_byte; /**< Count bytes written. */ + unsigned long read_fail; /**< Count read failures. */ + unsigned long read_block; /**< Count blocks read. */ + unsigned long read_byte; /**< Count bytes read. */ + unsigned long ensure_fail; /**< Count read request failures. */ + unsigned long ensure; /**< Count read requests. */ +} nfdev_stats_str; + +/** + * Input to the CONTROL ioctl. + */ +typedef struct nfdev_control_str { + unsigned control; /**< Control flags. */ +} nfdev_control_str; + +/** Control bit indicating host supports MOI control */ +#define NFDEV_CONTROL_HOST_MOI 0x0001 + +/** Index of control bits indicating desired mode + * + * Desired mode follows the M_ModuleMode enumeration. + */ +#define NFDEV_CONTROL_MODE_SHIFT 1 + +/** Detect a backwards-compatible control value + * + * Returns true if the request control value "makes no difference", i.e. + * and the failure of an attempt to set it is therefore uninteresting. + */ +#define NFDEV_CONTROL_HARMLESS(c) ((c) <= 1) + +/** + * Result of the STATUS ioctl. + */ +typedef struct nfdev_status_str { + unsigned status; /**< Status flags. */ + char error[8]; /**< Error string. */ +} nfdev_status_str; + +/** Monitor firmware supports MOI control and error reporting */ +#define NFDEV_STATUS_MONITOR_MOI 0x0001 + +/** Application firmware supports MOI control and error reporting */ +#define NFDEV_STATUS_APPLICATION_MOI 0x0002 + +/** Application firmware running and supports error reporting */ +#define NFDEV_STATUS_APPLICATION_RUNNING 0x0004 + +/** HSM failed + * + * Consult error[] for additional information. + */ +#define NFDEV_STATUS_FAILED 0x0008 + +/** Standard PCI interface. */ +#define NFDEV_IF_STANDARD 0x01 + +/** PCI interface with results pushed from device + * via DMA. + */ +#define NFDEV_IF_PCI_PUSH 0x02 + +/* platform independant base ioctl numbers */ + +/** Enquiry ioctl. + * \return nfdev_enquiry_str describing the attached device. */ +#define NFDEV_IOCTL_NUM_ENQUIRY 0x01 +/** Channel Update ioctl. + * \deprecated */ +#define NFDEV_IOCTL_NUM_CHUPDATE 0x02 +/** Ensure Reading ioctl. + * Signal a read request to the device. + * \param (unsigned int) Length of data to be read. + */ +#define NFDEV_IOCTL_NUM_ENSUREREADING 0x03 +/** Device Count ioctl. + * Not implemented for on all platforms. + * \return (int) the number of attached devices. */ +#define NFDEV_IOCTL_NUM_DEVCOUNT 0x04 +/** Internal Debug ioctl. + * Not implemented in release drivers. */ +#define NFDEV_IOCTL_NUM_DEBUG 0x05 +/** PCI Interface Version ioctl. + * \param (int) Maximum PCI interface version + * supported by the user of the device. */ +#define NFDEV_IOCTL_NUM_PCI_IFVERS 0x06 +/** Statistics ioctl. + * \return nfdev_enquiry_str describing the attached device. */ +#define NFDEV_IOCTL_NUM_STATS 0x07 + +/** Module control ioctl + * \param (nfdev_control_str) Value to write to HSM control register + */ +#define NFDEV_IOCTL_NUM_CONTROL 0x08 + +/** Module state ioctl + * \return (nfdev_status_str) Values read from HSM status/error registers + */ +#define NFDEV_IOCTL_NUM_STATUS 0x09 + +#endif diff --git a/usr/src/uts/common/io/nfp/nfdev-solaris.h b/usr/src/uts/common/io/nfp/nfdev-solaris.h new file mode 100644 index 0000000000..923b902e46 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfdev-solaris.h @@ -0,0 +1,37 @@ +/* + +nfdev-solaris.h: nFast solaris specific device ioctl interface. + +(C) Copyright nCipher Corporation Ltd 1998-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +14/07/1998 jsh Original + +*/ + +#ifndef NFDEV_SOLARIS_H +#define NFDEV_SOLARIS_H + +#include "nfdev-common.h" + +#define NFDEV_IOCTL_TYPE ('n'<<8) + +#define NFDEV_IOCTL_ENQUIRY ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_ENQUIRY ) +#define NFDEV_IOCTL_ENSUREREADING ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_ENSUREREADING ) +#define NFDEV_IOCTL_DEVCOUNT ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_DEVCOUNT ) +#define NFDEV_IOCTL_DEBUG ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_DEBUG ) +#define NFDEV_IOCTL_PCI_IFVERS ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_PCI_IFVERS ) +#define NFDEV_IOCTL_STATS ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_STATS ) + +#endif /* NFDEV_SOLARIS_H */ diff --git a/usr/src/uts/common/io/nfp/nfp.h b/usr/src/uts/common/io/nfp/nfp.h new file mode 100644 index 0000000000..9704f04fbc --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp.h @@ -0,0 +1,113 @@ +/* + +nfp.h: nFast PCI driver for Solaris 2.5, 2.6 and 2.7 + +(C) Copyright nCipher Corporation Ltd 2001-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +06/05/1998 jsh Original solaris 2.6 +21/05/1999 jsh added support for solaris 2.5 +10/06/1999 jsh added support for solaris 2.7 (32 and 64 bit) +16/10/2001 jsh moved from nfast to new structure in nfdrv + +*/ + +#ifndef NFP_H +#define NFP_H + +#ifndef _KERNEL +#error Hello? this is a driver, please compile with -D_KERNEL +#endif + +#if ( CH_KERNELVER < 260 ) +typedef int ioctlptr_t; +typedef unsigned short uint16_t; +#define DDI_GET32 ddi_getl +#define DDI_PUT32 ddi_putl +#define DDI_GET16 ddi_getw +#define DDI_PUT16 ddi_putw +#define DDI_REP_GET8 ddi_rep_getb +#define DDI_REP_PUT8 ddi_rep_putb +#define DDI_REP_GET32 ddi_rep_getl +#define DDI_REP_PUT32 ddi_rep_putl +#define PCI_CONFIG_GET16 pci_config_getw +#else /* ( CH_KERNELVER >= 260 ) */ +typedef intptr_t ioctlptr_t; +#define DDI_GET32 ddi_get32 +#define DDI_PUT32 ddi_put32 +#define DDI_GET16 ddi_get16 +#define DDI_PUT16 ddi_put16 +#define DDI_REP_GET8 ddi_rep_get8 +#define DDI_REP_PUT8 ddi_rep_put8 +#define DDI_REP_GET32 ddi_rep_get32 +#define DDI_REP_PUT32 ddi_rep_put32 +#define PCI_CONFIG_GET16 pci_config_get16 +#endif + +#if ( CH_KERNELVER < 270 ) +typedef int nfp_timeout_t; +#define EXTRA_CB_FLAGS 0 +#define VSXPRINTF(s, n, format, ap) vsprintf (s, format, ap) +#else /* ( CH_KERNELVER >= 270 ) */ +typedef timeout_id_t nfp_timeout_t; +#define EXTRA_CB_FLAGS D_64BIT +#define VSXPRINTF(s, n, format, ap) vsnprintf(s, n, format, ap) +#endif + +typedef struct nfp_dev { + int rd_ok; + int wr_ok; + + int ifvers; + + /* for PCI push read interface */ + unsigned char *read_buf; + ddi_dma_handle_t read_dma_handle; + ddi_dma_cookie_t read_dma_cookie; + + ddi_acc_handle_t acchandle; + + int rd_dma_ok; + + nfp_timeout_t wrtimeout; + nfp_timeout_t rdtimeout; + + struct buf *wr_bp; + int wr_ready; + int rd_ready; + int rd_pending; + int rd_outstanding; + kcondvar_t rd_cv; + + struct pollhead pollhead; + dev_info_t *dip; + + ddi_iblock_cookie_t high_iblock_cookie; /* for mutex */ + ddi_iblock_cookie_t low_iblock_cookie; /* for mutex */ + kmutex_t high_mutex; + kmutex_t low_mutex; + int high_intr; + ddi_softintr_t soft_int_id; + int high_read; + int high_write; + + ddi_iblock_cookie_t iblock_cookie; /* for mutex */ + kmutex_t isr_mutex; + + kmutex_t busy_mutex; + int busy; + + ddi_acc_handle_t conf_handle; + + nfp_cdev common; + const nfpcmd_dev *cmddev; +} nfp_dev; + +extern struct nfp_dev *nfp_dev_list[]; + +#endif /* NFP_H */ diff --git a/usr/src/uts/common/io/nfp/nfp_cmd.h b/usr/src/uts/common/io/nfp/nfp_cmd.h new file mode 100644 index 0000000000..db8af0b2f9 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_cmd.h @@ -0,0 +1,68 @@ +/* + +nfp_cmd.h: nCipher PCI HSM command driver decalrations + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +10/10/2001 jsh Original + +*/ + +#ifndef NFPCMD_H +#define NFPCMD_H + +#include "nfp_hostif.h" +#include "nfp_error.h" + +/* read and write called with userspace buffer */ + +typedef struct nfpcmd_dev { + const char *name; + unsigned short vendorid, deviceid, + sub_vendorid, sub_deviceid; + unsigned int bar_sizes[6]; /* includes IO bit */ + unsigned int flags; + nfp_err (*create)(struct nfp_cdev *pdev); + nfp_err (*destroy)(void * ctx); + nfp_err (*open)(void * ctx); + nfp_err (*close)(void * ctx); + nfp_err (*isr)(void *ctx, int *handled); + nfp_err (*write_block)( const char *ublock, int len, void *ctx ); + nfp_err (*read_block)( char *ublock, int len, void *ctx, int *rcount); + nfp_err (*channel_update)( char *data, int len, void *ctx); + nfp_err (*ensure_reading)( unsigned int addr, int len, void *ctx ); + nfp_err (*debug)( int cmd, void *ctx); +} nfpcmd_dev; + +#define NFP_CMD_FLG_NEED_IOBUF 0x1 + +/* list of all supported drivers ---------------------------------------- */ + +extern const nfpcmd_dev *nfp_drvlist[]; + +extern const nfpcmd_dev i21285_cmddev; +extern const nfpcmd_dev i21555_cmddev; +extern const nfpcmd_dev bcm5820_cmddev; + +#ifndef PCI_BASE_ADDRESS_SPACE_IO +#define PCI_BASE_ADDRESS_SPACE_IO 0x1 +#endif + +#define NFP_MAXDEV 16 + + +#define NFP_MEMBAR_MASK ~0xf +#define NFP_IOBAR_MASK ~0x3 +/* + This masks off the bottom bits of the PCI_CSR_BAR which signify that the + BAR is an IO BAR rather than a MEM BAR +*/ + +#endif + diff --git a/usr/src/uts/common/io/nfp/nfp_common.h b/usr/src/uts/common/io/nfp/nfp_common.h new file mode 100644 index 0000000000..d1d2100fea --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_common.h @@ -0,0 +1,68 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#ifndef NFP_COMMON_H +#define NFP_COMMON_H + +#include <sys/types.h> +#include <sys/conf.h> + +typedef uint32_t UINT32; +typedef uint8_t BYTE; + +#define DEFINE_NFPCI_PACKED_STRUCTS +#include "nfpci.h" +#include "nfdev-solaris.h" + +typedef int oserr_t; + +#if CH_BIGENDIAN + +/* Big Endian Sparc */ + +#define SWP32(x) \ +( (((unsigned int)(x)>>24)&0xff) | (((unsigned int)(x)>>8)&0xff00) | (((unsigned int)(x)<<8)&0xff0000) | (((unsigned int)(x)<<24)&0xff000000) ) + +#define SWP16(x) ( (((x)>>8)&0xff) | (((x)<<8)&0xff00) ) + +#define FROM_LE32_IO(x) SWP32(*x) +#define TO_LE32_IO(x,y) *x=SWP32(y) + +#define FROM_LE32_MEM(x) SWP32(*x) +#define TO_LE32_MEM(x,y) *x=SWP32(y) + +#define FROM_LE16_IO(x) SWP16(*x) +#define TO_LE16_IO(x,y) *x=SWP16(y) + +#else + +/* Little Endian x86 */ + +#define FROM_LE32_IO(x) (*x) +#define TO_LE32_IO(x,y) (*x=y) + +#define FROM_LE32_MEM(x) (*x) +#define TO_LE32_MEM(x,y) (*x=y) + +#define FROM_LE16_IO(x) (*x) +#define TO_LE16_IO(x,y) (*x=y) + +#endif /* !CH_BIGENDIAN */ + +#include <sys/types.h> + +#if CH_KERNELVER == 260 +#define nfp_get_lbolt( lbolt, err ) err= drv_getparm( LBOLT, lbolt ) +#else +#define nfp_get_lbolt( lbolt, err ) { *lbolt= ddi_get_lbolt(); err= 0; } +#endif + +#endif + diff --git a/usr/src/uts/common/io/nfp/nfp_error.h b/usr/src/uts/common/io/nfp/nfp_error.h new file mode 100644 index 0000000000..d64cb78fd4 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_error.h @@ -0,0 +1,48 @@ +/* + +nfp_error.h: nCipher PCI HSM error handling + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +05/12/2001 jsh Original + +*/ + +#ifndef NFP_ERROR_H +#define NFP_ERROR_H + +#include "nfp_common.h" + +#define NFP_SUCCESS 0x0 +#define NFP_EFAULT 0x1 +#define NFP_ENOMEM 0x2 +#define NFP_EINVAL 0x3 +#define NFP_EIO 0x4 +#define NFP_ENXIO 0x5 +#define NFP_ENODEV 0x6 +#define NFP_EINTR 0x7 +#define NFP_ESTARTING 0x8 +#define NFP_EAGAIN 0x9 +#define NFP_EUNKNOWN 0x100 + +typedef int nfp_err; + +extern oserr_t nfp_oserr( nfp_err nerr ); +extern nfp_err nfp_error( oserr_t oerr ); + +#define nfr( x) \ + return nfp_error((x)) + +#define nfer(x, fn, msg) \ + { oserr_t err=(x); if(err) { nfp_log( NFP_DBG1, #fn ": " msg); return nfp_error(err); } } + +#define er(x, fn, msg ) \ +{ nfp_err err=(x); if(err) { nfp_log( NFP_DBG1, #fn ": " msg); return err; } } + +#endif diff --git a/usr/src/uts/common/io/nfp/nfp_hostif.h b/usr/src/uts/common/io/nfp/nfp_hostif.h new file mode 100644 index 0000000000..3e7d8187e5 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_hostif.h @@ -0,0 +1,54 @@ +/* + +nfp_hostif.h: nCipher PCI HSM host interface declarations + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +10/10/2001 jsh Original + +*/ + +#ifndef NFP_HOSTIF_H +#define NFP_HOSTIF_H + +#include "nfdev-common.h" + +struct nfp_dev; + +/* common device structure */ + +typedef struct nfp_cdev { + unsigned char *bar[6]; + void *extra[6]; + + int busno; + int slotno; + + void *cmdctx; + + char *iobuf; + + struct nfp_dev* dev; + + struct nfdev_stats_str stats; + +} nfp_cdev; + +/* callbacks from command drivers -------------------------------------- */ + +void nfp_read_complete( struct nfp_dev *pdev, int ok); +void nfp_write_complete( struct nfp_dev *pdev, int ok); + +#define NFP_READ_MAX (8 * 1024) +#define NFP_READBUF_SIZE (NFP_READ_MAX + 8) +#define NFP_TIMEOUT_SEC 10 + +#define NFP_DRVNAME "nCipher nFast PCI driver" + +#endif diff --git a/usr/src/uts/common/io/nfp/nfp_ifvers.c b/usr/src/uts/common/io/nfp/nfp_ifvers.c new file mode 100644 index 0000000000..807b4f24c5 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_ifvers.c @@ -0,0 +1,51 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +/* + * nfp_ifervs.c - common pci interface versioning + * + * uses: + * + * int pdev->ifvers + * device interface version + * + * int nfp_ifvers + * interface version limit + * + * int nfp_alloc_pci_push( nfp_dev *pdev ) + * allocates resources needed for PCI Push, + * if not already allocated, and return True if successful + * + * void nfp_free_pci_push( nfp_dev *pdev ) { + * frees any resources allocated to PCI Push + */ + +void nfp_set_ifvers( nfp_dev *pdev, int vers ) { + if( nfp_ifvers != 0 && vers > nfp_ifvers ) { + nfp_log( NFP_DBG2, + "nfp_set_ifvers: can't set ifvers %d" + " as nfp_ifvers wants max ifvers %d", + vers, nfp_ifvers); + return; + } + if( vers >= NFDEV_IF_PCI_PUSH ) { + if(!nfp_alloc_pci_push(pdev)) { + nfp_log( NFP_DBG1, + "nfp_set_ifvers: can't set ifvers %d" + " as resources not available", + vers); + return; + } + } else { + nfp_free_pci_push(pdev); + } + pdev->ifvers= vers; + nfp_log( NFP_DBG3, "nfp_set_ifvers: setting ifvers %d", vers); +} diff --git a/usr/src/uts/common/io/nfp/nfp_osif.h b/usr/src/uts/common/io/nfp/nfp_osif.h new file mode 100644 index 0000000000..17ffe469ce --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_osif.h @@ -0,0 +1,105 @@ +/* + +nfp_osif.h: nCipher PCI HSM OS interface declarations + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +10/10/2001 jsh Original + +*/ + +#ifndef NFP_OSIF_H +#define NFP_OSIF_H + +#include "nfp_hostif.h" +#include "nfp_error.h" + +/* general typedefs ----------------------------------------------- */ + +typedef volatile unsigned int reg32; +typedef volatile unsigned short reg16; +typedef volatile unsigned char reg8; + +/* sempaphores, mutexs and events --------------------------------- */ + +#if 0 +extern nfp_err nfp_sema_init( nfp_sema *sema, int initial); +extern void nfp_sema_destroy( nfp_sema *sema ); +extern void nfp_sema_post( nfp_sema *sema ); +extern void nfp_sema_wait( nfp_sema *sema ); +extern int nfp_sema_wait_sig( nfp_sema *sema ); + +extern nfp_err nfp_mutex_init( nfp_mutex *mutex ); +extern void nfp_mutex_destroy( nfp_mutex *mutex ); +extern void nfp_mutex_enter( nfp_mutex *mutex ); +extern void nfp_mutex_exit( nfp_mutex *mutex ); + +extern nfp_err nfp_event_init( nfp_event *event ); +extern void nfp_event_destroy( nfp_event *event ); +extern void nfp_event_set( nfp_event *event ); +extern void nfp_event_clear( nfp_event *event ); +extern void nfp_event_wait( nfp_event *event ); +extern void nfp_event_wait_sig( nfp_event *event ); + +#endif + +/* timeouts ------------------------------------------------------ */ + +extern void nfp_sleep( int ms ); + +/* memory handling ----------------------------------------------- */ + +#define KMALLOC_DMA 0 +#define KMALLOC_CACHED 1 + +extern void *nfp_kmalloc( int size, int flags ); +extern void *nfp_krealloc( void *ptr, int size, int flags ); +extern void nfp_kfree( void * ); + +/* config space access ------------------------------------------------ */ + +/* return Little Endian 32 bit config register */ +extern nfp_err nfp_config_inl( nfp_cdev *pdev, int offset, unsigned int *res ); + +/* io space access ------------------------------------------------ */ + +extern unsigned int nfp_inl( nfp_cdev *pdev, int bar, int offset ); +extern unsigned short nfp_inw( nfp_cdev *pdev, int bar, int offset ); +extern void nfp_outl( nfp_cdev *pdev, int bar, int offset, unsigned int data ); +extern void nfp_outw( nfp_cdev *pdev, int bar, int offset, unsigned short data ); + +/* user and device memory space access ---------------------------- */ + +/* NB these 2 functions are not guarenteed to be re-entrant for a given device */ +extern nfp_err nfp_copy_from_user_to_dev( nfp_cdev *cdev, int bar, int offset, const char *ubuf, int len); +extern nfp_err nfp_copy_to_user_from_dev( nfp_cdev *cdev, int bar, int offset, char *ubuf, int len); + +extern nfp_err nfp_copy_from_user( char *kbuf, const char *ubuf, int len ); +extern nfp_err nfp_copy_to_user( char *ubuf, const char *kbuf, int len ); + +extern nfp_err nfp_copy_from_dev( nfp_cdev *cdev, int bar, int offset, char *kbuf, int len ); +extern nfp_err nfp_copy_to_dev( nfp_cdev *cdev, int bar, int offset, const char *kbuf, int len); + +/* debug ------------------------------------------------------------ */ + +#define NFP_DBG1 1 +#define NFP_DBGE NFP_DBG1 +#define NFP_DBG2 2 +#define NFP_DBG3 3 +#define NFP_DBG4 4 + +#ifdef STRANGE_VARARGS +extern void nfp_log(); +#else +extern void nfp_log( int severity, const char *format, ...); +#endif + +extern int nfp_debug; + +#endif diff --git a/usr/src/uts/common/io/nfp/nfpci.h b/usr/src/uts/common/io/nfp/nfpci.h new file mode 100644 index 0000000000..793f5995e6 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfpci.h @@ -0,0 +1,171 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +/* +* +* NFPCI.H - nFast PCI interface definition file +* +* +* +* 1998.06.09 IH Started +* +* The interface presented by nFast PCI devices consists of: +* +* A region of shared RAM used for data transfer & control information +* A doorbell interrupt register, so both sides can give each other interrupts +* A number of DMA channels for transferring data +*/ + +#ifndef NFPCI_H +#define NFPCI_H + +/* Sizes of some regions */ +#define NFPCI_RAM_MINSIZE 0x00100000 +/* This is the minimum size of shared RAM. In future it may be possible to + negotiate larger sizes of shared RAM or auto-detect how big it is */ +#define NFPCI_RAM_MINSIZE_JOBS 0x00020000 /* standard jobs only */ +#define NFPCI_RAM_MINSIZE_KERN 0x00040000 /* standard and kernel jobs */ + +/* Offsets within shared memory space. + The following main regions are: + jobs input area + jobs output area + kernel jobs input area + kernel output area +*/ + +#define NFPCI_OFFSET_JOBS 0x00000000 +#define NFPCI_OFFSET_JOBS_WR 0x00000000 +#define NFPCI_OFFSET_JOBS_RD 0x00010000 +#define NFPCI_OFFSET_KERN 0x00020000 +#define NFPCI_OFFSET_KERN_WR 0x00020000 +#define NFPCI_OFFSET_KERN_RD 0x00030000 + +/* Interrupts, defined by bit position in doorbell register */ + +/* Interrupts from device to host */ +#define NFAST_INT_DEVICE_WRITE_OK 0x00000001 +#define NFAST_INT_DEVICE_WRITE_FAILED 0x00000002 +#define NFAST_INT_DEVICE_READ_OK 0x00000004 +#define NFAST_INT_DEVICE_READ_FAILED 0x00000008 +#define NFAST_INT_DEVICE_KERN_WRITE_OK 0x00000010 +#define NFAST_INT_DEVICE_KERN_WRITE_FAILED 0x00000020 +#define NFAST_INT_DEVICE_KERN_READ_OK 0x00000040 +#define NFAST_INT_DEVICE_KERN_READ_FAILED 0x00000080 + +/* Interrupts from host to device */ +#define NFAST_INT_HOST_WRITE_REQUEST 0x00010000 +#define NFAST_INT_HOST_READ_REQUEST 0x00020000 +#define NFAST_INT_HOST_DEBUG 0x00040000 +#define NFAST_INT_HOST_KERN_WRITE_REQUEST 0x00080000 +#define NFAST_INT_HOST_KERN_READ_REQUEST 0x00100000 + +/* Ordinary job submission ------------------------ */ + +/* The NFPCI_OFFSET_JOBS_WR and NFPCI_OFFSET_JOBS_RD regions are defined + by the following (byte) address offsets... */ + +#define NFPCI_OFFSET_CONTROL 0x0 +#define NFPCI_OFFSET_LENGTH 0x4 +#define NFPCI_OFFSET_DATA 0x8 +#define NFPCI_OFFSET_PUSH_ADDR 0x8 + +#define NFPCI_JOBS_WR_CONTROL (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_CONTROL) +#define NFPCI_JOBS_WR_LENGTH (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_LENGTH) +#define NFPCI_JOBS_WR_DATA (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_DATA) +#define NFPCI_MAX_JOBS_WR_LEN (0x0000FFF8) + +#define NFPCI_JOBS_RD_CONTROL (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_CONTROL) +#define NFPCI_JOBS_RD_LENGTH (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_LENGTH) +#define NFPCI_JOBS_RD_DATA (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_DATA) +/* address in PCI space of host buffer for NFPCI_JOB_CONTROL_PCI_PUSH */ +#define NFPCI_JOBS_RD_PUSH_ADDR (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_PUSH_ADDR) +#define NFPCI_MAX_JOBS_RD_LEN (0x000FFF8) + +/* Kernel inferface job submission ---------------- */ + +#define NFPCI_KERN_WR_CONTROL (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_CONTROL) +#define NFPCI_KERN_WR_LENGTH (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_LENGTH) +#define NFPCI_KERN_WR_DATA (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_DATA) +#define NFPCI_MAX_KERN_WR_LEN (0x0000FFF8) + +#define NFPCI_KERN_RD_CONTROL (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_CONTROL) +#define NFPCI_KERN_RD_LENGTH (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_LENGTH) +#define NFPCI_KERN_RD_DATA (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_DATA) +/* address in PCI space of host buffer for NFPCI_JOB_CONTROL_PCI_PUSH */ +#define NFPCI_KERN_RD_ADDR (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_PUSH_ADDR) +#define NFPCI_MAX_KERN_RD_LEN (0x000FFF8) + +#ifdef DEFINE_NFPCI_PACKED_STRUCTS +typedef struct +{ + UINT32 controlword; + UINT32 length; /* length of data to follow */ + union { + BYTE data[1]; + UINT32 addr; + } uu; +} + NFPCI_JOBS_BLOCK; +#endif + + +#define NFPCI_JOB_CONTROL 0x00000001 +#define NFPCI_JOB_CONTROL_PCI_PUSH 0x00000002 +/* + The 'Control' word is analogous to the SCSI read/write address; + 1 = standard push/pull IO + 2 = push/push IO + + To submit a block of job data, the host: + - sets the (32-bit, little-endian) word at NFPCI_JOBS_WR_CONTROL to NFPCI_JOB_CONTROL + - sets the word at NFPCI_JOBS_WR_LENGTH to the length of the data + - copies the data to NFPCI_JOBS_WR_DATA + - sets interrupt NFAST_INT_HOST_WRITE_REQUEST in the doorbell register + - awaits the NFAST_INT_DEVICE_WRITE_OK (or _FAILED) interrupts back + + To read a block of jobs back, the host: + - sets the word at NFPCI_JOBS_RD_CONTROL to NFPCI_JOB_CONTROL + - sets the word at NFPCI_JOBS_RD_LENGTH to the max length for returned data + - sets interrupt NFAST_INT_HOST_READ_REQUEST + - awaits the NFAST_INT_DEVICE_READ_OK (or _FAILED) interrupt + - reads the data from NFPCI_JOBS_RD_DATA; the module will set the word at + NFPCI_JOBS_RD_LENGTH to its actual length. + + Optionally the host can request the PCI read data to be pushed to host PCI mapped ram: + - allocates a contiguous PCI addressable buffer for a NFPCI_JOBS_BLOCK of max + size NFPCI_MAX_JOBS_RD_LEN (or NFPCI_MAX_KERN_RD_LEN) + 8 + - sets the word at NFPCI_JOBS_RD_CONTROL to NFPCI_JOB_CONTROL_PCI_PUSH + - sets the word at NFPCI_JOBS_RD_LENGTH to the max length for returned data + - sets the word at NFPCI_JOBS_RD_PUSH_ADDR to be the host PCI address of + the buffer + - sets interrupt NFAST_INT_HOST_READ_REQUEST + - awaits the NFAST_INT_DEVICE_READ_OK (or _FAILED) interrupt + - reads the data from the buffer at NFPCI_OFFSET_DATA in the buffer. The + module will set NFPCI_OFFSET_LENGTH to the actual length. +*/ + +#define NFPCI_SCRATCH_CONTROL 0 + +#define NFPCI_SCRATCH_CONTROL_HOST_MOI (1<<0) +#define NFPCI_SCRATCH_CONTROL_MODE_SHIFT 1 +#define NFPCI_SCRATCH_CONTROL_MODE_MASK (3<<NFPCI_SCRATCH_CONTROL_MODE_SHIFT) + +#define NFPCI_SCRATCH_STATUS 1 + +#define NFPCI_SCRATCH_STATUS_MONITOR_MOI (1<<0) +#define NFPCI_SCRATCH_STATUS_APPLICATION_MOI (1<<1) +#define NFPCI_SCRATCH_STATUS_APPLICATION_RUNNING (1<<2) +#define NFPCI_SCRATCH_STATUS_ERROR (1<<3) + +#define NFPCI_SCRATCH_ERROR_LO 2 +#define NFPCI_SCRATCH_ERROR_HI 3 + +#endif diff --git a/usr/src/uts/common/io/nfp/osif.c b/usr/src/uts/common/io/nfp/osif.c new file mode 100644 index 0000000000..fba62f9a37 --- /dev/null +++ b/usr/src/uts/common/io/nfp/osif.c @@ -0,0 +1,184 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/conf.h> +#include <sys/uio.h> +#include <sys/map.h> +#include <sys/debug.h> +#include <sys/modctl.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/open.h> +#include <sys/stat.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/pci.h> + +#include "nfp_common.h" +#include "nfp_hostif.h" +#include "nfp_error.h" +#include "nfp_osif.h" +#include "nfp_cmd.h" +#include "nfp.h" +#include "autoversion.h" + +/* config space access ---------------------------------- */ + +nfp_err nfp_config_inl( nfp_cdev *pdev, int offset, unsigned int *res ) { + unsigned int tmp32; + if ( !pdev || !pdev->dev || !pdev->dev->conf_handle ) + return NFP_ENODEV; + +/* pci_config_get32() does byte swapping, so put back to LE */ + tmp32 = pci_config_get32( pdev->dev->conf_handle, offset ); + TO_LE32_IO(res, tmp32); + + return NFP_SUCCESS; +} + +/* user space memory access ---------------------------------- */ + +nfp_err nfp_copy_from_user( char *kbuf, const char *ubuf, int len) { + bcopy(ubuf, kbuf, len); + return 0; +} + +nfp_err nfp_copy_to_user( char *ubuf, const char *kbuf, int len) { + bcopy(kbuf, ubuf, len); + return 0; +} + +nfp_err nfp_copy_from_user_to_dev( nfp_cdev *cdev, int bar, int offset, const char *ubuf, int len) { + /* dirty hack on Solaris, as we are called from strategy we are, in fact, copying from kernel mem */ + return nfp_copy_to_dev( cdev, bar, offset, ubuf, len ); +} + +nfp_err nfp_copy_to_user_from_dev( nfp_cdev *cdev, int bar, int offset, char *ubuf, int len) { + /* dirty hack on Solaris, as we are called from strategy we are, in fact, copying to kernel mem */ + return nfp_copy_from_dev( cdev, bar, offset, ubuf, len ); +} + +nfp_err nfp_copy_from_dev( nfp_cdev *cdev, int bar, int offset, char *kbuf, int len) { + if( len & 0x3 || offset & 0x3 ) + DDI_REP_GET8( cdev->extra[bar], (unsigned char *)kbuf, cdev->bar[bar] + offset, len, DDI_DEV_AUTOINCR); + else + /* LINTED: alignment */ + DDI_REP_GET32( cdev->extra[bar], (unsigned int *)kbuf, (unsigned int *)(cdev->bar[bar] + offset), len / 4, DDI_DEV_AUTOINCR); + return NFP_SUCCESS; +} + +nfp_err nfp_copy_to_dev( nfp_cdev *cdev, int bar, int offset, const char *kbuf, int len) { + if( len & 0x3 || offset & 0x3 ) + DDI_REP_PUT8( cdev->extra[bar], (unsigned char *)kbuf, cdev->bar[bar] + offset, len, DDI_DEV_AUTOINCR ); + else + /* LINTED: alignment */ + DDI_REP_PUT32( cdev->extra[bar], (unsigned int *)kbuf, (unsigned int *)(cdev->bar[bar] + offset), len / 4, DDI_DEV_AUTOINCR ); + return NFP_SUCCESS; +} + +/* pci io space access --------------------------------------- */ + +unsigned int nfp_inl( nfp_cdev *pdev, int bar, int offset ) { + nfp_log( NFP_DBG3, "nfp_inl: addr %x", (uintptr_t) pdev->bar[bar] + offset); + /* LINTED: alignment */ + return DDI_GET32( pdev->extra[bar], (uint32_t *)(pdev->bar[bar] + offset) ); +} + +unsigned short nfp_inw( nfp_cdev *pdev, int bar, int offset ) { + nfp_log( NFP_DBG3, "nfp_inw: addr %x", (uintptr_t) pdev->bar[bar] + offset); + /* LINTED: alignment */ + return DDI_GET16( pdev->extra[bar], (unsigned short *)(pdev->bar[ bar ] + offset) ); +} + +void nfp_outl( nfp_cdev *pdev, int bar, int offset, unsigned int data ) { + nfp_log( NFP_DBG3, "nfp_outl: addr %x, data %x", (uintptr_t) pdev->bar[bar] + offset, data); + /* LINTED: alignment */ + DDI_PUT32( pdev->extra[bar], (uint32_t *)(pdev->bar[ bar ] + offset), data ); +} + +void nfp_outw( nfp_cdev *pdev, int bar, int offset, unsigned short data ) { + nfp_log( NFP_DBG3, "nfp_outl: addr %x, data %x", (uintptr_t) pdev->bar[bar] + offset, data); + /* LINTED: alignment */ + DDI_PUT16( pdev->extra[bar], (unsigned short *)(pdev->bar[ bar ] + offset), data ); +} + +/* logging ---------------------------------------------------- */ + +void nfp_log( int level, const char *fmt, ...) +{ + auto char buf[256]; + va_list ap; + + switch (level) { + case NFP_DBG4: if (nfp_debug < 4) break; + /*FALLTHROUGH*/ + case NFP_DBG3: if (nfp_debug < 3) break; + /*FALLTHROUGH*/ + case NFP_DBG2: if (nfp_debug < 2) break; + /*FALLTHROUGH*/ + case NFP_DBG1: if (nfp_debug < 1) break; + /*FALLTHROUGH*/ + default: + va_start(ap, fmt); + (void) vsnprintf(buf, 256, fmt, ap); + va_end(ap); + cmn_err(CE_CONT, "!" VERSION_COMPNAME " " VERSION_NO ": %s\n", buf); + break; + } +} + +struct errstr { + int oserr; + nfp_err nferr; +}; + + +static struct errstr errtab[] = { + { EFAULT, NFP_EFAULT }, + { ENOMEM, NFP_ENOMEM }, + { EINVAL, NFP_EINVAL }, + { EIO, NFP_EIO }, + { ENXIO, NFP_ENXIO }, + { ENODEV, NFP_ENODEV }, + { EINVAL, NFP_EUNKNOWN }, + { 0, 0 } +}; + +nfp_err nfp_error( int oserr ) +{ + struct errstr *perr; + if(!oserr) + return 0; + perr= errtab; + while(perr->nferr) { + if(perr->oserr == oserr) + return perr->nferr; + perr++; + } + return NFP_EUNKNOWN; +} + +int nfp_oserr( nfp_err nferr ) +{ + struct errstr *perr; + if(nferr == NFP_SUCCESS) + return 0; + perr= errtab; + while(perr->nferr) { + if(perr->nferr == nferr) + return perr->oserr; + perr++; + } + return EIO; +} diff --git a/usr/src/uts/common/io/overlay/overlay.c b/usr/src/uts/common/io/overlay/overlay.c new file mode 100644 index 0000000000..2ad3f4f591 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay.c @@ -0,0 +1,2184 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * Overlay Devices + * + * Overlay devices provide a means for creating overlay networks, a means of + * multiplexing multiple logical, isolated, and discrete layer two and layer + * three networks on top of one physical network. + * + * In general, these overlay devices encapsulate the logic to answer two + * different questions: + * + * 1) How should I transform a packet to put it on the wire? + * 2) Where should I send a transformed packet? + * + * Each overlay device is presented to the user as a GLDv3 device. While the + * link itself cannot have an IP interface created on top of it, it allows for + * additional GLDv3 devices, such as a VNIC, to be created on top of it which + * can be plumbed up with IP interfaces. + * + * + * -------------------- + * General Architecture + * -------------------- + * + * The logical overlay device that a user sees in dladm(1M) is a combination of + * two different components that work together. The first component is this + * kernel module, which is responsible for answering question one -- how should + * I transform a packet to put it on the wire. + * + * The second component is what we call the virtual ARP daemon, or varpd. It is + * a userland component that is responsible for answering the second question -- + * Where should I send a transformed packet. Instances of the kernel overlay + * GLDv3 device ask varpd the question of where should a packet go. + * + * The split was done for a few reasons. Importantly, we wanted to keep the act + * of generating encapsulated packets in the kernel so as to ensure that the + * general data path was fast and also kept simple. On the flip side, while the + * question of where should something go may be simple, it may often be + * complicated and need to interface with several different external or + * distributed systems. In those cases, it's simpler to allow for the full + * flexibility of userland to be brought to bear to solve that problem and in + * general, the path isn't very common. + * + * The following is what makes up the logical overlay device that a user would + * create with dladm(1M). + * + * Kernel Userland + * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . + * . +--------+ +--------+ +--------+ . . . + * . | VNIC 0 | | VNIC 1 | | VNIC 2 | . . . + * . +--------+ +--------+ +--------+ . . . + * . | | | . . . + * . | | | . . . + * . +------------+-----------+ . . . + * . | . . /dev/overlay . + * . +--------------+ . . . +------------+ . + * . | | . . . | | . + * . | Overlay |======*=================| Virtual | . + * . | GLDv3 Device |========================| ARP Daemon | . + * . | | . . | | . + * . +--------------+ . . +------------+ . + * . | . . | . + * . | . . | . + * . +----------------+ . . +--------+ . + * . | Overlay | . . | varpd | . + * . | Encapsulation | . . | Lookup | . + * . | Plugin | . . | Plugin | . + * . +----------------+ . . +--------+ . + * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . + * + * + * This image shows the two different components and where they live. + * Importantly, it also shows that both the kernel overlay device and the + * userland varpd both support plugins. The plugins actually implement the + * things that users care about and the APIs have been designed to try to + * minimize the amount of things that a module writer needs to worry about it. + * + * IDENTIFIERS + * + * Every overlay device is defined by a unique identifier which is the overlay + * identifier. Its purpose is similar to that of a VLAN identifier, it's a + * unique number that is used to differentiate between different entries on the + * wire. + * + * ENCAPSULATION + * + * An overlay encapsulation plugin is a kernel miscellaneous module whose + * purpose is to contain knowledge about how to transform packets to put them + * onto the wire and to take them off. An example of an encapsulation plugin is + * vxlan. It's also how support for things like nvgre or geneve would be brought + * into the system. + * + * Each encapsulation plugins defines a series of operation vectors and + * properties. For the full details on everything they should provide, please + * read uts/common/sys/overlay_plugin.h. The encapsulation plugin is responsible + * for telling the system what information is required to send a packet. For + * example, vxlan is defined to send everything over a UDP packet and therefore + * requires a port and an IP address, while nvgre on the other hand is its own + * IP type and therefore just requires an IP address. In addition, it also + * provides information about the kind of socket that should be created. This is + * used by the kernel multiplexor, more of that in the Kernel Components + * section. + * + * LOOKUPS + * + * The kernel communicates requests for lookups over the character device + * /dev/overlay. varpd is responsible for listening for requests on that device + * and answering them. The character device is specific to the target path and + * varpd. + * + * Much as the kernel overlay module handles the bulk of the scaffolding but + * leaves the important work to the encapsulation plugin, varpd provides a + * similar role and leaves the full brunt of lookups to a userland dynamic + * shared object which implements the logic of lookups. + * + * Each lookup plugin defines a series of operation vectors and properties. For + * the full details on everything that they should provide, please read + * lib/varpd/libvarpd/libvarpd_provider.h. Essentially, they are given a MAC + * address and asked to give an address on the physical network that it should + * be sent to. In addition, they handle questions related to how to handle + * things like broadcast and multicast traffic, etc. + * + * ---------- + * Properties + * ---------- + * + * A device from a dladm perspective has a unique set of properties that are + * combined from three different sources: + * + * 1) Generic properties that every overlay device has + * 2) Properties that are specific to the encapsulation plugin + * 3) Properties that are specific to the lookup plugin + * + * All of these are exposed in a single set of properties in dladm. Note that + * these are not necessarily traditional link properties. However, if something + * is both a traditional GLDv3 link property, say the MTU of a device, and a + * specific property here, than the driver ensures that all existing GLDv3 + * specific means of manipulating it are used and wraps up its private property + * interfaces to ensure that works. + * + * Properties in the second and third category are prefixed with the name of + * their module. For example, the vxlan encapsulation module has a property + * called the 'listen_ip'. This property would show up in dladm as + * 'vxlan/listen_ip'. This allows different plugins to both use similar names + * for similar properties and to also have independent name spaces so that + * overlapping names do not conflict with anything else. + * + * While the kernel combines both sets one and two into a single coherent view, + * it does not do anything with respect to the properties that are owned by the + * lookup plugin -- those are owned wholly by varpd. Instead, libdladm is in + * charge of bridging these two worlds into one magical experience for the user. + * It carries the burden of knowing about both overlay specific and varpd + * specific properties. Importantly, we want to maintain this distinction. We + * don't want to treat the kernel as an arbitrary key/value store for varpd and + * we want the kernel to own its own data and not have to ask userland for + * information that it owns. + * + * Every property in the system has the following attributes: + * + * o A name + * o A type + * o A size + * o Permissions + * o Default value + * o Valid value ranges + * o A value + * + * Everything except for the value is obtained by callers through the propinfo + * callbacks and a property has a maximum size of OVERLAY_PROP_SIZEMAX, + * currently 256 bytes. + * + * The following are the supported types of properties: + * + * OVERLAY_PROP_T_INT + * + * A signed integer, its length is 8 bytes, corresponding to a + * int64_t. + * + * OVERLAY_PROP_T_UINT + * + * An unsigned integer, its length is 8 bytes, corresponding to a + * uint64_t. + * + * OVERLAY_PROP_T_IP + * + * A struct in6_addr, it has a fixed size. + * + * OVERLAY_PROP_T_STRING + * + * A null-terminated character string encoded in either ASCII or + * UTF-8. Note that the size of the string includes the null + * terminator. + * + * The next thing that we apply to a property is its permission. The permissions + * are put together by the bitwise or of the following flags and values. + * + * OVERLAY_PROP_PERM_REQ + * + * This indicates a required property. A property that is required + * must be set by a consumer before the device can be created. If a + * required property has a default property, this constraint is + * loosened because the default property defines the value. + * + * OVERLAY_PORP_PERM_READ + * + * This indicates that a property can be read. All properties will + * have this value set. + * + * OVERLAY_PROP_PERM_WRITE + * + * This indicates that a property can be written to and thus + * updated by userland. Properties that are only intended to + * display information, will not have OVERLAY_PROP_PERM_WRITE set. + * + * In addition, a few additional values are defined as a convenience to + * consumers. The first, OVERLAY_PROP_PERM_RW, is a combination of + * OVERLAY_PROP_PERM_READ and OVERLAY_PERM_PROP_WRITE. The second, + * OVERLAY_PROP_PERM_RRW, is a combination of OVERLAY_PROP_PERM_REQ, + * OVERLAY_PROP_PERM_READ, and OVERLAY_PROP_PERM_WRITE. The protection mode of a + * property should generally be a constant across its lifetime. + * + * A property may optionally have a default value. If it does have a default + * value, and that property is not set to be a different value, then the default + * value is inherited automatically. It also means that if the default value is + * acceptable, there is no need to set the value for a required property. For + * example, the vxlan module has the vxlan/listen_port property which is + * required, but has a default value of 4789 (the IANA assigned port). Because + * of that default value, there is no need for it to be set. + * + * Finally, a property may declare a list of valid values. These valid values + * are used for display purposes, they are not enforced by the broader system, + * but merely allow a means for the information to be communicated to the user + * through dladm(1M). Like a default value, this is optional. + * + * The general scaffolding does not do very much with respect to the getting and + * setting of properties. That is really owned by the individual plugins + * themselves. + * + * ----------------------------- + * Destinations and Plugin Types + * ----------------------------- + * + * Both encapsulation and lookup plugins define the kinds of destinations that + * they know how to support. There are three different pieces of information + * that can be used to address to a destination currently, all of which is + * summarized in the type overlay_point_t. Any combination of these is + * supported. + * + * OVERLAY_PLUGIN_D_ETHERNET + * + * An Ethernet MAC address is required. + * + * OVERLAY_PLUGIN_D_IP + * + * An IP address is required. All IP addresses used by the overlay + * system are transmitted as IPv6 addresses. IPv4 addresses can be + * represented by using IPv4-mapped IPv6 addresses. + * + * OVERLAY_PLUGIN_D_PORT + * + * A TCP/UDP port is required. + * + * A kernel encapsulation plugin declares which of these that it requires, it's + * a static set. On the other hand, a userland lookup plugin can be built to + * support all of these or any combination thereof. It gets passed the required + * destination type, based on the kernel encapsulation method, and then it makes + * the determination as to whether or not it supports it. For example, the + * direct plugin can support either an IP or both an IP and a port, it simply + * doesn't display the direct/dest_port property in the cases where a port is + * not required to support this. + * + * The user lookup plugins have two different modes of operation which + * determines how they interact with the broader system and how look ups are + * performed. These types are: + * + * OVERLAY_TARGET_POINT + * + * A point to point plugin has a single static definition for where + * to send all traffic. Every packet in the system always gets sent + * to the exact same destination which is programmed into the + * kernel when the general device is activated. + * + * OVERLAY_TARGET_DYNAMIC + * + * A dynamic plugin does not have a single static definition. + * Instead, for each destination, the kernel makes an asynchronous + * request to varpd to determine where the packet should be routed, + * and if a specific destination is found, then that destination is + * cached in the overlay device's target cache. + * + * This distinction, while important for the general overlay device's operation, + * is not important to the encapsulation plugins. They don't need to know about + * any of these pieces. It's just a concern for varpd, the userland plugin, and + * the general overlay scaffolding. + * + * When an overlay device is set to OVERLAY_TARGET_POINT, then it does not + * maintain a target cache, and instead just keeps track of the destination and + * always sends encapsulated packets to that address. When the target type is of + * OVERLAY_TARGET_DYNAMIC, then the kernel maintains a cache of all such + * destinations. These destinations are kept around in an instance of a + * reference hash that is specific to the given overlay device. Entries in the + * cache can be invalidated and replaced by varpd and its lookup plugins. + * + * ---------------------------------- + * Kernel Components and Architecture + * ---------------------------------- + * + * There are multiple pieces inside the kernel that work together, there is the + * general overlay_dev_t structure, which is the logical GLDv3 device, but it + * itself has references to things like an instance of an encapsulation plugin, + * a pointer to a mux and a target cache. It can roughly be summarized in the + * following image: + * + * +------------------+ + * | global | + * | overlay list | + * | overlay_dev_list | + * +------------------+ + * | + * | +-----------------------+ +---------------+ + * +->| GLDv3 Device |----------->| GLDv3 Device | -> ... + * | overlay_dev_t | | overlay_dev_t | + * | | +---------------+ + * | | + * | mac_handle_t -----+---> GLDv3 handle to MAC + * | datalink_id_t -----+---> Datalink ID used by DLS + * | overlay_dev_flag_t ---+---> Device state + * | uint_t -----+---> Curent device MTU + * | uint_t -----+---> In-progress RX operations + * | uint_t -----+---> In-progress TX operations + * | char[] -----+---> FMA degraded message + * | void * -----+---> plugin private data + * | overlay_target_t * ---+---------------------+ + * | overlay_plugin_t * ---+---------+ | + * +-----------------------+ | | + * ^ | | + * +--------------------+ | | | + * | Kernel Socket | | | | + * | Multiplexor | | | | + * | overlay_mux_t | | | | + * | | | | | + * | avl_tree_t -+--+ | | + * | uint_t -+--> socket family | | + * | uint_t -+--> socket type | | + * | uint_t -+--> socket protocol | | + * | ksocket_t -+--> I/O socket | | + * | struct sockaddr * -+--> ksocket address | | + * | overlay_plugin_t --+--------+ | | + * +--------------------+ | | | + * | | | + * +-------------------------+ | | | + * | Encap Plugin |<--+-----------+ | + * | overlay_plugin_t | | + * | | | + * | char * ---+--> plugin name | + * | overlay_plugin_ops_t * -+--> plugin downcalls | + * | char ** (props) ---+--> property list | + * | uint_t ---+--> id length | + * | overlay_plugin_flags_t -+--> plugin flags | + * | overlay_plugin_dest_t --+--> destination type v + * +-------------------------+ +-------------------------+ + * | Target Cache | + * | overlay_target_t | + * | | + * cache mode <--+- overlay_target_mode_t | + * dest type <--+- overlay_plugin_dest_t | + * cache flags <--+- overlay_target_flag_t | + * varpd id <--+- uint64_t | + * outstanding varpd reqs. <--+- uint_t | + * OVERLAY_TARGET_POINT state <--+- overlay_target_point_t | + * OVERLAY_TARGET_DYNAMIC state <-+---+- overlay_target_dyn_t | + * | +-------------------------+ + * +-----------------------+ + * | + * v + * +-------------------------------+ +------------------------+ + * | Target Entry |-->| Target Entry |--> ... + * | overlay_target_entry_t | | overlay_target_entry_t | + * | | +------------------------+ + * | | + * | overlay_target_entry_flags_t -+--> Entry flags + * | uint8_t[ETHERADDRL] ---+--> Target MAC address + * | overlay_target_point_t ---+--> Target underlay address + * | mblk_t * ---+--> outstanding mblk head + * | mblk_t * ---+--> outstanding mblk tail + * | size_t ---+--> outstanding mblk size + * +-------------------------------+ + * + * The primary entries that we care about are the overlay_dev_t, which + * correspond to each overlay device that is created with dladm(1M). Globally, + * these devices are maintained in a simple list_t which is protected with a + * lock. Hence, these include important information such as the mac_handle_t + * and a datalink_id_t which is used to interact with the broader MAC and DLS + * ecosystem. We also maintain additional information such as the current state, + * outstanding operations, the mtu, and importantly, the plugin's private data. + * This is the instance of an encapsulation plugin that gets created as part of + * creating an overlay device. Another aspect of this is that the overlay_dev_t + * also includes information with respect to FMA. For more information, see the + * FMA section. + * + * Each overlay_dev_t has a pointer to a plugin, a mux, and a target. The plugin + * is the encapsulation plugin. This allows the device to make downcalls into it + * based on doing things like getting and setting properties. Otherwise, the + * plugin itself is a fairly straightforward entity. They are maintained in an + * (not pictured above) list. The plugins themselves mostly maintain things like + * the static list of properties, what kind of destination they require, and the + * operations vector. A given module may contain more if necessary. + * + * The next piece of the puzzle is the mux, or a multiplexor. The mux itself + * maintains a ksocket and it is through the mux that we send and receive + * message blocks. The mux represents a socket type and address, as well as a + * plugin. Multiple overlay_dev_t devices may then share the same mux. For + * example, consider the case where you have different instances of vxlan all on + * the same underlay network. These would all logically share the same IP + * address and port that packets are sent and received on; however, what differs + * is the decapuslation ID. + * + * Each mux maintains a ksocket_t which is similar to a socket(3SOCKET). Unlike + * a socket, we enable a direct callback on the ksocket. This means that + * whenever a message block chain is received, rather than sitting there and + * getting a callback in a context and kicking that back out to a taskq. Instead + * data comes into the callback function overlay_mux_recv(). + * + * The mux is given encapsulated packets (via overlay_m_tx, the GLDv3 tx + * function) to transmit. It receives encapsulated packets, decapsulates them to + * determine the overlay identifier, looks up the given device that matches that + * identifier, and then causes the broader MAC world to receive the packet with + * a call to mac_rx(). + * + * Today, we don't do too much that's special with the ksocket; however, as + * hardware is gaining understanding for these encapuslation protocols, we'll + * probably want to think of better ways to get those capabilities passed down + * and potentially better ways to program receive filters so they get directly + * to us. Though, that's all fantasy future land. + * + * The next part of the puzzle is the target cache. The purpose of the target + * cache is to cache where we should send a packet on the underlay network, + * given its mac address. The target cache operates in two modes depending on + * whether the lookup module was declared to OVERLAY_TARGET_POINT or + * OVERLAY_TARGET_DYANMIC. + * + * In the case where the target cache has been programmed to be + * OVERLAY_TARGET_POINT, then we only maintain a single overlay_target_point_t + * which has the destination that we send everything, no matter the destination + * mac address. + * + * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things + * are much more interesting and as a result, more complicated. We primarily + * store lists of overlay_target_entry_t's which are stored in both an avl tree + * and a refhash_t. The primary look up path uses the refhash_t and the avl tree + * is only used for a few of the target ioctls used to dump data such that we + * can get a consistent iteration order for things like dladm show-overlay -t. + * The key that we use for the reference hashtable is based on the mac address + * in the cache and currently we just do a simple CRC32 to transform it into a + * hash. + * + * Each entry maintains a set of flags to indicate the current status of the + * request. The flags may indicate one of three states: that current cache entry + * is valid, that the current cache entry has been directed to drop all output, + * and that the current cache entry is invalid and may be being looked up. In + * the case where it's valid, we just take the destination address and run with + * it. + * + * If it's invalid and a lookup has not been made, then we start the process + * that prepares a query that will make its way up to varpd. The cache entry + * entry maintains a message block chain of outstanding message blocks and a + * size. These lists are populated only when we don't know the answer as to + * where should these be sent. The size entry is used to cap the amount of + * outstanding data that we don't know the answer to. If we exceed a cap on the + * amount of outstanding data (currently 1 Mb), then we'll drop any additional + * packets. Once we get an answer indicating a valid destination, we transmit + * any outstanding data to that place. For the full story on how we look that up + * will be discussed in the section on the Target Cache Lifecycle. + * + * ------------------------ + * FMA and Degraded Devices + * ------------------------ + * + * Every kernel overlay device keeps track of its FMA state. Today in FMA we + * cannot represent partitions between resources nor can we represent that a + * given minor node of a psuedo device has failed -- if we degrade the overlay + * device, then the entire dev_info_t is degraded. However, we still want to be + * able to indicate to administrators that things may go wrong. + * + * To this end, we've added a notion of a degraded state to every overlay + * device. This state is primarily dictated by userland and it can happen for + * various reasons. Generally, because a userland lookup plugin has been + * partitioned, or something has gone wrong such that there is no longer any + * userland lookup module for a device, then we'll mark it degraded. + * + * As long as any of our minor instances is degraded, then we'll fire off the + * FMA event to note that. Once the last degraded instance is no longer + * degraded, then we'll end up telling FMA that we're all clean. + * + * To help administrators get a better sense of which of the various minor + * devices is wrong, we store the odd_fmamsg[] character array. This character + * array can be fetched with doing a dladm show-overlay -f. + * + * Note, that it's important that we do not update the link status of the + * devices. We want to remain up as much as possible. By changing the link in a + * degraded state, this may end up making things worse. We may still actually + * have information in the target cache and if we mark the link down, that'll + * result in not being able to use it. The reason being that this'll mark all + * the downstream VNICs down which will go to IP and from there we end up + * dealing with sadness. + * + * ----------------------- + * Target Cache Life Cycle + * ----------------------- + * + * This section only applies when we have a lookup plugin of + * OVERLAY_TARGET_DYNAMIC. None of this applies to those of type + * OVERLAY_TARGET_POINT. + * + * While we got into the target cache in the general architecture section, it's + * worth going into more details as to how this actually works and showing some + * examples and state machines. Recall that a target cache entry basically has + * the following state transition diagram: + * + * Initial state + * . . . . . . first access . . . varpd lookup enqueued + * . . . + * . . . + * +-------+ . +----------+ . + * | No |------*---->| Invalid |-------*----+ + * | Entry | | Entry | | + * +-------+ +----------+ | + * varpd ^ ^ varpd | + * invalidate | | drop | + * . . . * * . . v + * +-------+ | | +---------+ + * | Entry |--->-----+ +----<----| Entry | + * | Valid |<----------*---------<----| Pending |->-+ varpd + * +-------+ . +---------+ * . . drop, but + * . varpd ^ | other queued + * . success | | entries + * +-----+ + * + * When the table is first created, it is empty. As we attempt to lookup entries + * and we find there is no entry at all, we'll create a new table entry for it. + * At that point the entry is technically in an invalid state, that means that + * we have no valid data from varpd. In that case, we'll go ahead and queue the + * packet into the entry's pending chain, and queue a varpd lookup, setting the + * OVERLAY_ENTRY_F_PENDING flag in the progress. + * + * If additional mblk_t's come in for this entry, we end up appending them to + * the tail of the chain, if and only if, we don't exceed the threshold for the + * amount of space they can take up. An entry remains pending until we get a + * varpd reply. If varpd replies with a valid results, we move to the valid + * entry state, and remove the OVERLAY_ENTRY_F_PENDING flag and set it with one + * of OVERLAY_ENTRY_F_VALID or OVERLAY_ENTRY_F_DROP as appropriate. + * + * Once an entry is valid, it stays valid until user land tells us to invalidate + * it with an ioctl or replace it, OVERLAY_TARG_CACHE_REMOE and + * OVERLAY_TARG_CACHE_SET respectively. + * + * If the lookup fails with a call to drop the packet, then the next state is + * determined by the state of the queue. If the set of outstanding entries is + * empty, then we just transition back to the invalid state. If instead, the + * set of outstanding entries is not empty, then we'll queue another entry and + * stay in the same state, repeating this until the number of requests is + * drained. + * + * The following images describes the flow of a given lookup and where the + * overlay_target_entry_t is at any given time. + * + * +-------------------+ + * | Invalid Entry | An entry starts off as an invalid entry + * | de:ad:be:ef:00:00 | and only exists in the target cache. + * +-------------------+ + * + * ~~~~ + * + * +---------------------+ + * | Global list_t | A mblk_t comes in for an entry. We + * | overlay_target_list | append it to the overlay_target_list. + * +---------------------+ + * | + * v + * +-------------------+ +-------------------+ + * | Pending Entry |----->| Pending Entry |--->... + * | 42:5e:1a:10:d6:2d | | de:ad:be:ef:00:00 | + * +-------------------+ +-------------------+ + * + * ~~~~ + * + * +--------------------------+ + * | /dev/overlay minor state | User land said that it would look up an + * | overlay_target_hdl_t | entry for us. We remove it from the + * +--------------------------+ global list and add it to the handle's + * | outstanding list. + * | + * v + * +-------------------+ +-------------------+ + * | Pending Entry |----->| Pending Entry | + * | 90:b8:d0:79:02:dd | | de:ad:be:ef:00:00 | + * +-------------------+ +-------------------+ + * + * ~~~~ + * + * +-------------------+ + * | Valid Entry | varpd returned an answer with + * | de:ad:be:ef:00:00 | OVERLAY_IOC_RESPOND and the target cache + * | 10.169.23.42:4789 | entry is now populated with a + * +-------------------+ destination and marked as valid + * + * + * The lookup mechanism is performed via a series of operations on the character + * psuedo-device /dev/overlay. The only thing that uses this device is the + * userland daemon varpd. /dev/overlay is a cloneable device, each open of it + * granting a new minor number which maintains its own state. We maintain this + * state so that way if an outstanding lookup was queued to something that + * crashed or closed its handle without responding, we can know about this and + * thus handle it appropriately. + * + * When a lookup is first created it's added to our global list of outstanding + * lookups. To service requests, userland is required to perform an ioctl to ask + * for a request. We will block it in the kernel a set amount of time waiting + * for a request. When we give a request to a given minor instance of the + * device, we remove it from the global list and append the request to the + * device's list of outstanding entries, for the reasons we discussed above. + * When a lookup comes in, we give user land a smaller amount of information + * specific to that packet, the overlay_targ_lookup_t. It includes a request id + * to identify this, and then the overlay id, the varpd id, the header and + * packet size, the source and destination mac address, the SAP, and any + * potential VLAN header. + * + * At that point, it stays in that outstanding list until one of two ioctls are + * returned: OVERLAY_TARG_RESPOND or OVERLAY_TARG_DROP. During this time, + * userland may also perform other operations. For example, it may use + * OVERLAY_TARG_PKT to get a copy of this packet so it can perform more in-depth + * analysis of what to do beyond what we gave it initially. This is useful for + * providing proxy arp and the like. Finally, there are two other ioctls that + * varpd can then do. The first is OVERLAY_TARG_INJECT which injects the + * non-jumbo frame packet up into that mac device and OVERLAY_TARG_RESEND which + * causes us to encapsulate and send out the packet they've given us. + * + * + * Finally, through the target cache, several ioctls are provided to allow for + * interrogation and management of the cache. They allow for individual entries + * to be retrieved, set, or have the entire table flushed. For the full set of + * ioctls here and what they do, take a look at uts/common/sys/overlay_target.h. + * + * ------------------ + * Sample Packet Flow + * ------------------ + * + * There's a lot of pieces here, hopefully an example of how this all fits + * together will help clarify and elucidate what's going on. We're going to + * first track an outgoing packet, eg. one that is sent from an IP interface on + * a VNIC on top of an overlay device, and then we'll look at what it means to + * respond to that. + * + * + * +----------------+ +--------------+ +------------------+ + * | IP/DLS send |------->| MAC sends it |----------->| mblk_t reaches | + * | packet to MAC | | to the GLDv3 | | overlay GLDv3 tx | + * +----------------+ | VNIC device | | overlay_m_tx() | + * +--------------+ +------------------+ + * | + * . lookup . cache | + * . drop . miss v + * +---------+ . +--------+ . +------------------+ + * | freemsg |<-----*-------| varpd |<---*------| Lookup each mblk | + * | mblk_t | | lookup | | in the target | + * +---------+ | queued | | cache | + * ^ +--------+ +------------------+ + * on send | | | cache + * error . . * *. . lookup * . . hit + * | | success v + * | | +------------------+ + * +-----------------+ +--------------->| call plugin | + * | Send out | | ovpo_encap() to | + * | overlay_mux_t's |<----------------------------------| get encap mblk_t | + * | ksocket | +------------------+ + * +-----------------+ + * + * The receive end point looks a little different and looks more like: + * + * +------------------+ +----------------+ +-----------+ + * | mblk_t comes off |---->| enter netstack |--->| delivered |---+ + * | the physical | | IP stack | | to | * . . direct + * | device | +----------------+ | ksocket | | callback + * +------------------+ +-----------+ | + * . overlay id | + * . not found v + * +-----------+ . +-----------------+ +--------------------+ + * | freemsg |<--*------| call plugin |<------| overlay_mux_recv() | + * | mblk_t | | ovpo_decap() to | +--------------------+ + * +-----------+ | decap mblk_t | + * +-----------------+ + * | + * * . . overlay id + * v found + * +--------+ +----------------+ + * | adjust |----->| call mac_rx | + * | mblk_t | | on original | + * +--------+ | decaped packet | + * +----------------+ + * + * ------------------ + * Netstack Awareness + * ------------------ + * + * In the above image we note that this enters a netstack. Today the only + * netstack that can be is the global zone as the overlay driver itself is not + * exactly netstack aware. What this really means is that varpd cannot run in a + * non-global zone and an overlay device cannot belong to a non-global zone. + * Non-global zones can still have a VNIC assigned to them that's been created + * over the overlay device the same way they would if it had been created over + * an etherstub or a physical device. + * + * The majority of the work to make it netstack aware is straightforward and the + * biggest thing is to create a netstack module that allows us to hook into + * netstack (and thus zone) creation and destruction. From there, we need to + * amend the target cache lookup routines that we discussed earlier to not have + * a global outstanding list and a global list of handles, but rather, one per + * netstack. + * + * For the mux, we'll need to open the ksocket in the context of the zone, we + * can likely do this with a properly composed credential, but we'll need to do + * some more work on that path. Finally, we'll want to make sure the dld ioctls + * are aware of the zoneid of the caller and we use that appropriately and store + * it in the overlay_dev_t. + * + * ----------- + * GLDv3 Notes + * ----------- + * + * The overlay driver implements a GLDv3 device. Parts of GLDv3 are more + * relevant and other parts are much less relevant for us. For example, the + * GLDv3 is used to toggle the device being put into and out of promiscuous + * mode, to program MAC addresses for unicast and multicast hardware filters. + * Today, an overlay device doesn't have a notion of promiscuous mode nor does + * it have a notion of unicast and multicast addresses programmed into the + * device. Instead, for the purposes of the hardware filter, we don't do + * anything and just always accept new addresses being added and removed. + * + * If the GLDv3 start function has not been called, then we will not use this + * device for I/O purposes. Any calls to transmit or receive should be dropped, + * though the GLDv3 guarantees us that transmit will not be called without + * calling start. Similarly, once stop is called, then no packets can be dealt + * with. + * + * Today we don't support the stat interfaces, though there's no good reason + * that we shouldn't assemble some of the stats based on what we have in the + * future. + * + * When it comes to link properties, many of the traditional link properties do + * not apply and many others MAC handles for us. For example, we don't need to + * implement anything for overlay_m_getprop() to deal with returning the MTU, as + * MAC never calls into us for that. As such, there isn't much of anything to + * support in terms of properties. + * + * Today, we don't support any notion of hardware capabilities. However, if + * future NIC hardware or other changes to the system cause it to make sense for + * us to emulate logical groups, then we should do that. However, we still do + * implement a capab function so that we can identify ourselves as an overlay + * device to the broader MAC framework. This is done mostly so that a device + * created on top of us can have fanout rings as we don't try to lie about a + * speed for our device. + * + * The other question is what should be done for a device's MTU and margin. We + * set our minimum supported MTU to be the minimum value that an IP network may + * be set to 576 -- which mimics what an etherstub does. On the flip side, we + * have our upper bound set to 8900. This value comes from the fact that a lot + * of jumbo networks use their maximum as 9000. As such, we want to reserve 100 + * bytes, which isn't exactly the most accurate number, but it'll be good enough + * for now. Because of that, our default MTU off of these devices is 1400, as + * the default MTU for everything is usually 1500 or whatever the underlying + * device is at; however, this is a bit simpler than asking the netstack what + * are all the IP interfaces at. It also calls into question how PMTU and PMTU + * discovery should work here. The challenge, especially for + * OVERLAY_TARG_DYNAMIC is that the MTU to any of the places will vary and it's + * not clear that if you have a single bad entry that the overall MTU should be + * lowered. Instead, we should figure out a better way of determining these + * kinds of PMTU errors and appropriately alerting the administrator via FMA. + * + * Regarding margin, we allow a margin of up to VLAN_TAGSZ depending on whether + * or not the underlying encapsulation device supports VLAN tags. If it does, + * then we'll set the margin to allow for it, otherwise, we will not. + */ + +#include <sys/conf.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/modctl.h> +#include <sys/policy.h> +#include <sys/stream.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/ddifm.h> + +#include <sys/dls.h> +#include <sys/dld_ioc.h> +#include <sys/mac_provider.h> +#include <sys/mac_client_priv.h> +#include <sys/mac_ether.h> +#include <sys/vlan.h> + +#include <sys/overlay_impl.h> + +dev_info_t *overlay_dip; +static kmutex_t overlay_dev_lock; +static list_t overlay_dev_list; +static uint8_t overlay_macaddr[ETHERADDRL] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; + +typedef enum overlay_dev_prop { + OVERLAY_DEV_P_MTU = 0, + OVERLAY_DEV_P_VNETID, + OVERLAY_DEV_P_ENCAP, + OVERLAY_DEV_P_VARPDID +} overlay_dev_prop_t; + +#define OVERLAY_DEV_NPROPS 4 +static const char *overlay_dev_props[] = { + "mtu", + "vnetid", + "encap", + "varpd/id" +}; + +#define OVERLAY_MTU_MIN 576 +#define OVERLAY_MTU_DEF 1400 +#define OVERLAY_MTU_MAX 8900 + +overlay_dev_t * +overlay_hold_by_dlid(datalink_id_t id) +{ + overlay_dev_t *o; + + mutex_enter(&overlay_dev_lock); + for (o = list_head(&overlay_dev_list); o != NULL; + o = list_next(&overlay_dev_list, o)) { + if (id == o->odd_linkid) { + mutex_enter(&o->odd_lock); + o->odd_ref++; + mutex_exit(&o->odd_lock); + mutex_exit(&overlay_dev_lock); + return (o); + } + } + + mutex_exit(&overlay_dev_lock); + return (NULL); +} + +void +overlay_hold_rele(overlay_dev_t *odd) +{ + mutex_enter(&odd->odd_lock); + ASSERT(odd->odd_ref > 0); + odd->odd_ref--; + mutex_exit(&odd->odd_lock); +} + +void +overlay_io_start(overlay_dev_t *odd, overlay_dev_flag_t flag) +{ + ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX); + ASSERT(MUTEX_HELD(&odd->odd_lock)); + + if (flag & OVERLAY_F_IN_RX) + odd->odd_rxcount++; + if (flag & OVERLAY_F_IN_TX) + odd->odd_txcount++; + odd->odd_flags |= flag; +} + +void +overlay_io_done(overlay_dev_t *odd, overlay_dev_flag_t flag) +{ + boolean_t signal = B_FALSE; + + ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX); + ASSERT(MUTEX_HELD(&odd->odd_lock)); + + if (flag & OVERLAY_F_IN_RX) { + ASSERT(odd->odd_rxcount > 0); + odd->odd_rxcount--; + if (odd->odd_rxcount == 0) { + signal = B_TRUE; + odd->odd_flags &= ~OVERLAY_F_IN_RX; + } + } + if (flag & OVERLAY_F_IN_TX) { + ASSERT(odd->odd_txcount > 0); + odd->odd_txcount--; + if (odd->odd_txcount == 0) { + signal = B_TRUE; + odd->odd_flags &= ~OVERLAY_F_IN_TX; + } + } + + if (signal == B_TRUE) + cv_broadcast(&odd->odd_iowait); +} + +static void +overlay_io_wait(overlay_dev_t *odd, overlay_dev_flag_t flag) +{ + ASSERT((flag & ~OVERLAY_F_IOMASK) == 0); + ASSERT(MUTEX_HELD(&odd->odd_lock)); + + while (odd->odd_flags & flag) { + cv_wait(&odd->odd_iowait, &odd->odd_lock); + } +} + +void +overlay_dev_iter(overlay_dev_iter_f func, void *arg) +{ + overlay_dev_t *odd; + + mutex_enter(&overlay_dev_lock); + for (odd = list_head(&overlay_dev_list); odd != NULL; + odd = list_next(&overlay_dev_list, odd)) { + if (func(odd, arg) != 0) { + mutex_exit(&overlay_dev_lock); + return; + } + } + mutex_exit(&overlay_dev_lock); +} + +/* ARGSUSED */ +static int +overlay_m_stat(void *arg, uint_t stat, uint64_t *val) +{ + return (ENOTSUP); +} + +static int +overlay_m_start(void *arg) +{ + overlay_dev_t *odd = arg; + overlay_mux_t *mux; + int ret, domain, family, prot; + struct sockaddr_storage storage; + socklen_t slen; + + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0) { + mutex_exit(&odd->odd_lock); + return (EAGAIN); + } + mutex_exit(&odd->odd_lock); + + ret = odd->odd_plugin->ovp_ops->ovpo_socket(odd->odd_pvoid, &domain, + &family, &prot, (struct sockaddr *)&storage, &slen); + if (ret != 0) + return (ret); + + mux = overlay_mux_open(odd->odd_plugin, domain, family, prot, + (struct sockaddr *)&storage, slen, &ret); + if (mux == NULL) + return (ret); + + overlay_mux_add_dev(mux, odd); + odd->odd_mux = mux; + mutex_enter(&odd->odd_lock); + ASSERT(!(odd->odd_flags & OVERLAY_F_IN_MUX)); + odd->odd_flags |= OVERLAY_F_IN_MUX; + mutex_exit(&odd->odd_lock); + + return (0); +} + +static void +overlay_m_stop(void *arg) +{ + overlay_dev_t *odd = arg; + + /* + * The MAC Perimeter is held here, so we don't have to worry about + * synchornizing this with respect to metadata operations. + */ + mutex_enter(&odd->odd_lock); + VERIFY(odd->odd_flags & OVERLAY_F_IN_MUX); + VERIFY(!(odd->odd_flags & OVERLAY_F_MDDROP)); + odd->odd_flags |= OVERLAY_F_MDDROP; + overlay_io_wait(odd, OVERLAY_F_IOMASK); + mutex_exit(&odd->odd_lock); + + overlay_mux_remove_dev(odd->odd_mux, odd); + overlay_mux_close(odd->odd_mux); + odd->odd_mux = NULL; + + mutex_enter(&odd->odd_lock); + odd->odd_flags &= ~OVERLAY_F_IN_MUX; + odd->odd_flags &= ~OVERLAY_F_MDDROP; + VERIFY((odd->odd_flags & OVERLAY_F_STOPMASK) == 0); + mutex_exit(&odd->odd_lock); +} + +/* + * For more info on this, see the big theory statement. + */ +/* ARGSUSED */ +static int +overlay_m_promisc(void *arg, boolean_t on) +{ + return (0); +} + +/* + * For more info on this, see the big theory statement. + */ +/* ARGSUSED */ +static int +overlay_m_multicast(void *arg, boolean_t add, const uint8_t *addrp) +{ + return (0); +} + +/* + * For more info on this, see the big theory statement. + */ +/* ARGSUSED */ +static int +overlay_m_unicast(void *arg, const uint8_t *macaddr) +{ + return (0); +} + +mblk_t * +overlay_m_tx(void *arg, mblk_t *mp_chain) +{ + overlay_dev_t *odd = arg; + mblk_t *mp, *ep; + int ret; + ovep_encap_info_t einfo; + struct msghdr hdr; + + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_MDDROP) || + !(odd->odd_flags & OVERLAY_F_IN_MUX)) { + mutex_exit(&odd->odd_lock); + freemsgchain(mp_chain); + return (NULL); + } + overlay_io_start(odd, OVERLAY_F_IN_TX); + mutex_exit(&odd->odd_lock); + + bzero(&hdr, sizeof (struct msghdr)); + + bzero(&einfo, sizeof (ovep_encap_info_t)); + einfo.ovdi_id = odd->odd_vid; + mp = mp_chain; + while (mp != NULL) { + socklen_t slen; + struct sockaddr_storage storage; + + mp_chain = mp->b_next; + mp->b_next = NULL; + ep = NULL; + + ret = overlay_target_lookup(odd, mp, + (struct sockaddr *)&storage, &slen); + if (ret != OVERLAY_TARGET_OK) { + if (ret == OVERLAY_TARGET_DROP) + freemsg(mp); + mp = mp_chain; + continue; + } + + hdr.msg_name = &storage; + hdr.msg_namelen = slen; + + ret = odd->odd_plugin->ovp_ops->ovpo_encap(odd->odd_mh, mp, + &einfo, &ep); + if (ret != 0 || ep == NULL) { + freemsg(mp); + goto out; + } + + ASSERT(ep->b_cont == mp || ep == mp); + ret = overlay_mux_tx(odd->odd_mux, &hdr, ep); + if (ret != 0) + goto out; + + mp = mp_chain; + } + +out: + mutex_enter(&odd->odd_lock); + overlay_io_done(odd, OVERLAY_F_IN_TX); + mutex_exit(&odd->odd_lock); + return (mp_chain); +} + +/* ARGSUSED */ +static void +overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp) +{ + miocnak(q, mp, 0, ENOTSUP); +} + +/* ARGSUSED */ +static boolean_t +overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) +{ + /* + * Tell MAC we're an overlay. + */ + if (cap == MAC_CAPAB_OVERLAY) + return (B_TRUE); + return (B_FALSE); +} + +/* ARGSUSED */ +static int +overlay_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, const void *pr_val) +{ + uint32_t mtu, old; + int err; + overlay_dev_t *odd = arg; + + if (pr_num != MAC_PROP_MTU) + return (ENOTSUP); + + bcopy(pr_val, &mtu, sizeof (mtu)); + if (mtu < OVERLAY_MTU_MIN || mtu > OVERLAY_MTU_MAX) + return (EINVAL); + + mutex_enter(&odd->odd_lock); + old = odd->odd_mtu; + odd->odd_mtu = mtu; + err = mac_maxsdu_update(odd->odd_mh, mtu); + if (err != 0) + odd->odd_mtu = old; + mutex_exit(&odd->odd_lock); + + return (err); +} + +/* ARGSUSED */ +static int +overlay_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, void *pr_val) +{ + return (ENOTSUP); +} + +/* ARGSUSED */ +static void +overlay_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, + mac_prop_info_handle_t prh) +{ + if (pr_num != MAC_PROP_MTU) + return; + + mac_prop_info_set_default_uint32(prh, OVERLAY_MTU_DEF); + mac_prop_info_set_range_uint32(prh, OVERLAY_MTU_MIN, OVERLAY_MTU_MAX); +} + +static mac_callbacks_t overlay_m_callbacks = { + .mc_callbacks = (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP | + MC_PROPINFO), + .mc_getstat = overlay_m_stat, + .mc_start = overlay_m_start, + .mc_stop = overlay_m_stop, + .mc_setpromisc = overlay_m_promisc, + .mc_multicst = overlay_m_multicast, + .mc_unicst = overlay_m_unicast, + .mc_tx = overlay_m_tx, + .mc_ioctl = overlay_m_ioctl, + .mc_getcapab = overlay_m_getcapab, + .mc_getprop = overlay_m_getprop, + .mc_setprop = overlay_m_setprop, + .mc_propinfo = overlay_m_propinfo +}; + +static boolean_t +overlay_valid_name(const char *name, size_t buflen) +{ + size_t actlen; + int err, i; + + for (i = 0; i < buflen; i++) { + if (name[i] == '\0') + break; + } + + if (i == 0 || i == buflen) + return (B_FALSE); + actlen = i; + if (strchr(name, '/') != NULL) + return (B_FALSE); + if (u8_validate((char *)name, actlen, NULL, + U8_VALIDATE_ENTIRE, &err) < 0) + return (B_FALSE); + return (B_TRUE); +} + +/* ARGSUSED */ +static int +overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + int err; + uint64_t maxid; + overlay_dev_t *odd, *o; + mac_register_t *mac; + overlay_ioc_create_t *oicp = karg; + + if (overlay_valid_name(oicp->oic_encap, MAXLINKNAMELEN) == B_FALSE) + return (EINVAL); + + odd = kmem_zalloc(sizeof (overlay_dev_t), KM_SLEEP); + odd->odd_linkid = oicp->oic_linkid; + odd->odd_plugin = overlay_plugin_lookup(oicp->oic_encap); + if (odd->odd_plugin == NULL) { + kmem_free(odd, sizeof (overlay_dev_t)); + return (ENOENT); + } + err = odd->odd_plugin->ovp_ops->ovpo_init((overlay_handle_t)odd, + &odd->odd_pvoid); + if (err != 0) { + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EINVAL); + } + + /* + * Make sure that our virtual network id is valid for the given plugin + * that we're working with. + */ + ASSERT(odd->odd_plugin->ovp_id_size <= 8); + maxid = UINT64_MAX; + if (odd->odd_plugin->ovp_id_size != 8) + maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL; + if (oicp->oic_vnetid > maxid) { + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EINVAL); + } + odd->odd_vid = oicp->oic_vnetid; + + mac = mac_alloc(MAC_VERSION); + if (mac == NULL) { + mutex_exit(&overlay_dev_lock); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EINVAL); + } + + mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER; + mac->m_driver = odd; + mac->m_dip = overlay_dip; + mac->m_dst_addr = NULL; + mac->m_callbacks = &overlay_m_callbacks; + mac->m_pdata = NULL; + mac->m_pdata_size = 0; + + mac->m_priv_props = NULL; + + /* Let mac handle this itself. */ + mac->m_instance = (uint_t)-1; + + /* + * There is no real source address that should be used here, but saying + * that we're not ethernet is going to cause its own problems. At the + * end of the say, this is fine. + */ + mac->m_src_addr = overlay_macaddr; + + /* + * Start with the default MTU as the max SDU. If the MTU is changed, the + * SDU will be changed to reflect that. + */ + mac->m_min_sdu = 1; + mac->m_max_sdu = OVERLAY_MTU_DEF; + mac->m_multicast_sdu = 0; + + /* + * The underlying device doesn't matter, instead this comes from the + * encapsulation protocol and whether or not they allow VLAN tags. + */ + if (odd->odd_plugin->ovp_flags & OVEP_F_VLAN_TAG) { + mac->m_margin = VLAN_TAGSZ; + } else { + mac->m_margin = 0; + } + + /* + * Today, we have no MAC virtualization, it may make sense in the future + * to go ahead and emulate some subset of this, but it doesn't today. + */ + mac->m_v12n = MAC_VIRT_NONE; + + mutex_enter(&overlay_dev_lock); + for (o = list_head(&overlay_dev_list); o != NULL; + o = list_next(&overlay_dev_list, o)) { + if (o->odd_linkid == oicp->oic_linkid) { + mutex_exit(&overlay_dev_lock); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EEXIST); + } + + if (o->odd_vid == oicp->oic_vnetid && + o->odd_plugin == odd->odd_plugin) { + mutex_exit(&overlay_dev_lock); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EEXIST); + } + } + + err = mac_register(mac, &odd->odd_mh); + mac_free(mac); + if (err != 0) { + mutex_exit(&overlay_dev_lock); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (err); + } + + err = dls_devnet_create(odd->odd_mh, odd->odd_linkid, + crgetzoneid(cred)); + if (err != 0) { + mutex_exit(&overlay_dev_lock); + (void) mac_unregister(odd->odd_mh); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (err); + } + + mutex_init(&odd->odd_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL); + odd->odd_ref = 0; + odd->odd_flags = 0; + list_insert_tail(&overlay_dev_list, odd); + mutex_exit(&overlay_dev_lock); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_i_activate(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + int i, ret; + overlay_dev_t *odd; + mac_perim_handle_t mph; + overlay_ioc_activate_t *oiap = karg; + overlay_ioc_propinfo_t *infop; + overlay_ioc_prop_t *oip; + overlay_prop_handle_t phdl; + + odd = overlay_hold_by_dlid(oiap->oia_linkid); + if (odd == NULL) + return (ENOENT); + + infop = kmem_alloc(sizeof (overlay_ioc_propinfo_t), KM_SLEEP); + oip = kmem_alloc(sizeof (overlay_ioc_prop_t), KM_SLEEP); + phdl = (overlay_prop_handle_t)infop; + + mac_perim_enter_by_mh(odd->odd_mh, &mph); + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_ACTIVATED) { + mutex_exit(&odd->odd_lock); + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (EEXIST); + } + mutex_exit(&odd->odd_lock); + + for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) { + const char *pname = odd->odd_plugin->ovp_props[i]; + bzero(infop, sizeof (overlay_ioc_propinfo_t)); + overlay_prop_init(phdl); + ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(pname, phdl); + if (ret != 0) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (ret); + } + + if ((infop->oipi_prot & OVERLAY_PROP_PERM_REQ) == 0) + continue; + bzero(oip, sizeof (overlay_ioc_prop_t)); + oip->oip_size = sizeof (oip->oip_value); + ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid, + pname, oip->oip_value, &oip->oip_size); + if (ret != 0) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (ret); + } + if (oip->oip_size == 0) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (EINVAL); + } + } + + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_VARPD) == 0) { + mutex_exit(&odd->odd_lock); + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (ENXIO); + } + + ASSERT((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0); + odd->odd_flags |= OVERLAY_F_ACTIVATED; + + /* + * Now that we've activated ourselves, we should indicate to the world + * that we're up. Note that we may not be able to perform lookups at + * this time, but our notion of being 'up' isn't dependent on that + * ability. + */ + mac_link_update(odd->odd_mh, LINK_STATE_UP); + mutex_exit(&odd->odd_lock); + + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_i_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + overlay_ioc_delete_t *oidp = karg; + overlay_dev_t *odd; + datalink_id_t tid; + int ret; + + odd = overlay_hold_by_dlid(oidp->oid_linkid); + if (odd == NULL) { + return (ENOENT); + } + + mutex_enter(&odd->odd_lock); + /* If we're not the only hold, we're busy */ + if (odd->odd_ref != 1) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (EBUSY); + } + + if (odd->odd_flags & OVERLAY_F_IN_MUX) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (EBUSY); + } + + /* + * To remove this, we need to first remove it from dls and then remove + * it from mac. The act of removing it from mac will check if there are + * devices on top of this, eg. vnics. If there are, then that will fail + * and we'll have to go through and recreate the dls entry. Only after + * mac_unregister has succeeded, then we'll go through and actually free + * everything and drop the dev lock. + */ + ret = dls_devnet_destroy(odd->odd_mh, &tid, B_TRUE); + if (ret != 0) { + overlay_hold_rele(odd); + return (ret); + } + + ASSERT(oidp->oid_linkid == tid); + ret = mac_disable(odd->odd_mh); + if (ret != 0) { + (void) dls_devnet_create(odd->odd_mh, odd->odd_linkid, + crgetzoneid(cred)); + overlay_hold_rele(odd); + return (ret); + } + + overlay_target_quiesce(odd->odd_target); + + mutex_enter(&overlay_dev_lock); + list_remove(&overlay_dev_list, odd); + mutex_exit(&overlay_dev_lock); + + cv_destroy(&odd->odd_iowait); + mutex_destroy(&odd->odd_lock); + overlay_target_free(odd); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_i_nprops(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + overlay_dev_t *odd; + overlay_ioc_nprops_t *on = karg; + + odd = overlay_hold_by_dlid(on->oipn_linkid); + if (odd == NULL) + return (ENOENT); + on->oipn_nprops = odd->odd_plugin->ovp_nprops + OVERLAY_DEV_NPROPS; + overlay_hold_rele(odd); + + return (0); +} + +static int +overlay_propinfo_plugin_cb(overlay_plugin_t *opp, void *arg) +{ + overlay_prop_handle_t phdl = arg; + overlay_prop_set_range_str(phdl, opp->ovp_name); + return (0); +} + +static int +overlay_i_name_to_propid(overlay_dev_t *odd, const char *name, uint_t *id) +{ + int i; + + for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { + if (strcmp(overlay_dev_props[i], name) == 0) { + *id = i; + return (0); + } + } + + for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) { + if (strcmp(odd->odd_plugin->ovp_props[i], name) == 0) { + *id = i + OVERLAY_DEV_NPROPS; + return (0); + } + } + + return (ENOENT); +} + +static void +overlay_i_propinfo_mtu(overlay_dev_t *odd, overlay_prop_handle_t phdl) +{ + uint32_t def; + mac_propval_range_t range; + uint_t perm; + + ASSERT(MAC_PERIM_HELD(odd->odd_mh)); + + bzero(&range, sizeof (mac_propval_range_t)); + range.mpr_count = 1; + if (mac_prop_info(odd->odd_mh, MAC_PROP_MTU, "mtu", &def, + sizeof (def), &range, &perm) != 0) + return; + + if (perm == MAC_PROP_PERM_READ) + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); + else if (perm == MAC_PROP_PERM_WRITE) + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_WRITE); + else if (perm == MAC_PROP_PERM_RW) + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_default(phdl, &def, sizeof (def)); + overlay_prop_set_range_uint32(phdl, range.mpr_range_uint32[0].mpur_min, + range.mpr_range_uint32[0].mpur_max); +} + +/* ARGSUSED */ +static int +overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + overlay_dev_t *odd; + int ret; + mac_perim_handle_t mph; + uint_t propid = UINT_MAX; + overlay_ioc_propinfo_t *oip = karg; + overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip; + + odd = overlay_hold_by_dlid(oip->oipi_linkid); + if (odd == NULL) + return (ENOENT); + + overlay_prop_init(phdl); + mac_perim_enter_by_mh(odd->odd_mh, &mph); + + /* + * If the id is -1, then the property that we're looking for is named in + * oipi_name and we should fill in its id. Otherwise, we've been given + * an id and we need to turn that into a name for our plugin's sake. The + * id is our own fabrication for property discovery. + */ + if (oip->oipi_id == -1) { + /* + * Determine if it's a known generic property or it belongs to a + * module by checking against the list of known names. + */ + oip->oipi_name[OVERLAY_PROP_NAMELEN-1] = '\0'; + if ((ret = overlay_i_name_to_propid(odd, oip->oipi_name, + &propid)) != 0) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } + oip->oipi_id = propid; + if (propid >= OVERLAY_DEV_NPROPS) { + ret = odd->odd_plugin->ovp_ops->ovpo_propinfo( + oip->oipi_name, phdl); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + + } + } else if (oip->oipi_id >= OVERLAY_DEV_NPROPS) { + uint_t id = oip->oipi_id - OVERLAY_DEV_NPROPS; + + if (id >= odd->odd_plugin->ovp_nprops) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (EINVAL); + } + ret = odd->odd_plugin->ovp_ops->ovpo_propinfo( + odd->odd_plugin->ovp_props[id], phdl); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } else if (oip->oipi_id < -1) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (EINVAL); + } else { + ASSERT(oip->oipi_id < OVERLAY_DEV_NPROPS); + ASSERT(oip->oipi_id >= 0); + propid = oip->oipi_id; + (void) strlcpy(oip->oipi_name, overlay_dev_props[propid], + sizeof (oip->oipi_name)); + } + + switch (propid) { + case OVERLAY_DEV_P_MTU: + overlay_i_propinfo_mtu(odd, phdl); + break; + case OVERLAY_DEV_P_VNETID: + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_nodefault(phdl); + break; + case OVERLAY_DEV_P_ENCAP: + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_STRING); + overlay_prop_set_nodefault(phdl); + overlay_plugin_walk(overlay_propinfo_plugin_cb, phdl); + break; + case OVERLAY_DEV_P_VARPDID: + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_nodefault(phdl); + break; + default: + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ENOENT); + } + + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (0); +} + +/* ARGSUSED */ +static int +overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + int ret; + overlay_dev_t *odd; + mac_perim_handle_t mph; + overlay_ioc_prop_t *oip = karg; + uint_t propid, mtu; + + odd = overlay_hold_by_dlid(oip->oip_linkid); + if (odd == NULL) + return (ENOENT); + + mac_perim_enter_by_mh(odd->odd_mh, &mph); + oip->oip_size = OVERLAY_PROP_SIZEMAX; + oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0'; + if (oip->oip_id == -1) { + int i; + + for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { + if (strcmp(overlay_dev_props[i], oip->oip_name) == 0) + break; + if (i == OVERLAY_DEV_NPROPS) { + ret = odd->odd_plugin->ovp_ops->ovpo_getprop( + odd->odd_pvoid, oip->oip_name, + oip->oip_value, &oip->oip_size); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } + } + + propid = i; + } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) { + uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS; + + if (id > odd->odd_plugin->ovp_nprops) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (EINVAL); + } + ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid, + odd->odd_plugin->ovp_props[id], oip->oip_value, + &oip->oip_size); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } else if (oip->oip_id < -1) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (EINVAL); + } else { + ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS); + ASSERT(oip->oip_id >= 0); + propid = oip->oip_id; + } + + ret = 0; + switch (propid) { + case OVERLAY_DEV_P_MTU: + /* + * The MTU is always set and retrieved through MAC, to allow for + * MAC to do whatever it wants, as really that property belongs + * to MAC. This is important for things where vnics have hold on + * the MTU. + */ + mac_sdu_get(odd->odd_mh, NULL, &mtu); + bcopy(&mtu, oip->oip_value, sizeof (uint_t)); + oip->oip_size = sizeof (uint_t); + break; + case OVERLAY_DEV_P_VNETID: + /* + * While it's read-only while inside of a mux, we're not in a + * context that can guarantee that. Therefore we always grab the + * overlay_dev_t's odd_lock. + */ + mutex_enter(&odd->odd_lock); + bcopy(&odd->odd_vid, oip->oip_value, sizeof (uint64_t)); + mutex_exit(&odd->odd_lock); + oip->oip_size = sizeof (uint64_t); + break; + case OVERLAY_DEV_P_ENCAP: + oip->oip_size = strlcpy((char *)oip->oip_value, + odd->odd_plugin->ovp_name, oip->oip_size); + break; + case OVERLAY_DEV_P_VARPDID: + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_VARPD) { + const uint64_t val = odd->odd_target->ott_id; + bcopy(&val, oip->oip_value, sizeof (uint64_t)); + oip->oip_size = sizeof (uint64_t); + } else { + oip->oip_size = 0; + } + mutex_exit(&odd->odd_lock); + break; + default: + ret = ENOENT; + } + + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); +} + +static void +overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid) +{ + mutex_enter(&odd->odd_lock); + + /* Simple case, not active */ + if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) { + odd->odd_vid = vnetid; + mutex_exit(&odd->odd_lock); + return; + } + + /* + * In the hard case, we need to set the drop flag, quiesce I/O and then + * we can go ahead and do everything. + */ + odd->odd_flags |= OVERLAY_F_MDDROP; + overlay_io_wait(odd, OVERLAY_F_IOMASK); + mutex_exit(&odd->odd_lock); + + overlay_mux_remove_dev(odd->odd_mux, odd); + mutex_enter(&odd->odd_lock); + odd->odd_vid = vnetid; + mutex_exit(&odd->odd_lock); + overlay_mux_add_dev(odd->odd_mux, odd); + + mutex_enter(&odd->odd_lock); + ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX); + odd->odd_flags &= ~OVERLAY_F_IN_MUX; + mutex_exit(&odd->odd_lock); +} + +/* ARGSUSED */ +static int +overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + int ret; + overlay_dev_t *odd; + overlay_ioc_prop_t *oip = karg; + uint_t propid = UINT_MAX; + mac_perim_handle_t mph; + uint64_t maxid, *vidp; + + if (oip->oip_size > OVERLAY_PROP_SIZEMAX) + return (EINVAL); + + odd = overlay_hold_by_dlid(oip->oip_linkid); + if (odd == NULL) + return (ENOENT); + + oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0'; + mac_perim_enter_by_mh(odd->odd_mh, &mph); + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_ACTIVATED) { + mac_perim_exit(mph); + mutex_exit(&odd->odd_lock); + return (ENOTSUP); + } + mutex_exit(&odd->odd_lock); + if (oip->oip_id == -1) { + int i; + + for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { + if (strcmp(overlay_dev_props[i], oip->oip_name) == 0) + break; + if (i == OVERLAY_DEV_NPROPS) { + ret = odd->odd_plugin->ovp_ops->ovpo_setprop( + odd->odd_pvoid, oip->oip_name, + oip->oip_value, oip->oip_size); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } + } + + propid = i; + } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) { + uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS; + + if (id > odd->odd_plugin->ovp_nprops) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (EINVAL); + } + ret = odd->odd_plugin->ovp_ops->ovpo_setprop(odd->odd_pvoid, + odd->odd_plugin->ovp_props[id], oip->oip_value, + oip->oip_size); + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (ret); + } else if (oip->oip_id < -1) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (EINVAL); + } else { + ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS); + ASSERT(oip->oip_id >= 0); + propid = oip->oip_id; + } + + ret = 0; + switch (propid) { + case OVERLAY_DEV_P_MTU: + ret = mac_set_prop(odd->odd_mh, MAC_PROP_MTU, "mtu", + oip->oip_value, oip->oip_size); + break; + case OVERLAY_DEV_P_VNETID: + if (oip->oip_size != sizeof (uint64_t)) { + ret = EINVAL; + break; + } + vidp = (uint64_t *)oip->oip_value; + ASSERT(odd->odd_plugin->ovp_id_size <= 8); + maxid = UINT64_MAX; + if (odd->odd_plugin->ovp_id_size != 8) + maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - + 1ULL; + if (*vidp >= maxid) { + ret = EINVAL; + break; + } + overlay_setprop_vnetid(odd, *vidp); + break; + case OVERLAY_DEV_P_ENCAP: + case OVERLAY_DEV_P_VARPDID: + ret = EPERM; + break; + default: + ret = ENOENT; + } + + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (ret); +} + +/* ARGSUSED */ +static int +overlay_i_status(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + overlay_dev_t *odd; + overlay_ioc_status_t *os = karg; + + odd = overlay_hold_by_dlid(os->ois_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_DEGRADED) != 0) { + os->ois_status = OVERLAY_I_DEGRADED; + if (odd->odd_fmamsg != NULL) { + (void) strlcpy(os->ois_message, odd->odd_fmamsg, + OVERLAY_STATUS_BUFLEN); + } else { + os->ois_message[0] = '\0'; + } + + } else { + os->ois_status = OVERLAY_I_OK; + os->ois_message[0] = '\0'; + } + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + + return (0); +} + +static dld_ioc_info_t overlay_ioc_list[] = { + { OVERLAY_IOC_CREATE, DLDCOPYIN, sizeof (overlay_ioc_create_t), + overlay_i_create, secpolicy_dl_config }, + { OVERLAY_IOC_ACTIVATE, DLDCOPYIN, sizeof (overlay_ioc_activate_t), + overlay_i_activate, secpolicy_dl_config }, + { OVERLAY_IOC_DELETE, DLDCOPYIN, sizeof (overlay_ioc_delete_t), + overlay_i_delete, secpolicy_dl_config }, + { OVERLAY_IOC_PROPINFO, DLDCOPYIN | DLDCOPYOUT, + sizeof (overlay_ioc_propinfo_t), overlay_i_propinfo, + secpolicy_dl_config }, + { OVERLAY_IOC_GETPROP, DLDCOPYIN | DLDCOPYOUT, + sizeof (overlay_ioc_prop_t), overlay_i_getprop, + secpolicy_dl_config }, + { OVERLAY_IOC_SETPROP, DLDCOPYIN, + sizeof (overlay_ioc_prop_t), overlay_i_setprop, + secpolicy_dl_config }, + { OVERLAY_IOC_NPROPS, DLDCOPYIN | DLDCOPYOUT, + sizeof (overlay_ioc_nprops_t), overlay_i_nprops, + secpolicy_dl_config }, + { OVERLAY_IOC_STATUS, DLDCOPYIN | DLDCOPYOUT, + sizeof (overlay_ioc_status_t), overlay_i_status, + NULL } +}; + +static int +overlay_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int fmcap = DDI_FM_EREPORT_CAPABLE; + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (overlay_dip != NULL || ddi_get_instance(dip) != 0) + return (DDI_FAILURE); + + ddi_fm_init(dip, &fmcap, NULL); + + if (ddi_create_minor_node(dip, OVERLAY_CTL, S_IFCHR, + ddi_get_instance(dip), DDI_PSEUDO, 0) == DDI_FAILURE) + return (DDI_FAILURE); + + if (dld_ioc_register(OVERLAY_IOC, overlay_ioc_list, + DLDIOCCNT(overlay_ioc_list)) != 0) { + ddi_remove_minor_node(dip, OVERLAY_CTL); + return (DDI_FAILURE); + } + + overlay_dip = dip; + return (DDI_SUCCESS); +} + +/* ARGSUSED */ +static int +overlay_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp) +{ + int error; + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + *resp = (void *)overlay_dip; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *resp = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + break; + } + + return (error); +} + +static int +overlay_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + mutex_enter(&overlay_dev_lock); + if (!list_is_empty(&overlay_dev_list) || overlay_target_busy()) { + mutex_exit(&overlay_dev_lock); + return (EBUSY); + } + mutex_exit(&overlay_dev_lock); + + + dld_ioc_unregister(OVERLAY_IOC); + ddi_remove_minor_node(dip, OVERLAY_CTL); + ddi_fm_fini(dip); + overlay_dip = NULL; + return (DDI_SUCCESS); +} + +static struct cb_ops overlay_cbops = { + overlay_target_open, /* cb_open */ + overlay_target_close, /* cb_close */ + nodev, /* cb_strategy */ + nodev, /* cb_print */ + nodev, /* cb_dump */ + nodev, /* cb_read */ + nodev, /* cb_write */ + overlay_target_ioctl, /* cb_ioctl */ + nodev, /* cb_devmap */ + nodev, /* cb_mmap */ + nodev, /* cb_segmap */ + nochpoll, /* cb_chpoll */ + ddi_prop_op, /* cb_prop_op */ + NULL, /* cb_stream */ + D_MP, /* cb_flag */ + CB_REV, /* cb_rev */ + nodev, /* cb_aread */ + nodev, /* cb_awrite */ +}; + +static struct dev_ops overlay_dev_ops = { + DEVO_REV, /* devo_rev */ + 0, /* devo_refcnt */ + overlay_getinfo, /* devo_getinfo */ + nulldev, /* devo_identify */ + nulldev, /* devo_probe */ + overlay_attach, /* devo_attach */ + overlay_detach, /* devo_detach */ + nulldev, /* devo_reset */ + &overlay_cbops, /* devo_cb_ops */ + NULL, /* devo_bus_ops */ + NULL, /* devo_power */ + ddi_quiesce_not_supported /* devo_quiesce */ +}; + +static struct modldrv overlay_modldrv = { + &mod_driverops, + "Overlay Network Driver", + &overlay_dev_ops +}; + +static struct modlinkage overlay_linkage = { + MODREV_1, + &overlay_modldrv +}; + +static int +overlay_init(void) +{ + mutex_init(&overlay_dev_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&overlay_dev_list, sizeof (overlay_dev_t), + offsetof(overlay_dev_t, odd_link)); + overlay_mux_init(); + overlay_plugin_init(); + overlay_target_init(); + + return (DDI_SUCCESS); +} + +static void +overlay_fini(void) +{ + overlay_target_fini(); + overlay_plugin_fini(); + overlay_mux_fini(); + mutex_destroy(&overlay_dev_lock); + list_destroy(&overlay_dev_list); +} + +int +_init(void) +{ + int err; + + if ((err = overlay_init()) != DDI_SUCCESS) + return (err); + + mac_init_ops(NULL, "overlay"); + err = mod_install(&overlay_linkage); + if (err != DDI_SUCCESS) { + overlay_fini(); + return (err); + } + + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&overlay_linkage, modinfop)); +} + +int +_fini(void) +{ + int err; + + err = mod_remove(&overlay_linkage); + if (err != 0) + return (err); + + overlay_fini(); + return (0); +} diff --git a/usr/src/uts/common/io/overlay/overlay.conf b/usr/src/uts/common/io/overlay/overlay.conf new file mode 100644 index 0000000000..4b62fafd94 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015, Joyent, Inc. +# + +name="overlay" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/io/overlay/overlay.mapfile b/usr/src/uts/common/io/overlay/overlay.mapfile new file mode 100644 index 0000000000..800d72dc2b --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay.mapfile @@ -0,0 +1,46 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_VERSION ILLUMOSprivate { + global: + # DDI Interfaces + _fini; + _init; + _info; + + # Encapsualation Plugin interfaces + overlay_plugin_alloc; + overlay_plugin_free; + overlay_plugin_register; + overlay_plugin_unregister; + local: + *; +}; diff --git a/usr/src/uts/common/io/overlay/overlay_fm.c b/usr/src/uts/common/io/overlay/overlay_fm.c new file mode 100644 index 0000000000..0701d08e8b --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_fm.c @@ -0,0 +1,82 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * Overlay device FMA operations. + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/ddifm.h> +#include <sys/overlay_impl.h> + +kmutex_t overlay_fm_lock; +uint_t overlay_fm_count; + +void +overlay_fm_init(void) +{ + overlay_fm_count = 0; + mutex_init(&overlay_fm_lock, NULL, MUTEX_DRIVER, NULL); +} + +void +overlay_fm_fini(void) +{ + VERIFY(overlay_fm_count == 0); + mutex_destroy(&overlay_fm_lock); +} + +void +overlay_fm_degrade(overlay_dev_t *odd, const char *msg) +{ + mutex_enter(&overlay_fm_lock); + mutex_enter(&odd->odd_lock); + + if (msg != NULL) + (void) strlcpy(odd->odd_fmamsg, msg, OVERLAY_STATUS_BUFLEN); + + if (odd->odd_flags & OVERLAY_F_DEGRADED) + goto out; + + odd->odd_flags |= OVERLAY_F_DEGRADED; + overlay_fm_count++; + if (overlay_fm_count == 1) { + ddi_fm_service_impact(overlay_dip, DDI_SERVICE_DEGRADED); + } +out: + mutex_exit(&odd->odd_lock); + mutex_exit(&overlay_fm_lock); +} + +void +overlay_fm_restore(overlay_dev_t *odd) +{ + mutex_enter(&overlay_fm_lock); + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_DEGRADED)) + goto out; + + odd->odd_fmamsg[0] = '\0'; + odd->odd_flags &= ~OVERLAY_F_DEGRADED; + overlay_fm_count--; + if (overlay_fm_count == 0) { + ddi_fm_service_impact(overlay_dip, DDI_SERVICE_RESTORED); + } +out: + mutex_exit(&odd->odd_lock); + mutex_exit(&overlay_fm_lock); +} diff --git a/usr/src/uts/common/io/overlay/overlay_mux.c b/usr/src/uts/common/io/overlay/overlay_mux.c new file mode 100644 index 0000000000..58e9f2665d --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_mux.c @@ -0,0 +1,368 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * Overlay device ksocket multiplexer. + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/ksynch.h> +#include <sys/ksocket.h> +#include <sys/avl.h> +#include <sys/list.h> +#include <sys/pattr.h> +#include <sys/sysmacros.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <sys/tihdr.h> + +#include <sys/overlay_impl.h> + +#include <sys/sdt.h> + +#define OVERLAY_FREEMSG(mp, reason) \ + DTRACE_PROBE2(overlay__fremsg, mblk_t *, mp, char *, reason) + +static list_t overlay_mux_list; +static kmutex_t overlay_mux_lock; + +void +overlay_mux_init(void) +{ + list_create(&overlay_mux_list, sizeof (overlay_mux_t), + offsetof(overlay_mux_t, omux_lnode)); + mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL); +} + +void +overlay_mux_fini(void) +{ + mutex_destroy(&overlay_mux_lock); + list_destroy(&overlay_mux_list); +} + +static int +overlay_mux_comparator(const void *a, const void *b) +{ + const overlay_dev_t *odl, *odr; + odl = a; + odr = b; + if (odl->odd_vid > odr->odd_vid) + return (1); + else if (odl->odd_vid < odr->odd_vid) + return (-1); + else + return (0); +} + +/* + * This is the central receive data path. We need to decode the packet, if we + * can, and then deliver it to the appropriate overlay. + */ +/* ARGSUSED */ +static boolean_t +overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob, + void *arg) +{ + mblk_t *mp, *nmp, *fmp; + overlay_mux_t *mux = arg; + + /* + * We may have a received a chain of messages. Each messsage in the + * chain will likely have a T_unitdata_ind attached to it as an M_PROTO. + * If we aren't getting that, we should probably drop that for the + * moment. + */ + for (mp = mpchain; mp != NULL; mp = nmp) { + struct T_unitdata_ind *tudi; + ovep_encap_info_t infop; + overlay_dev_t od, *odd; + int ret; + + nmp = mp->b_next; + mp->b_next = NULL; + + if (DB_TYPE(mp) != M_PROTO) { + OVERLAY_FREEMSG(mp, "first one isn't M_PROTO"); + freemsg(mp); + continue; + } + + if (mp->b_cont == NULL) { + OVERLAY_FREEMSG(mp, "missing a b_cont"); + freemsg(mp); + continue; + } + + tudi = (struct T_unitdata_ind *)mp->b_rptr; + if (tudi->PRIM_type != T_UNITDATA_IND) { + OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *"); + freemsg(mp); + continue; + } + + /* + * In the future, we'll care about the source information + * for purposes of telling varpd for oob invalidation. But for + * now, just drop that block. + */ + fmp = mp; + mp = fmp->b_cont; + freeb(fmp); + + /* + * Until we have VXLAN-or-other-decap HW acceleration support + * (e.g. we support NICs that reach into VXLAN-encapsulated + * packets and check the inside-VXLAN IP packets' checksums, + * or do LSO with VXLAN), we should clear any HW-accelerated- + * performed bits. + * + * We do this, even in cases of HW_LOCAL_MAC, because we + * absolutely have NO context about the inner packet. + * It could've arrived off an external NIC and been forwarded + * to the overlay network, which means no context. + */ + DB_CKSUMFLAGS(mp) = 0; + + /* + * Decap and deliver. + */ + bzero(&infop, sizeof (ovep_encap_info_t)); + ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop); + if (ret != 0) { + OVERLAY_FREEMSG(mp, "decap failed"); + freemsg(mp); + continue; + } + if (MBLKL(mp) > infop.ovdi_hdr_size) { + mp->b_rptr += infop.ovdi_hdr_size; + } else { + while (infop.ovdi_hdr_size != 0) { + size_t rem, blkl; + + if (mp == NULL) + break; + + blkl = MBLKL(mp); + rem = MIN(infop.ovdi_hdr_size, blkl); + infop.ovdi_hdr_size -= rem; + mp->b_rptr += rem; + if (rem == blkl) { + fmp = mp; + mp = fmp->b_cont; + fmp->b_cont = NULL; + OVERLAY_FREEMSG(mp, + "freed a fmp block"); + freemsg(fmp); + } + } + if (mp == NULL) { + OVERLAY_FREEMSG(mp, "freed it all..."); + continue; + } + } + + + od.odd_vid = infop.ovdi_id; + mutex_enter(&mux->omux_lock); + odd = avl_find(&mux->omux_devices, &od, NULL); + if (odd == NULL) { + mutex_exit(&mux->omux_lock); + OVERLAY_FREEMSG(mp, "no matching vid"); + freemsg(mp); + continue; + } + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_MDDROP) || + !(odd->odd_flags & OVERLAY_F_IN_MUX)) { + mutex_exit(&odd->odd_lock); + mutex_exit(&mux->omux_lock); + OVERLAY_FREEMSG(mp, "dev dropped"); + freemsg(mp); + continue; + } + overlay_io_start(odd, OVERLAY_F_IN_RX); + mutex_exit(&odd->odd_lock); + mutex_exit(&mux->omux_lock); + + mac_rx(odd->odd_mh, NULL, mp); + + mutex_enter(&odd->odd_lock); + overlay_io_done(odd, OVERLAY_F_IN_RX); + mutex_exit(&odd->odd_lock); + } + + return (B_TRUE); +} + +/* + * Register a given device with a socket backend. If no such device socket + * exists, create a new one. + */ +overlay_mux_t * +overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol, + struct sockaddr *addr, socklen_t len, int *errp) +{ + int err; + overlay_mux_t *mux; + ksocket_t ksock; + + if (errp == NULL) + errp = &err; + + mutex_enter(&overlay_mux_lock); + for (mux = list_head(&overlay_mux_list); mux != NULL; + mux = list_next(&overlay_mux_list, mux)) { + if (domain == mux->omux_domain && + family == mux->omux_family && + protocol == mux->omux_protocol && + len == mux->omux_alen && + bcmp(addr, mux->omux_addr, len) == 0) { + + if (opp != mux->omux_plugin) { + *errp = EEXIST; + return (NULL); + } + + mutex_enter(&mux->omux_lock); + mux->omux_count++; + mutex_exit(&mux->omux_lock); + mutex_exit(&overlay_mux_lock); + *errp = 0; + return (mux); + } + } + + /* + * Today we aren't zone-aware and only exist in the global zone. When we + * allow for things to exist in the non-global zone, we'll want to use a + * credential that's actually specific to the zone. + */ + *errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP, + kcred); + if (*errp != 0) { + mutex_exit(&overlay_mux_lock); + return (NULL); + } + + *errp = ksocket_bind(ksock, addr, len, kcred); + if (*errp != 0) { + mutex_exit(&overlay_mux_lock); + ksocket_close(ksock, kcred); + return (NULL); + } + + /* + * Ask our lower layer to optionally toggle anything they need on this + * socket. Because a socket is owned by a single type of plugin, we can + * then ask it to perform any additional socket set up it'd like to do. + */ + if (opp->ovp_ops->ovpo_sockopt != NULL && + (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) { + mutex_exit(&overlay_mux_lock); + ksocket_close(ksock, kcred); + return (NULL); + } + + mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP); + list_link_init(&mux->omux_lnode); + mux->omux_ksock = ksock; + mux->omux_plugin = opp; + mux->omux_domain = domain; + mux->omux_family = family; + mux->omux_protocol = protocol; + mux->omux_addr = kmem_alloc(len, KM_SLEEP); + bcopy(addr, mux->omux_addr, len); + mux->omux_alen = len; + mux->omux_count = 1; + avl_create(&mux->omux_devices, overlay_mux_comparator, + sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode)); + mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL); + + + /* Once this is called, we need to expect to rx data */ + *errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux); + if (*errp != 0) { + ksocket_close(ksock, kcred); + mutex_destroy(&mux->omux_lock); + avl_destroy(&mux->omux_devices); + kmem_free(mux->omux_addr, len); + kmem_free(mux, sizeof (overlay_mux_t)); + return (NULL); + } + + list_insert_tail(&overlay_mux_list, mux); + mutex_exit(&overlay_mux_lock); + + *errp = 0; + return (mux); +} + +void +overlay_mux_close(overlay_mux_t *mux) +{ + mutex_enter(&overlay_mux_lock); + mutex_enter(&mux->omux_lock); + mux->omux_count--; + if (mux->omux_count != 0) { + mutex_exit(&mux->omux_lock); + mutex_exit(&overlay_mux_lock); + return; + } + list_remove(&overlay_mux_list, mux); + mutex_exit(&mux->omux_lock); + mutex_exit(&overlay_mux_lock); + + ksocket_close(mux->omux_ksock, kcred); + avl_destroy(&mux->omux_devices); + kmem_free(mux->omux_addr, mux->omux_alen); + kmem_free(mux, sizeof (overlay_mux_t)); +} + +void +overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd) +{ + mutex_enter(&mux->omux_lock); + avl_add(&mux->omux_devices, odd); + mutex_exit(&mux->omux_lock); +} + +void +overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd) +{ + mutex_enter(&mux->omux_lock); + avl_remove(&mux->omux_devices, odd); + mutex_exit(&mux->omux_lock); +} + +int +overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp) +{ + int ret; + + /* + * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately, + * that isn't actually supported by UDP at this time. + */ + ret = ksocket_sendmblk(mux->omux_ksock, hdr, 0, &mp, kcred); + if (ret != 0) + freemsg(mp); + + return (ret); +} diff --git a/usr/src/uts/common/io/overlay/overlay_plugin.c b/usr/src/uts/common/io/overlay/overlay_plugin.c new file mode 100644 index 0000000000..348ddb92a2 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_plugin.c @@ -0,0 +1,281 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * Overlay device encapsulation plugin management + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/errno.h> +#include <sys/sysmacros.h> +#include <sys/modctl.h> + +#include <sys/overlay_impl.h> + +static kmem_cache_t *overlay_plugin_cache; +static kmutex_t overlay_plugin_lock; +static list_t overlay_plugin_list; + +#define OVERLAY_MODDIR "overlay" + +/* ARGSUSED */ +static int +overlay_plugin_cache_constructor(void *buf, void *arg, int kmflags) +{ + overlay_plugin_t *opp = buf; + + mutex_init(&opp->ovp_mutex, NULL, MUTEX_DRIVER, NULL); + list_link_init(&opp->ovp_link); + + return (0); +} + +/* ARGSUSED */ +static void +overlay_plugin_cache_destructor(void *buf, void *arg) +{ + overlay_plugin_t *opp = buf; + ASSERT(list_link_active(&opp->ovp_link) == 0); + mutex_destroy(&opp->ovp_mutex); +} + +void +overlay_plugin_init(void) +{ + mutex_init(&overlay_plugin_lock, NULL, MUTEX_DRIVER, 0); + + /* + * In the future we may want to have a reaper to unload unused modules + * to help the kernel be able to reclaim memory. + */ + overlay_plugin_cache = kmem_cache_create("overlay_plugin_cache", + sizeof (overlay_plugin_t), 0, overlay_plugin_cache_constructor, + overlay_plugin_cache_destructor, NULL, NULL, NULL, 0); + list_create(&overlay_plugin_list, sizeof (overlay_plugin_t), + offsetof(overlay_plugin_t, ovp_link)); +} + +void +overlay_plugin_fini(void) +{ + mutex_enter(&overlay_plugin_lock); + VERIFY(list_is_empty(&overlay_plugin_list)); + mutex_exit(&overlay_plugin_lock); + + list_destroy(&overlay_plugin_list); + kmem_cache_destroy(overlay_plugin_cache); + mutex_destroy(&overlay_plugin_lock); +} + +overlay_plugin_register_t * +overlay_plugin_alloc(uint_t version) +{ + overlay_plugin_register_t *ovrp; + /* Version 1 is the only one that exists */ + if (version != OVEP_VERSION_ONE) + return (NULL); + + ovrp = kmem_zalloc(sizeof (overlay_plugin_register_t), KM_SLEEP); + ovrp->ovep_version = version; + return (ovrp); +} + +void +overlay_plugin_free(overlay_plugin_register_t *ovrp) +{ + kmem_free(ovrp, sizeof (overlay_plugin_register_t)); +} + +int +overlay_plugin_register(overlay_plugin_register_t *ovrp) +{ + overlay_plugin_t *opp, *ipp; + + /* Sanity check parameters of the registration */ + if (ovrp->ovep_version != OVEP_VERSION_ONE) + return (EINVAL); + + if (ovrp->ovep_name == NULL || ovrp->ovep_ops == NULL) + return (EINVAL); + + if ((ovrp->ovep_flags & ~(OVEP_F_VLAN_TAG)) != 0) + return (EINVAL); + + if (ovrp->ovep_id_size < 1) + return (EINVAL); + + /* Don't support anything that has an id size larger than 8 bytes */ + if (ovrp->ovep_id_size > 8) + return (ENOTSUP); + + if (ovrp->ovep_dest == OVERLAY_PLUGIN_D_INVALID) + return (EINVAL); + + if ((ovrp->ovep_dest & ~OVERLAY_PLUGIN_D_MASK) != 0) + return (EINVAL); + + if (ovrp->ovep_ops->ovpo_callbacks != 0) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_init == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_fini == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_encap == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_decap == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_socket == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_getprop == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_setprop == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_propinfo == NULL) + return (EINVAL); + + + opp = kmem_cache_alloc(overlay_plugin_cache, KM_SLEEP); + opp->ovp_active = 0; + opp->ovp_name = ovrp->ovep_name; + opp->ovp_ops = ovrp->ovep_ops; + opp->ovp_props = ovrp->ovep_props; + opp->ovp_id_size = ovrp->ovep_id_size; + opp->ovp_flags = ovrp->ovep_flags; + opp->ovp_dest = ovrp->ovep_dest; + + opp->ovp_nprops = 0; + if (ovrp->ovep_props != NULL) { + while (ovrp->ovep_props[opp->ovp_nprops] != NULL) { + if (strlen(ovrp->ovep_props[opp->ovp_nprops]) >= + OVERLAY_PROP_NAMELEN) { + mutex_exit(&overlay_plugin_lock); + kmem_cache_free(overlay_plugin_cache, opp); + return (EINVAL); + } + opp->ovp_nprops++; + } + } + + mutex_enter(&overlay_plugin_lock); + for (ipp = list_head(&overlay_plugin_list); ipp != NULL; + ipp = list_next(&overlay_plugin_list, ipp)) { + if (strcmp(ipp->ovp_name, opp->ovp_name) == 0) { + mutex_exit(&overlay_plugin_lock); + kmem_cache_free(overlay_plugin_cache, opp); + return (EEXIST); + } + } + list_insert_tail(&overlay_plugin_list, opp); + mutex_exit(&overlay_plugin_lock); + + return (0); +} + +int +overlay_plugin_unregister(const char *name) +{ + overlay_plugin_t *opp; + + mutex_enter(&overlay_plugin_lock); + for (opp = list_head(&overlay_plugin_list); opp != NULL; + opp = list_next(&overlay_plugin_list, opp)) { + if (strcmp(opp->ovp_name, name) == 0) + break; + } + + if (opp == NULL) { + mutex_exit(&overlay_plugin_lock); + return (ENOENT); + } + + mutex_enter(&opp->ovp_mutex); + if (opp->ovp_active > 0) { + mutex_exit(&opp->ovp_mutex); + mutex_exit(&overlay_plugin_lock); + return (EBUSY); + } + mutex_exit(&opp->ovp_mutex); + + list_remove(&overlay_plugin_list, opp); + mutex_exit(&overlay_plugin_lock); + + kmem_cache_free(overlay_plugin_cache, opp); + return (0); +} + +overlay_plugin_t * +overlay_plugin_lookup(const char *name) +{ + overlay_plugin_t *opp; + boolean_t trymodload = B_FALSE; + + for (;;) { + mutex_enter(&overlay_plugin_lock); + for (opp = list_head(&overlay_plugin_list); opp != NULL; + opp = list_next(&overlay_plugin_list, opp)) { + if (strcmp(name, opp->ovp_name) == 0) { + mutex_enter(&opp->ovp_mutex); + opp->ovp_active++; + mutex_exit(&opp->ovp_mutex); + mutex_exit(&overlay_plugin_lock); + return (opp); + } + } + mutex_exit(&overlay_plugin_lock); + + if (trymodload == B_TRUE) + return (NULL); + + /* + * If we didn't find it, it may still exist, but just not have + * been a loaded module. In that case, we'll do one attempt to + * load it. + */ + if (modload(OVERLAY_MODDIR, (char *)name) == -1) + return (NULL); + trymodload = B_TRUE; + } + +} + +void +overlay_plugin_rele(overlay_plugin_t *opp) +{ + mutex_enter(&opp->ovp_mutex); + ASSERT(opp->ovp_active > 0); + opp->ovp_active--; + mutex_exit(&opp->ovp_mutex); +} + +void +overlay_plugin_walk(overlay_plugin_walk_f func, void *arg) +{ + overlay_plugin_t *opp; + mutex_enter(&overlay_plugin_lock); + for (opp = list_head(&overlay_plugin_list); opp != NULL; + opp = list_next(&overlay_plugin_list, opp)) { + if (func(opp, arg) != 0) { + mutex_exit(&overlay_plugin_lock); + return; + } + } + mutex_exit(&overlay_plugin_lock); +} diff --git a/usr/src/uts/common/io/overlay/overlay_prop.c b/usr/src/uts/common/io/overlay/overlay_prop.c new file mode 100644 index 0000000000..ba1ea2a629 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_prop.c @@ -0,0 +1,122 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. + */ + +/* + * Routines for manipulating property information structures. + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/overlay_impl.h> + +void +overlay_prop_init(overlay_prop_handle_t phdl) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss; + + infop->oipi_posssize = sizeof (mac_propval_range_t); + bzero(rangep, sizeof (mac_propval_range_t)); +} + +void +overlay_prop_set_name(overlay_prop_handle_t phdl, const char *name) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + (void) strlcpy(infop->oipi_name, name, OVERLAY_PROP_NAMELEN); +} + +void +overlay_prop_set_prot(overlay_prop_handle_t phdl, overlay_prop_prot_t prot) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + infop->oipi_prot = prot; +} + +void +overlay_prop_set_type(overlay_prop_handle_t phdl, overlay_prop_type_t type) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + infop->oipi_type = type; +} + +int +overlay_prop_set_default(overlay_prop_handle_t phdl, void *def, ssize_t len) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + + if (len > OVERLAY_PROP_SIZEMAX) + return (E2BIG); + + if (len < 0) + return (EOVERFLOW); + + bcopy(def, infop->oipi_default, len); + infop->oipi_defsize = (uint32_t)len; + + return (0); +} + +void +overlay_prop_set_nodefault(overlay_prop_handle_t phdl) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + infop->oipi_default[0] = '\0'; + infop->oipi_defsize = 0; +} + +void +overlay_prop_set_range_uint32(overlay_prop_handle_t phdl, uint32_t min, + uint32_t max) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss; + + if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_UINT32) + return; + + if (infop->oipi_posssize + sizeof (mac_propval_uint32_range_t) > + sizeof (infop->oipi_poss)) + return; + + infop->oipi_posssize += sizeof (mac_propval_uint32_range_t); + rangep->mpr_count++; + rangep->mpr_type = MAC_PROPVAL_UINT32; + rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_min = min; + rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_max = max; +} + +void +overlay_prop_set_range_str(overlay_prop_handle_t phdl, const char *str) +{ + size_t len = strlen(str) + 1; /* Account for a null terminator */ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss; + mac_propval_str_range_t *pstr = &rangep->u.mpr_str; + + if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_STR) + return; + + if (infop->oipi_posssize + len > sizeof (infop->oipi_poss)) + return; + + rangep->mpr_count++; + rangep->mpr_type = MAC_PROPVAL_STR; + strlcpy((char *)&pstr->mpur_data[pstr->mpur_nextbyte], str, + sizeof (infop->oipi_poss) - infop->oipi_posssize); + pstr->mpur_nextbyte += len; + infop->oipi_posssize += len; +} diff --git a/usr/src/uts/common/io/overlay/overlay_target.c b/usr/src/uts/common/io/overlay/overlay_target.c new file mode 100644 index 0000000000..f4147b56d1 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_target.c @@ -0,0 +1,1651 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * Overlay device target cache management + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/types.h> +#include <sys/ethernet.h> +#include <sys/kmem.h> +#include <sys/policy.h> +#include <sys/sysmacros.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/strsubr.h> +#include <sys/mac_provider.h> +#include <sys/mac_client.h> +#include <sys/mac_client_priv.h> +#include <sys/vlan.h> +#include <sys/crc32.h> +#include <sys/cred.h> +#include <sys/file.h> +#include <sys/errno.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> + +#include <sys/overlay_impl.h> +#include <sys/sdt.h> + +/* + * This is total straw man, but at least it's a prime number. Here we're + * going to have to go through and do a lot of evaluation and understanding as + * to how these target caches should grow and shrink, as well as, memory + * pressure and evictions. This just gives us a starting point that'll be 'good + * enough', until it's not. + */ +#define OVERLAY_HSIZE 823 + +/* + * We use this data structure to keep track of what requests have been actively + * allocated to a given instance so we know what to put back on the pending + * list. + */ +typedef struct overlay_target_hdl { + minor_t oth_minor; /* RO */ + zoneid_t oth_zoneid; /* RO */ + int oth_oflags; /* RO */ + list_node_t oth_link; /* overlay_target_lock */ + kmutex_t oth_lock; + list_t oth_outstanding; /* oth_lock */ +} overlay_target_hdl_t; + +typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int); +typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *); +typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int); + +typedef struct overaly_target_ioctl { + int oti_cmd; /* ioctl id */ + boolean_t oti_write; /* ioctl requires FWRITE */ + boolean_t oti_ncopyout; /* copyout data? */ + overlay_target_copyin_f oti_copyin; /* copyin func */ + overlay_target_ioctl_f oti_func; /* function to call */ + overlay_target_copyout_f oti_copyout; /* copyin func */ + size_t oti_size; /* size of user level structure */ +} overlay_target_ioctl_t; + +static kmem_cache_t *overlay_target_cache; +static kmem_cache_t *overlay_entry_cache; +static id_space_t *overlay_thdl_idspace; +static void *overlay_thdl_state; + +/* + * When we support overlay devices in the NGZ, then all of these need to become + * zone aware, by plugging into the netstack engine and becoming per-netstack + * data. + */ +static list_t overlay_thdl_list; +static kmutex_t overlay_target_lock; +static kcondvar_t overlay_target_condvar; +static list_t overlay_target_list; +static boolean_t overlay_target_excl; + +/* + * Outstanding data per hash table entry. + */ +static int overlay_ent_size = 128 * 1024; + +/* ARGSUSED */ +static int +overlay_target_cache_constructor(void *buf, void *arg, int kmflgs) +{ + overlay_target_t *ott = buf; + + mutex_init(&ott->ott_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&ott->ott_cond, NULL, CV_DRIVER, NULL); + return (0); +} + +/* ARGSUSED */ +static void +overlay_target_cache_destructor(void *buf, void *arg) +{ + overlay_target_t *ott = buf; + + cv_destroy(&ott->ott_cond); + mutex_destroy(&ott->ott_lock); +} + +/* ARGSUSED */ +static int +overlay_entry_cache_constructor(void *buf, void *arg, int kmflgs) +{ + overlay_target_entry_t *ote = buf; + + bzero(ote, sizeof (overlay_target_entry_t)); + mutex_init(&ote->ote_lock, NULL, MUTEX_DRIVER, NULL); + return (0); +} + +/* ARGSUSED */ +static void +overlay_entry_cache_destructor(void *buf, void *arg) +{ + overlay_target_entry_t *ote = buf; + + mutex_destroy(&ote->ote_lock); +} + +static uint64_t +overlay_mac_hash(const void *v) +{ + uint32_t crc; + CRC32(crc, v, ETHERADDRL, -1U, crc32_table); + return (crc); +} + +static int +overlay_mac_cmp(const void *a, const void *b) +{ + return (bcmp(a, b, ETHERADDRL)); +} + +/* ARGSUSED */ +static void +overlay_target_entry_dtor(void *arg) +{ + overlay_target_entry_t *ote = arg; + + ote->ote_flags = 0; + bzero(ote->ote_addr, ETHERADDRL); + ote->ote_ott = NULL; + ote->ote_odd = NULL; + freemsgchain(ote->ote_chead); + ote->ote_chead = ote->ote_ctail = NULL; + ote->ote_mbsize = 0; + ote->ote_vtime = 0; + kmem_cache_free(overlay_entry_cache, ote); +} + +static int +overlay_mac_avl(const void *a, const void *b) +{ + int i; + const overlay_target_entry_t *l, *r; + l = a; + r = b; + + for (i = 0; i < ETHERADDRL; i++) { + if (l->ote_addr[i] > r->ote_addr[i]) + return (1); + else if (l->ote_addr[i] < r->ote_addr[i]) + return (-1); + } + + return (0); +} + +void +overlay_target_init(void) +{ + int ret; + ret = ddi_soft_state_init(&overlay_thdl_state, + sizeof (overlay_target_hdl_t), 1); + VERIFY(ret == 0); + overlay_target_cache = kmem_cache_create("overlay_target", + sizeof (overlay_target_t), 0, overlay_target_cache_constructor, + overlay_target_cache_destructor, NULL, NULL, NULL, 0); + overlay_entry_cache = kmem_cache_create("overlay_entry", + sizeof (overlay_target_entry_t), 0, overlay_entry_cache_constructor, + overlay_entry_cache_destructor, NULL, NULL, NULL, 0); + mutex_init(&overlay_target_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&overlay_target_condvar, NULL, CV_DRIVER, NULL); + list_create(&overlay_target_list, sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_qlink)); + list_create(&overlay_thdl_list, sizeof (overlay_target_hdl_t), + offsetof(overlay_target_hdl_t, oth_link)); + overlay_thdl_idspace = id_space_create("overlay_target_minors", + 1, INT32_MAX); +} + +void +overlay_target_fini(void) +{ + id_space_destroy(overlay_thdl_idspace); + list_destroy(&overlay_thdl_list); + list_destroy(&overlay_target_list); + cv_destroy(&overlay_target_condvar); + mutex_destroy(&overlay_target_lock); + kmem_cache_destroy(overlay_entry_cache); + kmem_cache_destroy(overlay_target_cache); + ddi_soft_state_fini(&overlay_thdl_state); +} + +void +overlay_target_free(overlay_dev_t *odd) +{ + if (odd->odd_target == NULL) + return; + + if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) { + refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash; + avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree; + overlay_target_entry_t *ote; + + /* + * Our AVL tree and hashtable contain the same elements, + * therefore we should just remove it from the tree, but then + * delete the entries when we remove them from the hash table + * (which happens through the refhash dtor). + */ + while ((ote = avl_first(ap)) != NULL) + avl_remove(ap, ote); + + avl_destroy(ap); + for (ote = refhash_first(rp); ote != NULL; + ote = refhash_next(rp, ote)) { + refhash_remove(rp, ote); + } + refhash_destroy(rp); + } + + ASSERT(odd->odd_target->ott_ocount == 0); + kmem_cache_free(overlay_target_cache, odd->odd_target); +} + +int +overlay_target_busy() +{ + int ret; + + mutex_enter(&overlay_target_lock); + ret = !list_is_empty(&overlay_thdl_list); + mutex_exit(&overlay_target_lock); + + return (ret); +} + +static void +overlay_target_queue(overlay_target_entry_t *entry) +{ + mutex_enter(&overlay_target_lock); + mutex_enter(&entry->ote_ott->ott_lock); + if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) { + mutex_exit(&entry->ote_ott->ott_lock); + mutex_exit(&overlay_target_lock); + return; + } + entry->ote_ott->ott_ocount++; + mutex_exit(&entry->ote_ott->ott_lock); + list_insert_tail(&overlay_target_list, entry); + cv_signal(&overlay_target_condvar); + mutex_exit(&overlay_target_lock); +} + +void +overlay_target_quiesce(overlay_target_t *ott) +{ + if (ott == NULL) + return; + mutex_enter(&ott->ott_lock); + ott->ott_flags |= OVERLAY_T_TEARDOWN; + while (ott->ott_ocount != 0) + cv_wait(&ott->ott_cond, &ott->ott_lock); + mutex_exit(&ott->ott_lock); +} + +/* + * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP | + * OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at + * this time, say for NVGRE, we drop all packets that mcuh this. + */ +int +overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock, + socklen_t *slenp) +{ + int ret; + struct sockaddr_in6 *v6; + overlay_target_t *ott; + mac_header_info_t mhi; + overlay_target_entry_t *entry; + + ASSERT(odd->odd_target != NULL); + + /* + * At this point, the overlay device is in a mux which means that it's + * been activated. At this point, parts of the target, such as the mode + * and the destination are now read-only and we don't have to worry + * about synchronization for them. + */ + ott = odd->odd_target; + if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT)) + return (OVERLAY_TARGET_DROP); + + v6 = (struct sockaddr_in6 *)sock; + bzero(v6, sizeof (struct sockaddr_in6)); + v6->sin6_family = AF_INET6; + + if (ott->ott_mode == OVERLAY_TARGET_POINT) { + mutex_enter(&ott->ott_lock); + bcopy(&ott->ott_u.ott_point.otp_ip, &v6->sin6_addr, + sizeof (struct in6_addr)); + v6->sin6_port = htons(ott->ott_u.ott_point.otp_port); + mutex_exit(&ott->ott_lock); + *slenp = sizeof (struct sockaddr_in6); + + return (OVERLAY_TARGET_OK); + } + + ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC); + + /* + * Note we only want the MAC address here, therefore we won't bother + * using mac_vlan_header_info(). If any caller needs the vlan info at + * this point, this should change to a call to mac_vlan_header_info(). + */ + if (mac_header_info(odd->odd_mh, mp, &mhi) != 0) + return (OVERLAY_TARGET_DROP); + mutex_enter(&ott->ott_lock); + entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + mhi.mhi_daddr); + if (entry == NULL) { + entry = kmem_cache_alloc(overlay_entry_cache, + KM_NOSLEEP | KM_NORMALPRI); + if (entry == NULL) { + mutex_exit(&ott->ott_lock); + return (OVERLAY_TARGET_DROP); + } + bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL); + entry->ote_chead = entry->ote_ctail = mp; + entry->ote_mbsize = msgsize(mp); + entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; + entry->ote_ott = ott; + entry->ote_odd = odd; + refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry); + avl_add(&ott->ott_u.ott_dyn.ott_tree, entry); + mutex_exit(&ott->ott_lock); + overlay_target_queue(entry); + return (OVERLAY_TARGET_ASYNC); + } + refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry); + mutex_exit(&ott->ott_lock); + + mutex_enter(&entry->ote_lock); + if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) { + ret = OVERLAY_TARGET_DROP; + } else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr, + sizeof (struct in6_addr)); + v6->sin6_port = htons(entry->ote_dest.otp_port); + *slenp = sizeof (struct sockaddr_in6); + ret = OVERLAY_TARGET_OK; + } else { + size_t mlen = msgsize(mp); + + if (mlen + entry->ote_mbsize > overlay_ent_size) { + ret = OVERLAY_TARGET_DROP; + } else { + if (entry->ote_ctail != NULL) { + ASSERT(entry->ote_ctail->b_next == + NULL); + entry->ote_ctail->b_next = mp; + entry->ote_ctail = mp; + } else { + entry->ote_chead = mp; + entry->ote_ctail = mp; + } + entry->ote_mbsize += mlen; + if ((entry->ote_flags & + OVERLAY_ENTRY_F_PENDING) == 0) { + entry->ote_flags |= + OVERLAY_ENTRY_F_PENDING; + overlay_target_queue(entry); + } + ret = OVERLAY_TARGET_ASYNC; + } + } + mutex_exit(&entry->ote_lock); + + mutex_enter(&ott->ott_lock); + refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry); + mutex_exit(&ott->ott_lock); + + return (ret); +} + +/* ARGSUSED */ +static int +overlay_target_info(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_targ_info_t *oti = arg; + + odd = overlay_hold_by_dlid(oti->oti_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + oti->oti_flags = 0; + oti->oti_needs = odd->odd_plugin->ovp_dest; + if (odd->odd_flags & OVERLAY_F_DEGRADED) + oti->oti_flags |= OVERLAY_TARG_INFO_F_DEGRADED; + if (odd->odd_flags & OVERLAY_F_ACTIVATED) + oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE; + oti->oti_vnetid = odd->odd_vid; + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_associate(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_targ_associate_t *ota = arg; + + odd = overlay_hold_by_dlid(ota->ota_linkid); + if (odd == NULL) + return (ENOENT); + + if (ota->ota_id == 0) { + overlay_hold_rele(odd); + return (EINVAL); + } + + if (ota->ota_mode != OVERLAY_TARGET_POINT && + ota->ota_mode != OVERLAY_TARGET_DYNAMIC) { + overlay_hold_rele(odd); + return (EINVAL); + } + + if (ota->ota_provides != odd->odd_plugin->ovp_dest) { + overlay_hold_rele(odd); + return (EINVAL); + } + + if (ota->ota_mode == OVERLAY_TARGET_POINT) { + if (ota->ota_provides & OVERLAY_PLUGIN_D_IP) { + if (IN6_IS_ADDR_UNSPECIFIED(&ota->ota_point.otp_ip) || + IN6_IS_ADDR_V4COMPAT(&ota->ota_point.otp_ip) || + IN6_IS_ADDR_V4MAPPED_ANY(&ota->ota_point.otp_ip)) { + overlay_hold_rele(odd); + return (EINVAL); + } + } + + if (ota->ota_provides & OVERLAY_PLUGIN_D_PORT) { + if (ota->ota_point.otp_port == 0) { + overlay_hold_rele(odd); + return (EINVAL); + } + } + } + + ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP); + ott->ott_flags = 0; + ott->ott_ocount = 0; + ott->ott_mode = ota->ota_mode; + ott->ott_dest = ota->ota_provides; + ott->ott_id = ota->ota_id; + + if (ott->ott_mode == OVERLAY_TARGET_POINT) { + bcopy(&ota->ota_point, &ott->ott_u.ott_point, + sizeof (overlay_target_point_t)); + } else { + ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE, + overlay_mac_hash, overlay_mac_cmp, + overlay_target_entry_dtor, sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_reflink), + offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP); + avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl, + sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_avllink)); + } + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_VARPD) { + mutex_exit(&odd->odd_lock); + kmem_cache_free(overlay_target_cache, ott); + overlay_hold_rele(odd); + return (EEXIST); + } + + odd->odd_flags |= OVERLAY_F_VARPD; + odd->odd_target = ott; + mutex_exit(&odd->odd_lock); + + overlay_hold_rele(odd); + + + return (0); +} + + +/* ARGSUSED */ +static int +overlay_target_degrade(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_targ_degrade_t *otd = arg; + + odd = overlay_hold_by_dlid(otd->otd_linkid); + if (odd == NULL) + return (ENOENT); + + overlay_fm_degrade(odd, otd->otd_buf); + overlay_hold_rele(odd); + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_restore(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_targ_id_t *otid = arg; + + odd = overlay_hold_by_dlid(otid->otid_linkid); + if (odd == NULL) + return (ENOENT); + + overlay_fm_restore(odd); + overlay_hold_rele(odd); + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_disassociate(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_targ_id_t *otid = arg; + + odd = overlay_hold_by_dlid(otid->otid_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + odd->odd_flags &= ~OVERLAY_F_VARPD; + mutex_exit(&odd->odd_lock); + + overlay_hold_rele(odd); + return (0); + +} + +static int +overlay_target_lookup_request(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_lookup_t *otl = arg; + overlay_target_entry_t *entry; + clock_t ret, timeout; + mac_header_info_t mhi; + + timeout = ddi_get_lbolt() + drv_usectohz(MICROSEC); +again: + mutex_enter(&overlay_target_lock); + while (list_is_empty(&overlay_target_list)) { + ret = cv_timedwait(&overlay_target_condvar, + &overlay_target_lock, timeout); + if (ret == -1) { + mutex_exit(&overlay_target_lock); + return (ETIME); + } + } + entry = list_remove_head(&overlay_target_list); + mutex_exit(&overlay_target_lock); + mutex_enter(&entry->ote_lock); + if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + ASSERT(entry->ote_chead == NULL); + mutex_exit(&entry->ote_lock); + goto again; + } + ASSERT(entry->ote_chead != NULL); + + /* + * If we have a bogon that doesn't have a valid mac header, drop it and + * try again. + */ + if (mac_vlan_header_info(entry->ote_odd->odd_mh, entry->ote_chead, + &mhi) != 0) { + boolean_t queue = B_FALSE; + mblk_t *mp = entry->ote_chead; + entry->ote_chead = mp->b_next; + mp->b_next = NULL; + if (entry->ote_ctail == mp) + entry->ote_ctail = entry->ote_chead; + entry->ote_mbsize -= msgsize(mp); + if (entry->ote_chead != NULL) + queue = B_TRUE; + mutex_exit(&entry->ote_lock); + if (queue == B_TRUE) + overlay_target_queue(entry); + freemsg(mp); + goto again; + } + + otl->otl_dlid = entry->ote_odd->odd_linkid; + otl->otl_reqid = (uintptr_t)entry; + otl->otl_varpdid = entry->ote_ott->ott_id; + otl->otl_vnetid = entry->ote_odd->odd_vid; + + otl->otl_hdrsize = mhi.mhi_hdrsize; + otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize; + bcopy(mhi.mhi_daddr, otl->otl_dstaddr, ETHERADDRL); + bcopy(mhi.mhi_saddr, otl->otl_srcaddr, ETHERADDRL); + otl->otl_dsttype = mhi.mhi_dsttype; + otl->otl_sap = mhi.mhi_bindsap; + otl->otl_vlan = VLAN_ID(mhi.mhi_tci); + mutex_exit(&entry->ote_lock); + + mutex_enter(&thdl->oth_lock); + list_insert_tail(&thdl->oth_outstanding, entry); + mutex_exit(&thdl->oth_lock); + + return (0); +} + +static int +overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg) +{ + const overlay_targ_resp_t *otr = arg; + overlay_target_entry_t *entry; + mblk_t *mp; + + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == otr->otr_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + return (EINVAL); + } + list_remove(&thdl->oth_outstanding, entry); + mutex_exit(&thdl->oth_lock); + + mutex_enter(&entry->ote_lock); + bcopy(&otr->otr_answer, &entry->ote_dest, + sizeof (overlay_target_point_t)); + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; + entry->ote_flags |= OVERLAY_ENTRY_F_VALID; + mp = entry->ote_chead; + entry->ote_chead = NULL; + entry->ote_ctail = NULL; + entry->ote_mbsize = 0; + entry->ote_vtime = gethrtime(); + mutex_exit(&entry->ote_lock); + + /* + * For now do an in-situ drain. + */ + mp = overlay_m_tx(entry->ote_odd, mp); + freemsgchain(mp); + + mutex_enter(&entry->ote_ott->ott_lock); + entry->ote_ott->ott_ocount--; + cv_signal(&entry->ote_ott->ott_cond); + mutex_exit(&entry->ote_ott->ott_lock); + + return (0); +} + +static int +overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg) +{ + const overlay_targ_resp_t *otr = arg; + overlay_target_entry_t *entry; + mblk_t *mp; + boolean_t queue = B_FALSE; + + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == otr->otr_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + return (EINVAL); + } + list_remove(&thdl->oth_outstanding, entry); + mutex_exit(&thdl->oth_lock); + + mutex_enter(&entry->ote_lock); + + /* Safeguard against a confused varpd */ + if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; + DTRACE_PROBE1(overlay__target__valid__drop, + overlay_target_entry_t *, entry); + mutex_exit(&entry->ote_lock); + goto done; + } + + mp = entry->ote_chead; + if (mp != NULL) { + entry->ote_chead = mp->b_next; + mp->b_next = NULL; + if (entry->ote_ctail == mp) + entry->ote_ctail = entry->ote_chead; + entry->ote_mbsize -= msgsize(mp); + } + if (entry->ote_chead != NULL) { + queue = B_TRUE; + entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; + } else { + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; + } + mutex_exit(&entry->ote_lock); + + if (queue == B_TRUE) + overlay_target_queue(entry); + freemsg(mp); + +done: + mutex_enter(&entry->ote_ott->ott_lock); + entry->ote_ott->ott_ocount--; + cv_signal(&entry->ote_ott->ott_cond); + mutex_exit(&entry->ote_ott->ott_lock); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_pkt_copyin(const void *ubuf, void **outp, size_t *bsize, + int flags) +{ + overlay_targ_pkt_t *pkt; + overlay_targ_pkt32_t *pkt32; + + pkt = kmem_alloc(sizeof (overlay_targ_pkt_t), KM_SLEEP); + *outp = pkt; + *bsize = sizeof (overlay_targ_pkt_t); + if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) { + uintptr_t addr; + + if (ddi_copyin(ubuf, pkt, sizeof (overlay_targ_pkt32_t), + flags & FKIOCTL) != 0) { + kmem_free(pkt, *bsize); + return (EFAULT); + } + pkt32 = (overlay_targ_pkt32_t *)pkt; + addr = pkt32->otp_buf; + pkt->otp_buf = (void *)addr; + } else { + if (ddi_copyin(ubuf, pkt, *bsize, flags & FKIOCTL) != 0) { + kmem_free(pkt, *bsize); + return (EFAULT); + } + } + return (0); +} + +static int +overlay_target_pkt_copyout(void *ubuf, void *buf, size_t bufsize, + int flags) +{ + if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) { + overlay_targ_pkt_t *pkt = buf; + overlay_targ_pkt32_t *pkt32 = buf; + uintptr_t addr = (uintptr_t)pkt->otp_buf; + pkt32->otp_buf = (caddr32_t)addr; + if (ddi_copyout(buf, ubuf, sizeof (overlay_targ_pkt32_t), + flags & FKIOCTL) != 0) + return (EFAULT); + } else { + if (ddi_copyout(buf, ubuf, bufsize, flags & FKIOCTL) != 0) + return (EFAULT); + } + return (0); +} + +static int +overlay_target_packet(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_pkt_t *pkt = arg; + overlay_target_entry_t *entry; + mblk_t *mp; + size_t mlen; + size_t boff; + + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == pkt->otp_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + return (EINVAL); + } + mutex_enter(&entry->ote_lock); + mutex_exit(&thdl->oth_lock); + mp = entry->ote_chead; + /* Protect against a rogue varpd */ + if (mp == NULL) { + mutex_exit(&entry->ote_lock); + return (EINVAL); + } + mlen = MIN(msgsize(mp), pkt->otp_size); + pkt->otp_size = mlen; + boff = 0; + while (mlen > 0) { + size_t wlen = MIN(MBLKL(mp), mlen); + if (ddi_copyout(mp->b_rptr, + (void *)((uintptr_t)pkt->otp_buf + boff), + wlen, 0) != 0) { + mutex_exit(&entry->ote_lock); + return (EFAULT); + } + mlen -= wlen; + boff += wlen; + mp = mp->b_cont; + } + mutex_exit(&entry->ote_lock); + return (0); +} + +static int +overlay_target_inject(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_pkt_t *pkt = arg; + overlay_target_entry_t *entry; + overlay_dev_t *odd; + mblk_t *mp; + + if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ) + return (EINVAL); + + mp = allocb(pkt->otp_size, 0); + if (mp == NULL) + return (ENOMEM); + + if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) { + freeb(mp); + return (EFAULT); + } + mp->b_wptr += pkt->otp_size; + + if (pkt->otp_linkid != UINT64_MAX) { + odd = overlay_hold_by_dlid(pkt->otp_linkid); + if (odd == NULL) { + freeb(mp); + return (ENOENT); + } + } else { + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == pkt->otp_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + freeb(mp); + return (ENOENT); + } + odd = entry->ote_odd; + mutex_exit(&thdl->oth_lock); + } + + mutex_enter(&odd->odd_lock); + overlay_io_start(odd, OVERLAY_F_IN_RX); + mutex_exit(&odd->odd_lock); + + mac_rx(odd->odd_mh, NULL, mp); + + mutex_enter(&odd->odd_lock); + overlay_io_done(odd, OVERLAY_F_IN_RX); + mutex_exit(&odd->odd_lock); + + return (0); +} + +static int +overlay_target_resend(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_pkt_t *pkt = arg; + overlay_target_entry_t *entry; + overlay_dev_t *odd; + mblk_t *mp; + + if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ) + return (EINVAL); + + mp = allocb(pkt->otp_size, 0); + if (mp == NULL) + return (ENOMEM); + + if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) { + freeb(mp); + return (EFAULT); + } + mp->b_wptr += pkt->otp_size; + + if (pkt->otp_linkid != UINT64_MAX) { + odd = overlay_hold_by_dlid(pkt->otp_linkid); + if (odd == NULL) { + freeb(mp); + return (ENOENT); + } + } else { + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == pkt->otp_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + freeb(mp); + return (ENOENT); + } + odd = entry->ote_odd; + mutex_exit(&thdl->oth_lock); + } + + mp = overlay_m_tx(odd, mp); + freemsgchain(mp); + + return (0); +} + +typedef struct overlay_targ_list_int { + boolean_t otli_count; + uint32_t otli_cur; + uint32_t otli_nents; + uint32_t otli_ents[]; +} overlay_targ_list_int_t; + +static int +overlay_target_list_copyin(const void *ubuf, void **outp, size_t *bsize, + int flags) +{ + overlay_targ_list_t n; + overlay_targ_list_int_t *otl; + + if (ddi_copyin(ubuf, &n, sizeof (overlay_targ_list_t), + flags & FKIOCTL) != 0) + return (EFAULT); + + /* + */ + if (n.otl_nents >= INT32_MAX / sizeof (uint32_t)) + return (EINVAL); + *bsize = sizeof (overlay_targ_list_int_t) + + sizeof (uint32_t) * n.otl_nents; + otl = kmem_zalloc(*bsize, KM_SLEEP); + otl->otli_cur = 0; + otl->otli_nents = n.otl_nents; + if (otl->otli_nents != 0) { + otl->otli_count = B_FALSE; + if (ddi_copyin((void *)((uintptr_t)ubuf + + offsetof(overlay_targ_list_t, otl_ents)), + otl->otli_ents, n.otl_nents * sizeof (uint32_t), + flags & FKIOCTL) != 0) { + kmem_free(otl, *bsize); + return (EFAULT); + } + } else { + otl->otli_count = B_TRUE; + } + + *outp = otl; + return (0); +} + +static int +overlay_target_ioctl_list_cb(overlay_dev_t *odd, void *arg) +{ + overlay_targ_list_int_t *otl = arg; + + if (otl->otli_cur < otl->otli_nents) + otl->otli_ents[otl->otli_cur] = odd->odd_linkid; + otl->otli_cur++; + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_ioctl_list(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_iter(overlay_target_ioctl_list_cb, arg); + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_list_copyout(void *ubuf, void *buf, size_t bufsize, int flags) +{ + overlay_targ_list_int_t *otl = buf; + + if (ddi_copyout(&otl->otli_cur, ubuf, sizeof (uint32_t), + flags & FKIOCTL) != 0) + return (EFAULT); + + if (otl->otli_count == B_FALSE) { + if (ddi_copyout(otl->otli_ents, + (void *)((uintptr_t)ubuf + + offsetof(overlay_targ_list_t, otl_ents)), + sizeof (uint32_t) * otl->otli_nents, + flags & FKIOCTL) != 0) + return (EFAULT); + } + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg) +{ + int ret = 0; + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_targ_cache_t *otc = arg; + + odd = overlay_hold_by_dlid(otc->otc_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_POINT && + ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + if (ott->ott_mode == OVERLAY_TARGET_POINT) { + otc->otc_entry.otce_flags = 0; + bcopy(&ott->ott_u.ott_point, &otc->otc_entry.otce_dest, + sizeof (overlay_target_point_t)); + } else { + overlay_target_entry_t *ote; + ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + otc->otc_entry.otce_mac); + if (ote != NULL) { + mutex_enter(&ote->ote_lock); + if ((ote->ote_flags & + OVERLAY_ENTRY_F_VALID_MASK) != 0) { + if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) { + otc->otc_entry.otce_flags = + OVERLAY_TARGET_CACHE_DROP; + } else { + otc->otc_entry.otce_flags = 0; + bcopy(&ote->ote_dest, + &otc->otc_entry.otce_dest, + sizeof (overlay_target_point_t)); + } + ret = 0; + } else { + ret = ENOENT; + } + mutex_exit(&ote->ote_lock); + } else { + ret = ENOENT; + } + } + + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + + return (ret); +} + +/* ARGSUSED */ +static int +overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_target_entry_t *ote; + overlay_targ_cache_t *otc = arg; + mblk_t *mp = NULL; + + if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP) + return (EINVAL); + + odd = overlay_hold_by_dlid(otc->otc_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + otc->otc_entry.otce_mac); + if (ote == NULL) { + ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP); + bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL); + ote->ote_chead = ote->ote_ctail = NULL; + ote->ote_mbsize = 0; + ote->ote_ott = ott; + ote->ote_odd = odd; + mutex_enter(&ote->ote_lock); + refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote); + avl_add(&ott->ott_u.ott_dyn.ott_tree, ote); + } else { + mutex_enter(&ote->ote_lock); + } + + if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) { + ote->ote_flags |= OVERLAY_ENTRY_F_DROP; + } else { + ote->ote_flags |= OVERLAY_ENTRY_F_VALID; + bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest, + sizeof (overlay_target_point_t)); + mp = ote->ote_chead; + ote->ote_chead = NULL; + ote->ote_ctail = NULL; + ote->ote_mbsize = 0; + ote->ote_vtime = gethrtime(); + } + + mutex_exit(&ote->ote_lock); + mutex_exit(&ott->ott_lock); + + if (mp != NULL) { + mp = overlay_m_tx(ote->ote_odd, mp); + freemsgchain(mp); + } + + overlay_hold_rele(odd); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg) +{ + int ret = 0; + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_target_entry_t *ote; + overlay_targ_cache_t *otc = arg; + + odd = overlay_hold_by_dlid(otc->otc_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + otc->otc_entry.otce_mac); + if (ote != NULL) { + mutex_enter(&ote->ote_lock); + ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK; + mutex_exit(&ote->ote_lock); + ret = 0; + } else { + ret = ENOENT; + } + + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + + return (ret); +} + +/* ARGSUSED */ +static int +overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg) +{ + avl_tree_t *avl; + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_target_entry_t *ote; + overlay_targ_cache_t *otc = arg; + + odd = overlay_hold_by_dlid(otc->otc_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + avl = &ott->ott_u.ott_dyn.ott_tree; + + for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) { + mutex_enter(&ote->ote_lock); + ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK; + mutex_exit(&ote->ote_lock); + } + ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + otc->otc_entry.otce_mac); + + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + + return (0); +} + +static int +overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize, + int flags) +{ + overlay_targ_cache_iter_t base, *iter; + + if (ddi_copyin(ubuf, &base, sizeof (overlay_targ_cache_iter_t), + flags & FKIOCTL) != 0) + return (EFAULT); + + if (base.otci_count > OVERLAY_TARGET_ITER_MAX) + return (E2BIG); + + if (base.otci_count == 0) + return (EINVAL); + + *bsize = sizeof (overlay_targ_cache_iter_t) + + base.otci_count * sizeof (overlay_targ_cache_entry_t); + iter = kmem_alloc(*bsize, KM_SLEEP); + bcopy(&base, iter, sizeof (overlay_targ_cache_iter_t)); + *outp = iter; + + return (0); +} + +typedef struct overlay_targ_cache_marker { + uint8_t otcm_mac[ETHERADDRL]; + uint16_t otcm_done; +} overlay_targ_cache_marker_t; + +/* ARGSUSED */ +static int +overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_target_entry_t lookup, *ent; + overlay_targ_cache_marker_t *mark; + avl_index_t where; + avl_tree_t *avl; + uint16_t written = 0; + + overlay_targ_cache_iter_t *iter = arg; + mark = (void *)&iter->otci_marker; + + if (mark->otcm_done != 0) { + iter->otci_count = 0; + return (0); + } + + odd = overlay_hold_by_dlid(iter->otci_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC && + ott->ott_mode != OVERLAY_TARGET_POINT) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + + /* + * Holding this lock across the entire iteration probably isn't very + * good. We should perhaps add an r/w lock for the avl tree. But we'll + * wait until we now it's necessary before we do more. + */ + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + if (ott->ott_mode == OVERLAY_TARGET_POINT) { + overlay_targ_cache_entry_t *out = &iter->otci_ents[0]; + bzero(out->otce_mac, ETHERADDRL); + out->otce_flags = 0; + bcopy(&ott->ott_u.ott_point, &out->otce_dest, + sizeof (overlay_target_point_t)); + written++; + mark->otcm_done = 1; + } + + avl = &ott->ott_u.ott_dyn.ott_tree; + bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL); + ent = avl_find(avl, &lookup, &where); + + /* + * NULL ent means that the entry does not exist, so we want to start + * with the closest node in the tree. This means that we implicitly rely + * on the tree's order and the first node will be the mac 00:00:00:00:00 + * and the last will be ff:ff:ff:ff:ff:ff. + */ + if (ent == NULL) { + ent = avl_nearest(avl, where, AVL_AFTER); + if (ent == NULL) { + mark->otcm_done = 1; + goto done; + } + } + + for (; ent != NULL && written < iter->otci_count; + ent = AVL_NEXT(avl, ent)) { + overlay_targ_cache_entry_t *out = &iter->otci_ents[written]; + mutex_enter(&ent->ote_lock); + if ((ent->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) == 0) { + mutex_exit(&ent->ote_lock); + continue; + } + bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL); + out->otce_flags = 0; + if (ent->ote_flags & OVERLAY_ENTRY_F_DROP) + out->otce_flags |= OVERLAY_TARGET_CACHE_DROP; + if (ent->ote_flags & OVERLAY_ENTRY_F_VALID) + bcopy(&ent->ote_dest, &out->otce_dest, + sizeof (overlay_target_point_t)); + written++; + mutex_exit(&ent->ote_lock); + } + + if (ent != NULL) { + bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL); + } else { + mark->otcm_done = 1; + } + +done: + iter->otci_count = written; + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize, + int flags) +{ + size_t outsize; + const overlay_targ_cache_iter_t *iter = buf; + + outsize = sizeof (overlay_targ_cache_iter_t) + + iter->otci_count * sizeof (overlay_targ_cache_entry_t); + + if (ddi_copyout(buf, ubuf, outsize, flags & FKIOCTL) != 0) + return (EFAULT); + + return (0); +} + +static overlay_target_ioctl_t overlay_target_ioctab[] = { + { OVERLAY_TARG_INFO, B_TRUE, B_TRUE, + NULL, overlay_target_info, + NULL, sizeof (overlay_targ_info_t) }, + { OVERLAY_TARG_ASSOCIATE, B_TRUE, B_FALSE, + NULL, overlay_target_associate, + NULL, sizeof (overlay_targ_associate_t) }, + { OVERLAY_TARG_DISASSOCIATE, B_TRUE, B_FALSE, + NULL, overlay_target_disassociate, + NULL, sizeof (overlay_targ_id_t) }, + { OVERLAY_TARG_DEGRADE, B_TRUE, B_FALSE, + NULL, overlay_target_degrade, + NULL, sizeof (overlay_targ_degrade_t) }, + { OVERLAY_TARG_RESTORE, B_TRUE, B_FALSE, + NULL, overlay_target_restore, + NULL, sizeof (overlay_targ_id_t) }, + { OVERLAY_TARG_LOOKUP, B_FALSE, B_TRUE, + NULL, overlay_target_lookup_request, + NULL, sizeof (overlay_targ_lookup_t) }, + { OVERLAY_TARG_RESPOND, B_TRUE, B_FALSE, + NULL, overlay_target_lookup_respond, + NULL, sizeof (overlay_targ_resp_t) }, + { OVERLAY_TARG_DROP, B_TRUE, B_FALSE, + NULL, overlay_target_lookup_drop, + NULL, sizeof (overlay_targ_resp_t) }, + { OVERLAY_TARG_PKT, B_TRUE, B_TRUE, + overlay_target_pkt_copyin, + overlay_target_packet, + overlay_target_pkt_copyout, + sizeof (overlay_targ_pkt_t) }, + { OVERLAY_TARG_INJECT, B_TRUE, B_FALSE, + overlay_target_pkt_copyin, + overlay_target_inject, + NULL, sizeof (overlay_targ_pkt_t) }, + { OVERLAY_TARG_RESEND, B_TRUE, B_FALSE, + overlay_target_pkt_copyin, + overlay_target_resend, + NULL, sizeof (overlay_targ_pkt_t) }, + { OVERLAY_TARG_LIST, B_FALSE, B_TRUE, + overlay_target_list_copyin, + overlay_target_ioctl_list, + overlay_target_list_copyout, + sizeof (overlay_targ_list_t) }, + { OVERLAY_TARG_CACHE_GET, B_FALSE, B_TRUE, + NULL, overlay_target_cache_get, + NULL, sizeof (overlay_targ_cache_t) }, + { OVERLAY_TARG_CACHE_SET, B_TRUE, B_TRUE, + NULL, overlay_target_cache_set, + NULL, sizeof (overlay_targ_cache_t) }, + { OVERLAY_TARG_CACHE_REMOVE, B_TRUE, B_TRUE, + NULL, overlay_target_cache_remove, + NULL, sizeof (overlay_targ_cache_t) }, + { OVERLAY_TARG_CACHE_FLUSH, B_TRUE, B_TRUE, + NULL, overlay_target_cache_flush, + NULL, sizeof (overlay_targ_cache_t) }, + { OVERLAY_TARG_CACHE_ITER, B_FALSE, B_TRUE, + overlay_target_cache_iter_copyin, + overlay_target_cache_iter, + overlay_target_cache_iter_copyout, + sizeof (overlay_targ_cache_iter_t) }, + { 0 } +}; + +int +overlay_target_open(dev_t *devp, int flags, int otype, cred_t *credp) +{ + minor_t mid; + overlay_target_hdl_t *thdl; + + if (secpolicy_dl_config(credp) != 0) + return (EPERM); + + if (getminor(*devp) != 0) + return (ENXIO); + + if (otype & OTYP_BLK) + return (EINVAL); + + if (flags & ~(FREAD | FWRITE | FEXCL)) + return (EINVAL); + + if ((flags & FWRITE) && + !(flags & FEXCL)) + return (EINVAL); + + if (!(flags & FREAD) && !(flags & FWRITE)) + return (EINVAL); + + if (crgetzoneid(credp) != GLOBAL_ZONEID) + return (EPERM); + + mid = id_alloc(overlay_thdl_idspace); + if (ddi_soft_state_zalloc(overlay_thdl_state, mid) != 0) { + id_free(overlay_thdl_idspace, mid); + return (ENXIO); + } + + thdl = ddi_get_soft_state(overlay_thdl_state, mid); + VERIFY(thdl != NULL); + thdl->oth_minor = mid; + thdl->oth_zoneid = crgetzoneid(credp); + thdl->oth_oflags = flags; + mutex_init(&thdl->oth_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&thdl->oth_outstanding, sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_qlink)); + *devp = makedevice(getmajor(*devp), mid); + + mutex_enter(&overlay_target_lock); + if ((flags & FEXCL) && overlay_target_excl == B_TRUE) { + mutex_exit(&overlay_target_lock); + list_destroy(&thdl->oth_outstanding); + mutex_destroy(&thdl->oth_lock); + ddi_soft_state_free(overlay_thdl_state, mid); + id_free(overlay_thdl_idspace, mid); + return (EEXIST); + } else if ((flags & FEXCL) != 0) { + VERIFY(overlay_target_excl == B_FALSE); + overlay_target_excl = B_TRUE; + } + list_insert_tail(&overlay_thdl_list, thdl); + mutex_exit(&overlay_target_lock); + + return (0); +} + +/* ARGSUSED */ +int +overlay_target_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + overlay_target_ioctl_t *ioc; + overlay_target_hdl_t *thdl; + + if (secpolicy_dl_config(credp) != 0) + return (EPERM); + + if ((thdl = ddi_get_soft_state(overlay_thdl_state, + getminor(dev))) == NULL) + return (ENXIO); + + for (ioc = &overlay_target_ioctab[0]; ioc->oti_cmd != 0; ioc++) { + int ret; + caddr_t buf; + size_t bufsize; + + if (ioc->oti_cmd != cmd) + continue; + + if (ioc->oti_write == B_TRUE && !(mode & FWRITE)) + return (EBADF); + + if (ioc->oti_copyin == NULL) { + bufsize = ioc->oti_size; + buf = kmem_alloc(bufsize, KM_SLEEP); + if (ddi_copyin((void *)(uintptr_t)arg, buf, bufsize, + mode & FKIOCTL) != 0) { + kmem_free(buf, bufsize); + return (EFAULT); + } + } else { + if ((ret = ioc->oti_copyin((void *)(uintptr_t)arg, + (void **)&buf, &bufsize, mode)) != 0) + return (ret); + } + + ret = ioc->oti_func(thdl, buf); + if (ret == 0 && ioc->oti_size != 0 && + ioc->oti_ncopyout == B_TRUE) { + if (ioc->oti_copyout == NULL) { + if (ddi_copyout(buf, (void *)(uintptr_t)arg, + bufsize, mode & FKIOCTL) != 0) + ret = EFAULT; + } else { + ret = ioc->oti_copyout((void *)(uintptr_t)arg, + buf, bufsize, mode); + } + } + + kmem_free(buf, bufsize); + return (ret); + } + + return (ENOTTY); +} + +/* ARGSUSED */ +int +overlay_target_close(dev_t dev, int flags, int otype, cred_t *credp) +{ + overlay_target_hdl_t *thdl; + overlay_target_entry_t *entry; + minor_t mid = getminor(dev); + + if ((thdl = ddi_get_soft_state(overlay_thdl_state, mid)) == NULL) + return (ENXIO); + + mutex_enter(&overlay_target_lock); + list_remove(&overlay_thdl_list, thdl); + mutex_enter(&thdl->oth_lock); + while ((entry = list_remove_head(&thdl->oth_outstanding)) != NULL) + list_insert_tail(&overlay_target_list, entry); + cv_signal(&overlay_target_condvar); + mutex_exit(&thdl->oth_lock); + if ((thdl->oth_oflags & FEXCL) != 0) { + VERIFY(overlay_target_excl == B_TRUE); + overlay_target_excl = B_FALSE; + } + mutex_exit(&overlay_target_lock); + + list_destroy(&thdl->oth_outstanding); + mutex_destroy(&thdl->oth_lock); + mid = thdl->oth_minor; + ddi_soft_state_free(overlay_thdl_state, mid); + id_free(overlay_thdl_idspace, mid); + + return (0); +} diff --git a/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c new file mode 100644 index 0000000000..92144b3985 --- /dev/null +++ b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c @@ -0,0 +1,394 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * VXLAN encapsulation module + * + * + * The VXLAN header looks as follows in network byte order: + * + * |0 3| 4 |5 31| + * +----------+---+------------------------+ + * | Reserved | I | Reserved | + * +---------------------------------------+ + * | Virtual Network ID | Reserved | + * +----------------------------+----------+ + * |0 23|24 31| + * + * All reserved values must be 0. The I bit must be 1. We call the top + * word the VXLAN magic field for the time being. The second word is + * definitely not the most friendly way to operate. Specifically, the ID + * is a 24-bit big endian value, but we have to make sure not to use the + * reserved byte. + * + * For us, VXLAN encapsulation is a fairly straightforward implementation. It + * only has two properties, a listen_ip and a listen_port. These determine on + * what address we should be listening on. While we do not have a default + * address to listen upon, we do have a default port, which is the IANA assigned + * port for VXLAN -- 4789. + */ + +#include <sys/overlay_plugin.h> +#include <sys/modctl.h> +#include <sys/errno.h> +#include <sys/byteorder.h> +#include <sys/vxlan.h> +#include <inet/ip.h> +#include <netinet/in.h> +#include <sys/strsun.h> +#include <netinet/udp.h> + +static const char *vxlan_ident = "vxlan"; +static uint16_t vxlan_defport = IPPORT_VXLAN; + +/* + * Should we enable UDP source port hashing for fanout. + */ +boolean_t vxlan_fanout = B_TRUE; + +/* + * This represents the size in bytes that we want to allocate when allocating a + * vxlan header block. This is intended such that lower levels can try and use + * the message block that we allocate for the IP and UPD header. The hope is + * that even if this is tunneled, that this is enough space. + * + * The vxlan_noalloc_min value represents the minimum amount of space we need to + * consider not allocating a message block and just passing it down the stack in + * this form. This number assumes that we have a VLAN tag, so 18 byte Ethernet + * header, 20 byte IP header, 8 byte UDP header, and 8 byte VXLAN header. + */ +uint_t vxlan_alloc_size = 128; +uint_t vxlan_noalloc_min = 54; + +static const char *vxlan_props[] = { + "vxlan/listen_ip", + "vxlan/listen_port", + NULL +}; + +typedef struct vxlan { + kmutex_t vxl_lock; + overlay_handle_t vxl_oh; + uint16_t vxl_lport; + boolean_t vxl_hladdr; + struct in6_addr vxl_laddr; +} vxlan_t; + +static int +vxlan_o_init(overlay_handle_t oh, void **outp) +{ + vxlan_t *vxl; + + vxl = kmem_alloc(sizeof (vxlan_t), KM_SLEEP); + *outp = vxl; + mutex_init(&vxl->vxl_lock, NULL, MUTEX_DRIVER, NULL); + vxl->vxl_oh = oh; + vxl->vxl_lport = vxlan_defport; + vxl->vxl_hladdr = B_FALSE; + + return (0); +} + +static void +vxlan_o_fini(void *arg) +{ + vxlan_t *vxl = arg; + + mutex_destroy(&vxl->vxl_lock); + kmem_free(arg, sizeof (vxlan_t)); +} + +static int +vxlan_o_socket(void *arg, int *dp, int *fp, int *pp, struct sockaddr *addr, + socklen_t *slenp) +{ + vxlan_t *vxl = arg; + struct sockaddr_in6 *in; + + in = (struct sockaddr_in6 *)addr; + *dp = AF_INET6; + *fp = SOCK_DGRAM; + *pp = 0; + bzero(in, sizeof (struct sockaddr_in6)); + in->sin6_family = AF_INET6; + + /* + * We should consider a more expressive private errno set that + * provider's can use. + */ + mutex_enter(&vxl->vxl_lock); + if (vxl->vxl_hladdr == B_FALSE) { + mutex_exit(&vxl->vxl_lock); + return (EINVAL); + } + in->sin6_port = htons(vxl->vxl_lport); + in->sin6_addr = vxl->vxl_laddr; + mutex_exit(&vxl->vxl_lock); + *slenp = sizeof (struct sockaddr_in6); + + return (0); +} + +static int +vxlan_o_sockopt(ksocket_t ksock) +{ + int val, err; + if (vxlan_fanout == B_FALSE) + return (0); + + val = UDP_HASH_VXLAN; + err = ksocket_setsockopt(ksock, IPPROTO_UDP, UDP_SRCPORT_HASH, &val, + sizeof (val), kcred); + return (err); +} + +/* ARGSUSED */ +static int +vxlan_o_encap(void *arg, mblk_t *mp, ovep_encap_info_t *einfop, + mblk_t **outp) +{ + mblk_t *ob; + vxlan_hdr_t *vxh; + + ASSERT(einfop->ovdi_id < (1 << 24)); + + if (DB_REF(mp) != 1 || mp->b_rptr - vxlan_noalloc_min < DB_BASE(mp)) { + /* + * This allocation could get hot. We may want to have a good + * way to cache and handle this allocation the same way that IP + * does with keeping around a message block per entry, or + * basically treating this as an immutable message block in the + * system. Basically freemsg() will be a nop, but we'll do the + * right thing with respect to the rest of the chain. + */ + ob = allocb(vxlan_alloc_size, 0); + if (ob == NULL) + return (ENOMEM); + + ob->b_wptr = DB_LIM(ob); + ob->b_rptr = ob->b_wptr; + ob->b_cont = mp; + } else { + ob = mp; + } + ob->b_rptr -= VXLAN_HDR_LEN; + + vxh = (vxlan_hdr_t *)ob->b_rptr; + vxh->vxlan_flags = ntohl(VXLAN_F_VDI); + vxh->vxlan_id = htonl((uint32_t)einfop->ovdi_id << VXLAN_ID_SHIFT); + *outp = ob; + + return (0); +} + +/* ARGSUSED */ +static int +vxlan_o_decap(void *arg, mblk_t *mp, ovep_encap_info_t *dinfop) +{ + vxlan_hdr_t *vxh; + + if (MBLKL(mp) < sizeof (vxlan_hdr_t)) + return (EINVAL); + vxh = (vxlan_hdr_t *)mp->b_rptr; + if ((ntohl(vxh->vxlan_flags) & VXLAN_F_VDI) == 0) + return (EINVAL); + + dinfop->ovdi_id = ntohl(vxh->vxlan_id) >> VXLAN_ID_SHIFT; + dinfop->ovdi_hdr_size = VXLAN_HDR_LEN; + + return (0); +} + +static int +vxlan_o_getprop(void *arg, const char *pr_name, void *buf, uint32_t *bufsize) +{ + vxlan_t *vxl = arg; + + /* vxlan/listen_ip */ + if (strcmp(pr_name, vxlan_props[0]) == 0) { + if (*bufsize < sizeof (struct in6_addr)) + return (EOVERFLOW); + + mutex_enter(&vxl->vxl_lock); + if (vxl->vxl_hladdr == B_FALSE) { + *bufsize = 0; + } else { + bcopy(&vxl->vxl_laddr, buf, sizeof (struct in6_addr)); + *bufsize = sizeof (struct in6_addr); + } + mutex_exit(&vxl->vxl_lock); + return (0); + } + + /* vxlan/listen_port */ + if (strcmp(pr_name, vxlan_props[1]) == 0) { + uint64_t val; + if (*bufsize < sizeof (uint64_t)) + return (EOVERFLOW); + + mutex_enter(&vxl->vxl_lock); + val = vxl->vxl_lport; + bcopy(&val, buf, sizeof (uint64_t)); + *bufsize = sizeof (uint64_t); + mutex_exit(&vxl->vxl_lock); + return (0); + } + + return (EINVAL); +} + +static int +vxlan_o_setprop(void *arg, const char *pr_name, const void *buf, + uint32_t bufsize) +{ + vxlan_t *vxl = arg; + + /* vxlan/listen_ip */ + if (strcmp(pr_name, vxlan_props[0]) == 0) { + const struct in6_addr *ipv6 = buf; + if (bufsize != sizeof (struct in6_addr)) + return (EINVAL); + + if (IN6_IS_ADDR_V4COMPAT(ipv6)) + return (EINVAL); + + if (IN6_IS_ADDR_MULTICAST(ipv6)) + return (EINVAL); + + if (IN6_IS_ADDR_6TO4(ipv6)) + return (EINVAL); + + if (IN6_IS_ADDR_V4MAPPED(ipv6)) { + ipaddr_t v4; + IN6_V4MAPPED_TO_IPADDR(ipv6, v4); + if (IN_MULTICAST(v4)) + return (EINVAL); + } + + mutex_enter(&vxl->vxl_lock); + vxl->vxl_hladdr = B_TRUE; + bcopy(ipv6, &vxl->vxl_laddr, sizeof (struct in6_addr)); + mutex_exit(&vxl->vxl_lock); + + return (0); + } + + /* vxlan/listen_port */ + if (strcmp(pr_name, vxlan_props[1]) == 0) { + const uint64_t *valp = buf; + if (bufsize != 8) + return (EINVAL); + + if (*valp == 0 || *valp > UINT16_MAX) + return (EINVAL); + + mutex_enter(&vxl->vxl_lock); + vxl->vxl_lport = *valp; + mutex_exit(&vxl->vxl_lock); + return (0); + } + return (EINVAL); +} + +static int +vxlan_o_propinfo(const char *pr_name, overlay_prop_handle_t phdl) +{ + /* vxlan/listen_ip */ + if (strcmp(pr_name, vxlan_props[0]) == 0) { + overlay_prop_set_name(phdl, vxlan_props[0]); + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_IP); + overlay_prop_set_nodefault(phdl); + return (0); + } + + if (strcmp(pr_name, vxlan_props[1]) == 0) { + overlay_prop_set_name(phdl, vxlan_props[1]); + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + (void) overlay_prop_set_default(phdl, &vxlan_defport, + sizeof (vxlan_defport)); + overlay_prop_set_range_uint32(phdl, 1, UINT16_MAX); + return (0); + } + + return (EINVAL); +} + +static struct overlay_plugin_ops vxlan_o_ops = { + 0, + vxlan_o_init, + vxlan_o_fini, + vxlan_o_encap, + vxlan_o_decap, + vxlan_o_socket, + vxlan_o_sockopt, + vxlan_o_getprop, + vxlan_o_setprop, + vxlan_o_propinfo +}; + +static struct modlmisc vxlan_modlmisc = { + &mod_miscops, + "VXLAN encap plugin" +}; + +static struct modlinkage vxlan_modlinkage = { + MODREV_1, + &vxlan_modlmisc +}; + +int +_init(void) +{ + int err; + overlay_plugin_register_t *ovrp; + + ovrp = overlay_plugin_alloc(OVEP_VERSION); + if (ovrp == NULL) + return (ENOTSUP); + ovrp->ovep_name = vxlan_ident; + ovrp->ovep_ops = &vxlan_o_ops; + ovrp->ovep_id_size = VXLAN_ID_LEN; + ovrp->ovep_flags = OVEP_F_VLAN_TAG; + ovrp->ovep_dest = OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT; + ovrp->ovep_props = vxlan_props; + + if ((err = overlay_plugin_register(ovrp)) == 0) { + if ((err = mod_install(&vxlan_modlinkage)) != 0) { + (void) overlay_plugin_unregister(vxlan_ident); + } + } + + overlay_plugin_free(ovrp); + return (err); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&vxlan_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int err; + + if ((err = overlay_plugin_unregister(vxlan_ident)) != 0) + return (err); + + return (mod_remove(&vxlan_modlinkage)); +} diff --git a/usr/src/uts/common/io/pciex/pcie.c b/usr/src/uts/common/io/pciex/pcie.c index 95eaa5837f..2f424321f1 100644 --- a/usr/src/uts/common/io/pciex/pcie.c +++ b/usr/src/uts/common/io/pciex/pcie.c @@ -684,6 +684,7 @@ pcie_init_pfd(dev_info_t *dip) pfd_p->pe_bus_p = bus_p; pfd_p->pe_severity_flags = 0; + pfd_p->pe_severity_mask = 0; pfd_p->pe_orig_severity_flags = 0; pfd_p->pe_lock = B_FALSE; pfd_p->pe_valid = B_FALSE; @@ -840,6 +841,7 @@ pcie_rc_init_pfd(dev_info_t *dip, pf_data_t *pfd_p) { pfd_p->pe_bus_p = PCIE_DIP2DOWNBUS(dip); pfd_p->pe_severity_flags = 0; + pfd_p->pe_severity_mask = 0; pfd_p->pe_orig_severity_flags = 0; pfd_p->pe_lock = B_FALSE; pfd_p->pe_valid = B_FALSE; @@ -921,7 +923,7 @@ pcie_rc_init_bus(dev_info_t *dip) bus_p->bus_aer_off = (uint16_t)-1; /* Needed only for handle lookup */ - bus_p->bus_fm_flags |= PF_FM_READY; + atomic_or_uint(&bus_p->bus_fm_flags, PF_FM_READY); ndi_set_bus_private(dip, B_FALSE, DEVI_PORT_TYPE_PCI, bus_p); @@ -938,6 +940,172 @@ pcie_rc_fini_bus(dev_info_t *dip) } /* + * We need to capture the supported, maximum, and current device speed and + * width. The way that this has been done has changed over time. + * + * Prior to PCIe Gen 3, there were only current and supported speed fields. + * These were found in the link status and link capabilities registers of the + * PCI express capability. With the change to PCIe Gen 3, the information in the + * link capabilities changed to the maximum value. The supported speeds vector + * was moved to the link capabilities 2 register. + * + * Now, a device may not implement some of these registers. To determine whether + * or not it's here, we have to do the following. First, we need to check the + * revision of the PCI express capability. The link capabilities 2 register did + * not exist prior to version 2 of this register. + */ +static void +pcie_capture_speeds(pcie_bus_t *bus_p, pcie_req_id_t bdf, dev_info_t *rcdip) +{ + uint16_t vers, status; + uint32_t val, cap, cap2; + + if (!PCIE_IS_PCIE(bus_p)) + return; + + vers = pci_cfgacc_get16(rcdip, bdf, bus_p->bus_pcie_off + PCIE_PCIECAP); + if (vers == PCI_EINVAL16) + return; + vers &= PCIE_PCIECAP_VER_MASK; + + /* + * Verify the capability's version. + */ + switch (vers) { + case PCIE_PCIECAP_VER_1_0: + cap2 = 0; + break; + case PCIE_PCIECAP_VER_2_0: + cap2 = pci_cfgacc_get32(rcdip, bdf, bus_p->bus_pcie_off + + PCIE_LINKCAP2); + if (cap2 == PCI_EINVAL32) + cap2 = 0; + break; + default: + /* Don't try and handle an unknown version */ + return; + } + + status = pci_cfgacc_get16(rcdip, bdf, bus_p->bus_pcie_off + + PCIE_LINKSTS); + cap = pci_cfgacc_get32(rcdip, bdf, bus_p->bus_pcie_off + PCIE_LINKCAP); + if (status == PCI_EINVAL16 || cap == PCI_EINVAL32) + return; + + switch (status & PCIE_LINKSTS_SPEED_MASK) { + case PCIE_LINKSTS_SPEED_2_5: + bus_p->bus_cur_speed = PCIE_LINK_SPEED_2_5; + break; + case PCIE_LINKSTS_SPEED_5: + bus_p->bus_cur_speed = PCIE_LINK_SPEED_5; + break; + case PCIE_LINKSTS_SPEED_8: + bus_p->bus_cur_speed = PCIE_LINK_SPEED_8; + break; + default: + bus_p->bus_cur_speed = PCIE_LINK_SPEED_UNKNOWN; + break; + } + + switch (status & PCIE_LINKSTS_NEG_WIDTH_MASK) { + case PCIE_LINKSTS_NEG_WIDTH_X1: + bus_p->bus_cur_width = PCIE_LINK_WIDTH_X1; + break; + case PCIE_LINKSTS_NEG_WIDTH_X2: + bus_p->bus_cur_width = PCIE_LINK_WIDTH_X2; + break; + case PCIE_LINKSTS_NEG_WIDTH_X4: + bus_p->bus_cur_width = PCIE_LINK_WIDTH_X4; + break; + case PCIE_LINKSTS_NEG_WIDTH_X8: + bus_p->bus_cur_width = PCIE_LINK_WIDTH_X8; + break; + case PCIE_LINKSTS_NEG_WIDTH_X12: + bus_p->bus_cur_width = PCIE_LINK_WIDTH_X12; + break; + case PCIE_LINKSTS_NEG_WIDTH_X16: + bus_p->bus_cur_width = PCIE_LINK_WIDTH_X16; + break; + case PCIE_LINKSTS_NEG_WIDTH_X32: + bus_p->bus_cur_width = PCIE_LINK_WIDTH_X32; + break; + default: + bus_p->bus_cur_width = PCIE_LINK_WIDTH_UNKNOWN; + break; + } + + switch (cap & PCIE_LINKCAP_MAX_WIDTH_MASK) { + case PCIE_LINKCAP_MAX_WIDTH_X1: + bus_p->bus_max_width = PCIE_LINK_WIDTH_X1; + break; + case PCIE_LINKCAP_MAX_WIDTH_X2: + bus_p->bus_max_width = PCIE_LINK_WIDTH_X2; + break; + case PCIE_LINKCAP_MAX_WIDTH_X4: + bus_p->bus_max_width = PCIE_LINK_WIDTH_X4; + break; + case PCIE_LINKCAP_MAX_WIDTH_X8: + bus_p->bus_max_width = PCIE_LINK_WIDTH_X8; + break; + case PCIE_LINKCAP_MAX_WIDTH_X12: + bus_p->bus_max_width = PCIE_LINK_WIDTH_X12; + break; + case PCIE_LINKCAP_MAX_WIDTH_X16: + bus_p->bus_max_width = PCIE_LINK_WIDTH_X16; + break; + case PCIE_LINKCAP_MAX_WIDTH_X32: + bus_p->bus_max_width = PCIE_LINK_WIDTH_X32; + break; + default: + bus_p->bus_max_width = PCIE_LINK_WIDTH_UNKNOWN; + break; + } + + /* + * If we have the Link Capabilities 2, then we can get the supported + * speeds from it and treat the bits in Link Capabilities 1 as the + * maximum. If we don't, then we need to follow the Implementation Note + * in the standard under Link Capabilities 2. Effectively, this means + * that if the value of 10b is set in Link Capabilities register, that + * it supports both 2.5 and 5 GT/s speeds. + */ + if (cap2 != 0) { + if (cap2 & PCIE_LINKCAP2_SPEED_2_5) + bus_p->bus_sup_speed |= PCIE_LINK_SPEED_2_5; + if (cap2 & PCIE_LINKCAP2_SPEED_5) + bus_p->bus_sup_speed |= PCIE_LINK_SPEED_5; + if (cap2 & PCIE_LINKCAP2_SPEED_8) + bus_p->bus_sup_speed |= PCIE_LINK_SPEED_8; + + switch (cap & PCIE_LINKCAP_MAX_SPEED_MASK) { + case PCIE_LINKCAP_MAX_SPEED_2_5: + bus_p->bus_max_speed = PCIE_LINK_SPEED_2_5; + break; + case PCIE_LINKCAP_MAX_SPEED_5: + bus_p->bus_max_speed = PCIE_LINK_SPEED_5; + break; + case PCIE_LINKCAP_MAX_SPEED_8: + bus_p->bus_max_speed = PCIE_LINK_SPEED_8; + break; + default: + bus_p->bus_max_speed = PCIE_LINK_SPEED_UNKNOWN; + break; + } + } else { + if (cap & PCIE_LINKCAP_MAX_SPEED_5) { + bus_p->bus_max_speed = PCIE_LINK_SPEED_5; + bus_p->bus_sup_speed = PCIE_LINK_SPEED_2_5 | + PCIE_LINK_SPEED_5; + } + + if (cap & PCIE_LINKCAP_MAX_SPEED_2_5) { + bus_p->bus_max_speed = PCIE_LINK_SPEED_2_5; + bus_p->bus_sup_speed = PCIE_LINK_SPEED_2_5; + } + } +} + +/* * partially init pcie_bus_t for device (dip,bdf) for accessing pci * config space * @@ -1134,6 +1302,10 @@ pcie_init_bus(dev_info_t *dip, pcie_req_id_t bdf, uint8_t flags) } } + /* + * Save and record speed information about the device. + */ + caps_done: /* save RP dip and RP bdf */ if (PCIE_IS_RP(bus_p)) { @@ -1170,7 +1342,7 @@ caps_done: } bus_p->bus_soft_state = PCI_SOFT_STATE_CLOSED; - bus_p->bus_fm_flags = 0; + (void) atomic_swap_uint(&bus_p->bus_fm_flags, 0); bus_p->bus_mps = 0; ndi_set_bus_private(dip, B_TRUE, DEVI_PORT_TYPE_PCI, (void *)bus_p); @@ -1226,6 +1398,8 @@ initial_done: pcie_init_plat(dip); + pcie_capture_speeds(bus_p, bdf, rcdip); + final_done: PCIE_DBG("Add %s(dip 0x%p, bdf 0x%x, secbus 0x%x)\n", @@ -1318,14 +1492,15 @@ pcie_get_rc_dip(dev_info_t *dip) return (rcdip); } -static boolean_t +boolean_t pcie_is_pci_device(dev_info_t *dip) { dev_info_t *pdip; char *device_type; pdip = ddi_get_parent(dip); - ASSERT(pdip); + if (pdip == NULL) + return (B_FALSE); if (ddi_prop_lookup_string(DDI_DEV_T_ANY, pdip, DDI_PROP_DONTPASS, "device_type", &device_type) != DDI_PROP_SUCCESS) diff --git a/usr/src/uts/common/io/pciex/pcie_fault.c b/usr/src/uts/common/io/pciex/pcie_fault.c index f4a2e9190e..61271d674b 100644 --- a/usr/src/uts/common/io/pciex/pcie_fault.c +++ b/usr/src/uts/common/io/pciex/pcie_fault.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017, Joyent, Inc. */ #include <sys/sysmacros.h> @@ -164,8 +165,8 @@ int pcie_disable_scan = 0; /* Disable fabric scan */ /* Inform interested parties that error handling is about to begin. */ /* ARGSUSED */ void -pf_eh_enter(pcie_bus_t *bus_p) { -} +pf_eh_enter(pcie_bus_t *bus_p) +{} /* Inform interested parties that error handling has ended. */ void @@ -304,7 +305,8 @@ done: } void -pcie_force_fullscan() { +pcie_force_fullscan() +{ pcie_full_scan = B_TRUE; } @@ -917,10 +919,18 @@ pf_default_hdl(dev_info_t *dip, pf_impl_t *impl) } /* - * Read vendor/device ID and check with cached data, if it doesn't match - * could very well be a device that isn't responding anymore. Just - * stop. Save the basic info in the error q for post mortem debugging - * purposes. + * If this is a device used for PCI passthrough into a virtual machine, + * don't let any error it caused panic the system. + */ + if (bus_p->bus_fm_flags & PF_FM_IS_PASSTHRU) + pfd_p->pe_severity_mask |= PF_ERR_PANIC; + + /* + * Read vendor/device ID and check with cached data; if it doesn't + * match, it could very well mean that the device is no longer + * responding. In this case, we return PF_SCAN_BAD_RESPONSE; should + * the caller choose to panic in this case, we will have the basic + * info in the error queue for the purposes of postmortem debugging. */ if (PCIE_GET(32, bus_p, PCI_CONF_VENID) != bus_p->bus_dev_ven_id) { char buf[FM_MAX_CLASS]; @@ -931,12 +941,12 @@ pf_default_hdl(dev_info_t *dip, pf_impl_t *impl) DDI_NOSLEEP, FM_VERSION, DATA_TYPE_UINT8, 0, NULL); /* - * For IOV/Hotplug purposes skip gathering info fo this device, + * For IOV/Hotplug purposes skip gathering info for this device, * but populate affected info and severity. Clear out any data * that maybe been saved in the last fabric scan. */ pf_reset_pfd(pfd_p); - pfd_p->pe_severity_flags = PF_ERR_PANIC_BAD_RESPONSE; + pfd_p->pe_severity_flags = PF_ERR_BAD_RESPONSE; PFD_AFFECTED_DEV(pfd_p)->pe_affected_flags = PF_AFFECTED_SELF; /* Add the snapshot to the error q */ @@ -948,6 +958,7 @@ pf_default_hdl(dev_info_t *dip, pf_impl_t *impl) pf_pci_regs_gather(pfd_p, bus_p); pf_pci_regs_clear(pfd_p, bus_p); + if (PCIE_IS_RP(bus_p)) pf_pci_find_rp_fault(pfd_p, bus_p); @@ -982,6 +993,22 @@ done: } /* + * Set the passthru flag on a device bus_p. Called by passthru drivers to + * indicate when a device is or is no longer under passthru control. + */ +void +pf_set_passthru(dev_info_t *dip, boolean_t is_passthru) +{ + pcie_bus_t *bus_p = PCIE_DIP2BUS(dip); + + if (is_passthru) { + atomic_or_uint(&bus_p->bus_fm_flags, PF_FM_IS_PASSTHRU); + } else { + atomic_and_uint(&bus_p->bus_fm_flags, ~PF_FM_IS_PASSTHRU); + } +} + +/* * Called during postattach to initialize a device's error handling * capabilities. If the devices has already been hardened, then there isn't * much needed. Otherwise initialize the device's default FMA capabilities. @@ -1024,7 +1051,7 @@ pf_init(dev_info_t *dip, ddi_iblock_cookie_t ibc, ddi_attach_cmd_t cmd) DDI_FM_EREPORT_CAPABLE | DDI_FM_ERRCB_CAPABLE); cap &= (DDI_FM_EREPORT_CAPABLE | DDI_FM_ERRCB_CAPABLE); - bus_p->bus_fm_flags |= PF_FM_IS_NH; + atomic_or_uint(&bus_p->bus_fm_flags, PF_FM_IS_NH); if (cmd == DDI_ATTACH) { ddi_fm_init(dip, &cap, &ibc); @@ -1039,7 +1066,7 @@ pf_init(dev_info_t *dip, ddi_iblock_cookie_t ibc, ddi_attach_cmd_t cmd) /* If ddi_fm_init fails for any reason RETURN */ if (!fmhdl) { - bus_p->bus_fm_flags = 0; + (void) atomic_swap_uint(&bus_p->bus_fm_flags, 0); return; } @@ -1049,7 +1076,7 @@ pf_init(dev_info_t *dip, ddi_iblock_cookie_t ibc, ddi_attach_cmd_t cmd) ddi_fm_handler_register(dip, pf_dummy_cb, NULL); } - bus_p->bus_fm_flags |= PF_FM_READY; + atomic_or_uint(&bus_p->bus_fm_flags, PF_FM_READY); } /* undo FMA lock, called at predetach */ @@ -1066,7 +1093,7 @@ pf_fini(dev_info_t *dip, ddi_detach_cmd_t cmd) return; /* no other code should set the flag to false */ - bus_p->bus_fm_flags &= ~PF_FM_READY; + atomic_and_uint(&bus_p->bus_fm_flags, ~PF_FM_READY); /* * Grab the mutex to make sure device isn't in the middle of @@ -1080,7 +1107,7 @@ pf_fini(dev_info_t *dip, ddi_detach_cmd_t cmd) /* undo non-hardened drivers */ if (bus_p->bus_fm_flags & PF_FM_IS_NH) { if (cmd == DDI_DETACH) { - bus_p->bus_fm_flags &= ~PF_FM_IS_NH; + atomic_and_uint(&bus_p->bus_fm_flags, ~PF_FM_IS_NH); pci_ereport_teardown(dip); /* * ddi_fini itself calls ddi_handler_unregister, @@ -1377,7 +1404,7 @@ pf_analyse_error(ddi_fm_error_t *derr, pf_impl_t *impl) sts_flags = 0; /* skip analysing error when no error info is gathered */ - if (pfd_p->pe_severity_flags == PF_ERR_PANIC_BAD_RESPONSE) + if (pfd_p->pe_severity_flags == PF_ERR_BAD_RESPONSE) goto done; switch (PCIE_PFD2BUS(pfd_p)->bus_dev_type) { @@ -1455,6 +1482,8 @@ done: /* Have pciev_eh adjust the severity */ pfd_p->pe_severity_flags = pciev_eh(pfd_p, impl); + pfd_p->pe_severity_flags &= ~pfd_p->pe_severity_mask; + error_flags |= pfd_p->pe_severity_flags; } @@ -2165,7 +2194,8 @@ pf_matched_in_rc(pf_data_t *dq_head_p, pf_data_t *pfd_p, */ static void pf_pci_find_trans_type(pf_data_t *pfd_p, uint64_t *addr, uint32_t *trans_type, - pcie_req_id_t *bdf) { + pcie_req_id_t *bdf) +{ pf_data_t *rc_pfd_p; /* Could be DMA or PIO. Find out by look at error type. */ @@ -2216,7 +2246,8 @@ pf_pci_find_trans_type(pf_data_t *pfd_p, uint64_t *addr, uint32_t *trans_type, */ /* ARGSUSED */ int -pf_pci_decode(pf_data_t *pfd_p, uint16_t *cmd) { +pf_pci_decode(pf_data_t *pfd_p, uint16_t *cmd) +{ pcix_attr_t *attr; uint64_t addr; uint32_t trans_type; @@ -2415,7 +2446,8 @@ done: static int pf_hdl_compare(dev_info_t *dip, ddi_fm_error_t *derr, uint32_t flag, - uint64_t addr, pcie_req_id_t bdf, ndi_fmc_t *fcp) { + uint64_t addr, pcie_req_id_t bdf, ndi_fmc_t *fcp) +{ ndi_fmcentry_t *fep; int found = 0; int status; @@ -2490,7 +2522,7 @@ pf_hdl_compare(dev_info_t *dip, ddi_fm_error_t *derr, uint32_t flag, /* ARGSUSED */ static int pf_log_hdl_lookup(dev_info_t *rpdip, ddi_fm_error_t *derr, pf_data_t *pfd_p, - boolean_t is_primary) + boolean_t is_primary) { /* * Disabling this function temporarily until errors can be handled @@ -2537,7 +2569,8 @@ pf_log_hdl_lookup(dev_info_t *rpdip, ddi_fm_error_t *derr, pf_data_t *pfd_p, * accessed via the bus_p. */ int -pf_tlp_decode(pcie_bus_t *bus_p, pf_pcie_adv_err_regs_t *adv_reg_p) { +pf_tlp_decode(pcie_bus_t *bus_p, pf_pcie_adv_err_regs_t *adv_reg_p) +{ pcie_tlp_hdr_t *tlp_hdr = (pcie_tlp_hdr_t *)adv_reg_p->pcie_ue_hdr; pcie_req_id_t my_bdf, tlp_bdf, flt_bdf = PCIE_INVALID_BDF; uint64_t flt_addr = 0; @@ -3050,6 +3083,7 @@ pf_reset_pfd(pf_data_t *pfd_p) pcie_bus_t *bus_p = PCIE_PFD2BUS(pfd_p); pfd_p->pe_severity_flags = 0; + pfd_p->pe_severity_mask = 0; pfd_p->pe_orig_severity_flags = 0; /* pe_lock and pe_valid were reset in pf_send_ereport */ diff --git a/usr/src/uts/common/io/pciex/pciev.c b/usr/src/uts/common/io/pciex/pciev.c index b442aae5b5..c8b2cfdda7 100644 --- a/usr/src/uts/common/io/pciex/pciev.c +++ b/usr/src/uts/common/io/pciex/pciev.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2017, Joyent, Inc. + */ + #include <sys/types.h> #include <sys/ddi.h> #include <sys/dditypes.h> @@ -302,7 +306,7 @@ pciev_eh(pf_data_t *pfd_p, pf_impl_t *impl) pcie_faulty_all = B_TRUE; } else if (severity & (PF_ERR_NO_PANIC | PF_ERR_MATCHED_DEVICE | - PF_ERR_PANIC | PF_ERR_PANIC_BAD_RESPONSE)) { + PF_ERR_PANIC | PF_ERR_BAD_RESPONSE)) { uint16_t affected_flag, dev_affected_flags; uint_t is_panic = 0, is_aff_dev_found = 0; diff --git a/usr/src/uts/common/io/physmem.c b/usr/src/uts/common/io/physmem.c index 39d5003b02..c48fecd133 100644 --- a/usr/src/uts/common/io/physmem.c +++ b/usr/src/uts/common/io/physmem.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ @@ -807,6 +808,13 @@ physmem_open(dev_t *devp, int flag, int otyp, cred_t *credp) int ret; static int msg_printed = 0; + /* + * This device should never be visible in a zone, but if it somehow + * does get created we refuse to allow the zone to use it. + */ + if (crgetzoneid(credp) != GLOBAL_ZONEID) + return (EACCES); + if ((flag & (FWRITE | FREAD)) != (FWRITE | FREAD)) { return (EINVAL); } diff --git a/usr/src/uts/common/io/pseudo.conf b/usr/src/uts/common/io/pseudo.conf index 42248e93d6..08affec609 100644 --- a/usr/src/uts/common/io/pseudo.conf +++ b/usr/src/uts/common/io/pseudo.conf @@ -22,8 +22,7 @@ # # Copyright 2003 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. -# -# ident "%Z%%M% %I% %E% SMI" +# Copyright 2014 Joyent, Inc. All rights reserved. # # This file is private to the pseudonex driver. It should not be edited. # @@ -38,3 +37,9 @@ name="pseudo" class="root" instance=0; # /pseudo; it has as its children the zone console pseudo nodes. # name="zconsnex" parent="/pseudo" instance=1 valid-children="zcons"; + +# +# zfdnex is an alias for pseudo; this node is instantiated as a child of +# /pseudo; it has as its children the zone fd pseudo nodes. +# +name="zfdnex" parent="/pseudo" instance=2 valid-children="zfd"; diff --git a/usr/src/uts/common/io/pseudonex.c b/usr/src/uts/common/io/pseudonex.c index f83b0abf39..0ae06f88cc 100644 --- a/usr/src/uts/common/io/pseudonex.c +++ b/usr/src/uts/common/io/pseudonex.c @@ -83,6 +83,8 @@ static int pseudonex_detach(dev_info_t *, ddi_detach_cmd_t); static int pseudonex_open(dev_t *, int, int, cred_t *); static int pseudonex_close(dev_t, int, int, cred_t *); static int pseudonex_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); +static int pseudonex_fm_init(dev_info_t *, dev_info_t *, int, + ddi_iblock_cookie_t *); static int pseudonex_ctl(dev_info_t *, dev_info_t *, ddi_ctl_enum_t, void *, void *); @@ -90,6 +92,8 @@ static void *pseudonex_state; typedef struct pseudonex_state { dev_info_t *pnx_devi; + int pnx_fmcap; + ddi_iblock_cookie_t pnx_fm_ibc; } pseudonex_state_t; static struct bus_ops pseudonex_bus_ops = { @@ -116,7 +120,7 @@ static struct bus_ops pseudonex_bus_ops = { NULL, /* bus_intr_ctl */ NULL, /* bus_config */ NULL, /* bus_unconfig */ - NULL, /* bus_fm_init */ + pseudonex_fm_init, /* bus_fm_init */ NULL, /* bus_fm_fini */ NULL, /* bus_fm_access_enter */ NULL, /* bus_fm_access_exit */ @@ -228,6 +232,9 @@ pseudonex_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) pnx_state = ddi_get_soft_state(pseudonex_state, instance); pnx_state->pnx_devi = devi; + pnx_state->pnx_fmcap = DDI_FM_EREPORT_CAPABLE; + ddi_fm_init(devi, &pnx_state->pnx_fmcap, &pnx_state->pnx_fm_ibc); + if (ddi_create_minor_node(devi, "devctl", S_IFCHR, instance, DDI_NT_NEXUS, 0) != DDI_SUCCESS) { ddi_remove_minor_node(devi, NULL); @@ -247,6 +254,10 @@ pseudonex_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) if (cmd == DDI_SUSPEND) return (DDI_SUCCESS); + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + ddi_fm_fini(devi); ddi_remove_minor_node(devi, NULL); ddi_soft_state_free(pseudonex_state, instance); return (DDI_SUCCESS); @@ -375,6 +386,19 @@ pseudonex_auto_assign(dev_info_t *child) } static int +pseudonex_fm_init(dev_info_t *dip, dev_info_t *tdip, int cap, + ddi_iblock_cookie_t *ibc) +{ + pseudonex_state_t *pnx_state; + + pnx_state = ddi_get_soft_state(pseudonex_state, ddi_get_instance(dip)); + ASSERT(pnx_state != NULL); + ASSERT(ibc != NULL); + *ibc = pnx_state->pnx_fm_ibc; + return (pnx_state->pnx_fmcap & cap); +} + +static int pseudonex_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop, void *arg, void *result) { diff --git a/usr/src/uts/common/io/ptm.c b/usr/src/uts/common/io/ptm.c index d9ec6cc585..dc6c4e64ad 100644 --- a/usr/src/uts/common/io/ptm.c +++ b/usr/src/uts/common/io/ptm.c @@ -447,6 +447,18 @@ ptmclose(queue_t *rqp, int flag, cred_t *credp) return (0); } +static boolean_t +ptmptsopencb(ptmptsopencb_arg_t arg) +{ + struct pt_ttys *ptmp = (struct pt_ttys *)arg; + boolean_t rval; + + PT_ENTER_READ(ptmp); + rval = (ptmp->pt_nullmsg != NULL); + PT_EXIT_READ(ptmp); + return (rval); +} + /* * The wput procedure will only handle ioctl and flush messages. */ @@ -574,6 +586,41 @@ ptmwput(queue_t *qp, mblk_t *mp) miocack(qp, mp, 0, 0); break; } + case PTMPTSOPENCB: + { + mblk_t *dp; /* ioctl reply data */ + ptmptsopencb_t *ppocb; + + /* only allow the kernel to invoke this ioctl */ + if (iocp->ioc_cr != kcred) { + miocnak(qp, mp, 0, EINVAL); + break; + } + + /* we don't support transparent ioctls */ + ASSERT(iocp->ioc_count != TRANSPARENT); + if (iocp->ioc_count == TRANSPARENT) { + miocnak(qp, mp, 0, EINVAL); + break; + } + + /* allocate a response message */ + dp = allocb(sizeof (ptmptsopencb_t), BPRI_MED); + if (dp == NULL) { + miocnak(qp, mp, 0, EAGAIN); + break; + } + + /* initialize the ioctl results */ + ppocb = (ptmptsopencb_t *)dp->b_rptr; + ppocb->ppocb_func = ptmptsopencb; + ppocb->ppocb_arg = (ptmptsopencb_arg_t)ptmp; + + /* send the reply data */ + mioc2ack(mp, dp, sizeof (ptmptsopencb_t), 0); + qreply(qp, mp); + break; + } } break; diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/iwarp_sm.jpg b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/iwarp_sm.jpg Binary files differnew file mode 100644 index 0000000000..b932ffaa7c --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/iwarp_sm.jpg diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full-36.jpg b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full-36.jpg Binary files differnew file mode 100644 index 0000000000..9421ecc0db --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full-36.jpg diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full.png b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full.png Binary files differnew file mode 100644 index 0000000000..4b8a66761a --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full.png diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-logo.png b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-logo.png Binary files differnew file mode 100644 index 0000000000..3254fbdc3b --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-logo.png diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/reg_access.jpg b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/reg_access.jpg Binary files differnew file mode 100644 index 0000000000..7bb0dbf21b --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/reg_access.jpg diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values.bin b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values.bin Binary files differnew file mode 100644 index 0000000000..43014fd8ea --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values.bin diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values_zipped.bin b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values_zipped.bin Binary files differnew file mode 100644 index 0000000000..9524eb4a63 --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values_zipped.bin diff --git a/usr/src/uts/common/io/qede/qede_fp.c b/usr/src/uts/common/io/qede/qede_fp.c index 5b6177963e..fe7cb7a6cd 100644 --- a/usr/src/uts/common/io/qede/qede_fp.c +++ b/usr/src/uts/common/io/qede/qede_fp.c @@ -1848,8 +1848,6 @@ qede_ring_tx(void *arg, mblk_t *mp) } if (!qede->params.link_state) { - qede_print_err("!%s(%d): Link !up for xmit", - __func__, qede->instance); goto exit; } diff --git a/usr/src/uts/common/io/qede/qede_list.h b/usr/src/uts/common/io/qede/qede_list.h index 2350cb4117..656d2a915f 100644 --- a/usr/src/uts/common/io/qede/qede_list.h +++ b/usr/src/uts/common/io/qede/qede_list.h @@ -176,4 +176,3 @@ qede_list_splice_tail(qede_list_t *list, #define QEDE_LIST_FOR_EACH_ENTRY_SAFE OSAL_LIST_FOR_EACH_ENTRY_SAFE #endif /* !_QEDE_LIST_H */ - diff --git a/usr/src/uts/common/io/qede/qede_version.h b/usr/src/uts/common/io/qede/qede_version.h index ebf2624268..54678600c9 100644 --- a/usr/src/uts/common/io/qede/qede_version.h +++ b/usr/src/uts/common/io/qede/qede_version.h @@ -42,4 +42,3 @@ #define REVVERSION 23 #endif /* !_QEDE_VERSION_H */ - diff --git a/usr/src/uts/common/io/random.c b/usr/src/uts/common/io/random.c index d79b86362c..a50bbcceec 100644 --- a/usr/src/uts/common/io/random.c +++ b/usr/src/uts/common/io/random.c @@ -291,6 +291,9 @@ rnd_write(dev_t dev, struct uio *uiop, cred_t *credp) if ((error = uiomove(buf, bytes, UIO_WRITE, uiop)) != 0) return (error); + if (crgetzone(credp) != global_zone) + continue; + switch (devno) { case DEVRANDOM: if ((error = random_add_entropy(buf, bytes, 0)) != 0) diff --git a/usr/src/uts/common/io/rsm/rsm.c b/usr/src/uts/common/io/rsm/rsm.c index 4ec7d34b92..4c5489b039 100644 --- a/usr/src/uts/common/io/rsm/rsm.c +++ b/usr/src/uts/common/io/rsm/rsm.c @@ -22,8 +22,8 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2012 Milan Jurik. All rights reserved. - * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright 2017 Joyent, Inc. + * Copyright (c) 2016 by Delphix. All rights reserved. */ diff --git a/usr/src/uts/common/io/sata/adapters/ahci/ahci.c b/usr/src/uts/common/io/sata/adapters/ahci/ahci.c index 227fc76b6f..258e1d742c 100644 --- a/usr/src/uts/common/io/sata/adapters/ahci/ahci.c +++ b/usr/src/uts/common/io/sata/adapters/ahci/ahci.c @@ -10559,8 +10559,7 @@ ahci_em_led_task(void *arg) mutex_enter(&led->aelta_ctl->ahcictl_mutex); if (ret) { - led->aelta_ctl->ahcictl_em_state[led->aelta_port] = - led->aelta_state; + led->aelta_ctl->ahcictl_em_state[led->aelta_port] = state; led->aelta_ret = 0; } else { led->aelta_ret = EIO; @@ -10770,6 +10769,7 @@ ahci_em_ioctl_set(ahci_ctl_t *ahci_ctlp, intptr_t arg) } task->aelta_ctl = ahci_ctlp; + task->aelta_port = set.aiems_port; task->aelta_port = (uint8_t)set.aiems_port; task->aelta_op = set.aiems_op; task->aelta_state = set.aiems_leds; diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c index 25c91f3560..dc9547a275 100644 --- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c +++ b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c @@ -72,6 +72,7 @@ #include <sys/file.h> #include <sys/policy.h> #include <sys/model.h> +#include <sys/refhash.h> #include <sys/sysevent.h> #include <sys/sysevent/eventdefs.h> #include <sys/sysevent/dr.h> @@ -99,7 +100,6 @@ #include <sys/scsi/adapters/mpt_sas/mptsas_var.h> #include <sys/scsi/adapters/mpt_sas/mptsas_ioctl.h> #include <sys/scsi/adapters/mpt_sas/mptsas_smhba.h> -#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h> #include <sys/raidioctl.h> #include <sys/fs/dv_node.h> /* devfs_clean */ diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c deleted file mode 100644 index 8f96c2d9f1..0000000000 --- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c +++ /dev/null @@ -1,215 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2014 Joyent, Inc. All rights reserved. - */ - -#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h> -#include <sys/sysmacros.h> -#include <sys/types.h> -#include <sys/kmem.h> -#include <sys/list.h> -#include <sys/ddi.h> - -#ifdef lint -extern refhash_link_t *obj_to_link(refhash_t *, void *); -extern void *link_to_obj(refhash_t *, refhash_link_t *); -extern void *obj_to_tag(refhash_t *, void *); -#else -#define obj_to_link(_h, _o) \ - ((refhash_link_t *)(((char *)(_o)) + (_h)->rh_link_off)) -#define link_to_obj(_h, _l) \ - ((void *)(((char *)(_l)) - (_h)->rh_link_off)) -#define obj_to_tag(_h, _o) \ - ((void *)(((char *)(_o)) + (_h)->rh_tag_off)) -#endif - -refhash_t * -refhash_create(uint_t bucket_count, refhash_hash_f hash, - refhash_cmp_f cmp, refhash_dtor_f dtor, size_t obj_size, size_t link_off, - size_t tag_off, int km_flags) -{ - refhash_t *hp; - uint_t i; - - hp = kmem_alloc(sizeof (refhash_t), km_flags); - if (hp == NULL) - return (NULL); - hp->rh_buckets = kmem_zalloc(bucket_count * sizeof (list_t), km_flags); - if (hp->rh_buckets == NULL) { - kmem_free(hp, sizeof (refhash_t)); - return (NULL); - } - hp->rh_bucket_count = bucket_count; - - for (i = 0; i < bucket_count; i++) { - list_create(&hp->rh_buckets[i], sizeof (refhash_link_t), - offsetof(refhash_link_t, rhl_chain_link)); - } - list_create(&hp->rh_objs, sizeof (refhash_link_t), - offsetof(refhash_link_t, rhl_global_link)); - - hp->rh_obj_size = obj_size; - hp->rh_link_off = link_off; - hp->rh_tag_off = tag_off; - hp->rh_hash = hash; - hp->rh_cmp = cmp; - hp->rh_dtor = dtor; - - return (hp); -} - -void -refhash_destroy(refhash_t *hp) -{ - ASSERT(list_is_empty(&hp->rh_objs)); - - kmem_free(hp->rh_buckets, hp->rh_bucket_count * sizeof (list_t)); - kmem_free(hp, sizeof (refhash_t)); -} - -void -refhash_insert(refhash_t *hp, void *op) -{ - uint_t bucket; - refhash_link_t *lp = obj_to_link(hp, op); - - bucket = hp->rh_hash(obj_to_tag(hp, op)) % hp->rh_bucket_count; - list_link_init(&lp->rhl_chain_link); - list_link_init(&lp->rhl_global_link); - lp->rhl_flags = 0; - lp->rhl_refcnt = 0; - list_insert_tail(&hp->rh_buckets[bucket], lp); - list_insert_tail(&hp->rh_objs, lp); -} - -static void -refhash_delete(refhash_t *hp, void *op) -{ - refhash_link_t *lp = obj_to_link(hp, op); - uint_t bucket; - - bucket = hp->rh_hash(obj_to_tag(hp, op)) % hp->rh_bucket_count; - list_remove(&hp->rh_buckets[bucket], lp); - list_remove(&hp->rh_objs, lp); - hp->rh_dtor(op); -} - -void -refhash_remove(refhash_t *hp, void *op) -{ - refhash_link_t *lp = obj_to_link(hp, op); - - if (lp->rhl_refcnt > 0) { - lp->rhl_flags |= RHL_F_DEAD; - } else { - refhash_delete(hp, op); - } -} - -void * -refhash_lookup(refhash_t *hp, const void *tp) -{ - uint_t bucket; - refhash_link_t *lp; - void *op; - - bucket = hp->rh_hash(tp) % hp->rh_bucket_count; - for (lp = list_head(&hp->rh_buckets[bucket]); lp != NULL; - lp = list_next(&hp->rh_buckets[bucket], lp)) { - op = link_to_obj(hp, lp); - if (hp->rh_cmp(obj_to_tag(hp, op), tp) == 0 && - !(lp->rhl_flags & RHL_F_DEAD)) { - return (op); - } - } - - return (NULL); -} - -void * -refhash_linear_search(refhash_t *hp, refhash_eval_f eval, void *arg) -{ - void *op; - refhash_link_t *lp; - - for (lp = list_head(&hp->rh_objs); lp != NULL; - lp = list_next(&hp->rh_objs, lp)) { - op = link_to_obj(hp, lp); - if (eval(op, arg) == 0) - return (op); - } - - return (NULL); -} - -void -refhash_hold(refhash_t *hp, void *op) -{ - refhash_link_t *lp = obj_to_link(hp, op); - - ++lp->rhl_refcnt; -} - -void -refhash_rele(refhash_t *hp, void *op) -{ - refhash_link_t *lp = obj_to_link(hp, op); - - ASSERT(lp->rhl_refcnt > 0); - - if (--lp->rhl_refcnt == 0 && (lp->rhl_flags & RHL_F_DEAD)) - refhash_remove(hp, op); -} - -void * -refhash_first(refhash_t *hp) -{ - refhash_link_t *lp; - - lp = list_head(&hp->rh_objs); - if (lp == NULL) - return (NULL); - - ++lp->rhl_refcnt; - - return (link_to_obj(hp, lp)); -} - -void * -refhash_next(refhash_t *hp, void *op) -{ - refhash_link_t *lp; - - lp = obj_to_link(hp, op); - while ((lp = list_next(&hp->rh_objs, lp)) != NULL) { - if (!(lp->rhl_flags & RHL_F_DEAD)) - break; - } - - refhash_rele(hp, op); - if (lp == NULL) - return (NULL); - - ++lp->rhl_refcnt; - - return (link_to_obj(hp, lp)); -} - -boolean_t -refhash_obj_valid(refhash_t *hp, const void *op) -{ - /* LINTED - E_ARG_INCOMPATIBLE_WITH_ARG_L */ - const refhash_link_t *lp = obj_to_link(hp, op); - - return ((lp->rhl_flags & RHL_F_DEAD) != 0); -} diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt.c new file mode 100644 index 0000000000..a7ef2b69d7 --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt.c @@ -0,0 +1,565 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017, Joyent, Inc. + */ + +#include <sys/scsi/adapters/smrt/smrt.h> + +static int smrt_attach(dev_info_t *, ddi_attach_cmd_t); +static int smrt_detach(dev_info_t *, ddi_detach_cmd_t); +static int smrt_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); +static void smrt_cleanup(smrt_t *); +static int smrt_command_comparator(const void *, const void *); + +/* + * Controller soft state. Each entry is an object of type "smrt_t". + */ +void *smrt_state; + +/* + * DMA attributes template. Each controller will make a copy of this template + * with appropriate customisations; e.g., the Scatter/Gather List Length. + */ +static ddi_dma_attr_t smrt_dma_attr_template = { + .dma_attr_version = DMA_ATTR_V0, + .dma_attr_addr_lo = 0x0000000000000000, + .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF, + .dma_attr_count_max = 0x00FFFFFF, + .dma_attr_align = 0x20, + .dma_attr_burstsizes = 0x20, + .dma_attr_minxfer = DMA_UNIT_8, + .dma_attr_maxxfer = 0xFFFFFFFF, + /* + * There is some suggestion that at least some, possibly older, Smart + * Array controllers cannot tolerate a DMA segment that straddles a 4GB + * boundary. + */ + .dma_attr_seg = 0xFFFFFFFF, + .dma_attr_sgllen = 1, + .dma_attr_granular = 512, + .dma_attr_flags = 0 +}; + +/* + * Device memory access attributes for device control registers. + */ +ddi_device_acc_attr_t smrt_dev_attributes = { + .devacc_attr_version = DDI_DEVICE_ATTR_V0, + .devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC, + .devacc_attr_dataorder = DDI_STRICTORDER_ACC, + .devacc_attr_access = 0 +}; + +/* + * Character/Block Operations Structure + */ +static struct cb_ops smrt_cb_ops = { + .cb_rev = CB_REV, + .cb_flag = D_NEW | D_MP, + + .cb_open = scsi_hba_open, + .cb_close = scsi_hba_close, + + .cb_ioctl = smrt_ioctl, + + .cb_strategy = nodev, + .cb_print = nodev, + .cb_dump = nodev, + .cb_read = nodev, + .cb_write = nodev, + .cb_devmap = nodev, + .cb_mmap = nodev, + .cb_segmap = nodev, + .cb_chpoll = nochpoll, + .cb_prop_op = ddi_prop_op, + .cb_str = NULL, + .cb_aread = nodev, + .cb_awrite = nodev +}; + +/* + * Device Operations Structure + */ +static struct dev_ops smrt_dev_ops = { + .devo_rev = DEVO_REV, + .devo_refcnt = 0, + + .devo_attach = smrt_attach, + .devo_detach = smrt_detach, + + .devo_cb_ops = &smrt_cb_ops, + + .devo_getinfo = nodev, + .devo_identify = nulldev, + .devo_probe = nulldev, + .devo_reset = nodev, + .devo_bus_ops = NULL, + .devo_power = nodev, + .devo_quiesce = nodev +}; + +/* + * Linkage structures + */ +static struct modldrv smrt_modldrv = { + .drv_modops = &mod_driverops, + .drv_linkinfo = "HP Smart Array", + .drv_dev_ops = &smrt_dev_ops +}; + +static struct modlinkage smrt_modlinkage = { + .ml_rev = MODREV_1, + .ml_linkage = { &smrt_modldrv, NULL } +}; + + +int +_init() +{ + int r; + + VERIFY0(ddi_soft_state_init(&smrt_state, sizeof (smrt_t), 0)); + + if ((r = scsi_hba_init(&smrt_modlinkage)) != 0) { + goto fail; + } + + if ((r = mod_install(&smrt_modlinkage)) != 0) { + scsi_hba_fini(&smrt_modlinkage); + goto fail; + } + + return (r); + +fail: + ddi_soft_state_fini(&smrt_state); + return (r); +} + +int +_fini() +{ + int r; + + if ((r = mod_remove(&smrt_modlinkage)) == 0) { + scsi_hba_fini(&smrt_modlinkage); + ddi_soft_state_fini(&smrt_state); + } + + return (r); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&smrt_modlinkage, modinfop)); +} + +static int +smrt_iport_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + const char *addr; + dev_info_t *pdip; + int instance; + smrt_t *smrt; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + /* + * Note, we cannot get to our parent via the tran's tran_hba_private + * member. This pointer is reset to NULL when the scsi_hba_tran_t + * structure is duplicated. + */ + addr = scsi_hba_iport_unit_address(dip); + VERIFY(addr != NULL); + pdip = ddi_get_parent(dip); + instance = ddi_get_instance(pdip); + smrt = ddi_get_soft_state(smrt_state, instance); + VERIFY(smrt != NULL); + + if (strcmp(addr, SMRT_IPORT_VIRT) == 0) { + if (smrt_logvol_hba_setup(smrt, dip) != DDI_SUCCESS) + return (DDI_FAILURE); + smrt->smrt_virt_iport = dip; + } else if (strcmp(addr, SMRT_IPORT_PHYS) == 0) { + if (smrt_phys_hba_setup(smrt, dip) != DDI_SUCCESS) + return (DDI_FAILURE); + smrt->smrt_phys_iport = dip; + } else { + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +static int +smrt_iport_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + const char *addr; + scsi_hba_tran_t *tran; + smrt_t *smrt; + + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + tran = ddi_get_driver_private(dip); + VERIFY(tran != NULL); + smrt = tran->tran_hba_private; + VERIFY(smrt != NULL); + + addr = scsi_hba_iport_unit_address(dip); + VERIFY(addr != NULL); + + if (strcmp(addr, SMRT_IPORT_VIRT) == 0) { + smrt_logvol_hba_teardown(smrt, dip); + smrt->smrt_virt_iport = NULL; + } else if (strcmp(addr, SMRT_IPORT_PHYS) == 0) { + smrt_phys_hba_teardown(smrt, dip); + smrt->smrt_phys_iport = NULL; + } else { + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +static int +smrt_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + uint32_t instance; + smrt_t *smrt; + boolean_t check_for_interrupts = B_FALSE; + int r; + char taskq_name[64]; + + if (scsi_hba_iport_unit_address(dip) != NULL) + return (smrt_iport_attach(dip, cmd)); + + if (cmd != DDI_ATTACH) { + return (DDI_FAILURE); + } + + /* + * Allocate the per-controller soft state object and get + * a pointer to it. + */ + instance = ddi_get_instance(dip); + if (ddi_soft_state_zalloc(smrt_state, instance) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not allocate soft state"); + return (DDI_FAILURE); + } + if ((smrt = ddi_get_soft_state(smrt_state, instance)) == NULL) { + dev_err(dip, CE_WARN, "could not get soft state"); + ddi_soft_state_free(smrt_state, instance); + return (DDI_FAILURE); + } + + /* + * Initialise per-controller state object. + */ + smrt->smrt_dip = dip; + smrt->smrt_instance = instance; + smrt->smrt_next_tag = SMRT_MIN_TAG_NUMBER; + list_create(&smrt->smrt_commands, sizeof (smrt_command_t), + offsetof(smrt_command_t, smcm_link)); + list_create(&smrt->smrt_finishq, sizeof (smrt_command_t), + offsetof(smrt_command_t, smcm_link_finish)); + list_create(&smrt->smrt_abortq, sizeof (smrt_command_t), + offsetof(smrt_command_t, smcm_link_abort)); + list_create(&smrt->smrt_volumes, sizeof (smrt_volume_t), + offsetof(smrt_volume_t, smlv_link)); + list_create(&smrt->smrt_physicals, sizeof (smrt_physical_t), + offsetof(smrt_physical_t, smpt_link)); + list_create(&smrt->smrt_targets, sizeof (smrt_target_t), + offsetof(smrt_target_t, smtg_link_ctlr)); + avl_create(&smrt->smrt_inflight, smrt_command_comparator, + sizeof (smrt_command_t), offsetof(smrt_command_t, + smcm_node)); + cv_init(&smrt->smrt_cv_finishq, NULL, CV_DRIVER, NULL); + + smrt->smrt_init_level |= SMRT_INITLEVEL_BASIC; + + /* + * Perform basic device setup, including identifying the board, mapping + * the I2O registers and the Configuration Table. + */ + if (smrt_device_setup(smrt) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "device setup failed"); + goto fail; + } + + /* + * Select a Transport Method (e.g. Simple or Performant) and update + * the Configuration Table. This function also waits for the + * controller to become ready. + */ + if (smrt_ctlr_init(smrt) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "controller initialisation failed"); + goto fail; + } + + /* + * Each controller may have a different Scatter/Gather Element count. + * Configure a per-controller set of DMA attributes with the + * appropriate S/G size. + */ + VERIFY(smrt->smrt_sg_cnt > 0); + smrt->smrt_dma_attr = smrt_dma_attr_template; + smrt->smrt_dma_attr.dma_attr_sgllen = smrt->smrt_sg_cnt; + + /* + * Now that we have selected a Transport Method, we can configure + * the appropriate interrupt handlers. + */ + if (smrt_interrupts_setup(smrt) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "interrupt handler setup failed"); + goto fail; + } + + /* + * Now that we have the correct interrupt priority, we can initialise + * the mutex. This must be done before the interrupt handler is + * enabled. + */ + mutex_init(&smrt->smrt_mutex, NULL, MUTEX_DRIVER, + DDI_INTR_PRI(smrt->smrt_interrupt_pri)); + smrt->smrt_init_level |= SMRT_INITLEVEL_MUTEX; + + /* + * From this point forward, the controller is able to accept commands + * and (at least by polling) return command submissions. Setting this + * flag allows the rest of the driver to interact with the device. + */ + smrt->smrt_status |= SMRT_CTLR_STATUS_RUNNING; + + if (smrt_interrupts_enable(smrt) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "interrupt handler could not be enabled"); + goto fail; + } + + if (smrt_ctrl_hba_setup(smrt) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "SCSI framework setup failed"); + goto fail; + } + + /* + * Set the appropriate Interrupt Mask Register bits to start + * command completion interrupts from the controller. + */ + smrt_intr_set(smrt, B_TRUE); + check_for_interrupts = B_TRUE; + + /* + * Register the maintenance routine for periodic execution: + */ + smrt->smrt_periodic = ddi_periodic_add(smrt_periodic, smrt, + SMRT_PERIODIC_RATE * NANOSEC, DDI_IPL_0); + smrt->smrt_init_level |= SMRT_INITLEVEL_PERIODIC; + + (void) snprintf(taskq_name, sizeof (taskq_name), "smrt_discover_%u", + instance); + smrt->smrt_discover_taskq = ddi_taskq_create(smrt->smrt_dip, taskq_name, + 1, TASKQ_DEFAULTPRI, 0); + if (smrt->smrt_discover_taskq == NULL) { + dev_err(dip, CE_WARN, "failed to create discovery task queue"); + goto fail; + } + smrt->smrt_init_level |= SMRT_INITLEVEL_TASKQ; + + if ((r = smrt_event_init(smrt)) != 0) { + dev_err(dip, CE_WARN, "could not initialize event subsystem " + "(%d)", r); + goto fail; + } + smrt->smrt_init_level |= SMRT_INITLEVEL_ASYNC_EVENT; + + if (scsi_hba_iport_register(dip, SMRT_IPORT_VIRT) != DDI_SUCCESS) + goto fail; + + if (scsi_hba_iport_register(dip, SMRT_IPORT_PHYS) != DDI_SUCCESS) + goto fail; + + /* + * Announce the attachment of this controller. + */ + ddi_report_dev(dip); + + return (DDI_SUCCESS); + +fail: + if (check_for_interrupts) { + if (smrt->smrt_stats.smrts_claimed_interrupts == 0) { + dev_err(dip, CE_WARN, "controller did not interrupt " + "during attach"); + } + } + smrt_cleanup(smrt); + return (DDI_FAILURE); +} + +static int +smrt_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + scsi_hba_tran_t *tran = (scsi_hba_tran_t *)ddi_get_driver_private(dip); + smrt_t *smrt = (smrt_t *)tran->tran_hba_private; + + if (scsi_hba_iport_unit_address(dip) != NULL) + return (smrt_iport_detach(dip, cmd)); + + if (cmd != DDI_DETACH) { + return (DDI_FAILURE); + } + + /* + * First, check to make sure that all SCSI framework targets have + * detached. + */ + mutex_enter(&smrt->smrt_mutex); + if (!list_is_empty(&smrt->smrt_targets)) { + mutex_exit(&smrt->smrt_mutex); + dev_err(smrt->smrt_dip, CE_WARN, "cannot detach; targets still " + "using HBA"); + return (DDI_FAILURE); + } + + if (smrt->smrt_virt_iport != NULL || smrt->smrt_phys_iport != NULL) { + mutex_exit(&smrt->smrt_mutex); + dev_err(smrt->smrt_dip, CE_WARN, "cannot detach: iports still " + "attached"); + return (DDI_FAILURE); + } + + /* + * Prevent new targets from attaching now: + */ + smrt->smrt_status |= SMRT_CTLR_STATUS_DETACHING; + mutex_exit(&smrt->smrt_mutex); + + /* + * Clean up all remaining resources. + */ + smrt_cleanup(smrt); + + return (DDI_SUCCESS); +} + +static int +smrt_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rval) +{ + int inst = MINOR2INST(getminor(dev)); + int status; + + if (secpolicy_sys_config(credp, B_FALSE) != 0) { + return (EPERM); + } + + /* + * Ensure that we have a soft state object for this instance. + */ + if (ddi_get_soft_state(smrt_state, inst) == NULL) { + return (ENXIO); + } + + switch (cmd) { + default: + status = scsi_hba_ioctl(dev, cmd, arg, mode, credp, rval); + break; + } + + return (status); +} + +static void +smrt_cleanup(smrt_t *smrt) +{ + if (smrt->smrt_init_level & SMRT_INITLEVEL_ASYNC_EVENT) { + smrt_event_fini(smrt); + smrt->smrt_init_level &= ~SMRT_INITLEVEL_ASYNC_EVENT; + } + + smrt_interrupts_teardown(smrt); + + if (smrt->smrt_init_level & SMRT_INITLEVEL_TASKQ) { + ddi_taskq_destroy(smrt->smrt_discover_taskq); + smrt->smrt_discover_taskq = NULL; + smrt->smrt_init_level &= ~SMRT_INITLEVEL_TASKQ; + } + + if (smrt->smrt_init_level & SMRT_INITLEVEL_PERIODIC) { + ddi_periodic_delete(smrt->smrt_periodic); + smrt->smrt_init_level &= ~SMRT_INITLEVEL_PERIODIC; + } + + smrt_ctrl_hba_teardown(smrt); + + smrt_ctlr_teardown(smrt); + + smrt_device_teardown(smrt); + + if (smrt->smrt_init_level & SMRT_INITLEVEL_BASIC) { + smrt_logvol_teardown(smrt); + smrt_phys_teardown(smrt); + + cv_destroy(&smrt->smrt_cv_finishq); + + VERIFY(list_is_empty(&smrt->smrt_commands)); + list_destroy(&smrt->smrt_commands); + list_destroy(&smrt->smrt_finishq); + list_destroy(&smrt->smrt_abortq); + + VERIFY(list_is_empty(&smrt->smrt_volumes)); + list_destroy(&smrt->smrt_volumes); + + VERIFY(list_is_empty(&smrt->smrt_physicals)); + list_destroy(&smrt->smrt_physicals); + + VERIFY(list_is_empty(&smrt->smrt_targets)); + list_destroy(&smrt->smrt_targets); + + VERIFY(avl_is_empty(&smrt->smrt_inflight)); + avl_destroy(&smrt->smrt_inflight); + + smrt->smrt_init_level &= ~SMRT_INITLEVEL_BASIC; + } + + if (smrt->smrt_init_level & SMRT_INITLEVEL_MUTEX) { + mutex_destroy(&smrt->smrt_mutex); + + smrt->smrt_init_level &= ~SMRT_INITLEVEL_MUTEX; + } + + VERIFY0(smrt->smrt_init_level); + + ddi_soft_state_free(smrt_state, ddi_get_instance(smrt->smrt_dip)); +} + +/* + * Comparator for the "smrt_inflight" AVL tree in a "smrt_t". This AVL tree + * allows a tag ID to be mapped back to the relevant "smrt_command_t". + */ +static int +smrt_command_comparator(const void *lp, const void *rp) +{ + const smrt_command_t *l = lp; + const smrt_command_t *r = rp; + + if (l->smcm_tag > r->smcm_tag) { + return (1); + } else if (l->smcm_tag < r->smcm_tag) { + return (-1); + } else { + return (0); + } +} diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt.conf b/usr/src/uts/common/io/scsi/adapters/smrt/smrt.conf new file mode 100644 index 0000000000..758ecd0779 --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2016 Joyent, Inc. +# + +scsi-no-quiesce=1; diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss.c new file mode 100644 index 0000000000..b4cdd5607e --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss.c @@ -0,0 +1,2023 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017, Joyent, Inc. + */ + +#include <sys/scsi/adapters/smrt/smrt.h> + +/* + * Discovery, Resets, Periodics, and Events + * ---------------------------------------- + * + * Discovery is the act of figuring out what logical and physical volumes exist + * under the controller. Discovery happens in response to the following events: + * + * o iports for virtual and physical devices being attached + * o Controller event notifications indicating potential topology changes + * o After a reset of the controller, before we can perform I/O again + * + * Because we have to perform discovery after a reset, which can happen during + * panic(), that also means that discovery may be run in panic context. We + * also need to emphasize the need for discovery to happen after a controller + * reset. Once a reset is initiated, we cannot be certain about the addresses + * of any of the existing targets until the reset has completed. The driver + * performs I/Os to addresses that the controller provides. The controller + * specification says that these addresses may change after a controller reset. + * + * Unfortunately, all of this combined means that making sure we can correctly + * run discovery is somewhat complicated. In non-panic contexts, discovery is + * always run from a taskq. We'll kick off the discovery in the taskq if + * nothing is pending at that time. The state is managed by bits in the + * smrt_status member of the smrt_t. There are four bits at this time: + * + * SMRT_CTLR_DISCOVERY_REQUESTED This flag indicates that something has + * requested that a discovery be performed. + * If no flags are set when this is set, + * then we will kick off discovery. All + * discovery requests are initiated via the + * smrt_discover_request() function. + * + * SMRT_CTLR_DISCOVERY_RUNNING This flag is set at the start of us + * running a discovery. It is removed when + * discovery finishes. + * + * SMRT_CTLR_DISCOVERY_PERIODIC This flag is set in a number of + * circumstances, which will be described + * in a subsequent section. This indicates + * that the periodic must kick off the + * discovery process. + * + * SMRT_CTLR_DISCOVERY_REQUIRED This flag indicates that at some point a + * controller reset occurred and we need to + * have a successful discovery to finish + * the act of resetting and allowing I/O to + * continue. + * + * In general, a request to discover kicks off the taskq to discover entries, if + * it hasn't already been requested or started. This also allows us to coalesce + * multiple requests, if needed. Note that if a request comes in when a + * discovery is ongoing, we do not kick off discovery again. Instead, we set + * the SMRT_CTLR_DISCOVERY_REQUESTED flag which will rerun discovery after the + * initial pass has completed. + * + * When a discovery starts, the first thing it does is clear the + * SMRT_CTLR_DISCOVERY_REQUESTED flag. This is important, because any + * additional requests for discovery that come in after this has started likely + * indicate that we've missed something. As such, when the discovery process + * finishes, if it sees the REQUESTED flag, then it will need to set the + * PERIODIC flag. The PERIODIC flag is used to indicate that we should run + * discovery again, but not kick if off immediately. Instead, it should be + * driven by the normal periodic behavior. + * + * If for some reason the act of discovery fails, or we fail to dispatch + * discovery due to a transient error, then we will flag PERIODIC so that the + * periodic tick will try and run things again. + * + * Now, we need to talk about SMRT_CTLR_DISCOVERY_REQUIRED. This flag is set + * after a reset occurs. The reset thread will be blocked on this. + * Importantly, none of the code in the discovery path can ask for a controller + * reset at this time. If at the end of a discovery, this flag is set, then we + * will signal the reset thread that it should check on its status by + * broadcasting on the smrt_cv_finishq. At that point, the reset thread will + * continue. + * + * Panic Context + * ------------- + * + * All of this talk of threads and taskqs is well and good, but as an HBA + * driver, we have a serious responsibility to try and deal with panic sanely. + * In panic context, we will directly call the discovery functions and not poll + * for them to occur. + * + * However, because our discovery relies on the target maps, which aren't safe + * for panic context at this time, we have to take a different approach. We + * leverage the fact that we have a generation number stored with every + * discovery. If we try to do an I/O to a device where the generation doesn't + * match, then we know that it disappeared and should not be used. We also + * sanity check the model, serial numbers, and WWNs to make sure that these are + * the same devices. If they are, then we'll end up updating the address + * structures. + * + * Now, it is possible that when we were panicking, we had a thread that was in + * the process of running a discovery or even resetting the system. Once we're + * in panic, those threads aren't running, so if they didn't end up producing a + * new view of the world that the SCSI framework is using, then it shouldn't + * really matter, as we won't have updated the list of devices. Importantly, + * once we're in that context, we're not going to be attaching or detaching + * targets. If we get a request for one of these targets which has disappeared, + * we're going to have to end up giving up. + * + * Request Attributes + * ------------------ + * + * The CISS specification allows for three different kinds of attributes that + * describe how requests are queued to the controller. These are: + * + * HEAD OF QUEUE The request should go to the head of the + * controller queue. This is used for resets and + * aborts to ensure that they're not blocked behind + * additional I/O. + * + * SIMPLE This queues the request for normal processing. + * Commands queued this way are not special with + * respect to one another. We use this for all I/O + * and discovery commands. + * + * ORDERED This attribute is used to indicate that commands + * should be submitted and processed in some order. + * This is used primarily for the event + * notification bits so we can ensure that at the + * return of a cancellation of the event + * notification, that any outstanding request has + * been honored. + */ + +static int smrt_ctlr_versions(smrt_t *, uint16_t, smrt_versions_t *); +static void smrt_discover(void *); + +/* + * The maximum number of seconds to wait for the controller to come online. + */ +unsigned smrt_ciss_init_time = 90; + +/* + * A tunable that determines the number of events per tick that we'll process + * via asynchronous event notification. If this rate is very high, then we will + * not submit the event and it will be picked up at the next tick of the + * periodic. + */ +uint_t smrt_event_intervention_threshold = 1000; + +/* + * Converts a LUN Address to a BMIC Identifier. The BMIC Identifier is used + * when performing various physical commands and generally should stay the same + * for a given device across inserts and removals; however, not across + * controller resets. These are calculated based on what the CISS specification + * calls the 'Level 2' target and bus, which don't have a real meaning in the + * SAS world otherwise. + */ +uint16_t +smrt_lun_addr_to_bmic(PhysDevAddr_t *paddr) +{ + uint16_t id; + + id = (paddr->Target[1].PeripDev.Bus - 1) << 8; + id += paddr->Target[1].PeripDev.Dev; + + return (id); +} + +void +smrt_write_lun_addr_phys(LUNAddr_t *lun, boolean_t masked, unsigned bus, + unsigned target) +{ + lun->PhysDev.Mode = masked ? MASK_PERIPHERIAL_DEV_ADDR : + PERIPHERIAL_DEV_ADDR; + + lun->PhysDev.TargetId = target; + lun->PhysDev.Bus = bus; + + bzero(&lun->PhysDev.Target, sizeof (lun->PhysDev.Target)); +} + +/* + * According to the CISS Specification, the controller is always addressed in + * Mask Perhiperhal mode with a bus and target ID of zero. This is used by + * commands that need to write to the controller itself, which is generally + * discovery and other commands. + */ +void +smrt_write_controller_lun_addr(LUNAddr_t *lun) +{ + smrt_write_lun_addr_phys(lun, B_TRUE, 0, 0); +} + +void +smrt_write_message_common(smrt_command_t *smcm, uint8_t type, int timeout_secs) +{ + switch (type) { + case CISS_MSG_ABORT: + case CISS_MSG_RESET: + case CISS_MSG_NOP: + break; + + default: + panic("unknown message type"); + } + + smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_MSG; + smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_HEADOFQUEUE; + smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_NONE; + smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout_secs); + smcm->smcm_va_cmd->Request.CDBLen = CISS_CDBLEN; + smcm->smcm_va_cmd->Request.CDB[0] = type; +} + +void +smrt_write_message_abort_one(smrt_command_t *smcm, uint32_t tag) +{ + smrt_tag_t cisstag; + + /* + * When aborting a particular command, the request is addressed + * to the controller. + */ + smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN, + B_TRUE, 0, 0); + + smrt_write_message_common(smcm, CISS_MSG_ABORT, 0); + + /* + * Abort a single command. + */ + smcm->smcm_va_cmd->Request.CDB[1] = CISS_ABORT_TASK; + + /* + * The CISS Specification says that the tag value for a task-level + * abort should be in the CDB in bytes 4-11. + */ + bzero(&cisstag, sizeof (cisstag)); + cisstag.tag_value = tag; + bcopy(&cisstag, &smcm->smcm_va_cmd->Request.CDB[4], + sizeof (cisstag)); +} + +void +smrt_write_message_abort_all(smrt_command_t *smcm, LUNAddr_t *addr) +{ + /* + * When aborting all tasks for a particular Logical Volume, + * the command is addressed not to the controller but to + * the Volume itself. + */ + smcm->smcm_va_cmd->Header.LUN = *addr; + + smrt_write_message_common(smcm, CISS_MSG_ABORT, 0); + + /* + * Abort all commands for a particular Logical Volume. + */ + smcm->smcm_va_cmd->Request.CDB[1] = CISS_ABORT_TASKSET; +} + +void +smrt_write_message_event_notify(smrt_command_t *smcm) +{ + smrt_event_notify_req_t senr; + + smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN); + + smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD; + smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_ORDERED; + smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ; + smcm->smcm_va_cmd->Request.Timeout = 0; + smcm->smcm_va_cmd->Request.CDBLen = sizeof (senr); + + bzero(&senr, sizeof (senr)); + senr.senr_opcode = CISS_SCMD_READ; + senr.senr_subcode = CISS_BMIC_NOTIFY_ON_EVENT; + senr.senr_flags = BE_32(0); + senr.senr_size = BE_32(SMRT_EVENT_NOTIFY_BUFLEN); + + bcopy(&senr, &smcm->smcm_va_cmd->Request.CDB[0], + MIN(CISS_CDBLEN, sizeof (senr))); +} + +void +smrt_write_message_cancel_event_notify(smrt_command_t *smcm) +{ + smrt_event_notify_req_t senr; + + smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN); + + smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD; + smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_ORDERED; + smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_WRITE; + smcm->smcm_va_cmd->Request.Timeout = LE_16(SMRT_ASYNC_CANCEL_TIMEOUT); + smcm->smcm_va_cmd->Request.CDBLen = sizeof (senr); + + bzero(&senr, sizeof (senr)); + senr.senr_opcode = CISS_SCMD_WRITE; + senr.senr_subcode = CISS_BMIC_NOTIFY_ON_EVENT_CANCEL; + senr.senr_size = BE_32(SMRT_EVENT_NOTIFY_BUFLEN); + + bcopy(&senr, &smcm->smcm_va_cmd->Request.CDB[0], + MIN(CISS_CDBLEN, sizeof (senr))); +} + +void +smrt_write_message_reset_ctlr(smrt_command_t *smcm) +{ + smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN, + B_TRUE, 0, 0); + + smrt_write_message_common(smcm, CISS_MSG_RESET, 0); + + smcm->smcm_va_cmd->Request.CDB[1] = CISS_RESET_CTLR; +} + +void +smrt_write_message_nop(smrt_command_t *smcm, int timeout_secs) +{ + /* + * No-op messages are always sent to the controller. + */ + smrt_write_lun_addr_phys(&smcm->smcm_va_cmd->Header.LUN, + B_TRUE, 0, 0); + + smrt_write_message_common(smcm, CISS_MSG_NOP, timeout_secs); +} + +/* + * This routine is executed regularly by ddi_periodic_add(9F). It checks the + * health of the controller and looks for submitted commands that have timed + * out. + */ +void +smrt_periodic(void *arg) +{ + smrt_t *smrt = arg; + + mutex_enter(&smrt->smrt_mutex); + + /* + * Before we even check if the controller is running to process + * everything else, we must first check if we had a request to kick off + * discovery. We do this before the check if the controller is running, + * as this may be required to finish a discovery. + */ + if ((smrt->smrt_status & SMRT_CTLR_DISCOVERY_PERIODIC) != 0 && + (smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING) == 0 && + (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) == 0) { + if (ddi_taskq_dispatch(smrt->smrt_discover_taskq, + smrt_discover, smrt, DDI_NOSLEEP) != DDI_SUCCESS) { + smrt->smrt_stats.smrts_discovery_tq_errors++; + } else { + smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_PERIODIC; + } + } + + if (!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING)) { + /* + * The device is currently not active, e.g. due to an + * in-progress controller reset. + */ + mutex_exit(&smrt->smrt_mutex); + return; + } + + /* + * Check on the health of the controller firmware. Note that if the + * controller has locked up, this routine will panic the system. + */ + smrt_lockup_check(smrt); + + /* + * Reset the event notification threshold counter. + */ + smrt->smrt_event_count = 0; + + /* + * Check inflight commands to see if they have timed out. + */ + for (smrt_command_t *smcm = avl_first(&smrt->smrt_inflight); + smcm != NULL; smcm = AVL_NEXT(&smrt->smrt_inflight, smcm)) { + if (smcm->smcm_status & SMRT_CMD_STATUS_POLLED) { + /* + * Polled commands are timed out by the polling + * routine. + */ + continue; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_ABORT_SENT) { + /* + * This command has been aborted; either it will + * complete or the controller will be reset. + */ + continue; + } + + if (list_link_active(&smcm->smcm_link_abort)) { + /* + * Already on the abort queue. + */ + continue; + } + + if (smcm->smcm_expiry == 0) { + /* + * This command has no expiry time. + */ + continue; + } + + if (gethrtime() > smcm->smcm_expiry) { + list_insert_tail(&smrt->smrt_abortq, smcm); + smcm->smcm_status |= SMRT_CMD_STATUS_TIMEOUT; + } + } + + /* + * Process the abort queue. + */ + (void) smrt_process_abortq(smrt); + + /* + * Check if we have an outstanding event intervention request. Note, + * the command in question should always be in a state such that it is + * usable by the system here. The command is always prepared again by + * the normal event notification path, even if a reset has occurred. + * The reset will be processed before we'd ever consider running an + * event again. Note, if we fail to submit this, then we leave this for + * the next occurrence of the periodic. + */ + if (smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION) { + smrt->smrt_stats.smrts_events_intervened++; + + if (smrt_submit(smrt, smrt->smrt_event_cmd) == 0) { + smrt->smrt_status &= ~SMRT_CTLR_ASYNC_INTERVENTION; + } + } + + mutex_exit(&smrt->smrt_mutex); +} + +int +smrt_retrieve(smrt_t *smrt) +{ + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + switch (smrt->smrt_ctlr_mode) { + case SMRT_CTLR_MODE_SIMPLE: + smrt_retrieve_simple(smrt); + return (DDI_SUCCESS); + + case SMRT_CTLR_MODE_UNKNOWN: + break; + } + + panic("unknown controller mode"); + /* LINTED: E_FUNC_NO_RET_VAL */ +} + +/* + * Grab a new tag number for this command. We aim to avoid reusing tag numbers + * as much as possible, so as to avoid spurious double completion from the + * controller. + */ +static void +smrt_set_new_tag(smrt_t *smrt, smrt_command_t *smcm) +{ + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + /* + * Loop until we find a tag that is not in use. The tag space is + * very large (~30 bits) and the maximum number of inflight commands + * is comparatively small (~1024 in current controllers). + */ + for (;;) { + uint32_t new_tag = smrt->smrt_next_tag; + + if (++smrt->smrt_next_tag > SMRT_MAX_TAG_NUMBER) { + smrt->smrt_next_tag = SMRT_MIN_TAG_NUMBER; + } + + if (smrt_lookup_inflight(smrt, new_tag) != NULL) { + /* + * This tag is already used on an inflight command. + * Choose another. + */ + continue; + } + + /* + * Set the tag for the command and also write it into the + * appropriate part of the request block. + */ + smcm->smcm_tag = new_tag; + smcm->smcm_va_cmd->Header.Tag.tag_value = new_tag; + return; + } +} + +/* + * Submit a command to the controller. + */ +int +smrt_submit(smrt_t *smrt, smrt_command_t *smcm) +{ + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + VERIFY(smcm->smcm_type != SMRT_CMDTYPE_PREINIT); + + /* + * Anything that asks us to ignore the running state of the controller + * must be wired up to poll for completion. + */ + if (smcm->smcm_status & SMRT_CMD_IGNORE_RUNNING) { + VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_POLLED); + } + + /* + * If the controller is currently being reset, do not allow command + * submission. However, if this is one of the commands needed to finish + * reset, as indicated on the command structure, allow it. + */ + if (!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING) && + !(smcm->smcm_status & SMRT_CMD_IGNORE_RUNNING)) { + return (EIO); + } + + /* + * Do not allow submission of more concurrent commands than the + * controller supports. + */ + if (avl_numnodes(&smrt->smrt_inflight) >= smrt->smrt_maxcmds) { + return (EAGAIN); + } + + /* + * Synchronise the Command Block DMA resources to ensure that the + * device has a consistent view before we pass it the command. + */ + if (ddi_dma_sync(smcm->smcm_contig.smdma_dma_handle, 0, 0, + DDI_DMA_SYNC_FORDEV) != DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_PANIC, "DMA sync failure"); + return (EIO); + } + + /* + * Ensure that this command is not re-used without issuing a new + * tag number and performing any appropriate cleanup. + */ + VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_USED)); + smcm->smcm_status |= SMRT_CMD_STATUS_USED; + + /* + * Assign a tag that is not currently in use + */ + smrt_set_new_tag(smrt, smcm); + + /* + * Insert this command into the inflight AVL. + */ + avl_index_t where; + if (avl_find(&smrt->smrt_inflight, smcm, &where) != NULL) { + dev_err(smrt->smrt_dip, CE_PANIC, "duplicate submit tag %x", + smcm->smcm_tag); + } + avl_insert(&smrt->smrt_inflight, smcm, where); + if (smrt->smrt_stats.smrts_max_inflight < + avl_numnodes(&smrt->smrt_inflight)) { + smrt->smrt_stats.smrts_max_inflight = + avl_numnodes(&smrt->smrt_inflight); + } + + VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT)); + smcm->smcm_status |= SMRT_CMD_STATUS_INFLIGHT; + + smcm->smcm_time_submit = gethrtime(); + + switch (smrt->smrt_ctlr_mode) { + case SMRT_CTLR_MODE_SIMPLE: + smrt_submit_simple(smrt, smcm); + return (0); + + case SMRT_CTLR_MODE_UNKNOWN: + break; + } + panic("unknown controller mode"); + /* LINTED: E_FUNC_NO_RET_VAL */ +} + +static void +smrt_process_finishq_sync(smrt_command_t *smcm) +{ + smrt_t *smrt = smcm->smcm_ctlr; + + if (ddi_dma_sync(smcm->smcm_contig.smdma_dma_handle, 0, 0, + DDI_DMA_SYNC_FORCPU) != DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_PANIC, "finishq DMA sync failure"); + } +} + +static void +smrt_process_finishq_one(smrt_command_t *smcm) +{ + smrt_t *smrt = smcm->smcm_ctlr; + + VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_COMPLETE)); + smcm->smcm_status |= SMRT_CMD_STATUS_COMPLETE; + + switch (smcm->smcm_type) { + case SMRT_CMDTYPE_INTERNAL: + cv_broadcast(&smcm->smcm_ctlr->smrt_cv_finishq); + return; + + case SMRT_CMDTYPE_SCSA: + smrt_hba_complete(smcm); + return; + + case SMRT_CMDTYPE_EVENT: + smrt_event_complete(smcm); + return; + + case SMRT_CMDTYPE_ABORTQ: + /* + * Abort messages sent as part of abort queue processing + * do not require any completion activity. + */ + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(smcm); + mutex_enter(&smrt->smrt_mutex); + return; + + case SMRT_CMDTYPE_PREINIT: + dev_err(smrt->smrt_dip, CE_PANIC, "preinit command " + "completed after initialisation"); + return; + } + + panic("unknown command type"); +} + +/* + * Process commands in the completion queue. + */ +void +smrt_process_finishq(smrt_t *smrt) +{ + smrt_command_t *smcm; + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + while ((smcm = list_remove_head(&smrt->smrt_finishq)) != NULL) { + /* + * Synchronise the Command Block before we read from it or + * free it, to ensure that any writes from the controller are + * visible. + */ + smrt_process_finishq_sync(smcm); + + /* + * Check if this command was in line to be aborted. + */ + if (list_link_active(&smcm->smcm_link_abort)) { + /* + * This command was in line, but the controller + * subsequently completed the command before we + * were able to do so. + */ + list_remove(&smrt->smrt_abortq, smcm); + smcm->smcm_status &= ~SMRT_CMD_STATUS_TIMEOUT; + } + + /* + * Check if this command has been abandoned by the original + * submitter. If it has, free it now to avoid a leak. + */ + if (smcm->smcm_status & SMRT_CMD_STATUS_ABANDONED) { + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(smcm); + mutex_enter(&smrt->smrt_mutex); + continue; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_POLLED) { + /* + * This command will be picked up and processed + * by "smrt_poll_for()" once the CV is triggered + * at the end of processing. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_POLL_COMPLETE; + continue; + } + + smrt_process_finishq_one(smcm); + } + + cv_broadcast(&smrt->smrt_cv_finishq); +} + +/* + * Process commands in the abort queue. + */ +void +smrt_process_abortq(smrt_t *smrt) +{ + smrt_command_t *smcm; + smrt_command_t *abort_smcm = NULL; + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + if (list_is_empty(&smrt->smrt_abortq)) { + goto out; + } + +another: + mutex_exit(&smrt->smrt_mutex); + if ((abort_smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_ABORTQ, + KM_NOSLEEP)) == NULL) { + /* + * No resources available to send abort messages. We will + * try again the next time around. + */ + mutex_enter(&smrt->smrt_mutex); + goto out; + } + mutex_enter(&smrt->smrt_mutex); + + while ((smcm = list_remove_head(&smrt->smrt_abortq)) != NULL) { + if (!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT)) { + /* + * This message is not currently inflight, so + * no abort is needed. + */ + continue; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_ABORT_SENT) { + /* + * An abort message has already been sent for + * this command. + */ + continue; + } + + /* + * Send an abort message for the command. + */ + smrt_write_message_abort_one(abort_smcm, smcm->smcm_tag); + if (smrt_submit(smrt, abort_smcm) != 0) { + /* + * The command could not be submitted to the + * controller. Put it back in the abort queue + * and give up for now. + */ + list_insert_head(&smrt->smrt_abortq, smcm); + goto out; + } + smcm->smcm_status |= SMRT_CMD_STATUS_ABORT_SENT; + + /* + * Record some debugging information about the abort we + * sent: + */ + smcm->smcm_abort_time = gethrtime(); + smcm->smcm_abort_tag = abort_smcm->smcm_tag; + + /* + * The abort message was sent. Release it and + * allocate another command. + */ + abort_smcm = NULL; + goto another; + } + +out: + cv_broadcast(&smrt->smrt_cv_finishq); + if (abort_smcm != NULL) { + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(abort_smcm); + mutex_enter(&smrt->smrt_mutex); + } +} + +int +smrt_poll_for(smrt_t *smrt, smrt_command_t *smcm) +{ + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_POLLED); + + while (!(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE)) { + if (smcm->smcm_expiry != 0) { + /* + * This command has an expiry time. Check to see + * if it has already passed: + */ + if (smcm->smcm_expiry < gethrtime()) { + return (ETIMEDOUT); + } + } + + if (ddi_in_panic()) { + /* + * When the system is panicking, there are no + * interrupts or other threads. Drive the polling loop + * on our own, but with a small delay to avoid + * aggrevating the controller while we're trying to + * dump. + */ + (void) smrt_retrieve(smrt); + smrt_process_finishq(smrt); + drv_usecwait(100); + continue; + } + + /* + * Wait for command completion to return through the regular + * interrupt handling path. + */ + if (smcm->smcm_expiry == 0) { + cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex); + } else { + /* + * Wait only until the expiry time for this command. + */ + (void) cv_timedwait_sig_hrtime(&smrt->smrt_cv_finishq, + &smrt->smrt_mutex, smcm->smcm_expiry); + } + } + + /* + * Fire the completion callback for this command. The callback + * is responsible for freeing the command, so it may not be + * referenced again once this call returns. + */ + smrt_process_finishq_one(smcm); + + return (0); +} + +void +smrt_intr_set(smrt_t *smrt, boolean_t enabled) +{ + /* + * Read the Interrupt Mask Register. + */ + uint32_t imr = smrt_get32(smrt, CISS_I2O_INTERRUPT_MASK); + + switch (smrt->smrt_ctlr_mode) { + case SMRT_CTLR_MODE_SIMPLE: + if (enabled) { + imr &= ~CISS_IMR_BIT_SIMPLE_INTR_DISABLE; + } else { + imr |= CISS_IMR_BIT_SIMPLE_INTR_DISABLE; + } + smrt_put32(smrt, CISS_I2O_INTERRUPT_MASK, imr); + return; + + case SMRT_CTLR_MODE_UNKNOWN: + break; + } + panic("unknown controller mode"); +} + +/* + * Signal to the controller that we have updated the Configuration Table by + * writing to the Inbound Doorbell Register. The controller will, after some + * number of seconds, acknowledge this by clearing the bit. + * + * If successful, return DDI_SUCCESS. If the controller takes too long to + * acknowledge, return DDI_FAILURE. + */ +int +smrt_cfgtbl_flush(smrt_t *smrt) +{ + /* + * Read the current value of the Inbound Doorbell Register. + */ + uint32_t idr = smrt_get32(smrt, CISS_I2O_INBOUND_DOORBELL); + + /* + * Signal the Configuration Table change to the controller. + */ + idr |= CISS_IDR_BIT_CFGTBL_CHANGE; + smrt_put32(smrt, CISS_I2O_INBOUND_DOORBELL, idr); + + /* + * Wait for the controller to acknowledge the change. + */ + for (unsigned i = 0; i < smrt_ciss_init_time; i++) { + idr = smrt_get32(smrt, CISS_I2O_INBOUND_DOORBELL); + + if ((idr & CISS_IDR_BIT_CFGTBL_CHANGE) == 0) { + return (DDI_SUCCESS); + } + + /* + * Wait for one second before trying again. + */ + delay(drv_usectohz(1000000)); + } + + dev_err(smrt->smrt_dip, CE_WARN, "time out expired before controller " + "configuration completed"); + return (DDI_FAILURE); +} + +int +smrt_cfgtbl_transport_has_support(smrt_t *smrt, int xport) +{ + VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE); + + /* + * Read the current value of the "Supported Transport Methods" field in + * the Configuration Table. + */ + uint32_t xport_active = ddi_get32(smrt->smrt_ct_handle, + &smrt->smrt_ct->TransportSupport); + + /* + * Check that the desired transport method is supported by the + * controller: + */ + if ((xport_active & xport) == 0) { + dev_err(smrt->smrt_dip, CE_WARN, "controller does not support " + "method \"%s\"", xport == CISS_CFGTBL_XPORT_SIMPLE ? + "simple" : "performant"); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +void +smrt_cfgtbl_transport_set(smrt_t *smrt, int xport) +{ + VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE); + + ddi_put32(smrt->smrt_ct_handle, &smrt->smrt_ct->TransportRequest, + xport); +} + +int +smrt_cfgtbl_transport_confirm(smrt_t *smrt, int xport) +{ + VERIFY(xport == CISS_CFGTBL_XPORT_SIMPLE); + + /* + * Read the current value of the TransportActive field in the + * Configuration Table. + */ + uint32_t xport_active = ddi_get32(smrt->smrt_ct_handle, + &smrt->smrt_ct->TransportActive); + + /* + * Check that the desired transport method is now active: + */ + if ((xport_active & xport) == 0) { + dev_err(smrt->smrt_dip, CE_WARN, "failed to enable transport " + "method \"%s\"", xport == CISS_CFGTBL_XPORT_SIMPLE ? + "simple" : "performant"); + return (DDI_FAILURE); + } + + /* + * Ensure that the controller is now ready to accept commands. + */ + if ((xport_active & CISS_CFGTBL_READY_FOR_COMMANDS) == 0) { + dev_err(smrt->smrt_dip, CE_WARN, "controller not ready to " + "accept commands"); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +uint32_t +smrt_ctlr_get_maxsgelements(smrt_t *smrt) +{ + return (ddi_get32(smrt->smrt_ct_handle, &smrt->smrt_ct->MaxSGElements)); +} + +uint32_t +smrt_ctlr_get_cmdsoutmax(smrt_t *smrt) +{ + return (ddi_get32(smrt->smrt_ct_handle, &smrt->smrt_ct->CmdsOutMax)); +} + +static uint32_t +smrt_ctlr_get_hostdrvsup(smrt_t *smrt) +{ + return (ddi_get32(smrt->smrt_ct_handle, + &smrt->smrt_ct->HostDrvrSupport)); +} + +int +smrt_ctlr_init(smrt_t *smrt) +{ + uint8_t signature[4] = { 'C', 'I', 'S', 'S' }; + int e; + + if ((e = smrt_ctlr_wait_for_state(smrt, + SMRT_WAIT_STATE_READY)) != DDI_SUCCESS) { + return (e); + } + + /* + * The configuration table contains an ASCII signature ("CISS") which + * should be checked as we initialise the controller. + * See: "9.1 Configuration Table" in CISS Specification. + */ + for (unsigned i = 0; i < 4; i++) { + if (ddi_get8(smrt->smrt_ct_handle, + &smrt->smrt_ct->Signature[i]) != signature[i]) { + dev_err(smrt->smrt_dip, CE_WARN, "invalid signature " + "detected"); + return (DDI_FAILURE); + } + } + + /* + * Initialise an appropriate Transport Method. For now, this driver + * only supports the "Simple" method. + */ + if ((e = smrt_ctlr_init_simple(smrt)) != DDI_SUCCESS) { + return (e); + } + + /* + * Save some common feature support bitfields. + */ + smrt->smrt_host_support = smrt_ctlr_get_hostdrvsup(smrt); + smrt->smrt_bus_support = ddi_get32(smrt->smrt_ct_handle, + &smrt->smrt_ct->BusTypes); + + /* + * Read initial controller heartbeat value and mark the current + * reading time. + */ + smrt->smrt_last_heartbeat = ddi_get32(smrt->smrt_ct_handle, + &smrt->smrt_ct->HeartBeat); + smrt->smrt_last_heartbeat_time = gethrtime(); + + /* + * Determine the firmware version of the controller so that we can + * select which type of interrupts to use. + */ + if ((e = smrt_ctlr_versions(smrt, SMRT_DISCOVER_TIMEOUT, + &smrt->smrt_versions)) != 0) { + dev_err(smrt->smrt_dip, CE_WARN, "could not identify " + "controller (%d)", e); + return (DDI_FAILURE); + } + + dev_err(smrt->smrt_dip, CE_NOTE, "!firmware rev %s", + smrt->smrt_versions.smrtv_firmware_rev); + + return (DDI_SUCCESS); +} + +void +smrt_ctlr_teardown(smrt_t *smrt) +{ + smrt->smrt_status &= ~SMRT_CTLR_STATUS_RUNNING; + + switch (smrt->smrt_ctlr_mode) { + case SMRT_CTLR_MODE_SIMPLE: + smrt_ctlr_teardown_simple(smrt); + return; + + case SMRT_CTLR_MODE_UNKNOWN: + return; + } + + panic("unknown controller mode"); +} + +int +smrt_ctlr_wait_for_state(smrt_t *smrt, smrt_wait_state_t state) +{ + unsigned wait_usec = 100 * 1000; + unsigned wait_count = SMRT_WAIT_DELAY_SECONDS * 1000000 / wait_usec; + + VERIFY(state == SMRT_WAIT_STATE_READY || + state == SMRT_WAIT_STATE_UNREADY); + + /* + * Read from the Scratchpad Register until the expected ready signature + * is detected. This behaviour is not described in the CISS + * specification. + * + * If the device is not in the desired state immediately, sleep for a + * second and try again. If the device has not become ready in 300 + * seconds, give up. + */ + for (unsigned i = 0; i < wait_count; i++) { + uint32_t spr = smrt_get32(smrt, CISS_I2O_SCRATCHPAD); + + switch (state) { + case SMRT_WAIT_STATE_READY: + if (spr == CISS_SCRATCHPAD_INITIALISED) { + return (DDI_SUCCESS); + } + break; + + case SMRT_WAIT_STATE_UNREADY: + if (spr != CISS_SCRATCHPAD_INITIALISED) { + return (DDI_SUCCESS); + } + break; + } + + if (ddi_in_panic()) { + /* + * There is no sleep for the panicking, so we + * must spin wait: + */ + drv_usecwait(wait_usec); + } else { + /* + * Wait for a quarter second and try again. + */ + delay(drv_usectohz(wait_usec)); + } + } + + dev_err(smrt->smrt_dip, CE_WARN, "time out waiting for controller " + "to enter state \"%s\"", state == SMRT_WAIT_STATE_READY ? + "ready": "unready"); + return (DDI_FAILURE); +} + +void +smrt_lockup_check(smrt_t *smrt) +{ + /* + * Read the current controller heartbeat value. + */ + uint32_t heartbeat = ddi_get32(smrt->smrt_ct_handle, + &smrt->smrt_ct->HeartBeat); + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + /* + * Check to see if the value is the same as last time we looked: + */ + if (heartbeat != smrt->smrt_last_heartbeat) { + /* + * The heartbeat value has changed, which suggests that the + * firmware in the controller has not yet come to a complete + * stop. Record the new value, as well as the current time. + */ + smrt->smrt_last_heartbeat = heartbeat; + smrt->smrt_last_heartbeat_time = gethrtime(); + return; + } + + /* + * The controller _might_ have been able to signal to us that is + * has locked up. This is a truly unfathomable state of affairs: + * If the firmware can tell it has flown off the rails, why not + * simply reset the controller? + */ + uint32_t odr = smrt_get32(smrt, CISS_I2O_OUTBOUND_DOORBELL_STATUS); + uint32_t spr = smrt_get32(smrt, CISS_I2O_SCRATCHPAD); + if ((odr & CISS_ODR_BIT_LOCKUP) != 0) { + dev_err(smrt->smrt_dip, CE_PANIC, "HP SmartArray firmware has " + "reported a critical fault (odr %08x spr %08x)", + odr, spr); + } + + if (gethrtime() > smrt->smrt_last_heartbeat_time + 60 * NANOSEC) { + dev_err(smrt->smrt_dip, CE_PANIC, "HP SmartArray firmware has " + "stopped responding (odr %08x spr %08x)", + odr, spr); + } +} + +/* + * Probe the controller with the IDENTIFY CONTROLLER request. This is a BMIC + * command, so it must be submitted to the controller and we must poll for its + * completion. This functionality is only presently used during controller + * initialisation, so it uses the special pre-initialisation path for command + * allocation and submission. + */ +static int +smrt_ctlr_identify(smrt_t *smrt, uint16_t timeout, + smrt_identify_controller_t *resp) +{ + smrt_command_t *smcm; + smrt_identify_controller_req_t smicr; + int r; + size_t sz; + + /* + * Allocate a command with a data buffer; the controller will fill it + * with identification information. There is some suggestion in the + * firmware-level specification that the buffer length should be a + * multiple of 512 bytes for some controllers, so we round up. + */ + sz = P2ROUNDUP_TYPED(sizeof (*resp), 512, size_t); + if ((smcm = smrt_command_alloc_preinit(smrt, sz, KM_SLEEP)) == NULL) { + return (ENOMEM); + } + + smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN); + + smcm->smcm_va_cmd->Request.CDBLen = sizeof (smicr); + smcm->smcm_va_cmd->Request.Timeout = timeout; + smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD; + smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE; + smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ; + + /* + * Construct the IDENTIFY CONTROLLER request CDB. Note that any + * reserved fields in the request must be filled with zeroes. + */ + bzero(&smicr, sizeof (smicr)); + smicr.smicr_opcode = CISS_SCMD_BMIC_READ; + smicr.smicr_lun = 0; + smicr.smicr_command = CISS_BMIC_IDENTIFY_CONTROLLER; + bcopy(&smicr, &smcm->smcm_va_cmd->Request.CDB[0], + MIN(CISS_CDBLEN, sizeof (smicr))); + + /* + * Send the command to the device and poll for its completion. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_POLLED; + smcm->smcm_expiry = gethrtime() + timeout * NANOSEC; + if ((r = smrt_preinit_command_simple(smrt, smcm)) != 0) { + VERIFY3S(r, ==, ETIMEDOUT); + VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE); + + /* + * This command timed out, but the driver is not presently + * initialised to the point where we can try to abort it. + * The command was created with the PREINIT type, so it + * does not appear in the global command tracking list. + * In order to avoid problems with DMA from the controller, + * we have to leak the command allocation. + */ + smcm = NULL; + goto out; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) { + /* + * The controller was reset while we were trying to identify + * it. Report failure. + */ + r = EIO; + goto out; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) { + ErrorInfo_t *ei = smcm->smcm_va_err; + + if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) { + dev_err(smrt->smrt_dip, CE_WARN, "identify " + "controller error: status 0x%x", + ei->CommandStatus); + r = EIO; + goto out; + } + } + + if (resp != NULL) { + /* + * Copy the identify response out for the caller. + */ + bcopy(smcm->smcm_internal->smcmi_va, resp, sizeof (*resp)); + } + + r = 0; + +out: + if (smcm != NULL) { + smrt_command_free(smcm); + } + return (r); +} + +/* + * The firmware versions in an IDENTIFY CONTROLLER response generally take + * the form of a four byte ASCII string containing a dotted decimal version + * number; e.g., "8.00". + * + * This function sanitises the firmware version, replacing unexpected + * values with a question mark. + */ +static void +smrt_copy_firmware_version(uint8_t *src, char *dst) +{ + for (unsigned i = 0; i < 4; i++) { + /* + * Make sure that this is a 7-bit clean ASCII value. + */ + char c = src[i] <= 0x7f ? (char)(src[i] & 0x7f) : '?'; + + if (isalnum(c) || c == '.' || c == ' ') { + dst[i] = c; + } else { + dst[i] = '?'; + } + } + dst[4] = '\0'; +} + +/* + * Using an IDENTIFY CONTROLLER request, determine firmware and controller + * version details. See the comments for "smrt_ctlr_identify()" for more + * details about calling context. + */ +static int +smrt_ctlr_versions(smrt_t *smrt, uint16_t timeout, smrt_versions_t *smrtv) +{ + smrt_identify_controller_t smic; + int r; + + if ((r = smrt_ctlr_identify(smrt, timeout, &smic)) != 0) { + return (r); + } + + smrtv->smrtv_hardware_version = smic.smic_hardware_version; + smrt_copy_firmware_version(smic.smic_firmware_rev, + smrtv->smrtv_firmware_rev); + smrt_copy_firmware_version(smic.smic_recovery_rev, + smrtv->smrtv_recovery_rev); + smrt_copy_firmware_version(smic.smic_bootblock_rev, + smrtv->smrtv_bootblock_rev); + + return (0); +} + +int +smrt_ctlr_reset(smrt_t *smrt) +{ + smrt_command_t *smcm, *smcm_nop; + int r; + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + if (ddi_in_panic()) { + goto skip_check; + } + + if (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) { + /* + * Don't pile on. One reset is enough. Wait until + * it's complete, and then return success. + */ + while (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) { + cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex); + } + return (0); + } + smrt->smrt_status |= SMRT_CTLR_STATUS_RESETTING; + smrt->smrt_last_reset_start = gethrtime(); + smrt->smrt_stats.smrts_ctlr_resets++; + +skip_check: + /* + * Allocate two commands: one for the soft reset message, which we + * cannot free until the controller has reset; and one for the ping we + * will use to determine when it is once again functional. + */ + mutex_exit(&smrt->smrt_mutex); + if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, + KM_NOSLEEP)) == NULL) { + mutex_enter(&smrt->smrt_mutex); + return (ENOMEM); + } + if ((smcm_nop = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, + KM_NOSLEEP)) == NULL) { + smrt_command_free(smcm); + mutex_enter(&smrt->smrt_mutex); + return (ENOMEM); + } + mutex_enter(&smrt->smrt_mutex); + + /* + * Send a soft reset command to the controller. If this command + * succeeds, there will likely be no completion notification. Instead, + * the device should become unavailable for some period of time and + * then become available again. Once available again, we know the soft + * reset has completed and should abort all in-flight commands. + */ + smrt_write_message_reset_ctlr(smcm); + + /* + * Disable interrupts now. + */ + smrt_intr_set(smrt, B_FALSE); + + dev_err(smrt->smrt_dip, CE_WARN, "attempting controller soft reset"); + smcm->smcm_status |= SMRT_CMD_STATUS_POLLED; + if ((r = smrt_submit(smrt, smcm)) != 0) { + dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: " + "submit failed (%d)", r); + } + + /* + * Mark every currently inflight command as being reset, including the + * soft reset command we just sent. Once we confirm the reset works, + * we can safely report that these commands have failed. + */ + for (smrt_command_t *t = avl_first(&smrt->smrt_inflight); + t != NULL; t = AVL_NEXT(&smrt->smrt_inflight, t)) { + t->smcm_status |= SMRT_CMD_STATUS_RESET_SENT; + } + + /* + * Now that we have submitted our soft reset command, prevent + * the rest of the driver from interacting with the controller. + */ + smrt->smrt_status &= ~SMRT_CTLR_STATUS_RUNNING; + + /* + * We do not expect a completion from the controller for our soft + * reset command, but we also cannot remove it from the inflight + * list until we know the controller has actually reset. To do + * otherwise would potentially allow the controller to scribble + * on the memory we were using. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED; + + if (smrt_ctlr_wait_for_state(smrt, SMRT_WAIT_STATE_UNREADY) != + DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: " + "controller did not become unready"); + } + dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller unready"); + + if (smrt_ctlr_wait_for_state(smrt, SMRT_WAIT_STATE_READY) != + DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: " + "controller did not come become ready"); + } + dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller ready"); + + /* + * In at least the Smart Array P420i, the controller can take 30-45 + * seconds after the scratchpad register shows it as being available + * before it is ready to receive commands. In order to avoid hitting + * it too early with our post-reset ping, we will sleep for 10 seconds + * here. + */ + if (ddi_in_panic()) { + drv_usecwait(10 * MICROSEC); + } else { + delay(drv_usectohz(10 * MICROSEC)); + } + + smrt_ctlr_teardown(smrt); + if (smrt_ctlr_init(smrt) != DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: " + "controller transport could not be configured"); + } + dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: controller configured"); + + smrt_write_message_nop(smcm_nop, 0); + smcm_nop->smcm_status |= SMRT_CMD_STATUS_POLLED | + SMRT_CMD_IGNORE_RUNNING; + if ((r = smrt_submit(smrt, smcm_nop)) != 0) { + dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: " + "ping could not be submitted (%d)", r); + } + + /* + * Interrupts are still masked at this stage. Poll manually in + * a way that will not trigger regular finish queue processing: + */ + VERIFY(smcm_nop->smcm_status & SMRT_CMD_STATUS_INFLIGHT); + for (unsigned i = 0; i < 600; i++) { + smrt_retrieve_simple(smrt); + + if (!(smcm_nop->smcm_status & SMRT_CMD_STATUS_INFLIGHT)) { + /* + * Remove the ping command from the finish queue and + * process it manually. This processing must mirror + * what would have been done in smrt_process_finishq(). + */ + VERIFY(list_link_active(&smcm_nop->smcm_link_finish)); + list_remove(&smrt->smrt_finishq, smcm_nop); + smrt_process_finishq_sync(smcm_nop); + smcm_nop->smcm_status |= SMRT_CMD_STATUS_POLL_COMPLETE; + smrt_process_finishq_one(smcm_nop); + break; + } + + if (ddi_in_panic()) { + drv_usecwait(100 * 1000); + } else { + delay(drv_usectohz(100 * 1000)); + } + } + + if (!(smcm_nop->smcm_status & SMRT_CMD_STATUS_COMPLETE)) { + dev_err(smrt->smrt_dip, CE_PANIC, "soft reset failed: " + "ping did not complete"); + } else if (smcm_nop->smcm_status & SMRT_CMD_STATUS_ERROR) { + dev_err(smrt->smrt_dip, CE_WARN, "soft reset: ping completed " + "in error (status %u)", + (unsigned)smcm_nop->smcm_va_err->CommandStatus); + } else { + dev_err(smrt->smrt_dip, CE_NOTE, "soft reset: ping completed"); + } + + /* + * Now that the controller is working again, we can abort any + * commands that were inflight during the reset. + */ + smrt_command_t *nt; + for (smrt_command_t *t = avl_first(&smrt->smrt_inflight); + t != NULL; t = nt) { + nt = AVL_NEXT(&smrt->smrt_inflight, t); + + if (t->smcm_status & SMRT_CMD_STATUS_RESET_SENT) { + avl_remove(&smrt->smrt_inflight, t); + t->smcm_status &= ~SMRT_CMD_STATUS_INFLIGHT; + + list_insert_tail(&smrt->smrt_finishq, t); + } + } + + /* + * Quiesce our discovery thread. Note, because + * SMRT_CTLR_STATUS_RESTARTING is set, nothing can cause it to be + * enabled again. + */ + if (!ddi_in_panic()) { + mutex_exit(&smrt->smrt_mutex); + ddi_taskq_wait(smrt->smrt_discover_taskq); + mutex_enter(&smrt->smrt_mutex); + } + + /* + * Re-enable interrupts. Now, we must kick off a discovery to make sure + * that the system is in a sane state and that we can perform I/O. + */ + smrt_intr_set(smrt, B_TRUE); + smrt->smrt_status &= ~SMRT_CTLR_STATUS_RESETTING; + smrt->smrt_status |= SMRT_CTLR_DISCOVERY_REQUIRED; + + /* + * Attempt a discovery to make sure that the drivers sees a realistic + * view of the world. If we're not in panic context, spin for the + * asynchronous process to complete, otherwise we're in panic context + * and this is going to happen regardless if we want it to or not. + * Before we kick off the request to run discovery, we reset the + * discovery request flags as we know that nothing else can consider + * running discovery and we don't want to delay until the next smrt + * periodic tick if we can avoid it. In panic context, if this failed, + * then we won't make it back. + */ + VERIFY0(smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING); + smrt->smrt_status &= ~(SMRT_CTLR_DISCOVERY_MASK); + smrt_discover(smrt); + if (!ddi_in_panic()) { + while (smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUIRED) { + cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex); + } + } + + smrt->smrt_status |= SMRT_CTLR_STATUS_RUNNING; + smrt->smrt_last_reset_finish = gethrtime(); + + /* + * Wake anybody that was waiting for the reset to complete. + */ + cv_broadcast(&smrt->smrt_cv_finishq); + + /* + * Process the completion queue one last time before we let go + * of the mutex. + */ + smrt_process_finishq(smrt); + + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(smcm_nop); + mutex_enter(&smrt->smrt_mutex); + return (0); +} + +int +smrt_event_init(smrt_t *smrt) +{ + int ret; + smrt_command_t *event, *cancel; + + event = smrt_command_alloc(smrt, SMRT_CMDTYPE_EVENT, KM_NOSLEEP); + if (event == NULL) + return (ENOMEM); + if (smrt_command_attach_internal(smrt, event, SMRT_EVENT_NOTIFY_BUFLEN, + KM_NOSLEEP) != 0) { + smrt_command_free(event); + return (ENOMEM); + } + smrt_write_message_event_notify(event); + + cancel = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, KM_NOSLEEP); + if (cancel == NULL) { + smrt_command_free(event); + return (ENOMEM); + } + if (smrt_command_attach_internal(smrt, cancel, SMRT_EVENT_NOTIFY_BUFLEN, + KM_NOSLEEP) != 0) { + smrt_command_free(event); + smrt_command_free(cancel); + return (ENOMEM); + } + smrt_write_message_cancel_event_notify(cancel); + + cv_init(&smrt->smrt_event_queue, NULL, CV_DRIVER, NULL); + + mutex_enter(&smrt->smrt_mutex); + if ((ret = smrt_submit(smrt, event)) != 0) { + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(event); + smrt_command_free(cancel); + return (ret); + } + + smrt->smrt_event_cmd = event; + smrt->smrt_event_cancel_cmd = cancel; + mutex_exit(&smrt->smrt_mutex); + + return (0); +} + +void +smrt_event_complete(smrt_command_t *smcm) +{ + smrt_event_notify_t *sen; + boolean_t log, rescan; + + boolean_t intervene = B_FALSE; + smrt_t *smrt = smcm->smcm_ctlr; + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + VERIFY3P(smcm, ==, smrt->smrt_event_cmd); + VERIFY0(smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION); + + smrt->smrt_stats.smrts_events_received++; + + if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) { + cv_signal(&smrt->smrt_event_queue); + return; + } + + if (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) { + intervene = B_TRUE; + goto clean; + } + + /* + * The event notification command failed for some reason. Attempt to + * drive on and try again at the next intervention period. Because this + * may represent a programmer error (though it's hard to know), we wait + * until the next intervention period and don't panic. + */ + if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) { + ErrorInfo_t *ei = smcm->smcm_va_err; + intervene = B_TRUE; + + smrt->smrt_stats.smrts_events_errors++; + dev_err(smrt->smrt_dip, CE_WARN, "!event notification request " + "error: status 0x%x", ei->CommandStatus); + goto clean; + } + + sen = smcm->smcm_internal->smcmi_va; + log = rescan = B_FALSE; + switch (sen->sen_class) { + case SMRT_EVENT_CLASS_PROTOCOL: + /* + * Most of the event protocol class events aren't really + * actionable. However, subclass 1 indicates errors. Today, + * the only error is an event overflow. If there's an event + * overflow, then we must assume that we need to rescan. + */ + if (sen->sen_subclass == SMRT_EVENT_PROTOCOL_SUBCLASS_ERROR) { + rescan = B_TRUE; + } + break; + case SMRT_EVENT_CLASS_HOTPLUG: + /* + * We want to log all hotplug events. However we only need to + * scan these if the subclass indicates the event is for a disk. + */ + log = B_TRUE; + if (sen->sen_subclass == SMRT_EVENT_HOTPLUG_SUBCLASS_DRIVE) { + rescan = B_TRUE; + } + break; + case SMRT_EVENT_CLASS_HWERROR: + case SMRT_EVENT_CLASS_ENVIRONMENT: + log = B_TRUE; + break; + case SMRT_EVENT_CLASS_PHYS: + log = B_TRUE; + /* + * This subclass indicates some change for physical drives. As + * such, this should trigger a rescan. + */ + if (sen->sen_subclass == SMRT_EVENT_PHYS_SUBCLASS_STATE) { + rescan = B_TRUE; + } + break; + case SMRT_EVENT_CLASS_LOGVOL: + rescan = B_TRUE; + log = B_TRUE; + break; + default: + /* + * While there are other classes of events, it's hard to say how + * actionable they are for the moment. If we revamp this such + * that it becomes an ireport based system, then we should just + * always log these. We opt not to at the moment to try and be + * kind to the system log. + */ + break; + } + + /* + * Ideally, this would be an ireport that we could pass onto + * administrators; however, since we don't have any way to generate + * that, we provide a subset of the event information. + */ + if (log) { + const char *rmsg; + if (rescan == B_TRUE) { + rmsg = "rescanning"; + } else { + rmsg = "not rescanning"; + } + if (sen->sen_message[0] != '\0') { + sen->sen_message[sizeof (sen->sen_message) - 1] = '\0'; + dev_err(smrt->smrt_dip, CE_NOTE, "!controller event " + "class/sub-class/detail %x, %x, %x: %s; %s devices", + sen->sen_class, sen->sen_subclass, sen->sen_detail, + sen->sen_message, rmsg); + } else { + dev_err(smrt->smrt_dip, CE_NOTE, "!controller event " + "class/sub-class/detail %x, %x, %x; %s devices", + sen->sen_class, sen->sen_subclass, sen->sen_detail, + rmsg); + } + } + + if (rescan) + smrt_discover_request(smrt); + +clean: + mutex_exit(&smrt->smrt_mutex); + smrt_command_reuse(smcm); + bzero(smcm->smcm_internal->smcmi_va, SMRT_EVENT_NOTIFY_BUFLEN); + mutex_enter(&smrt->smrt_mutex); + + /* + * Make sure we're not _now_ detaching or resetting. + */ + if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) { + cv_signal(&smrt->smrt_event_queue); + return; + } + + if ((smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) != 0 || + intervene == B_TRUE) { + smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION; + return; + } + + /* + * Check out command count per tick. If it's too high, leave it for + * intervention to solve. Likely there is some serious driver or + * firmware error going on. + */ + smrt->smrt_event_count++; + if (smrt->smrt_event_count > smrt_event_intervention_threshold) { + smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION; + return; + } + + if (smrt_submit(smrt, smcm) != 0) { + smrt->smrt_status |= SMRT_CTLR_ASYNC_INTERVENTION; + } +} + +void +smrt_event_fini(smrt_t *smrt) +{ + int ret; + smrt_command_t *event, *cancel; + mutex_enter(&smrt->smrt_mutex); + + /* + * If intervention has been requested, there is nothing for us to do. We + * clear the flag so nothing else accidentally sees this and takes + * action. We also don't need to bother sending a cancellation request, + * as there is no outstanding event. + */ + if (smrt->smrt_status & SMRT_CTLR_ASYNC_INTERVENTION) { + smrt->smrt_status &= ~SMRT_CTLR_ASYNC_INTERVENTION; + goto free; + } + + /* + * Submit a cancel request for the event notification queue. Because we + * submit both the cancel event and the regular notification event as an + * ordered command, we know that by the time this completes, that the + * existing one will have completed. + */ + smrt->smrt_event_cancel_cmd->smcm_status |= SMRT_CMD_STATUS_POLLED; + if ((ret = smrt_submit(smrt, smrt->smrt_event_cancel_cmd)) != 0) { + /* + * This is unfortunate. We've failed to submit the command. At + * this point all we can do is reset the device. If the reset + * succeeds, we're done and we can clear all the memory. If it + * fails, then all we can do is just leak the command and scream + * to the system, sorry. + */ + if (smrt_ctlr_reset(smrt) != 0) { + dev_err(smrt->smrt_dip, CE_WARN, "failed to reset " + "device after failure to submit cancellation " + "(%d), abandoning smrt_command_t at address %p", + ret, smrt->smrt_event_cmd); + smrt->smrt_event_cmd = NULL; + goto free; + } + } + + smrt->smrt_event_cancel_cmd->smcm_expiry = gethrtime() + + SMRT_ASYNC_CANCEL_TIMEOUT * NANOSEC; + if ((ret = smrt_poll_for(smrt, smrt->smrt_event_cancel_cmd)) != 0) { + VERIFY3S(ret, ==, ETIMEDOUT); + VERIFY0(smrt->smrt_event_cancel_cmd->smcm_status & + SMRT_CMD_STATUS_POLL_COMPLETE); + + /* + * The command timed out. All we can do is hope a reset will + * work. + */ + if (smrt_ctlr_reset(smrt) != 0) { + dev_err(smrt->smrt_dip, CE_WARN, "failed to reset " + "device after failure to poll for async " + "cancellation command abandoning smrt_command_t " + "event command at address %p and cancellation " + "command at %p", smrt->smrt_event_cmd, + smrt->smrt_event_cancel_cmd); + smrt->smrt_event_cmd = NULL; + smrt->smrt_event_cancel_cmd = NULL; + goto free; + } + + } + + /* + * Well, in the end, it's results that count. + */ + if (smrt->smrt_event_cancel_cmd->smcm_status & + SMRT_CMD_STATUS_RESET_SENT) { + goto free; + } + + if (smrt->smrt_event_cancel_cmd->smcm_status & SMRT_CMD_STATUS_ERROR) { + ErrorInfo_t *ei = smrt->smrt_event_cancel_cmd->smcm_va_err; + + /* + * This can return a CISS_CMD_TARGET_STATUS entry when the + * controller doesn't think a command is outstanding. It is + * possible we raced, so don't think too much about that case. + * Anything else leaves us between a rock and a hard place, the + * only way out is a reset. + */ + if (ei->CommandStatus != CISS_CMD_TARGET_STATUS && + smrt_ctlr_reset(smrt) != 0) { + dev_err(smrt->smrt_dip, CE_WARN, "failed to reset " + "device after receiving an error on the async " + "cancellation command (%d); abandoning " + "smrt_command_t event command at address %p and " + "cancellation command at %p", ei->CommandStatus, + smrt->smrt_event_cmd, smrt->smrt_event_cancel_cmd); + smrt->smrt_event_cmd = NULL; + smrt->smrt_event_cancel_cmd = NULL; + goto free; + } + } + +free: + event = smrt->smrt_event_cmd; + smrt->smrt_event_cmd = NULL; + cancel = smrt->smrt_event_cancel_cmd; + smrt->smrt_event_cancel_cmd = NULL; + mutex_exit(&smrt->smrt_mutex); + if (event != NULL) + smrt_command_free(event); + if (cancel != NULL) + smrt_command_free(cancel); + cv_destroy(&smrt->smrt_event_queue); +} + +/* + * We've been asked to do a discovery in panic context. This would have + * occurred because there was a device reset. Because we can't rely on the + * target maps, all we can do at the moment is go over all the active targets + * and note which ones no longer exist. If this target was required to dump, + * then the dump code will encounter a fatal error. If not, then we should + * count ourselves surprisingly lucky. + */ +static void +smrt_discover_panic_check(smrt_t *smrt) +{ + smrt_target_t *smtg; + + ASSERT(MUTEX_HELD(&smrt->smrt_mutex)); + for (smtg = list_head(&smrt->smrt_targets); smtg != NULL; + smtg = list_next(&smrt->smrt_targets, smtg)) { + uint64_t gen; + + if (smtg->smtg_physical) { + smrt_physical_t *smpt = smtg->smtg_lun.smtg_phys; + /* + * Don't worry about drives that aren't visible. + */ + if (!smpt->smpt_visible) + continue; + gen = smpt->smpt_gen; + } else { + smrt_volume_t *smlv = smtg->smtg_lun.smtg_vol; + gen = smlv->smlv_gen; + } + + if (gen != smrt->smrt_discover_gen) { + dev_err(smrt->smrt_dip, CE_WARN, "target %s " + "disappeared during post-panic discovery", + scsi_device_unit_address(smtg->smtg_scsi_dev)); + smtg->smtg_gone = B_TRUE; + } + } +} + +static void +smrt_discover(void *arg) +{ + int log = 0, phys = 0; + smrt_t *smrt = arg; + uint64_t gen; + boolean_t runphys, runvirt; + + mutex_enter(&smrt->smrt_mutex); + smrt->smrt_status |= SMRT_CTLR_DISCOVERY_RUNNING; + smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_REQUESTED; + + smrt->smrt_discover_gen++; + gen = smrt->smrt_discover_gen; + runphys = smrt->smrt_phys_tgtmap != NULL; + runvirt = smrt->smrt_virt_tgtmap != NULL; + mutex_exit(&smrt->smrt_mutex); + if (runphys) + phys = smrt_phys_discover(smrt, SMRT_DISCOVER_TIMEOUT, gen); + if (runvirt) + log = smrt_logvol_discover(smrt, SMRT_DISCOVER_TIMEOUT, gen); + mutex_enter(&smrt->smrt_mutex); + + if (phys != 0 || log != 0) { + if (!ddi_in_panic()) { + smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC; + } else { + panic("smrt_t %p failed to perform discovery after " + "a reset in panic context, unable to continue. " + "logvol: %d, phys: %d", smrt, log, phys); + } + } else { + if (!ddi_in_panic() && + smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUIRED) { + smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_REQUIRED; + cv_broadcast(&smrt->smrt_cv_finishq); + } + + if (ddi_in_panic()) { + smrt_discover_panic_check(smrt); + } + } + smrt->smrt_status &= ~SMRT_CTLR_DISCOVERY_RUNNING; + if (smrt->smrt_status & SMRT_CTLR_DISCOVERY_REQUESTED) + smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC; + mutex_exit(&smrt->smrt_mutex); +} + +/* + * Request discovery, which is always run via a taskq. + */ +void +smrt_discover_request(smrt_t *smrt) +{ + boolean_t run; + ASSERT(MUTEX_HELD(&smrt->smrt_mutex)); + + if (ddi_in_panic()) { + smrt_discover(smrt); + return; + } + + run = (smrt->smrt_status & SMRT_CTLR_DISCOVERY_MASK) == 0; + smrt->smrt_status |= SMRT_CTLR_DISCOVERY_REQUESTED; + if (run && ddi_taskq_dispatch(smrt->smrt_discover_taskq, + smrt_discover, smrt, DDI_NOSLEEP) != DDI_SUCCESS) { + smrt->smrt_status |= SMRT_CTLR_DISCOVERY_PERIODIC; + smrt->smrt_stats.smrts_discovery_tq_errors++; + } +} diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss_simple.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss_simple.c new file mode 100644 index 0000000000..1b3d7b2602 --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_ciss_simple.c @@ -0,0 +1,282 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/scsi/adapters/smrt/smrt.h> + +uint_t +smrt_isr_hw_simple(caddr_t arg1, caddr_t arg2) +{ + _NOTE(ARGUNUSED(arg2)) + + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + smrt_t *smrt = (smrt_t *)arg1; + uint32_t isr = smrt_get32(smrt, CISS_I2O_INTERRUPT_STATUS); + hrtime_t now = gethrtime(); + + mutex_enter(&smrt->smrt_mutex); + if (!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING)) { + smrt->smrt_stats.smrts_unclaimed_interrupts++; + smrt->smrt_last_interrupt_unclaimed = now; + + /* + * We should not be receiving interrupts from the controller + * while the driver is not running. + */ + mutex_exit(&smrt->smrt_mutex); + return (DDI_INTR_UNCLAIMED); + } + + /* + * Check to see if this interrupt came from the device: + */ + if ((isr & CISS_ISR_BIT_SIMPLE_INTR) == 0) { + smrt->smrt_stats.smrts_unclaimed_interrupts++; + smrt->smrt_last_interrupt_unclaimed = now; + + /* + * Check to see if the firmware has come to rest. If it has, + * this routine will panic the system. + */ + smrt_lockup_check(smrt); + + mutex_exit(&smrt->smrt_mutex); + return (DDI_INTR_UNCLAIMED); + } + + smrt->smrt_stats.smrts_claimed_interrupts++; + smrt->smrt_last_interrupt_claimed = now; + + /* + * The interrupt was from our controller, so collect any pending + * command completions. + */ + smrt_retrieve_simple(smrt); + + /* + * Process any commands in the completion queue. + */ + smrt_process_finishq(smrt); + + mutex_exit(&smrt->smrt_mutex); + return (DDI_INTR_CLAIMED); +} + +/* + * Read tags and process completion of the associated command until the supply + * of tags is exhausted. + */ +void +smrt_retrieve_simple(smrt_t *smrt) +{ + uint32_t opq; + uint32_t none = 0xffffffff; + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + while ((opq = smrt_get32(smrt, CISS_I2O_OUTBOUND_POST_Q)) != none) { + uint32_t tag = CISS_OPQ_READ_TAG(opq); + smrt_command_t *smcm; + + if ((smcm = smrt_lookup_inflight(smrt, tag)) == NULL) { + dev_err(smrt->smrt_dip, CE_WARN, "spurious tag %x", + tag); + continue; + } + + avl_remove(&smrt->smrt_inflight, smcm); + smcm->smcm_status &= ~SMRT_CMD_STATUS_INFLIGHT; + if (CISS_OPQ_READ_ERROR(opq) != 0) { + smcm->smcm_status |= SMRT_CMD_STATUS_ERROR; + } + smcm->smcm_time_complete = gethrtime(); + + /* + * Push this command onto the completion queue. + */ + list_insert_tail(&smrt->smrt_finishq, smcm); + } +} + +/* + * Submit a command to the controller by posting it to the Inbound Post Queue + * Register. + */ +void +smrt_submit_simple(smrt_t *smrt, smrt_command_t *smcm) +{ + smrt_put32(smrt, CISS_I2O_INBOUND_POST_Q, smcm->smcm_pa_cmd); +} + +/* + * Submit a command to the controller by posting it to the Inbound Post Queue + * Register. Immediately begin polling on the completion of that command. + * + * NOTE: This function is for controller initialisation only. It discards + * completions of commands other than the expected command as spurious, and + * will not interact correctly with the rest of the driver once it is running. + */ +int +smrt_preinit_command_simple(smrt_t *smrt, smrt_command_t *smcm) +{ + /* + * The controller must be initialised to use the Simple Transport + * Method, but not be marked RUNNING. The command to process must be a + * PREINIT command with the expected tag number, marked for polling. + */ + VERIFY(smrt->smrt_ctlr_mode == SMRT_CTLR_MODE_SIMPLE); + VERIFY(!(smrt->smrt_status & SMRT_CTLR_STATUS_RUNNING)); + VERIFY(smcm->smcm_type == SMRT_CMDTYPE_PREINIT); + VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_POLLED); + VERIFY3U(smcm->smcm_tag, ==, SMRT_PRE_TAG_NUMBER); + + /* + * Submit this command to the controller. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_INFLIGHT; + smrt_put32(smrt, CISS_I2O_INBOUND_POST_Q, smcm->smcm_pa_cmd); + + /* + * Poll the controller for completions until we see the command we just + * sent, or the timeout expires. + */ + for (;;) { + uint32_t none = 0xffffffff; + uint32_t opq = smrt_get32(smrt, CISS_I2O_OUTBOUND_POST_Q); + uint32_t tag; + + if (smcm->smcm_expiry != 0) { + /* + * This command has an expiry time. Check to see + * if it has already passed: + */ + if (smcm->smcm_expiry < gethrtime()) { + return (ETIMEDOUT); + } + } + + if (opq == none) { + delay(drv_usectohz(10 * 1000)); + continue; + } + + if ((tag = CISS_OPQ_READ_TAG(opq)) != SMRT_PRE_TAG_NUMBER) { + dev_err(smrt->smrt_dip, CE_WARN, "unexpected tag 0x%x" + " completed during driver init", tag); + delay(drv_usectohz(10 * 1000)); + continue; + } + + smcm->smcm_status &= ~SMRT_CMD_STATUS_INFLIGHT; + if (CISS_OPQ_READ_ERROR(opq) != 0) { + smcm->smcm_status |= SMRT_CMD_STATUS_ERROR; + } + smcm->smcm_time_complete = gethrtime(); + smcm->smcm_status |= SMRT_CMD_STATUS_POLL_COMPLETE; + + return (0); + } +} + +int +smrt_ctlr_init_simple(smrt_t *smrt) +{ + VERIFY(smrt->smrt_ctlr_mode == SMRT_CTLR_MODE_UNKNOWN); + + if (smrt_cfgtbl_transport_has_support(smrt, + CISS_CFGTBL_XPORT_SIMPLE) != DDI_SUCCESS) { + return (DDI_FAILURE); + } + smrt->smrt_ctlr_mode = SMRT_CTLR_MODE_SIMPLE; + + /* + * Disable device interrupts while we are setting up. + */ + smrt_intr_set(smrt, B_FALSE); + + if ((smrt->smrt_maxcmds = smrt_ctlr_get_cmdsoutmax(smrt)) == 0) { + dev_err(smrt->smrt_dip, CE_WARN, "maximum outstanding " + "commands set to zero"); + return (DDI_FAILURE); + } + + /* + * Determine the number of Scatter/Gather List entries this controller + * supports. The maximum number we allow is CISS_MAXSGENTRIES: the + * number of elements in the static struct we use for command + * submission. + */ + if ((smrt->smrt_sg_cnt = smrt_ctlr_get_maxsgelements(smrt)) == 0) { + /* + * The CISS specification states that if this value is + * zero, we should assume a value of 31 for compatibility + * with older firmware. + */ + smrt->smrt_sg_cnt = CISS_SGCNT_FALLBACK; + + } else if (smrt->smrt_sg_cnt > CISS_MAXSGENTRIES) { + /* + * If the controller supports more than we have allocated, + * just cap the count at the allocation size. + */ + smrt->smrt_sg_cnt = CISS_MAXSGENTRIES; + } + + /* + * Zero the upper 32 bits of the address in the Controller. + */ + ddi_put32(smrt->smrt_ct_handle, &smrt->smrt_ct->Upper32Addr, 0); + + /* + * Set the Transport Method and flush the changes to the + * Configuration Table. + */ + smrt_cfgtbl_transport_set(smrt, CISS_CFGTBL_XPORT_SIMPLE); + if (smrt_cfgtbl_flush(smrt) != DDI_SUCCESS) { + return (DDI_FAILURE); + } + + if (smrt_cfgtbl_transport_confirm(smrt, + CISS_CFGTBL_XPORT_SIMPLE) != DDI_SUCCESS) { + return (DDI_FAILURE); + } + + /* + * Check the outstanding command cap a second time now that we have + * flushed out the new Transport Method. This is entirely defensive; + * we do not expect this value to change. + */ + uint32_t check_again = smrt_ctlr_get_cmdsoutmax(smrt); + if (check_again != smrt->smrt_maxcmds) { + dev_err(smrt->smrt_dip, CE_WARN, "maximum outstanding commands " + "changed during initialisation (was %u, now %u)", + smrt->smrt_maxcmds, check_again); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +void +smrt_ctlr_teardown_simple(smrt_t *smrt) +{ + VERIFY(smrt->smrt_ctlr_mode == SMRT_CTLR_MODE_SIMPLE); + + /* + * Due to the nominal simplicity of the simple mode, we have no + * particular teardown to perform as we do not allocate anything + * on the way up. + */ + smrt->smrt_ctlr_mode = SMRT_CTLR_MODE_UNKNOWN; +} diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_commands.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_commands.c new file mode 100644 index 0000000000..edcbfa65e2 --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_commands.c @@ -0,0 +1,362 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017, Joyent, Inc. + */ + +#include <sys/scsi/adapters/smrt/smrt.h> + + +static ddi_dma_attr_t smrt_command_dma_attr = { + .dma_attr_version = DMA_ATTR_V0, + .dma_attr_addr_lo = 0x00000000, + .dma_attr_addr_hi = 0xFFFFFFFF, + .dma_attr_count_max = 0x00FFFFFF, + .dma_attr_align = 0x20, + .dma_attr_burstsizes = 0x20, + .dma_attr_minxfer = DMA_UNIT_8, + .dma_attr_maxxfer = 0xFFFFFFFF, + .dma_attr_seg = 0x0000FFFF, + .dma_attr_sgllen = 1, + .dma_attr_granular = 512, + .dma_attr_flags = 0 +}; + +/* + * These device access attributes are for command block allocation, where we do + * not use any of the structured byte swapping facilities. + */ +static ddi_device_acc_attr_t smrt_command_dev_attr = { + .devacc_attr_version = DDI_DEVICE_ATTR_V0, + .devacc_attr_endian_flags = DDI_NEVERSWAP_ACC, + .devacc_attr_dataorder = DDI_STRICTORDER_ACC, + .devacc_attr_access = 0 +}; + + +static void smrt_contig_free(smrt_dma_t *); + + +static int +smrt_check_command_type(smrt_command_type_t type) +{ + /* + * Note that we leave out the default case in order to utilise + * compiler warnings about missed enum values. + */ + switch (type) { + case SMRT_CMDTYPE_ABORTQ: + case SMRT_CMDTYPE_SCSA: + case SMRT_CMDTYPE_INTERNAL: + case SMRT_CMDTYPE_PREINIT: + case SMRT_CMDTYPE_EVENT: + return (type); + } + + panic("unexpected command type"); + /* LINTED: E_FUNC_NO_RET_VAL */ +} + +static int +smrt_contig_alloc(smrt_t *smrt, smrt_dma_t *smdma, size_t sz, int kmflags, + void **vap, uint32_t *pap) +{ + caddr_t va; + int rv; + dev_info_t *dip = smrt->smrt_dip; + int (*dma_wait)(caddr_t) = (kmflags == KM_SLEEP) ? DDI_DMA_SLEEP : + DDI_DMA_DONTWAIT; + + VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP); + + /* + * Ensure we don't try to allocate a second time using the same + * tracking object. + */ + VERIFY0(smdma->smdma_level); + + if ((rv = ddi_dma_alloc_handle(dip, &smrt_command_dma_attr, + dma_wait, NULL, &smdma->smdma_dma_handle)) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "DMA handle allocation failed (%x)", + rv); + goto fail; + } + smdma->smdma_level |= SMRT_DMALEVEL_HANDLE_ALLOC; + + if ((rv = ddi_dma_mem_alloc(smdma->smdma_dma_handle, sz, + &smrt_command_dev_attr, DDI_DMA_CONSISTENT, dma_wait, NULL, + &va, &smdma->smdma_real_size, &smdma->smdma_acc_handle)) != + DDI_SUCCESS) { + dev_err(dip, CE_WARN, "DMA memory allocation failed (%x)", rv); + goto fail; + } + smdma->smdma_level |= SMRT_DMALEVEL_MEMORY_ALLOC; + + if ((rv = ddi_dma_addr_bind_handle(smdma->smdma_dma_handle, + NULL, va, smdma->smdma_real_size, + DDI_DMA_CONSISTENT | DDI_DMA_RDWR, dma_wait, NULL, + smdma->smdma_dma_cookies, &smdma->smdma_dma_ncookies)) != + DDI_DMA_MAPPED) { + dev_err(dip, CE_WARN, "DMA handle bind failed (%x)", rv); + goto fail; + } + smdma->smdma_level |= SMRT_DMALEVEL_HANDLE_BOUND; + + VERIFY3U(smdma->smdma_dma_ncookies, ==, 1); + *pap = smdma->smdma_dma_cookies[0].dmac_address; + *vap = (void *)va; + return (DDI_SUCCESS); + +fail: + *vap = NULL; + *pap = 0; + smrt_contig_free(smdma); + return (DDI_FAILURE); +} + +static void +smrt_contig_free(smrt_dma_t *smdma) +{ + if (smdma->smdma_level & SMRT_DMALEVEL_HANDLE_BOUND) { + VERIFY3U(ddi_dma_unbind_handle(smdma->smdma_dma_handle), ==, + DDI_SUCCESS); + + smdma->smdma_level &= ~SMRT_DMALEVEL_HANDLE_BOUND; + } + + if (smdma->smdma_level & SMRT_DMALEVEL_MEMORY_ALLOC) { + ddi_dma_mem_free(&smdma->smdma_acc_handle); + + smdma->smdma_level &= ~SMRT_DMALEVEL_MEMORY_ALLOC; + } + + if (smdma->smdma_level & SMRT_DMALEVEL_HANDLE_ALLOC) { + ddi_dma_free_handle(&smdma->smdma_dma_handle); + + smdma->smdma_level &= ~SMRT_DMALEVEL_HANDLE_ALLOC; + } + + VERIFY(smdma->smdma_level == 0); + bzero(smdma, sizeof (*smdma)); +} + +static smrt_command_t * +smrt_command_alloc_impl(smrt_t *smrt, smrt_command_type_t type, int kmflags) +{ + smrt_command_t *smcm; + + VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP); + + if ((smcm = kmem_zalloc(sizeof (*smcm), kmflags)) == NULL) { + return (NULL); + } + + smcm->smcm_ctlr = smrt; + smcm->smcm_type = smrt_check_command_type(type); + + /* + * Allocate a single contiguous chunk of memory for the command block + * (smcm_va_cmd) and the error information block (smcm_va_err). The + * physical address of each block should be 32-byte aligned. + */ + size_t contig_size = 0; + contig_size += P2ROUNDUP_TYPED(sizeof (CommandList_t), 32, size_t); + + size_t errorinfo_offset = contig_size; + contig_size += P2ROUNDUP_TYPED(sizeof (ErrorInfo_t), 32, size_t); + + if (smrt_contig_alloc(smrt, &smcm->smcm_contig, contig_size, + kmflags, (void **)&smcm->smcm_va_cmd, &smcm->smcm_pa_cmd) != + DDI_SUCCESS) { + kmem_free(smcm, sizeof (*smcm)); + return (NULL); + } + + smcm->smcm_va_err = (void *)((caddr_t)smcm->smcm_va_cmd + + errorinfo_offset); + smcm->smcm_pa_err = smcm->smcm_pa_cmd + errorinfo_offset; + + /* + * Ensure we asked for, and received, the correct physical alignment: + */ + VERIFY0(smcm->smcm_pa_cmd & 0x1f); + VERIFY0(smcm->smcm_pa_err & 0x1f); + + /* + * Populate Fields. + */ + bzero(smcm->smcm_va_cmd, contig_size); + smcm->smcm_va_cmd->ErrDesc.Addr = smcm->smcm_pa_err; + smcm->smcm_va_cmd->ErrDesc.Len = sizeof (ErrorInfo_t); + + return (smcm); +} + +smrt_command_t * +smrt_command_alloc_preinit(smrt_t *smrt, size_t datasize, int kmflags) +{ + smrt_command_t *smcm; + + if ((smcm = smrt_command_alloc_impl(smrt, SMRT_CMDTYPE_PREINIT, + kmflags)) == NULL) { + return (NULL); + } + + /* + * Note that most driver infrastructure has not been initialised at + * this time. All commands are submitted to the controller serially, + * using a pre-specified tag, and are not attached to the command + * tracking list. + */ + smcm->smcm_tag = SMRT_PRE_TAG_NUMBER; + smcm->smcm_va_cmd->Header.Tag.tag_value = SMRT_PRE_TAG_NUMBER; + + if (smrt_command_attach_internal(smrt, smcm, datasize, kmflags) != 0) { + smrt_command_free(smcm); + return (NULL); + } + + return (smcm); +} + +smrt_command_t * +smrt_command_alloc(smrt_t *smrt, smrt_command_type_t type, int kmflags) +{ + smrt_command_t *smcm; + + VERIFY(type != SMRT_CMDTYPE_PREINIT); + + if ((smcm = smrt_command_alloc_impl(smrt, type, kmflags)) == NULL) { + return (NULL); + } + + /* + * Insert into the per-controller command list. + */ + mutex_enter(&smrt->smrt_mutex); + list_insert_tail(&smrt->smrt_commands, smcm); + mutex_exit(&smrt->smrt_mutex); + + return (smcm); +} + +int +smrt_command_attach_internal(smrt_t *smrt, smrt_command_t *smcm, size_t len, + int kmflags) +{ + smrt_command_internal_t *smcmi; + + VERIFY(kmflags == KM_SLEEP || kmflags == KM_NOSLEEP); + VERIFY3U(len, <=, UINT32_MAX); + + if ((smcmi = kmem_zalloc(sizeof (*smcmi), kmflags)) == NULL) { + return (ENOMEM); + } + + if (smrt_contig_alloc(smrt, &smcmi->smcmi_contig, len, kmflags, + &smcmi->smcmi_va, &smcmi->smcmi_pa) != DDI_SUCCESS) { + kmem_free(smcmi, sizeof (*smcmi)); + return (ENOMEM); + } + + bzero(smcmi->smcmi_va, smcmi->smcmi_len); + + smcm->smcm_internal = smcmi; + + smcm->smcm_va_cmd->SG[0].Addr = smcmi->smcmi_pa; + smcm->smcm_va_cmd->SG[0].Len = (uint32_t)len; + smcm->smcm_va_cmd->Header.SGList = 1; + smcm->smcm_va_cmd->Header.SGTotal = 1; + + return (0); +} + +void +smrt_command_reuse(smrt_command_t *smcm) +{ + smrt_t *smrt = smcm->smcm_ctlr; + + mutex_enter(&smrt->smrt_mutex); + + /* + * Make sure the command is not currently inflight, then + * reset the command status. + */ + VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT)); + smcm->smcm_status = SMRT_CMD_STATUS_REUSED; + + /* + * Ensure we are not trying to reuse a command that is in the finish or + * abort queue. + */ + VERIFY(!list_link_active(&smcm->smcm_link_abort)); + VERIFY(!list_link_active(&smcm->smcm_link_finish)); + + /* + * Clear the previous tag value. + */ + smcm->smcm_tag = 0; + smcm->smcm_va_cmd->Header.Tag.tag_value = 0; + + mutex_exit(&smrt->smrt_mutex); +} + +void +smrt_command_free(smrt_command_t *smcm) +{ + smrt_t *smrt = smcm->smcm_ctlr; + + /* + * Ensure the object we are about to free is not currently in the + * inflight AVL. + */ + VERIFY(!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT)); + + if (smcm->smcm_internal != NULL) { + smrt_command_internal_t *smcmi = smcm->smcm_internal; + + smrt_contig_free(&smcmi->smcmi_contig); + kmem_free(smcmi, sizeof (*smcmi)); + } + + smrt_contig_free(&smcm->smcm_contig); + + if (smcm->smcm_type != SMRT_CMDTYPE_PREINIT) { + mutex_enter(&smrt->smrt_mutex); + + /* + * Ensure we are not trying to free a command that is in the + * finish or abort queue. + */ + VERIFY(!list_link_active(&smcm->smcm_link_abort)); + VERIFY(!list_link_active(&smcm->smcm_link_finish)); + + list_remove(&smrt->smrt_commands, smcm); + + mutex_exit(&smrt->smrt_mutex); + } + + kmem_free(smcm, sizeof (*smcm)); +} + +smrt_command_t * +smrt_lookup_inflight(smrt_t *smrt, uint32_t tag) +{ + smrt_command_t srch; + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + bzero(&srch, sizeof (srch)); + srch.smcm_tag = tag; + + return (avl_find(&smrt->smrt_inflight, &srch, NULL)); +} diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_device.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_device.c new file mode 100644 index 0000000000..9e27448b68 --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_device.c @@ -0,0 +1,238 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017, Joyent, Inc. + */ + +#include <sys/scsi/adapters/smrt/smrt.h> + +/* + * We must locate what the CISS specification describes as the "I2O + * registers". The Intelligent I/O (I2O) Architecture Specification describes + * this somewhat more coherently as "the memory region specified by the first + * base address configuration register indicating memory space (offset 10h, + * 14h, and so forth)". + */ +static int +smrt_locate_bar(pci_regspec_t *regs, unsigned nregs, + unsigned *i2o_bar) +{ + /* + * Locate the first memory-mapped BAR: + */ + for (unsigned i = 0; i < nregs; i++) { + unsigned type = regs[i].pci_phys_hi & PCI_ADDR_MASK; + + if (type == PCI_ADDR_MEM32 || type == PCI_ADDR_MEM64) { + *i2o_bar = i; + return (DDI_SUCCESS); + } + } + + return (DDI_FAILURE); +} + +static int +smrt_locate_cfgtbl(smrt_t *smrt, pci_regspec_t *regs, unsigned nregs, + unsigned *ct_bar, uint32_t *baseaddr) +{ + uint32_t cfg_offset, mem_offset; + unsigned want_type; + uint32_t want_bar; + + cfg_offset = smrt_get32(smrt, CISS_I2O_CFGTBL_CFG_OFFSET); + mem_offset = smrt_get32(smrt, CISS_I2O_CFGTBL_MEM_OFFSET); + + VERIFY3U(cfg_offset, !=, 0xffffffff); + VERIFY3U(mem_offset, !=, 0xffffffff); + + /* + * Locate the Configuration Table. Three different values read + * from two I2O registers allow us to determine the location: + * - the correct PCI BAR offset is in the low 16 bits of + * CISS_I2O_CFGTBL_CFG_OFFSET + * - bit 16 is 0 for a 32-bit space, and 1 for 64-bit + * - the memory offset from the base of this BAR is + * in CISS_I2O_CFGTBL_MEM_OFFSET + */ + want_bar = (cfg_offset & 0xffff); + want_type = (cfg_offset & (1UL << 16)) ? PCI_ADDR_MEM64 : + PCI_ADDR_MEM32; + + DTRACE_PROBE4(locate_cfgtbl, uint32_t, want_bar, unsigned, + want_type, uint32_t, cfg_offset, uint32_t, mem_offset); + + for (unsigned i = 0; i < nregs; i++) { + unsigned type = regs[i].pci_phys_hi & PCI_ADDR_MASK; + unsigned bar = PCI_REG_REG_G(regs[i].pci_phys_hi); + + if (type != PCI_ADDR_MEM32 && type != PCI_ADDR_MEM64) { + continue; + } + + if (bar == want_bar) { + *ct_bar = i; + *baseaddr = mem_offset; + return (DDI_SUCCESS); + } + } + + return (DDI_FAILURE); +} + +/* + * Determine the PCI vendor and device ID which is a proxy for which generation + * of controller we're working with. + */ +static int +smrt_identify_device(smrt_t *smrt) +{ + ddi_acc_handle_t pci_hdl; + + if (pci_config_setup(smrt->smrt_dip, &pci_hdl) != DDI_SUCCESS) + return (DDI_FAILURE); + + smrt->smrt_pci_vendor = pci_config_get16(pci_hdl, PCI_CONF_VENID); + smrt->smrt_pci_device = pci_config_get16(pci_hdl, PCI_CONF_DEVID); + + pci_config_teardown(&pci_hdl); + + return (DDI_SUCCESS); +} + +static int +smrt_map_device(smrt_t *smrt) +{ + pci_regspec_t *regs; + uint_t regslen, nregs; + dev_info_t *dip = smrt->smrt_dip; + int r = DDI_FAILURE; + + /* + * Get the list of PCI registers from the DDI property "regs": + */ + if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "reg", (int **)®s, ®slen) != DDI_PROP_SUCCESS) { + dev_err(dip, CE_WARN, "could not load \"reg\" DDI prop"); + return (DDI_FAILURE); + } + nregs = regslen * sizeof (int) / sizeof (pci_regspec_t); + + if (smrt_locate_bar(regs, nregs, &smrt->smrt_i2o_bar) != + DDI_SUCCESS) { + dev_err(dip, CE_WARN, "did not find any memory BARs"); + goto out; + } + + /* + * Map enough of the I2O memory space to enable us to talk to the + * device. + */ + if (ddi_regs_map_setup(dip, smrt->smrt_i2o_bar, &smrt->smrt_i2o_space, + CISS_I2O_MAP_BASE, CISS_I2O_MAP_LIMIT - CISS_I2O_MAP_BASE, + &smrt_dev_attributes, &smrt->smrt_i2o_handle) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "failed to map I2O registers"); + goto out; + } + smrt->smrt_init_level |= SMRT_INITLEVEL_I2O_MAPPED; + + if (smrt_locate_cfgtbl(smrt, regs, nregs, &smrt->smrt_ct_bar, + &smrt->smrt_ct_baseaddr) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not find config table"); + goto out; + } + + /* + * Map the Configuration Table. + */ + if (ddi_regs_map_setup(dip, smrt->smrt_ct_bar, + (caddr_t *)&smrt->smrt_ct, smrt->smrt_ct_baseaddr, + sizeof (CfgTable_t), &smrt_dev_attributes, + &smrt->smrt_ct_handle) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not map config table"); + goto out; + } + smrt->smrt_init_level |= SMRT_INITLEVEL_CFGTBL_MAPPED; + + r = DDI_SUCCESS; + +out: + ddi_prop_free(regs); + return (r); +} + +int +smrt_device_setup(smrt_t *smrt) +{ + /* + * Ensure that the controller is installed in such a fashion that it + * may become a DMA master. + */ + if (ddi_slaveonly(smrt->smrt_dip) == DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_WARN, "device cannot become DMA " + "master"); + return (DDI_FAILURE); + } + + if (smrt_identify_device(smrt) != DDI_SUCCESS) + goto fail; + + if (smrt_map_device(smrt) != DDI_SUCCESS) { + goto fail; + } + + return (DDI_SUCCESS); + +fail: + smrt_device_teardown(smrt); + return (DDI_FAILURE); +} + +void +smrt_device_teardown(smrt_t *smrt) +{ + if (smrt->smrt_init_level & SMRT_INITLEVEL_CFGTBL_MAPPED) { + ddi_regs_map_free(&smrt->smrt_ct_handle); + smrt->smrt_init_level &= ~SMRT_INITLEVEL_CFGTBL_MAPPED; + } + + if (smrt->smrt_init_level & SMRT_INITLEVEL_I2O_MAPPED) { + ddi_regs_map_free(&smrt->smrt_i2o_handle); + smrt->smrt_init_level &= ~SMRT_INITLEVEL_I2O_MAPPED; + } +} + +uint32_t +smrt_get32(smrt_t *smrt, offset_t off) +{ + VERIFY3S(off, >=, CISS_I2O_MAP_BASE); + VERIFY3S(off, <, CISS_I2O_MAP_BASE + CISS_I2O_MAP_LIMIT); + + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + uint32_t *addr = (uint32_t *)(smrt->smrt_i2o_space + + (off - CISS_I2O_MAP_BASE)); + + return (ddi_get32(smrt->smrt_i2o_handle, addr)); +} + +void +smrt_put32(smrt_t *smrt, offset_t off, uint32_t val) +{ + VERIFY3S(off, >=, CISS_I2O_MAP_BASE); + VERIFY3S(off, <, CISS_I2O_MAP_BASE + CISS_I2O_MAP_LIMIT); + + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + uint32_t *addr = (uint32_t *)(smrt->smrt_i2o_space + + (off - CISS_I2O_MAP_BASE)); + + ddi_put32(smrt->smrt_i2o_handle, addr, val); +} diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_hba.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_hba.c new file mode 100644 index 0000000000..8f082ffc9c --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_hba.c @@ -0,0 +1,1457 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017, Joyent, Inc. + */ + +#include <sys/scsi/adapters/smrt/smrt.h> + +/* + * The controller is not allowed to attach. + */ +static int +smrt_ctrl_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip, + scsi_hba_tran_t *hba_tran, struct scsi_device *sd) +{ + return (DDI_FAILURE); +} + +/* + * The controller is not allowed to send packets. + */ +static int +smrt_ctrl_tran_start(struct scsi_address *sa, struct scsi_pkt *pkt) +{ + return (TRAN_BADPKT); +} + +static boolean_t +smrt_logvol_parse(const char *ua, uint_t *targp) +{ + long targ, lun; + const char *comma; + char *eptr; + + comma = strchr(ua, ','); + if (comma == NULL) { + return (B_FALSE); + } + + /* + * We expect the target number for a logical unit number to be zero for + * a logical volume. + */ + if (ddi_strtol(comma + 1, &eptr, 16, &lun) != 0 || *eptr != '\0' || + lun != 0) { + return (B_FALSE); + } + + if (ddi_strtol(ua, &eptr, 16, &targ) != 0 || eptr != comma || + targ < 0 || targ >= SMRT_MAX_LOGDRV) { + return (B_FALSE); + } + + *targp = (uint_t)targ; + + return (B_TRUE); +} + +static int +smrt_logvol_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip, + scsi_hba_tran_t *hba_tran, struct scsi_device *sd) +{ + _NOTE(ARGUNUSED(hba_dip)) + + smrt_volume_t *smlv; + smrt_target_t *smtg; + const char *ua; + uint_t targ; + + smrt_t *smrt = (smrt_t *)hba_tran->tran_hba_private; + dev_info_t *dip = smrt->smrt_dip; + + /* + * The unit address comes in the form of 'target,lun'. We expect the + * lun to be zero. The target is what we set when we added it to the + * target map earlier. + */ + ua = scsi_device_unit_address(sd); + if (ua == NULL) { + return (DDI_FAILURE); + } + + if (!smrt_logvol_parse(ua, &targ)) { + return (DDI_FAILURE); + } + + if ((smtg = kmem_zalloc(sizeof (*smtg), KM_NOSLEEP)) == NULL) { + dev_err(dip, CE_WARN, "could not allocate target object " + "due to memory exhaustion"); + return (DDI_FAILURE); + } + + mutex_enter(&smrt->smrt_mutex); + + if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) { + /* + * We are detaching. Do not accept any more requests to + * attach targets from the framework. + */ + mutex_exit(&smrt->smrt_mutex); + kmem_free(smtg, sizeof (*smtg)); + return (DDI_FAILURE); + } + + /* + * Look for a logical volume for the SCSI unit address of this target. + */ + if ((smlv = smrt_logvol_lookup_by_id(smrt, targ)) == NULL) { + mutex_exit(&smrt->smrt_mutex); + kmem_free(smtg, sizeof (*smtg)); + return (DDI_FAILURE); + } + + smtg->smtg_lun.smtg_vol = smlv; + smtg->smtg_addr = &smlv->smlv_addr; + smtg->smtg_physical = B_FALSE; + list_insert_tail(&smlv->smlv_targets, smtg); + + /* + * Link this target object to the controller: + */ + smtg->smtg_ctlr = smrt; + list_insert_tail(&smrt->smrt_targets, smtg); + + smtg->smtg_scsi_dev = sd; + VERIFY(sd->sd_dev == tgt_dip); + + scsi_device_hba_private_set(sd, smtg); + + mutex_exit(&smrt->smrt_mutex); + return (DDI_SUCCESS); +} + +static void +smrt_logvol_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip, + scsi_hba_tran_t *hba_tran, struct scsi_device *sd) +{ + _NOTE(ARGUNUSED(hba_dip, tgt_dip)) + + smrt_t *smrt = (smrt_t *)hba_tran->tran_hba_private; + smrt_target_t *smtg = scsi_device_hba_private_get(sd); + smrt_volume_t *smlv = smtg->smtg_lun.smtg_vol; + + VERIFY(smtg->smtg_scsi_dev == sd); + VERIFY(smtg->smtg_physical == B_FALSE); + + mutex_enter(&smrt->smrt_mutex); + list_remove(&smlv->smlv_targets, smtg); + list_remove(&smrt->smrt_targets, smtg); + + scsi_device_hba_private_set(sd, NULL); + + mutex_exit(&smrt->smrt_mutex); + + kmem_free(smtg, sizeof (*smtg)); +} + +static int +smrt_phys_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip, + scsi_hba_tran_t *hba_tran, struct scsi_device *sd) +{ + _NOTE(ARGUNUSED(hba_dip)) + + smrt_target_t *smtg; + smrt_physical_t *smpt; + const char *ua, *comma; + char *eptr; + long lun; + + smrt_t *smrt = (smrt_t *)hba_tran->tran_hba_private; + dev_info_t *dip = smrt->smrt_dip; + + /* + * The unit address comes in the form of 'target,lun'. We expect the + * lun to be zero. The target is what we set when we added it to the + * target map earlier. + */ + ua = scsi_device_unit_address(sd); + if (ua == NULL) + return (DDI_FAILURE); + + comma = strchr(ua, ','); + if (comma == NULL) { + return (DDI_FAILURE); + } + + /* + * Confirm the LUN is zero. We may want to instead check the scsi + * 'lun'/'lun64' property or do so in addition to this logic. + */ + if (ddi_strtol(comma + 1, &eptr, 16, &lun) != 0 || *eptr != '\0' || + lun != 0) { + return (DDI_FAILURE); + } + + if ((smtg = kmem_zalloc(sizeof (*smtg), KM_NOSLEEP)) == NULL) { + dev_err(dip, CE_WARN, "could not allocate target object " + "due to memory exhaustion"); + return (DDI_FAILURE); + } + + mutex_enter(&smrt->smrt_mutex); + + if (smrt->smrt_status & SMRT_CTLR_STATUS_DETACHING) { + /* + * We are detaching. Do not accept any more requests to + * attach targets from the framework. + */ + mutex_exit(&smrt->smrt_mutex); + kmem_free(smtg, sizeof (*smtg)); + return (DDI_FAILURE); + } + + + /* + * Look for a physical target based on the unit address of the target + * (which will encode its WWN and LUN). + */ + smpt = smrt_phys_lookup_by_ua(smrt, ua); + if (smpt == NULL) { + mutex_exit(&smrt->smrt_mutex); + kmem_free(smtg, sizeof (*smtg)); + return (DDI_FAILURE); + } + + smtg->smtg_scsi_dev = sd; + smtg->smtg_physical = B_TRUE; + smtg->smtg_lun.smtg_phys = smpt; + list_insert_tail(&smpt->smpt_targets, smtg); + smtg->smtg_addr = &smpt->smpt_addr; + + /* + * Link this target object to the controller: + */ + smtg->smtg_ctlr = smrt; + list_insert_tail(&smrt->smrt_targets, smtg); + + VERIFY(sd->sd_dev == tgt_dip); + smtg->smtg_scsi_dev = sd; + + scsi_device_hba_private_set(sd, smtg); + mutex_exit(&smrt->smrt_mutex); + + return (DDI_SUCCESS); +} + +static void +smrt_phys_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip, + scsi_hba_tran_t *hba_tran, struct scsi_device *sd) +{ + _NOTE(ARGUNUSED(hba_dip, tgt_dip)) + + smrt_t *smrt = (smrt_t *)hba_tran->tran_hba_private; + smrt_target_t *smtg = scsi_device_hba_private_get(sd); + smrt_physical_t *smpt = smtg->smtg_lun.smtg_phys; + + VERIFY(smtg->smtg_scsi_dev == sd); + VERIFY(smtg->smtg_physical == B_TRUE); + + mutex_enter(&smrt->smrt_mutex); + list_remove(&smpt->smpt_targets, smtg); + list_remove(&smrt->smrt_targets, smtg); + + scsi_device_hba_private_set(sd, NULL); + mutex_exit(&smrt->smrt_mutex); + kmem_free(smtg, sizeof (*smtg)); +} + +/* + * This function is called when the SCSI framework has allocated a packet and + * our private per-packet object. + * + * We choose not to have the framework pre-allocate memory for the CDB. + * Instead, we will make available the CDB area in the controller command block + * itself. + * + * Status block memory is allocated by the framework because we passed + * SCSI_HBA_TRAN_SCB to scsi_hba_attach_setup(9F). + */ +static int +smrt_tran_setup_pkt(struct scsi_pkt *pkt, int (*callback)(caddr_t), + caddr_t arg) +{ + _NOTE(ARGUNUSED(arg)) + + struct scsi_device *sd; + smrt_target_t *smtg; + smrt_t *smrt; + smrt_command_t *smcm; + smrt_command_scsa_t *smcms; + int kmflags = callback == SLEEP_FUNC ? KM_SLEEP : KM_NOSLEEP; + + sd = scsi_address_device(&pkt->pkt_address); + VERIFY(sd != NULL); + smtg = scsi_device_hba_private_get(sd); + VERIFY(smtg != NULL); + smrt = smtg->smtg_ctlr; + VERIFY(smrt != NULL); + smcms = (smrt_command_scsa_t *)pkt->pkt_ha_private; + + /* + * Check that we have enough space in the command object for the + * request from the target driver: + */ + if (pkt->pkt_cdblen > CISS_CDBLEN) { + /* + * The CDB member of the Request Block of a controller + * command is fixed at 16 bytes. + */ + dev_err(smrt->smrt_dip, CE_WARN, "oversize CDB: had %u, " + "needed %u", CISS_CDBLEN, pkt->pkt_cdblen); + return (-1); + } + + /* + * Allocate our command block: + */ + if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_SCSA, + kmflags)) == NULL) { + return (-1); + } + smcm->smcm_scsa = smcms; + smcms->smcms_command = smcm; + smcms->smcms_pkt = pkt; + + pkt->pkt_cdbp = &smcm->smcm_va_cmd->Request.CDB[0]; + smcm->smcm_va_cmd->Request.CDBLen = pkt->pkt_cdblen; + + smcm->smcm_target = smtg; + + return (0); +} + +static void +smrt_tran_teardown_pkt(struct scsi_pkt *pkt) +{ + smrt_command_scsa_t *smcms = (smrt_command_scsa_t *) + pkt->pkt_ha_private; + smrt_command_t *smcm = smcms->smcms_command; + + smrt_command_free(smcm); + + pkt->pkt_cdbp = NULL; +} + +static void +smrt_set_arq_data(struct scsi_pkt *pkt, uchar_t key) +{ + struct scsi_arq_status *sts; + + VERIFY3U(pkt->pkt_scblen, >=, sizeof (struct scsi_arq_status)); + + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + sts = (struct scsi_arq_status *)(pkt->pkt_scbp); + bzero(sts, sizeof (*sts)); + + /* + * Mock up a CHECK CONDITION SCSI status for the original command: + */ + sts->sts_status.sts_chk = 1; + + /* + * Pretend that we successfully performed REQUEST SENSE: + */ + sts->sts_rqpkt_reason = CMD_CMPLT; + sts->sts_rqpkt_resid = 0; + sts->sts_rqpkt_state = STATE_GOT_BUS | STATE_GOT_TARGET | + STATE_SENT_CMD | STATE_XFERRED_DATA; + sts->sts_rqpkt_statistics = 0; + + /* + * Return the key value we were provided in the fake sense data: + */ + sts->sts_sensedata.es_valid = 1; + sts->sts_sensedata.es_class = CLASS_EXTENDED_SENSE; + sts->sts_sensedata.es_key = key; + + pkt->pkt_state |= STATE_ARQ_DONE; +} + +/* + * When faking up a REPORT LUNS data structure, we simply report one LUN, LUN 0. + * We need 16 bytes for this, 4 for the size, 4 reserved bytes, and the 8 for + * the actual LUN. + */ +static void +smrt_fake_report_lun(smrt_command_t *smcm, struct scsi_pkt *pkt) +{ + size_t sz; + char resp[16]; + struct buf *bp; + + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET | STATE_SENT_CMD | + STATE_GOT_STATUS; + + /* + * Check to make sure this is valid. If reserved bits are set or if the + * mode is one other than 0x00, 0x01, 0x02, then it's an illegal + * request. + */ + if (pkt->pkt_cdbp[1] != 0 || pkt->pkt_cdbp[3] != 0 || + pkt->pkt_cdbp[4] != 0 || pkt->pkt_cdbp[5] != 0 || + pkt->pkt_cdbp[10] != 0 || pkt->pkt_cdbp[11] != 0 || + pkt->pkt_cdbp[2] > 0x2) { + smrt_set_arq_data(pkt, KEY_ILLEGAL_REQUEST); + return; + } + + /* + * Construct the actual REPORT LUNS reply. We need to indicate a single + * LUN of all zeros. This means that the length needs to be 8 bytes, + * the size of the lun. Otherwise, the rest of this structure can be + * zeros. + */ + bzero(resp, sizeof (resp)); + resp[3] = sizeof (scsi_lun_t); + + bp = scsi_pkt2bp(pkt); + sz = MIN(sizeof (resp), bp->b_bcount); + + bp_mapin(bp); + bcopy(resp, bp->b_un.b_addr, sz); + bp_mapout(bp); + pkt->pkt_state |= STATE_XFERRED_DATA; + pkt->pkt_resid = bp->b_bcount - sz; + if (pkt->pkt_scblen >= 1) { + pkt->pkt_scbp[0] = STATUS_GOOD; + } +} + +static int +smrt_tran_start(struct scsi_address *sa, struct scsi_pkt *pkt) +{ + _NOTE(ARGUNUSED(sa)) + + struct scsi_device *sd; + smrt_target_t *smtg; + smrt_t *smrt; + smrt_command_scsa_t *smcms; + smrt_command_t *smcm; + int r; + + sd = scsi_address_device(&pkt->pkt_address); + VERIFY(sd != NULL); + smtg = scsi_device_hba_private_get(sd); + VERIFY(smtg != NULL); + smrt = smtg->smtg_ctlr; + VERIFY(smrt != NULL); + smcms = (smrt_command_scsa_t *)pkt->pkt_ha_private; + VERIFY(smcms != NULL); + smcm = smcms->smcms_command; + VERIFY(smcm != NULL); + + if (smcm->smcm_status & SMRT_CMD_STATUS_TRAN_START) { + /* + * This is a retry of a command that has already been + * used once. Assign it a new tag number. + */ + smrt_command_reuse(smcm); + } + smcm->smcm_status |= SMRT_CMD_STATUS_TRAN_START; + + /* + * The sophisticated firmware in this controller cannot possibly bear + * the following SCSI commands. It appears to return a response with + * the status STATUS_ACA_ACTIVE (0x30), which is not something we + * expect. Instead, fake up a failure response. + */ + switch (pkt->pkt_cdbp[0]) { + case SCMD_FORMAT: + case SCMD_LOG_SENSE_G1: + case SCMD_MODE_SELECT: + case SCMD_PERSISTENT_RESERVE_IN: + if (smtg->smtg_physical) { + break; + } + + smrt->smrt_stats.smrts_ignored_scsi_cmds++; + smcm->smcm_status |= SMRT_CMD_STATUS_TRAN_IGNORED; + + /* + * Mark the command as completed to the point where we + * received a SCSI status code: + */ + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET | + STATE_SENT_CMD | STATE_GOT_STATUS; + + /* + * Mock up sense data for an illegal request: + */ + smrt_set_arq_data(pkt, KEY_ILLEGAL_REQUEST); + + scsi_hba_pkt_comp(pkt); + return (TRAN_ACCEPT); + case SCMD_REPORT_LUNS: + /* + * The SMRT controller does not accept a REPORT LUNS command for + * logical volumes. As such, we need to fake up a REPORT LUNS + * response that has a single LUN, LUN 0. + */ + if (smtg->smtg_physical) { + break; + } + + smrt_fake_report_lun(smcm, pkt); + + scsi_hba_pkt_comp(pkt); + return (TRAN_ACCEPT); + default: + break; + } + + if (pkt->pkt_flags & FLAG_NOINTR) { + /* + * We must sleep and wait for the completion of this command. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_POLLED; + } + + /* + * Because we provide a tran_setup_pkt(9E) entrypoint, we must now + * set up the Scatter/Gather List in the Command to reflect any + * DMA resources passed to us by the framework. + */ + if (pkt->pkt_numcookies > smrt->smrt_sg_cnt) { + /* + * More DMA cookies than we are prepared to handle. + */ + dev_err(smrt->smrt_dip, CE_WARN, "too many DMA cookies (got %u;" + " expected %u)", pkt->pkt_numcookies, smrt->smrt_sg_cnt); + return (TRAN_BADPKT); + } + smcm->smcm_va_cmd->Header.SGList = pkt->pkt_numcookies; + smcm->smcm_va_cmd->Header.SGTotal = pkt->pkt_numcookies; + for (unsigned i = 0; i < pkt->pkt_numcookies; i++) { + smcm->smcm_va_cmd->SG[i].Addr = + LE_64(pkt->pkt_cookies[i].dmac_laddress); + smcm->smcm_va_cmd->SG[i].Len = + LE_32(pkt->pkt_cookies[i].dmac_size); + } + + /* + * Copy logical volume address from the target object: + */ + smcm->smcm_va_cmd->Header.LUN = *smcm->smcm_target->smtg_addr; + + /* + * Initialise the command block. + */ + smcm->smcm_va_cmd->Request.CDBLen = pkt->pkt_cdblen; + smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD; + smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE; + smcm->smcm_va_cmd->Request.Timeout = LE_16(pkt->pkt_time); + if (pkt->pkt_numcookies > 0) { + /* + * There are DMA resources; set the transfer direction + * appropriately: + */ + if (pkt->pkt_dma_flags & DDI_DMA_READ) { + smcm->smcm_va_cmd->Request.Type.Direction = + CISS_XFER_READ; + } else if (pkt->pkt_dma_flags & DDI_DMA_WRITE) { + smcm->smcm_va_cmd->Request.Type.Direction = + CISS_XFER_WRITE; + } else { + smcm->smcm_va_cmd->Request.Type.Direction = + CISS_XFER_NONE; + } + } else { + /* + * No DMA resources means no transfer. + */ + smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_NONE; + } + + /* + * Initialise the SCSI packet as described in tran_start(9E). We will + * progressively update these fields as the command moves through the + * submission and completion states. + */ + pkt->pkt_resid = 0; + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_statistics = 0; + pkt->pkt_state = 0; + + /* + * If this SCSI packet has a timeout, configure an appropriate + * expiry time: + */ + if (pkt->pkt_time != 0) { + smcm->smcm_expiry = gethrtime() + pkt->pkt_time * NANOSEC; + } + + /* + * Submit the command to the controller. + */ + mutex_enter(&smrt->smrt_mutex); + + /* + * If we're dumping, there's a chance that the target we're talking to + * could have ended up disappearing during the process of discovery. If + * this target is part of the dump device, we check here and return that + * we hit a fatal error. + */ + if (ddi_in_panic() && smtg->smtg_gone) { + mutex_exit(&smrt->smrt_mutex); + + dev_err(smrt->smrt_dip, CE_WARN, "smrt_submit failed: target " + "%s is gone, it did not come back after post-panic reset " + "device discovery", scsi_device_unit_address(sd)); + + return (TRAN_FATAL_ERROR); + } + + smrt->smrt_stats.smrts_tran_starts++; + if ((r = smrt_submit(smrt, smcm)) != 0) { + mutex_exit(&smrt->smrt_mutex); + + dev_err(smrt->smrt_dip, CE_WARN, "smrt_submit failed %d", r); + + /* + * Inform the SCSI framework that we could not submit + * the command. + */ + return (r == EAGAIN ? TRAN_BUSY : TRAN_FATAL_ERROR); + } + + /* + * Update the SCSI packet to reflect submission of the command. + */ + pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET | STATE_SENT_CMD; + + if (pkt->pkt_flags & FLAG_NOINTR) { + /* + * Poll the controller for completion of the command we + * submitted. Once this routine has returned, the completion + * callback will have been fired with either an active response + * (success or error) or a timeout. The command is freed by + * the completion callback, so it may not be referenced again + * after this call returns. + */ + smrt_poll_for(smrt, smcm); + } + + mutex_exit(&smrt->smrt_mutex); + return (TRAN_ACCEPT); +} + +static int +smrt_tran_reset(struct scsi_address *sa, int level) +{ + _NOTE(ARGUNUSED(level)) + + struct scsi_device *sd; + smrt_target_t *smtg; + smrt_t *smrt; + smrt_command_t *smcm; + int r; + + sd = scsi_address_device(sa); + VERIFY(sd != NULL); + smtg = scsi_device_hba_private_get(sd); + VERIFY(smtg != NULL); + smrt = smtg->smtg_ctlr; + + /* + * The framework has requested some kind of SCSI reset. A + * controller-level soft reset can take a very long time -- often on + * the order of 30-60 seconds -- but might well be our only option if + * the controller is non-responsive. + * + * First, check if the controller is responding to pings. + */ +again: + if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, + KM_NOSLEEP)) == NULL) { + return (0); + } + + smrt_write_message_nop(smcm, SMRT_PING_CHECK_TIMEOUT); + + mutex_enter(&smrt->smrt_mutex); + smrt->smrt_stats.smrts_tran_resets++; + if (ddi_in_panic()) { + goto skip_check; + } + + if (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) { + /* + * The controller is already resetting. Wait for that + * to finish. + */ + while (smrt->smrt_status & SMRT_CTLR_STATUS_RESETTING) { + cv_wait(&smrt->smrt_cv_finishq, &smrt->smrt_mutex); + } + } + +skip_check: + /* + * Submit our ping to the controller. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_POLLED; + smcm->smcm_expiry = gethrtime() + SMRT_PING_CHECK_TIMEOUT * NANOSEC; + if (smrt_submit(smrt, smcm) != 0) { + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(smcm); + return (0); + } + + if ((r = smrt_poll_for(smrt, smcm)) != 0) { + VERIFY3S(r, ==, ETIMEDOUT); + VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE); + + /* + * The ping command timed out. Abandon it now. + */ + dev_err(smrt->smrt_dip, CE_WARN, "controller ping timed out"); + smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED; + smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED; + + } else if ((smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) || + (smcm->smcm_status & SMRT_CMD_STATUS_ERROR)) { + /* + * The command completed in error, or a controller reset + * was sent while we were trying to ping. + */ + dev_err(smrt->smrt_dip, CE_WARN, "controller ping error"); + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(smcm); + mutex_enter(&smrt->smrt_mutex); + + } else { + VERIFY(smcm->smcm_status & SMRT_CMD_STATUS_COMPLETE); + + /* + * The controller is responsive, and a full soft reset would be + * extremely disruptive to the system. Given our spotty + * support for some SCSI commands (which can upset the target + * drivers) and the historically lax behaviour of the "smrt" + * driver, we grit our teeth and pretend we were able to + * perform a reset. + */ + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(smcm); + return (1); + } + + /* + * If a reset has been initiated in the last 90 seconds, try + * another ping. + */ + if (gethrtime() < smrt->smrt_last_reset_start + 90 * NANOSEC) { + dev_err(smrt->smrt_dip, CE_WARN, "controller ping failed, but " + "was recently reset; retrying ping"); + mutex_exit(&smrt->smrt_mutex); + + /* + * Sleep for a second first. + */ + if (ddi_in_panic()) { + drv_usecwait(1 * MICROSEC); + } else { + delay(drv_usectohz(1 * MICROSEC)); + } + goto again; + } + + dev_err(smrt->smrt_dip, CE_WARN, "controller ping failed; resetting " + "controller"); + if (smrt_ctlr_reset(smrt) != 0) { + dev_err(smrt->smrt_dip, CE_WARN, "controller reset failure"); + mutex_exit(&smrt->smrt_mutex); + return (0); + } + + mutex_exit(&smrt->smrt_mutex); + return (1); +} + +static int +smrt_tran_abort(struct scsi_address *sa, struct scsi_pkt *pkt) +{ + struct scsi_device *sd; + smrt_target_t *smtg; + smrt_t *smrt; + smrt_command_t *smcm = NULL; + smrt_command_t *abort_smcm; + + sd = scsi_address_device(sa); + VERIFY(sd != NULL); + smtg = scsi_device_hba_private_get(sd); + VERIFY(smtg != NULL); + smrt = smtg->smtg_ctlr; + VERIFY(smrt != NULL); + + + if ((abort_smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, + KM_NOSLEEP)) == NULL) { + /* + * No resources available to send an abort message. + */ + return (0); + } + + mutex_enter(&smrt->smrt_mutex); + smrt->smrt_stats.smrts_tran_aborts++; + if (pkt != NULL) { + /* + * The framework wants us to abort a specific SCSI packet. + */ + smrt_command_scsa_t *smcms = (smrt_command_scsa_t *) + pkt->pkt_ha_private; + smcm = smcms->smcms_command; + + if (!(smcm->smcm_status & SMRT_CMD_STATUS_INFLIGHT)) { + /* + * This message is not currently in flight, so we + * cannot abort it. + */ + goto fail; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_ABORT_SENT) { + /* + * An abort message for this command has already been + * sent to the controller. Return failure. + */ + goto fail; + } + + smrt_write_message_abort_one(abort_smcm, smcm->smcm_tag); + } else { + /* + * The framework wants us to abort every in flight command + * for the target with this address. + */ + smrt_write_message_abort_all(abort_smcm, smtg->smtg_addr); + } + + /* + * Submit the abort message to the controller. + */ + abort_smcm->smcm_status |= SMRT_CMD_STATUS_POLLED; + if (smrt_submit(smrt, abort_smcm) != 0) { + goto fail; + } + + if (pkt != NULL) { + /* + * Record some debugging information about the abort we + * sent: + */ + smcm->smcm_abort_time = gethrtime(); + smcm->smcm_abort_tag = abort_smcm->smcm_tag; + + /* + * Mark the command as aborted so that we do not send + * a second abort message: + */ + smcm->smcm_status |= SMRT_CMD_STATUS_ABORT_SENT; + } + + /* + * Poll for completion of the abort message. Note that this function + * only fails if we set a timeout on the command, which we have not + * done. + */ + VERIFY0(smrt_poll_for(smrt, abort_smcm)); + + if ((abort_smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) || + (abort_smcm->smcm_status & SMRT_CMD_STATUS_ERROR)) { + /* + * Either the controller was reset or the abort command + * failed. + */ + goto fail; + } + + /* + * The command was successfully aborted. + */ + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(abort_smcm); + return (1); + +fail: + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(abort_smcm); + return (0); +} + +static void +smrt_hba_complete_status(smrt_command_t *smcm) +{ + ErrorInfo_t *ei = smcm->smcm_va_err; + struct scsi_pkt *pkt = smcm->smcm_scsa->smcms_pkt; + + bzero(pkt->pkt_scbp, pkt->pkt_scblen); + + if (ei->ScsiStatus != STATUS_CHECK) { + /* + * If the SCSI status is not CHECK CONDITION, we don't want + * to try and read the sense data buffer. + */ + goto simple_status; + } + + if (pkt->pkt_scblen < sizeof (struct scsi_arq_status)) { + /* + * There is not enough room for a request sense structure. + * Fall back to reporting just the SCSI status code. + */ + goto simple_status; + } + + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + struct scsi_arq_status *sts = (struct scsi_arq_status *)pkt->pkt_scbp; + + /* + * Copy in the SCSI status from the original command. + */ + bcopy(&ei->ScsiStatus, &sts->sts_status, sizeof (sts->sts_status)); + + /* + * Mock up a successful REQUEST SENSE: + */ + sts->sts_rqpkt_reason = CMD_CMPLT; + sts->sts_rqpkt_resid = 0; + sts->sts_rqpkt_state = STATE_GOT_BUS | STATE_GOT_TARGET | + STATE_SENT_CMD | STATE_XFERRED_DATA | STATE_GOT_STATUS; + sts->sts_rqpkt_statistics = 0; + + /* + * The sense data from the controller should be copied into place + * starting at the "sts_sensedata" member of the auto request + * sense object. + */ + size_t sense_len = pkt->pkt_scblen - offsetof(struct scsi_arq_status, + sts_sensedata); + if (ei->SenseLen < sense_len) { + /* + * Only copy sense data bytes that are within the region + * the controller marked as valid. + */ + sense_len = ei->SenseLen; + } + bcopy(ei->SenseInfo, &sts->sts_sensedata, sense_len); + + pkt->pkt_state |= STATE_ARQ_DONE; + return; + +simple_status: + if (pkt->pkt_scblen < sizeof (struct scsi_status)) { + /* + * There is not even enough room for the SCSI status byte. + */ + return; + } + + bcopy(&ei->ScsiStatus, pkt->pkt_scbp, sizeof (struct scsi_status)); +} + +static void +smrt_hba_complete_log_error(smrt_command_t *smcm, const char *name) +{ + smrt_t *smrt = smcm->smcm_ctlr; + ErrorInfo_t *ei = smcm->smcm_va_err; + + dev_err(smrt->smrt_dip, CE_WARN, "!SCSI command failed: %s: " + "SCSI op %x, CISS status %x, SCSI status %x", name, + (unsigned)smcm->smcm_va_cmd->Request.CDB[0], + (unsigned)ei->CommandStatus, (unsigned)ei->ScsiStatus); +} + +/* + * Completion routine for commands submitted to the controller via the SCSI + * framework. + */ +void +smrt_hba_complete(smrt_command_t *smcm) +{ + smrt_t *smrt = smcm->smcm_ctlr; + ErrorInfo_t *ei = smcm->smcm_va_err; + struct scsi_pkt *pkt = smcm->smcm_scsa->smcms_pkt; + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + pkt->pkt_resid = ei->ResidualCnt; + + /* + * Check if the controller was reset while this packet was in flight. + */ + if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) { + if (pkt->pkt_reason != CMD_CMPLT) { + /* + * If another error status has already been written, + * do not overwrite it. + */ + pkt->pkt_reason = CMD_RESET; + } + pkt->pkt_statistics |= STAT_BUS_RESET | STAT_DEV_RESET; + goto finish; + } + + if (!(smcm->smcm_status & SMRT_CMD_STATUS_ERROR)) { + /* + * The command was completed without error by the controller. + * + * As per the specification, if an error was not signalled + * by the controller through the CISS transport method, + * the error information (including CommandStatus) has not + * been written and should not be checked. + */ + pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS; + goto finish; + } + + /* + * Check the completion status to determine what befell this request. + */ + switch (ei->CommandStatus) { + case CISS_CMD_SUCCESS: + /* + * In a certain sense, the specification contradicts itself. + * On the one hand, it suggests that a successful command + * will not result in a controller write to the error + * information block; on the other hand, it makes room + * for a status code (0) which denotes a successful + * execution. + * + * To be on the safe side, we check for that condition here. + */ + pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS; + break; + + case CISS_CMD_DATA_UNDERRUN: + /* + * A data underrun occurred. Ideally this will result in + * an appropriate SCSI status and sense data. + */ + pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS; + break; + + case CISS_CMD_TARGET_STATUS: + /* + * The command completed, but an error occurred. We need + * to provide the sense data to the SCSI framework. + */ + pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS; + break; + + case CISS_CMD_DATA_OVERRUN: + /* + * Data overrun has occurred. + */ + smrt_hba_complete_log_error(smcm, "data overrun"); + pkt->pkt_reason = CMD_DATA_OVR; + pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS; + break; + + case CISS_CMD_INVALID: + /* + * One or more fields in the command has invalid data. + */ + smrt_hba_complete_log_error(smcm, "invalid command"); + pkt->pkt_reason = CMD_BADMSG; + pkt->pkt_state |= STATE_GOT_STATUS; + break; + + case CISS_CMD_PROTOCOL_ERR: + /* + * An error occurred in communication with the end device. + */ + smrt_hba_complete_log_error(smcm, "protocol error"); + pkt->pkt_reason = CMD_BADMSG; + pkt->pkt_state |= STATE_GOT_STATUS; + break; + + case CISS_CMD_HARDWARE_ERR: + /* + * A hardware error occurred. + */ + smrt_hba_complete_log_error(smcm, "hardware error"); + pkt->pkt_reason = CMD_INCOMPLETE; + break; + + case CISS_CMD_CONNECTION_LOST: + /* + * The connection with the end device cannot be + * re-established. + */ + smrt_hba_complete_log_error(smcm, "connection lost"); + pkt->pkt_reason = CMD_INCOMPLETE; + break; + + case CISS_CMD_ABORTED: + case CISS_CMD_UNSOLICITED_ABORT: + if (smcm->smcm_status & SMRT_CMD_STATUS_TIMEOUT) { + /* + * This abort was arranged by the periodic routine + * in response to an elapsed timeout. + */ + pkt->pkt_reason = CMD_TIMEOUT; + pkt->pkt_statistics |= STAT_TIMEOUT; + } else { + pkt->pkt_reason = CMD_ABORTED; + } + pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS; + pkt->pkt_statistics |= STAT_ABORTED; + break; + + case CISS_CMD_TIMEOUT: + smrt_hba_complete_log_error(smcm, "timeout"); + pkt->pkt_reason = CMD_TIMEOUT; + pkt->pkt_statistics |= STAT_TIMEOUT; + break; + + default: + /* + * This is an error that we were not prepared to handle. + * Signal a generic transport-level error to the framework. + */ + smrt_hba_complete_log_error(smcm, "unexpected error"); + pkt->pkt_reason = CMD_TRAN_ERR; + } + + /* + * Attempt to read a SCSI status code and any automatic + * request sense data that may exist: + */ + smrt_hba_complete_status(smcm); + +finish: + mutex_exit(&smrt->smrt_mutex); + scsi_hba_pkt_comp(pkt); + mutex_enter(&smrt->smrt_mutex); +} + +static int +smrt_getcap(struct scsi_address *sa, char *cap, int whom) +{ + _NOTE(ARGUNUSED(whom)) + + struct scsi_device *sd; + smrt_target_t *smtg; + smrt_t *smrt; + int index; + + sd = scsi_address_device(sa); + VERIFY(sd != NULL); + smtg = scsi_device_hba_private_get(sd); + VERIFY(smtg != NULL); + smrt = smtg->smtg_ctlr; + VERIFY(smrt != NULL); + + if ((index = scsi_hba_lookup_capstr(cap)) == DDI_FAILURE) { + /* + * This capability string could not be translated to an + * ID number, so it must not exist. + */ + return (-1); + } + + switch (index) { + case SCSI_CAP_CDB_LEN: + /* + * The CDB field in the CISS request block is fixed at 16 + * bytes. + */ + return (CISS_CDBLEN); + + case SCSI_CAP_DMA_MAX: + if (smrt->smrt_dma_attr.dma_attr_maxxfer > INT_MAX) { + return (INT_MAX); + } + return ((int)smrt->smrt_dma_attr.dma_attr_maxxfer); + + case SCSI_CAP_SECTOR_SIZE: + if (smrt->smrt_dma_attr.dma_attr_granular > INT_MAX) { + return (-1); + } + return ((int)smrt->smrt_dma_attr.dma_attr_granular); + + /* + * If this target corresponds to a physical device, then we always + * indicate that we're on a SAS interconnect. Otherwise, we default to + * saying that we're on a parallel bus. We can't use SAS for + * everything, unfortunately. When you declare yourself to be a SAS + * interconnect, it's expected that you have a full 16-byte WWN as the + * target. If not, devfsadm will not be able to enumerate the device + * and create /dev/[r]dsk entries. + */ + case SCSI_CAP_INTERCONNECT_TYPE: + if (smtg->smtg_physical) { + return (INTERCONNECT_SAS); + } else { + return (INTERCONNECT_PARALLEL); + } + + case SCSI_CAP_DISCONNECT: + case SCSI_CAP_SYNCHRONOUS: + case SCSI_CAP_WIDE_XFER: + case SCSI_CAP_ARQ: + case SCSI_CAP_UNTAGGED_QING: + case SCSI_CAP_TAGGED_QING: + /* + * These capabilities are supported by the driver and the + * controller. See scsi_ifgetcap(9F) for more information. + */ + return (1); + + case SCSI_CAP_INITIATOR_ID: + case SCSI_CAP_RESET_NOTIFICATION: + /* + * These capabilities are not supported. + */ + return (0); + + default: + /* + * The property in question is not known to this driver. + */ + return (-1); + } +} + +/* ARGSUSED */ +static int +smrt_setcap(struct scsi_address *sa, char *cap, int value, int whom) +{ + int index; + + if ((index = scsi_hba_lookup_capstr(cap)) == DDI_FAILURE) { + /* + * This capability string could not be translated to an + * ID number, so it must not exist. + */ + return (-1); + } + + if (whom == 0) { + /* + * When whom is 0, this is a request to set a capability for + * all targets. As per the recommendation in tran_setcap(9E), + * we do not support this mode of operation. + */ + return (-1); + } + + switch (index) { + case SCSI_CAP_CDB_LEN: + case SCSI_CAP_DMA_MAX: + case SCSI_CAP_SECTOR_SIZE: + case SCSI_CAP_INITIATOR_ID: + case SCSI_CAP_DISCONNECT: + case SCSI_CAP_SYNCHRONOUS: + case SCSI_CAP_WIDE_XFER: + case SCSI_CAP_ARQ: + case SCSI_CAP_UNTAGGED_QING: + case SCSI_CAP_TAGGED_QING: + case SCSI_CAP_RESET_NOTIFICATION: + case SCSI_CAP_INTERCONNECT_TYPE: + /* + * We do not support changing any capabilities at this time. + */ + return (0); + + default: + /* + * The capability in question is not known to this driver. + */ + return (-1); + } +} + +int +smrt_ctrl_hba_setup(smrt_t *smrt) +{ + int flags; + dev_info_t *dip = smrt->smrt_dip; + scsi_hba_tran_t *tran; + + if ((tran = scsi_hba_tran_alloc(dip, SCSI_HBA_CANSLEEP)) == NULL) { + dev_err(dip, CE_WARN, "could not allocate SCSA resources"); + return (DDI_FAILURE); + } + + smrt->smrt_hba_tran = tran; + tran->tran_hba_private = smrt; + + tran->tran_tgt_init = smrt_ctrl_tran_tgt_init; + tran->tran_tgt_probe = scsi_hba_probe; + + tran->tran_start = smrt_ctrl_tran_start; + + tran->tran_getcap = smrt_getcap; + tran->tran_setcap = smrt_setcap; + + tran->tran_setup_pkt = smrt_tran_setup_pkt; + tran->tran_teardown_pkt = smrt_tran_teardown_pkt; + tran->tran_hba_len = sizeof (smrt_command_scsa_t); + tran->tran_interconnect_type = INTERCONNECT_SAS; + + flags = SCSI_HBA_HBA | SCSI_HBA_TRAN_SCB | SCSI_HBA_ADDR_COMPLEX; + if (scsi_hba_attach_setup(dip, &smrt->smrt_dma_attr, tran, flags) != + DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not attach to SCSA framework"); + scsi_hba_tran_free(tran); + return (DDI_FAILURE); + } + + smrt->smrt_init_level |= SMRT_INITLEVEL_SCSA; + return (DDI_SUCCESS); +} + +void +smrt_ctrl_hba_teardown(smrt_t *smrt) +{ + if (smrt->smrt_init_level & SMRT_INITLEVEL_SCSA) { + VERIFY(scsi_hba_detach(smrt->smrt_dip) != DDI_FAILURE); + scsi_hba_tran_free(smrt->smrt_hba_tran); + smrt->smrt_init_level &= ~SMRT_INITLEVEL_SCSA; + } +} + +int +smrt_logvol_hba_setup(smrt_t *smrt, dev_info_t *iport) +{ + scsi_hba_tran_t *tran; + + tran = ddi_get_driver_private(iport); + if (tran == NULL) + return (DDI_FAILURE); + + tran->tran_tgt_init = smrt_logvol_tran_tgt_init; + tran->tran_tgt_free = smrt_logvol_tran_tgt_free; + + tran->tran_start = smrt_tran_start; + tran->tran_reset = smrt_tran_reset; + tran->tran_abort = smrt_tran_abort; + + tran->tran_hba_private = smrt; + + mutex_enter(&smrt->smrt_mutex); + if (scsi_hba_tgtmap_create(iport, SCSI_TM_FULLSET, MICROSEC, + 2 * MICROSEC, smrt, smrt_logvol_tgtmap_activate, + smrt_logvol_tgtmap_deactivate, &smrt->smrt_virt_tgtmap) != + DDI_SUCCESS) { + return (DDI_FAILURE); + } + + smrt_discover_request(smrt); + mutex_exit(&smrt->smrt_mutex); + + return (DDI_SUCCESS); +} + +void +smrt_logvol_hba_teardown(smrt_t *smrt, dev_info_t *iport) +{ + ASSERT(smrt->smrt_virt_iport == iport); + + mutex_enter(&smrt->smrt_mutex); + + if (smrt->smrt_virt_tgtmap != NULL) { + scsi_hba_tgtmap_t *t; + + /* + * Ensure that we can't be racing with discovery. + */ + while (smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING) { + mutex_exit(&smrt->smrt_mutex); + ddi_taskq_wait(smrt->smrt_discover_taskq); + mutex_enter(&smrt->smrt_mutex); + } + + t = smrt->smrt_virt_tgtmap; + smrt->smrt_virt_tgtmap = NULL; + mutex_exit(&smrt->smrt_mutex); + scsi_hba_tgtmap_destroy(t); + mutex_enter(&smrt->smrt_mutex); + } + + mutex_exit(&smrt->smrt_mutex); +} + +int +smrt_phys_hba_setup(smrt_t *smrt, dev_info_t *iport) +{ + scsi_hba_tran_t *tran; + + tran = ddi_get_driver_private(iport); + if (tran == NULL) + return (DDI_FAILURE); + + tran->tran_tgt_init = smrt_phys_tran_tgt_init; + tran->tran_tgt_free = smrt_phys_tran_tgt_free; + + tran->tran_start = smrt_tran_start; + tran->tran_reset = smrt_tran_reset; + tran->tran_abort = smrt_tran_abort; + + tran->tran_hba_private = smrt; + + mutex_enter(&smrt->smrt_mutex); + if (scsi_hba_tgtmap_create(iport, SCSI_TM_FULLSET, MICROSEC, + 2 * MICROSEC, smrt, smrt_phys_tgtmap_activate, + smrt_phys_tgtmap_deactivate, &smrt->smrt_phys_tgtmap) != + DDI_SUCCESS) { + return (DDI_FAILURE); + } + + smrt_discover_request(smrt); + mutex_exit(&smrt->smrt_mutex); + + return (DDI_SUCCESS); +} + +void +smrt_phys_hba_teardown(smrt_t *smrt, dev_info_t *iport) +{ + ASSERT(smrt->smrt_phys_iport == iport); + + mutex_enter(&smrt->smrt_mutex); + + if (smrt->smrt_phys_tgtmap != NULL) { + scsi_hba_tgtmap_t *t; + + /* + * Ensure that we can't be racing with discovery. + */ + while (smrt->smrt_status & SMRT_CTLR_DISCOVERY_RUNNING) { + mutex_exit(&smrt->smrt_mutex); + ddi_taskq_wait(smrt->smrt_discover_taskq); + mutex_enter(&smrt->smrt_mutex); + } + + t = smrt->smrt_phys_tgtmap; + smrt->smrt_phys_tgtmap = NULL; + mutex_exit(&smrt->smrt_mutex); + scsi_hba_tgtmap_destroy(t); + mutex_enter(&smrt->smrt_mutex); + } + + mutex_exit(&smrt->smrt_mutex); +} diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_interrupts.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_interrupts.c new file mode 100644 index 0000000000..18d5b8e936 --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_interrupts.c @@ -0,0 +1,286 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017, Joyent, Inc. + */ + +#include <sys/scsi/adapters/smrt/smrt.h> + +static char * +smrt_interrupt_type_name(int type) +{ + switch (type) { + case DDI_INTR_TYPE_MSIX: + return ("MSI-X"); + case DDI_INTR_TYPE_MSI: + return ("MSI"); + case DDI_INTR_TYPE_FIXED: + return ("fixed"); + default: + return ("?"); + } +} + +static boolean_t +smrt_try_msix(smrt_t *smrt) +{ + char *fwver = smrt->smrt_versions.smrtv_firmware_rev; + + /* + * Generation 9 controllers end up having a different firmware + * versioning scheme than others. If this is a generation 9 controller, + * which all share the same PCI device ID, then we default to MSI. + */ + if (smrt->smrt_pci_vendor == SMRT_VENDOR_HP && + smrt->smrt_pci_device == SMRT_DEVICE_GEN9) { + return (B_FALSE); + } + + if (fwver[0] == '8' && fwver[1] == '.' && isdigit(fwver[2]) && + isdigit(fwver[3])) { + /* + * Version 8.00 of the Smart Array firmware appears to have + * broken MSI support on at least one controller. We could + * blindly try MSI-X everywhere, except that on at least some + * 6.XX firmware versions, MSI-X interrupts do not appear + * to be triggered for Simple Transport Method command + * completions. + * + * For now, assume we should try for MSI-X with all 8.XX + * versions of the firmware. + */ + dev_err(smrt->smrt_dip, CE_NOTE, "!trying MSI-X interrupts " + "to work around 8.XX firmware defect"); + return (B_TRUE); + } + + return (B_FALSE); +} + +static int +smrt_interrupts_disable(smrt_t *smrt) +{ + if (smrt->smrt_interrupt_cap & DDI_INTR_FLAG_BLOCK) { + return (ddi_intr_block_disable(smrt->smrt_interrupts, + smrt->smrt_ninterrupts)); + } else { + VERIFY3S(smrt->smrt_ninterrupts, ==, 1); + + return (ddi_intr_disable(smrt->smrt_interrupts[0])); + } +} + +int +smrt_interrupts_enable(smrt_t *smrt) +{ + int ret; + + VERIFY(!(smrt->smrt_init_level & SMRT_INITLEVEL_INT_ENABLED)); + + if (smrt->smrt_interrupt_cap & DDI_INTR_FLAG_BLOCK) { + ret = ddi_intr_block_enable(smrt->smrt_interrupts, + smrt->smrt_ninterrupts); + } else { + VERIFY3S(smrt->smrt_ninterrupts, ==, 1); + + ret = ddi_intr_enable(smrt->smrt_interrupts[0]); + } + + if (ret == DDI_SUCCESS) { + smrt->smrt_init_level |= SMRT_INITLEVEL_INT_ENABLED; + } + + return (ret); +} + +static void +smrt_interrupts_free(smrt_t *smrt) +{ + for (int i = 0; i < smrt->smrt_ninterrupts; i++) { + (void) ddi_intr_free(smrt->smrt_interrupts[i]); + } + smrt->smrt_ninterrupts = 0; + smrt->smrt_interrupt_type = 0; + smrt->smrt_interrupt_cap = 0; + smrt->smrt_interrupt_pri = 0; +} + +static int +smrt_interrupts_alloc(smrt_t *smrt, int type) +{ + dev_info_t *dip = smrt->smrt_dip; + int nintrs = 0; + int navail = 0; + + if (ddi_intr_get_nintrs(dip, type, &nintrs) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not count %s interrupts", + smrt_interrupt_type_name(type)); + return (DDI_FAILURE); + } + if (nintrs < 1) { + dev_err(dip, CE_WARN, "no %s interrupts supported", + smrt_interrupt_type_name(type)); + return (DDI_FAILURE); + } + + if (ddi_intr_get_navail(dip, type, &navail) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not count available %s " + "interrupts", smrt_interrupt_type_name(type)); + return (DDI_FAILURE); + } + if (navail < 1) { + dev_err(dip, CE_WARN, "no %s interrupts available", + smrt_interrupt_type_name(type)); + return (DDI_FAILURE); + } + + if (ddi_intr_alloc(dip, smrt->smrt_interrupts, type, 0, 1, + &smrt->smrt_ninterrupts, DDI_INTR_ALLOC_STRICT) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "%s interrupt allocation failed", + smrt_interrupt_type_name(type)); + smrt_interrupts_free(smrt); + return (DDI_FAILURE); + } + + smrt->smrt_init_level |= SMRT_INITLEVEL_INT_ALLOC; + smrt->smrt_interrupt_type = type; + return (DDI_SUCCESS); +} + +int +smrt_interrupts_setup(smrt_t *smrt) +{ + int types; + unsigned ipri; + uint_t (*hw_isr)(caddr_t, caddr_t); + dev_info_t *dip = smrt->smrt_dip; + + /* + * Select the correct hardware interrupt service routine for the + * Transport Method we have configured: + */ + switch (smrt->smrt_ctlr_mode) { + case SMRT_CTLR_MODE_SIMPLE: + hw_isr = smrt_isr_hw_simple; + break; + default: + panic("unknown controller mode"); + } + + if (ddi_intr_get_supported_types(dip, &types) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not get support interrupts"); + goto fail; + } + + /* + * At least one firmware version has been released for the Smart Array + * line with entirely defective MSI support. The specification is + * somewhat unclear on the precise nature of MSI-X support with Smart + * Array controllers, particularly with respect to the Simple Transport + * Method, but for those broken firmware versions we need to try + * anyway. + */ + if (smrt_try_msix(smrt) && (types & DDI_INTR_TYPE_MSIX)) { + if (smrt_interrupts_alloc(smrt, DDI_INTR_TYPE_MSIX) == + DDI_SUCCESS) { + goto add_handler; + } + } + + /* + * If MSI-X is not available, or not expected to work, fall back to + * MSI. + */ + if (types & DDI_INTR_TYPE_MSI) { + if (smrt_interrupts_alloc(smrt, DDI_INTR_TYPE_MSI) == + DDI_SUCCESS) { + goto add_handler; + } + } + + /* + * If neither MSI-X nor MSI is available, fall back to fixed + * interrupts. Note that the use of fixed interrupts has been + * observed, with some combination of controllers and systems, to + * result in interrupts stopping completely at random times. + */ + if (types & DDI_INTR_TYPE_FIXED) { + if (smrt_interrupts_alloc(smrt, DDI_INTR_TYPE_FIXED) == + DDI_SUCCESS) { + goto add_handler; + } + } + + /* + * We were unable to allocate any interrupts. + */ + dev_err(dip, CE_WARN, "interrupt allocation failed"); + goto fail; + +add_handler: + /* + * Ensure that we have not been given a high-level interrupt, as our + * interrupt handlers do not support them. + */ + if (ddi_intr_get_pri(smrt->smrt_interrupts[0], &ipri) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not determine interrupt priority"); + goto fail; + } + if (ipri >= ddi_intr_get_hilevel_pri()) { + dev_err(dip, CE_WARN, "high level interrupts not supported"); + goto fail; + } + smrt->smrt_interrupt_pri = ipri; + + if (ddi_intr_get_cap(smrt->smrt_interrupts[0], + &smrt->smrt_interrupt_cap) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "could not get %s interrupt cap", + smrt_interrupt_type_name(smrt->smrt_interrupt_type)); + goto fail; + } + + if (ddi_intr_add_handler(smrt->smrt_interrupts[0], hw_isr, + (caddr_t)smrt, NULL) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "adding %s interrupt failed", + smrt_interrupt_type_name(smrt->smrt_interrupt_type)); + goto fail; + } + smrt->smrt_init_level |= SMRT_INITLEVEL_INT_ADDED; + + return (DDI_SUCCESS); + +fail: + smrt_interrupts_teardown(smrt); + return (DDI_FAILURE); +} + +void +smrt_interrupts_teardown(smrt_t *smrt) +{ + if (smrt->smrt_init_level & SMRT_INITLEVEL_INT_ENABLED) { + (void) smrt_interrupts_disable(smrt); + + smrt->smrt_init_level &= ~SMRT_INITLEVEL_INT_ENABLED; + } + + if (smrt->smrt_init_level & SMRT_INITLEVEL_INT_ADDED) { + (void) ddi_intr_remove_handler(smrt->smrt_interrupts[0]); + + smrt->smrt_init_level &= ~SMRT_INITLEVEL_INT_ADDED; + } + + if (smrt->smrt_init_level & SMRT_INITLEVEL_INT_ALLOC) { + smrt_interrupts_free(smrt); + + smrt->smrt_init_level &= ~SMRT_INITLEVEL_INT_ALLOC; + } +} diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_logvol.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_logvol.c new file mode 100644 index 0000000000..05963ac2e2 --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_logvol.c @@ -0,0 +1,367 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017, Joyent, Inc. + */ + +#include <sys/scsi/adapters/smrt/smrt.h> + +static void +smrt_logvol_free(smrt_volume_t *smlv) +{ + /* + * By this stage of teardown, all of the SCSI target drivers + * must have been detached from this logical volume. + */ + VERIFY(list_is_empty(&smlv->smlv_targets)); + list_destroy(&smlv->smlv_targets); + + kmem_free(smlv, sizeof (*smlv)); +} + +smrt_volume_t * +smrt_logvol_lookup_by_id(smrt_t *smrt, unsigned long id) +{ + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + for (smrt_volume_t *smlv = list_head(&smrt->smrt_volumes); + smlv != NULL; smlv = list_next(&smrt->smrt_volumes, smlv)) { + if (smlv->smlv_addr.LogDev.VolId == id) { + return (smlv); + } + } + + return (NULL); +} + +static int +smrt_read_logvols(smrt_t *smrt, smrt_report_logical_lun_t *smrll, uint64_t gen) +{ + smrt_report_logical_lun_ent_t *ents = smrll->smrll_data.ents; + uint32_t count = BE_32(smrll->smrll_datasize) / + sizeof (smrt_report_logical_lun_ent_t); + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + if (count > SMRT_MAX_LOGDRV) { + count = SMRT_MAX_LOGDRV; + } + + for (unsigned i = 0; i < count; i++) { + smrt_volume_t *smlv; + char id[SCSI_MAXNAMELEN]; + + DTRACE_PROBE2(read_logvol, unsigned, i, + smrt_report_logical_lun_ent_t *, &ents[i]); + + if ((smlv = smrt_logvol_lookup_by_id(smrt, + ents[i].smrle_addr.VolId)) == NULL) { + + /* + * This is a new Logical Volume, so add it the the list. + */ + if ((smlv = kmem_zalloc(sizeof (*smlv), KM_NOSLEEP)) == + NULL) { + return (ENOMEM); + } + + list_create(&smlv->smlv_targets, + sizeof (smrt_target_t), + offsetof(smrt_target_t, smtg_link_lun)); + + smlv->smlv_ctlr = smrt; + list_insert_tail(&smrt->smrt_volumes, smlv); + } + + /* + * Always make sure that the address and the generation are up + * to date, regardless of where this came from. + */ + smlv->smlv_addr.LogDev = ents[i].smrle_addr; + smlv->smlv_gen = gen; + (void) snprintf(id, sizeof (id), "%x", + smlv->smlv_addr.LogDev.VolId); + if (!ddi_in_panic() && + scsi_hba_tgtmap_set_add(smrt->smrt_virt_tgtmap, + SCSI_TGT_SCSI_DEVICE, id, NULL) != DDI_SUCCESS) { + return (EIO); + } + } + + return (0); +} + +static int +smrt_read_logvols_ext(smrt_t *smrt, smrt_report_logical_lun_t *smrll, + uint64_t gen) +{ + smrt_report_logical_lun_extent_t *extents = + smrll->smrll_data.extents; + uint32_t count = BE_32(smrll->smrll_datasize) / + sizeof (smrt_report_logical_lun_extent_t); + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + if (count > SMRT_MAX_LOGDRV) { + count = SMRT_MAX_LOGDRV; + } + + for (unsigned i = 0; i < count; i++) { + smrt_volume_t *smlv; + char id[SCSI_MAXNAMELEN]; + + DTRACE_PROBE2(read_logvol_ext, unsigned, i, + smrt_report_logical_lun_extent_t *, &extents[i]); + + if ((smlv = smrt_logvol_lookup_by_id(smrt, + extents[i].smrle_addr.VolId)) != NULL) { + if ((smlv->smlv_flags & SMRT_VOL_FLAG_WWN) && + bcmp(extents[i].smrle_wwn, smlv->smlv_wwn, + 16) != 0) { + dev_err(smrt->smrt_dip, CE_PANIC, "logical " + "volume %u WWN changed unexpectedly", i); + } + } else { + /* + * This is a new Logical Volume, so add it the the list. + */ + if ((smlv = kmem_zalloc(sizeof (*smlv), KM_NOSLEEP)) == + NULL) { + return (ENOMEM); + } + + bcopy(extents[i].smrle_wwn, smlv->smlv_wwn, 16); + smlv->smlv_flags |= SMRT_VOL_FLAG_WWN; + + list_create(&smlv->smlv_targets, + sizeof (smrt_target_t), + offsetof(smrt_target_t, smtg_link_lun)); + + smlv->smlv_ctlr = smrt; + list_insert_tail(&smrt->smrt_volumes, smlv); + } + + /* + * Always make sure that the address and the generation are up + * to date. The address may have changed on a reset. + */ + smlv->smlv_addr.LogDev = extents[i].smrle_addr; + smlv->smlv_gen = gen; + (void) snprintf(id, sizeof (id), "%x", + smlv->smlv_addr.LogDev.VolId); + if (!ddi_in_panic() && + scsi_hba_tgtmap_set_add(smrt->smrt_virt_tgtmap, + SCSI_TGT_SCSI_DEVICE, id, NULL) != DDI_SUCCESS) { + return (EIO); + } + } + + return (0); +} + +/* + * Discover the currently visible set of Logical Volumes exposed by the + * controller. + */ +int +smrt_logvol_discover(smrt_t *smrt, uint16_t timeout, uint64_t gen) +{ + smrt_command_t *smcm; + smrt_report_logical_lun_t *smrll; + smrt_report_logical_lun_req_t smrllr = { 0 }; + int r; + + /* + * Allocate the command to send to the device, including buffer space + * for the returned list of Logical Volumes. + */ + if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, + KM_NOSLEEP)) == NULL || smrt_command_attach_internal(smrt, smcm, + sizeof (smrt_report_logical_lun_t), KM_NOSLEEP) != 0) { + r = ENOMEM; + mutex_enter(&smrt->smrt_mutex); + goto out; + } + + smrll = smcm->smcm_internal->smcmi_va; + + smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN); + + smcm->smcm_va_cmd->Request.CDBLen = sizeof (smrllr); + smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout); + smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD; + smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE; + smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ; + + /* + * The Report Logical LUNs command is essentially a vendor-specific + * SCSI command, which we assemble into the CDB region of the command + * block. + */ + bzero(&smrllr, sizeof (smrllr)); + smrllr.smrllr_opcode = CISS_SCMD_REPORT_LOGICAL_LUNS; + smrllr.smrllr_extflag = 1; + smrllr.smrllr_datasize = htonl(sizeof (smrt_report_logical_lun_t)); + bcopy(&smrllr, &smcm->smcm_va_cmd->Request.CDB[0], + MIN(CISS_CDBLEN, sizeof (smrllr))); + + mutex_enter(&smrt->smrt_mutex); + + /* + * Send the command to the device. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_POLLED; + if ((r = smrt_submit(smrt, smcm)) != 0) { + goto out; + } + + /* + * Poll for completion. + */ + smcm->smcm_expiry = gethrtime() + timeout * NANOSEC; + if ((r = smrt_poll_for(smrt, smcm)) != 0) { + VERIFY3S(r, ==, ETIMEDOUT); + VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE); + + /* + * The command timed out; abandon it now. Remove the POLLED + * flag so that the periodic routine will send an abort to + * clean it up next time around. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED; + smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED; + smcm = NULL; + goto out; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) { + /* + * The controller was reset while we were trying to discover + * logical volumes. Report failure. + */ + r = EIO; + goto out; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) { + ErrorInfo_t *ei = smcm->smcm_va_err; + + if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) { + dev_err(smrt->smrt_dip, CE_WARN, "logical volume " + "discovery error: status 0x%x", ei->CommandStatus); + r = EIO; + goto out; + } + } + + if (!ddi_in_panic() && + scsi_hba_tgtmap_set_begin(smrt->smrt_virt_tgtmap) != DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_WARN, "failed to begin target map " + "observation on %s", SMRT_IPORT_VIRT); + r = EIO; + goto out; + } + + if ((smrll->smrll_extflag & 0x1) != 0) { + r = smrt_read_logvols_ext(smrt, smrll, gen); + } else { + r = smrt_read_logvols(smrt, smrll, gen); + } + + if (r == 0 && !ddi_in_panic()) { + if (scsi_hba_tgtmap_set_end(smrt->smrt_virt_tgtmap, 0) != + DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_WARN, "failed to end target " + "map observation on %s", SMRT_IPORT_VIRT); + r = EIO; + } + } else if (r != 0 && !ddi_in_panic()) { + if (scsi_hba_tgtmap_set_flush(smrt->smrt_virt_tgtmap) != + DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_WARN, "failed to end target " + "map observation on %s", SMRT_IPORT_VIRT); + r = EIO; + } + } + + if (r == 0) { + /* + * Update the time of the last successful Logical Volume + * discovery: + */ + smrt->smrt_last_log_discovery = gethrtime(); + } + +out: + mutex_exit(&smrt->smrt_mutex); + + if (smcm != NULL) { + smrt_command_free(smcm); + } + return (r); +} + +void +smrt_logvol_tgtmap_activate(void *arg, char *addr, scsi_tgtmap_tgt_type_t type, + void **privpp) +{ + smrt_t *smrt = arg; + unsigned long volume; + char *eptr; + + VERIFY(type == SCSI_TGT_SCSI_DEVICE); + VERIFY0(ddi_strtoul(addr, &eptr, 16, &volume)); + VERIFY3S(*eptr, ==, '\0'); + VERIFY3S(volume, >=, 0); + VERIFY3S(volume, <, SMRT_MAX_LOGDRV); + mutex_enter(&smrt->smrt_mutex); + VERIFY(smrt_logvol_lookup_by_id(smrt, volume) != NULL); + mutex_exit(&smrt->smrt_mutex); + *privpp = NULL; +} + +boolean_t +smrt_logvol_tgtmap_deactivate(void *arg, char *addr, + scsi_tgtmap_tgt_type_t type, void *priv, scsi_tgtmap_deact_rsn_t reason) +{ + smrt_t *smrt = arg; + smrt_volume_t *smlv; + unsigned long volume; + char *eptr; + + VERIFY(type == SCSI_TGT_SCSI_DEVICE); + VERIFY(priv == NULL); + VERIFY0(ddi_strtoul(addr, &eptr, 16, &volume)); + VERIFY3S(*eptr, ==, '\0'); + VERIFY3S(volume, >=, 0); + VERIFY3S(volume, <, SMRT_MAX_LOGDRV); + + mutex_enter(&smrt->smrt_mutex); + smlv = smrt_logvol_lookup_by_id(smrt, volume); + VERIFY(smlv != NULL); + + list_remove(&smrt->smrt_volumes, smlv); + smrt_logvol_free(smlv); + mutex_exit(&smrt->smrt_mutex); + + return (B_FALSE); +} + +void +smrt_logvol_teardown(smrt_t *smrt) +{ + smrt_volume_t *smlv; + + while ((smlv = list_remove_head(&smrt->smrt_volumes)) != NULL) { + smrt_logvol_free(smlv); + } +} diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_physical.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_physical.c new file mode 100644 index 0000000000..8ab3927673 --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_physical.c @@ -0,0 +1,613 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017 Joyent, Inc. + */ + +#include <sys/scsi/adapters/smrt/smrt.h> + +static void +smrt_physical_free(smrt_physical_t *smpt) +{ + VERIFY(list_is_empty(&smpt->smpt_targets)); + VERIFY(smpt->smpt_info != NULL); + + kmem_free(smpt->smpt_info, sizeof (*smpt->smpt_info)); + list_destroy(&smpt->smpt_targets); + kmem_free(smpt, sizeof (*smpt)); +} + +/* + * Determine if a physical device enumerated should be shown to the world. There + * are three conditions to satisfy for this to be true. + * + * 1. The device (SAS, SATA, SES, etc.) must not have a masked CISS address. A + * masked CISS address indicates a device that we should not be performing I/O + * to. + * 2. The drive (SAS or SATA device) must not be marked as a member of a logical + * volume. + * 3. The drive (SAS or SATA device) must not be marked as a spare. + */ +static boolean_t +smrt_physical_visible(PhysDevAddr_t *addr, smrt_identify_physical_drive_t *info) +{ + if (addr->Mode == SMRT_CISS_MODE_MASKED) { + return (B_FALSE); + } + + if ((info->sipd_more_flags & (SMRT_MORE_FLAGS_LOGVOL | + SMRT_MORE_FLAGS_SPARE)) != 0) { + return (B_FALSE); + } + + return (B_TRUE); +} + +/* + * Note, the caller is responsible for making sure that the unit-address form of + * the WWN is pased in. Any additional information to target a specific LUN + * will be ignored. + */ +smrt_physical_t * +smrt_phys_lookup_by_ua(smrt_t *smrt, const char *ua) +{ + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + /* + * Sanity check that the caller has provided us enough bytes for a + * properly formed unit-address form of a WWN. + */ + if (strlen(ua) < SCSI_WWN_UA_STRLEN) + return (NULL); + + for (smrt_physical_t *smpt = list_head(&smrt->smrt_physicals); + smpt != NULL; smpt = list_next(&smrt->smrt_physicals, smpt)) { + char wwnstr[SCSI_WWN_BUFLEN]; + + (void) scsi_wwn_to_wwnstr(smpt->smpt_wwn, 1, wwnstr); + if (strncmp(wwnstr, ua, SCSI_WWN_UA_STRLEN) != 0) + continue; + + /* + * Verify that the UA string is either a comma or null there. + * We accept the comma in case it's being used as part of a + * normal UA with a LUN. + */ + if (ua[SCSI_WWN_UA_STRLEN] != '\0' && + ua[SCSI_WWN_UA_STRLEN] != ',') { + continue; + } + + return (smpt); + } + + return (NULL); +} + +static smrt_physical_t * +smrt_phys_lookup_by_wwn(smrt_t *smrt, uint64_t wwn) +{ + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + for (smrt_physical_t *smpt = list_head(&smrt->smrt_physicals); + smpt != NULL; smpt = list_next(&smrt->smrt_physicals, smpt)) { + if (wwn == smpt->smpt_wwn) + return (smpt); + } + + return (NULL); +} + +static int +smrt_phys_identify(smrt_t *smrt, smrt_identify_physical_drive_t *info, + uint16_t bmic, uint16_t timeout) +{ + smrt_command_t *smcm = NULL; + smrt_identify_physical_drive_t *sipd; + smrt_identify_physical_drive_req_t sipdr; + int ret; + size_t sz, copysz; + + sz = sizeof (smrt_identify_physical_drive_t); + sz = P2ROUNDUP_TYPED(sz, 512, size_t); + if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, + KM_NOSLEEP)) == NULL || smrt_command_attach_internal(smrt, smcm, + sizeof (*sipd), KM_NOSLEEP) != 0) { + ret = ENOMEM; + goto out; + } + + sipd = smcm->smcm_internal->smcmi_va; + + smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN); + + smcm->smcm_va_cmd->Request.CDBLen = sizeof (sipdr); + smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout); + smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD; + smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE; + smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ; + + /* + * Construct the IDENTIFY PHYSICAL DEVICE request CDB. Note that any + * reserved fields in the request must be filled with zeroes. + */ + bzero(&sipdr, sizeof (sipdr)); + sipdr.sipdr_opcode = CISS_SCMD_BMIC_READ; + sipdr.sipdr_lun = 0; + sipdr.sipdr_bmic_index1 = bmic & 0x00ff; + sipdr.sipdr_command = CISS_BMIC_IDENTIFY_PHYSICAL_DEVICE; + sipdr.sipdr_bmic_index2 = (bmic & 0xff00) >> 8; + bcopy(&sipdr, &smcm->smcm_va_cmd->Request.CDB[0], + MIN(CISS_CDBLEN, sizeof (sipdr))); + + mutex_enter(&smrt->smrt_mutex); + + /* + * Send the command to the device. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_POLLED; + if ((ret = smrt_submit(smrt, smcm)) != 0) { + mutex_exit(&smrt->smrt_mutex); + goto out; + } + + /* + * Poll for completion. + */ + smcm->smcm_expiry = gethrtime() + timeout * NANOSEC; + if ((ret = smrt_poll_for(smrt, smcm)) != 0) { + VERIFY3S(ret, ==, ETIMEDOUT); + VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE); + + /* + * The command timed out; abandon it now. Remove the POLLED + * flag so that the periodic routine will send an abort to + * clean it up next time around. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED; + smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED; + smcm = NULL; + mutex_exit(&smrt->smrt_mutex); + goto out; + } + mutex_exit(&smrt->smrt_mutex); + + if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) { + /* + * The controller was reset while we were trying to discover + * physical volumes. Report failure. + */ + ret = EIO; + goto out; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) { + ErrorInfo_t *ei = smcm->smcm_va_err; + + if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) { + dev_err(smrt->smrt_dip, CE_WARN, "identify physical " + "device error: status 0x%x", ei->CommandStatus); + ret = EIO; + goto out; + } + + copysz = MIN(sizeof (*sipd), sz - ei->ResidualCnt); + } else { + copysz = sizeof (*sipd); + } + + + sz = MIN(sizeof (*sipd), copysz); + bcopy(sipd, info, sizeof (*sipd)); + + ret = 0; +out: + if (smcm != NULL) { + smrt_command_free(smcm); + } + + return (ret); +} + +static int +smrt_read_phys_ext(smrt_t *smrt, smrt_report_physical_lun_t *smrpl, + uint16_t timeout, uint64_t gen) +{ + smrt_report_physical_lun_extent_t *extents = smrpl->smrpl_data.extents; + uint32_t count = BE_32(smrpl->smrpl_datasize) / + sizeof (smrt_report_physical_lun_extent_t); + uint32_t i; + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + + if (count > SMRT_MAX_PHYSDEV) { + count = SMRT_MAX_PHYSDEV; + } + + for (i = 0; i < count; i++) { + int ret; + smrt_physical_t *smpt; + smrt_identify_physical_drive_t *info; + smrt_report_physical_opdi_t *opdi; + uint16_t bmic; + uint64_t wwn, satawwn; + char name[SCSI_MAXNAMELEN]; + + opdi = &extents[i].srple_extdata.srple_opdi; + + mutex_exit(&smrt->smrt_mutex); + + /* + * Get the extended information about this device. + */ + info = kmem_zalloc(sizeof (*info), KM_NOSLEEP); + if (info == NULL) { + mutex_enter(&smrt->smrt_mutex); + return (ENOMEM); + } + + bmic = smrt_lun_addr_to_bmic(&extents[i].srple_addr); + ret = smrt_phys_identify(smrt, info, bmic, timeout); + if (ret != 0) { + mutex_enter(&smrt->smrt_mutex); + kmem_free(info, sizeof (*info)); + return (ret); + } + + wwn = *(uint64_t *)opdi->srpo_wwid; + wwn = BE_64(wwn); + + /* + * SATA devices may not have a proper WWN returned from firmware + * based on the SATL specification. Try to fetch the proper id + * for SATA devices, if the drive has one. If the drive doesn't + * have one or the SATL refuses to give us one, we use whatever + * the controller told us. + */ + if (opdi->srpo_dtype == SMRT_DTYPE_SATA && + smrt_sata_determine_wwn(smrt, &extents[i].srple_addr, + &satawwn, timeout) == 0) { + wwn = satawwn; + } + + mutex_enter(&smrt->smrt_mutex); + smpt = smrt_phys_lookup_by_wwn(smrt, wwn); + if (smpt != NULL) { + /* + * Sanity check that the model and serial number of this + * device is the same for this WWN. If it's not, the + * controller is probably lying about something. + */ + if (bcmp(smpt->smpt_info->sipd_model, info->sipd_model, + sizeof (info->sipd_model)) != 0 || + bcmp(smpt->smpt_info->sipd_serial, + info->sipd_serial, sizeof (info->sipd_serial)) != + 0 || smpt->smpt_dtype != opdi->srpo_dtype) { + dev_err(smrt->smrt_dip, CE_PANIC, "physical " + "target with wwn 0x%" PRIx64 " changed " + "model, serial, or type unexpectedly: " + "smrt_physical_t %p, phys info: %p", wwn, + smpt, info); + } + + /* + * When panicking, we don't allow a device's visibility + * to change to being invisible and be able to actually + * panic. We only worry about devices which are used + * for I/O. We purposefully ignore SES devices. + */ + if (ddi_in_panic() && + (opdi->srpo_dtype == SMRT_DTYPE_SATA || + opdi->srpo_dtype == SMRT_DTYPE_SAS)) { + boolean_t visible; + + visible = smrt_physical_visible( + &smpt->smpt_addr.PhysDev, smpt->smpt_info); + + if (visible != smpt->smpt_visible) { + dev_err(smrt->smrt_dip, CE_PANIC, + "physical target with wwn 0x%" + PRIx64 " changed visibility status " + "unexpectedly", wwn); + } + } + + kmem_free(smpt->smpt_info, sizeof (*smpt->smpt_info)); + smpt->smpt_info = NULL; + } else { + smpt = kmem_zalloc(sizeof (smrt_physical_t), + KM_NOSLEEP); + if (smpt == NULL) { + kmem_free(info, sizeof (*info)); + return (ENOMEM); + } + + smpt->smpt_wwn = wwn; + smpt->smpt_dtype = opdi->srpo_dtype; + list_create(&smpt->smpt_targets, sizeof (smrt_target_t), + offsetof(smrt_target_t, smtg_link_lun)); + smpt->smpt_ctlr = smrt; + list_insert_tail(&smrt->smrt_physicals, smpt); + } + + VERIFY3P(smpt->smpt_info, ==, NULL); + + /* + * Determine if this device is supported and if it's visible to + * the system. Some devices may not be visible to the system + * because they're used in logical volumes or spares. + * Unsupported devices are also not visible. + */ + switch (smpt->smpt_dtype) { + case SMRT_DTYPE_SATA: + case SMRT_DTYPE_SAS: + smpt->smpt_supported = B_TRUE; + smpt->smpt_visible = + smrt_physical_visible(&extents[i].srple_addr, info); + break; + case SMRT_DTYPE_SES: + smpt->smpt_supported = B_TRUE; + smpt->smpt_visible = + smrt_physical_visible(&extents[i].srple_addr, info); + break; + default: + smpt->smpt_visible = B_FALSE; + smpt->smpt_supported = B_FALSE; + } + + smpt->smpt_info = info; + smpt->smpt_addr.PhysDev = extents[i].srple_addr; + smpt->smpt_bmic = bmic; + smpt->smpt_gen = gen; + (void) scsi_wwn_to_wwnstr(smpt->smpt_wwn, 1, name); + if (!ddi_in_panic() && smpt->smpt_visible && + scsi_hba_tgtmap_set_add(smrt->smrt_phys_tgtmap, + SCSI_TGT_SCSI_DEVICE, name, NULL) != DDI_SUCCESS) { + return (EIO); + } + } + + return (0); +} + +int +smrt_phys_discover(smrt_t *smrt, uint16_t timeout, uint64_t gen) +{ + smrt_command_t *smcm; + smrt_report_physical_lun_t *smrpl; + smrt_report_physical_lun_req_t smrplr; + int r; + + /* + * Allocate the command to send to the device, including buffer space + * for the returned list of Physical Volumes. + */ + if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, + KM_NOSLEEP)) == NULL || smrt_command_attach_internal(smrt, smcm, + sizeof (*smrpl), KM_NOSLEEP) != 0) { + r = ENOMEM; + mutex_enter(&smrt->smrt_mutex); + goto out; + } + + smrpl = smcm->smcm_internal->smcmi_va; + + smrt_write_controller_lun_addr(&smcm->smcm_va_cmd->Header.LUN); + + smcm->smcm_va_cmd->Request.CDBLen = sizeof (smrplr); + smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout); + smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD; + smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE; + smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ; + + /* + * The Report Physical LUNs command is essentially a vendor-specific + * SCSI command, which we assemble into the CDB region of the command + * block. + */ + bzero(&smrplr, sizeof (smrplr)); + smrplr.smrplr_opcode = CISS_SCMD_REPORT_PHYSICAL_LUNS; + smrplr.smrplr_extflag = SMRT_REPORT_PHYSICAL_LUN_EXT_OPDI; + smrplr.smrplr_datasize = BE_32(sizeof (smrt_report_physical_lun_t)); + bcopy(&smrplr, &smcm->smcm_va_cmd->Request.CDB[0], + MIN(CISS_CDBLEN, sizeof (smrplr))); + + mutex_enter(&smrt->smrt_mutex); + + /* + * Send the command to the device. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_POLLED; + if ((r = smrt_submit(smrt, smcm)) != 0) { + goto out; + } + + /* + * Poll for completion. + */ + smcm->smcm_expiry = gethrtime() + timeout * NANOSEC; + if ((r = smrt_poll_for(smrt, smcm)) != 0) { + VERIFY3S(r, ==, ETIMEDOUT); + VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE); + + /* + * The command timed out; abandon it now. Remove the POLLED + * flag so that the periodic routine will send an abort to + * clean it up next time around. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED; + smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED; + smcm = NULL; + goto out; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) { + /* + * + * The controller was reset while we were trying to discover + * logical volumes. Report failure. + */ + r = EIO; + goto out; + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) { + ErrorInfo_t *ei = smcm->smcm_va_err; + + if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) { + dev_err(smrt->smrt_dip, CE_WARN, "physical target " + "discovery error: status 0x%x", ei->CommandStatus); + r = EIO; + goto out; + } + } + + /* + * If the controller doesn't support extended physical reporting, it + * likely doesn't even support physical devices that we'd care about + * exposing. As such, we treat this as an OK case. + */ + if ((smrpl->smrpl_extflag & SMRT_REPORT_PHYSICAL_LUN_EXT_MASK) != + SMRT_REPORT_PHYSICAL_LUN_EXT_OPDI) { + r = 0; + goto out; + } + + if (!ddi_in_panic() && + scsi_hba_tgtmap_set_begin(smrt->smrt_phys_tgtmap) != DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_WARN, "failed to begin target map " + "observation on %s", SMRT_IPORT_PHYS); + r = EIO; + goto out; + } + + r = smrt_read_phys_ext(smrt, smrpl, timeout, gen); + + if (r == 0 && !ddi_in_panic()) { + if (scsi_hba_tgtmap_set_end(smrt->smrt_phys_tgtmap, 0) != + DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_WARN, "failed to end target " + "map observation on %s", SMRT_IPORT_PHYS); + r = EIO; + } + } else if (r != 0 && !ddi_in_panic()) { + if (scsi_hba_tgtmap_set_flush(smrt->smrt_phys_tgtmap) != + DDI_SUCCESS) { + dev_err(smrt->smrt_dip, CE_WARN, "failed to end target " + "map observation on %s", SMRT_IPORT_PHYS); + r = EIO; + } + } + + if (r == 0) { + smrt_physical_t *smpt, *next; + + /* + * Prune physical devices that do not match the current + * generation and are not marked as visible devices. Visible + * devices will be dealt with as part of the target map work. + */ + for (smpt = list_head(&smrt->smrt_physicals), next = NULL; + smpt != NULL; smpt = next) { + next = list_next(&smrt->smrt_physicals, smpt); + if (smpt->smpt_visible || smpt->smpt_gen == gen) + continue; + list_remove(&smrt->smrt_physicals, smpt); + smrt_physical_free(smpt); + } + + /* + * Update the time of the last successful Physical Volume + * discovery: + */ + smrt->smrt_last_phys_discovery = gethrtime(); + + /* + * Now, for each unsupported device that we haven't warned about + * encountering, try and give the administrator some hope of + * knowing about this. + */ + for (smpt = list_head(&smrt->smrt_physicals), next = NULL; + smpt != NULL; smpt = next) { + if (smpt->smpt_supported || smpt->smpt_unsup_warn) + continue; + smpt->smpt_unsup_warn = B_TRUE; + dev_err(smrt->smrt_dip, CE_WARN, "encountered " + "unsupported device with device type %d", + smpt->smpt_dtype); + } + } + +out: + mutex_exit(&smrt->smrt_mutex); + + if (smcm != NULL) { + smrt_command_free(smcm); + } + return (r); +} + +void +smrt_phys_tgtmap_activate(void *arg, char *addr, scsi_tgtmap_tgt_type_t type, + void **privpp) +{ + smrt_t *smrt = arg; + smrt_physical_t *smpt; + + VERIFY3S(type, ==, SCSI_TGT_SCSI_DEVICE); + mutex_enter(&smrt->smrt_mutex); + smpt = smrt_phys_lookup_by_ua(smrt, addr); + VERIFY(smpt != NULL); + VERIFY(smpt->smpt_supported); + VERIFY(smpt->smpt_visible); + *privpp = NULL; + mutex_exit(&smrt->smrt_mutex); +} + +boolean_t +smrt_phys_tgtmap_deactivate(void *arg, char *addr, scsi_tgtmap_tgt_type_t type, + void *priv, scsi_tgtmap_deact_rsn_t reason) +{ + smrt_t *smrt = arg; + smrt_physical_t *smpt; + + VERIFY3S(type, ==, SCSI_TGT_SCSI_DEVICE); + VERIFY3P(priv, ==, NULL); + + mutex_enter(&smrt->smrt_mutex); + smpt = smrt_phys_lookup_by_ua(smrt, addr); + + /* + * If the device disappeared or became invisible, then it may have + * already been removed. + */ + if (smpt == NULL || !smpt->smpt_visible) { + mutex_exit(&smrt->smrt_mutex); + return (B_FALSE); + } + + list_remove(&smrt->smrt_physicals, smpt); + smrt_physical_free(smpt); + mutex_exit(&smrt->smrt_mutex); + return (B_FALSE); +} + +void +smrt_phys_teardown(smrt_t *smrt) +{ + smrt_physical_t *smpt; + + VERIFY(MUTEX_HELD(&smrt->smrt_mutex)); + while ((smpt = list_remove_head(&smrt->smrt_physicals)) != NULL) { + smrt_physical_free(smpt); + } +} diff --git a/usr/src/uts/common/io/scsi/adapters/smrt/smrt_sata.c b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_sata.c new file mode 100644 index 0000000000..6224b97732 --- /dev/null +++ b/usr/src/uts/common/io/scsi/adapters/smrt/smrt_sata.c @@ -0,0 +1,160 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +/* + * Collection of routines specific to SATA devices and attempting to make them + * work. + */ + +#include <sys/scsi/adapters/smrt/smrt.h> + +/* + * This is a buffer size that should easily cover all of the data that we need + * to properly determine the buffer allocation. + */ +#define SMRT_SATA_INQ83_LEN 256 + +/* + * We need to try and determine if a SATA WWN exists on the device. SAT-2 + * defines that the response to the inquiry page 0x83. + */ +int +smrt_sata_determine_wwn(smrt_t *smrt, PhysDevAddr_t *addr, uint64_t *wwnp, + uint16_t timeout) +{ + smrt_command_t *smcm; + int r; + uint8_t *inq; + uint64_t wwn; + size_t resid; + + VERIFY3P(wwnp, !=, NULL); + + if ((smcm = smrt_command_alloc(smrt, SMRT_CMDTYPE_INTERNAL, + KM_NOSLEEP)) == NULL || smrt_command_attach_internal(smrt, smcm, + SMRT_SATA_INQ83_LEN, KM_NOSLEEP) != 0) { + if (smcm != NULL) { + smrt_command_free(smcm); + } + return (ENOMEM); + } + + smcm->smcm_va_cmd->Header.LUN.PhysDev = *addr; + smcm->smcm_va_cmd->Request.CDBLen = CDB_GROUP0; + smcm->smcm_va_cmd->Request.Type.Type = CISS_TYPE_CMD; + smcm->smcm_va_cmd->Request.Type.Attribute = CISS_ATTR_SIMPLE; + smcm->smcm_va_cmd->Request.Type.Direction = CISS_XFER_READ; + smcm->smcm_va_cmd->Request.Timeout = LE_16(timeout); + + smcm->smcm_va_cmd->Request.CDB[0] = SCMD_INQUIRY; + smcm->smcm_va_cmd->Request.CDB[1] = 1; + smcm->smcm_va_cmd->Request.CDB[2] = 0x83; + smcm->smcm_va_cmd->Request.CDB[3] = (SMRT_SATA_INQ83_LEN & 0xff00) >> 8; + smcm->smcm_va_cmd->Request.CDB[4] = SMRT_SATA_INQ83_LEN & 0x00ff; + smcm->smcm_va_cmd->Request.CDB[5] = 0; + + mutex_enter(&smrt->smrt_mutex); + + /* + * Send the command to the device. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_POLLED; + if ((r = smrt_submit(smrt, smcm)) != 0) { + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(smcm); + return (r); + } + + if ((r = smrt_poll_for(smrt, smcm)) != 0) { + VERIFY3S(r, ==, ETIMEDOUT); + VERIFY0(smcm->smcm_status & SMRT_CMD_STATUS_POLL_COMPLETE); + + /* + * The command timed out; abandon it now. Remove the POLLED + * flag so that the periodic routine will send an abort to + * clean it up next time around. + */ + smcm->smcm_status |= SMRT_CMD_STATUS_ABANDONED; + smcm->smcm_status &= ~SMRT_CMD_STATUS_POLLED; + mutex_exit(&smrt->smrt_mutex); + return (r); + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_RESET_SENT) { + /* + * The controller was reset while we were trying to discover + * logical volumes. Report failure. + */ + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(smcm); + return (EIO); + } + + if (smcm->smcm_status & SMRT_CMD_STATUS_ERROR) { + ErrorInfo_t *ei = smcm->smcm_va_err; + + if (ei->CommandStatus != CISS_CMD_DATA_UNDERRUN) { + dev_err(smrt->smrt_dip, CE_WARN, "physical target " + "SATA WWN error: status 0x%x", ei->CommandStatus); + mutex_exit(&smrt->smrt_mutex); + smrt_command_free(smcm); + return (EIO); + } + resid = ei->ResidualCnt; + } else { + resid = 0; + } + + mutex_exit(&smrt->smrt_mutex); + + /* + * We must have at least 12 bytes. The first four bytes are the header, + * the next four are for the LUN header, and the last 8 are for the + * actual WWN, which according to SAT-2 will always be first. + */ + if (SMRT_SATA_INQ83_LEN - resid < 16) { + smrt_command_free(smcm); + return (EINVAL); + } + inq = smcm->smcm_internal->smcmi_va; + + /* + * Sanity check we have the right page. + */ + if (inq[1] != 0x83) { + smrt_command_free(smcm); + return (EINVAL); + } + + /* + * Check to see if we have a proper Network Address Authority (NAA) + * based world wide number for this LUN. It is possible that firmware + * interposes on this and constructs a fake world wide number (WWN). If + * this is the case, we don't want to actually use it. We need to + * verify that the WWN declares the correct naming authority and is of + * the proper length. + */ + if ((inq[5] & 0x30) != 0 || (inq[5] & 0x0f) != 3 || inq[7] != 8) { + smrt_command_free(smcm); + return (ENOTSUP); + } + + bcopy(&inq[8], &wwn, sizeof (uint64_t)); + *wwnp = BE_64(wwn); + + smrt_command_free(smcm); + + return (0); +} diff --git a/usr/src/uts/common/io/scsi/targets/sd.c b/usr/src/uts/common/io/scsi/targets/sd.c index 769592a3e2..95f0b3b82a 100644 --- a/usr/src/uts/common/io/scsi/targets/sd.c +++ b/usr/src/uts/common/io/scsi/targets/sd.c @@ -26,6 +26,7 @@ * Copyright (c) 2011 Bayard G. Bell. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright 2017 Nexenta Systems, Inc. */ /* @@ -3497,9 +3498,13 @@ sd_set_mmc_caps(sd_ssc_t *ssc) * according to the successful response to the page * 0x2A mode sense request. */ - scsi_log(SD_DEVINFO(un), sd_label, CE_WARN, - "sd_set_mmc_caps: Mode Sense returned " - "invalid block descriptor length\n"); + /* + * The following warning occurs due to the KVM CD-ROM + * mishandling the multi-media commands. Ignore it. + * scsi_log(SD_DEVINFO(un), sd_label, CE_WARN, + * "sd_set_mmc_caps: Mode Sense returned " + * "invalid block descriptor length\n"); + */ kmem_free(buf, BUFLEN_MODE_CDROM_CAP); return; } @@ -4444,18 +4449,77 @@ sd_sdconf_id_match(struct sd_lun *un, char *id, int idlen) { struct scsi_inquiry *sd_inq; int rval = SD_SUCCESS; + char *p; + int chk_vidlen = 0, chk_pidlen = 0; + int has_tail = 0; + static const int VSZ = sizeof (sd_inq->inq_vid); + static const int PSZ = sizeof (sd_inq->inq_pid); ASSERT(un != NULL); sd_inq = un->un_sd->sd_inq; ASSERT(id != NULL); /* - * We use the inq_vid as a pointer to a buffer containing the - * vid and pid and use the entire vid/pid length of the table - * entry for the comparison. This works because the inq_pid - * data member follows inq_vid in the scsi_inquiry structure. + * We would like to use the inq_vid as a pointer to a buffer + * containing the vid and pid and use the entire vid/pid length of + * the table entry for the comparison. However, this does not work + * because, while the inq_pid data member follows inq_vid in the + * scsi_inquiry structure, we do not control the contents of this + * buffer, and some broken devices violate SPC 4.3.1 and return + * fields with null bytes in them. */ - if (strncasecmp(sd_inq->inq_vid, id, idlen) != 0) { + chk_vidlen = MIN(VSZ, idlen); + p = id + chk_vidlen - 1; + while (*p == ' ' && chk_vidlen > 0) { + --p; + --chk_vidlen; + } + + /* + * If it's all spaces, check the whole thing. + */ + if (chk_vidlen == 0) + chk_vidlen = MIN(VSZ, idlen); + + if (idlen > VSZ) { + chk_pidlen = idlen - VSZ; + p = id + idlen - 1; + while (*p == ' ' && chk_pidlen > 0) { + --p; + --chk_pidlen; + } + if (chk_pidlen == 0) + chk_pidlen = MIN(PSZ, idlen - VSZ); + } + + /* + * There's one more thing we need to do here. If the user specified + * an ID with trailing spaces, we need to make sure the inquiry + * vid/pid has only spaces or NULs after the check length; otherwise, it + * can't match. + */ + if (idlen > chk_vidlen && chk_vidlen < VSZ) { + for (p = sd_inq->inq_vid + chk_vidlen; + p < sd_inq->inq_vid + VSZ; ++p) { + if (*p != ' ' && *p != '\0') { + ++has_tail; + break; + } + } + } + if (idlen > chk_pidlen + VSZ && chk_pidlen < PSZ) { + for (p = sd_inq->inq_pid + chk_pidlen; + p < sd_inq->inq_pid + PSZ; ++p) { + if (*p != ' ' && *p != '\0') { + ++has_tail; + break; + } + } + } + + if (has_tail || strncasecmp(sd_inq->inq_vid, id, chk_vidlen) != 0 || + (idlen > VSZ && + strncasecmp(sd_inq->inq_pid, id + VSZ, chk_pidlen) != 0)) { /* * The user id string is compared to the inquiry vid/pid * using a case insensitive comparison and ignoring @@ -6722,7 +6786,7 @@ sdpower(dev_info_t *devi, int component, int level) time_t intvlp; struct pm_trans_data sd_pm_tran_data; uchar_t save_state = SD_STATE_NORMAL; - int sval; + int sval, tursval = 0; uchar_t state_before_pm; int got_semaphore_here; sd_ssc_t *ssc; @@ -7039,13 +7103,26 @@ sdpower(dev_info_t *devi, int component, int level) * a deadlock on un_pm_busy_cv will occur. */ if (SD_PM_IS_IO_CAPABLE(un, level)) { - sval = sd_send_scsi_TEST_UNIT_READY(ssc, + tursval = sd_send_scsi_TEST_UNIT_READY(ssc, SD_DONT_RETRY_TUR | SD_BYPASS_PM); - if (sval != 0) + if (tursval != 0) sd_ssc_assessment(ssc, SD_FMT_IGNORE); } - if (un->un_f_power_condition_supported) { + /* + * We've encountered certain classes of drives that pass a TUR, but fail + * the START STOP UNIT when using power conditions, or worse leave the + * drive in an unusable state despite passing SSU. Strictly speaking, + * for SPC-4 or greater, no additional actions are required to make the + * drive operational when a TUR passes. If we have something that + * matches this condition, we continue on and presume the drive is + * successfully powered on. + */ + if (un->un_f_power_condition_supported && + SD_SCSI_VERS_IS_GE_SPC_4(un) && SD_PM_IS_IO_CAPABLE(un, level) && + level == SD_SPINDLE_ACTIVE && tursval == 0) { + sval = 0; + } else if (un->un_f_power_condition_supported) { char *pm_condition_name[] = {"STOPPED", "STANDBY", "IDLE", "ACTIVE"}; SD_TRACE(SD_LOG_IO_PM, un, @@ -7065,6 +7142,7 @@ sdpower(dev_info_t *devi, int component, int level) sd_ssc_assessment(ssc, SD_FMT_STATUS_CHECK); else sd_ssc_assessment(ssc, SD_FMT_IGNORE); + } /* Command failed, check for media present. */ @@ -14213,15 +14291,21 @@ sd_initpkt_for_uscsi(struct buf *bp, struct scsi_pkt **pktpp) /* * Allocate the scsi_pkt for the command. + * * Note: If PKT_DMA_PARTIAL flag is set, scsi_vhci binds a path * during scsi_init_pkt time and will continue to use the * same path as long as the same scsi_pkt is used without - * intervening scsi_dma_free(). Since uscsi command does + * intervening scsi_dmafree(). Since uscsi command does * not call scsi_dmafree() before retry failed command, it * is necessary to make sure PKT_DMA_PARTIAL flag is NOT * set such that scsi_vhci can use other available path for * retry. Besides, ucsci command does not allow DMA breakup, * so there is no need to set PKT_DMA_PARTIAL flag. + * + * More fundamentally, we can't support breaking up this DMA into + * multiple windows on x86. There is, in general, no guarantee + * that arbitrary SCSI commands are idempotent, which is required + * if we want to use multiple windows for a given command. */ if (uscmd->uscsi_rqlen > SENSE_LENGTH) { pktp = scsi_init_pkt(SD_ADDRESS(un), NULL, @@ -22680,6 +22764,7 @@ sdioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cred_p, int *rval_p) case MHIOCGRP_REGISTERANDIGNOREKEY: case CDROMCLOSETRAY: case USCSICMD: + case USCSIMAXXFER: goto skip_ready_valid; default: break; @@ -23116,6 +23201,23 @@ skip_ready_valid: } break; + case USCSIMAXXFER: + SD_TRACE(SD_LOG_IOCTL, un, "USCSIMAXXFER\n"); + cr = ddi_get_cred(); + if ((drv_priv(cred_p) != 0) && (drv_priv(cr) != 0)) { + err = EPERM; + } else { + const uscsi_xfer_t xfer = un->un_max_xfer_size; + + if (ddi_copyout(&xfer, (void *)arg, sizeof (xfer), + flag) != 0) { + err = EFAULT; + } else { + err = 0; + } + } + break; + case CDROMPAUSE: case CDROMRESUME: SD_TRACE(SD_LOG_IOCTL, un, "PAUSE-RESUME\n"); @@ -31297,7 +31399,7 @@ sd_set_unit_attributes(struct sd_lun *un, dev_info_t *devi) if (SD_PM_CAPABLE_IS_UNDEFINED(pm_cap)) { un->un_f_log_sense_supported = TRUE; if (!un->un_f_power_condition_disabled && - SD_INQUIRY(un)->inq_ansi == 6) { + SD_SCSI_VERS_IS_GE_SPC_4(un)) { un->un_f_power_condition_supported = TRUE; } } else { @@ -31315,7 +31417,7 @@ sd_set_unit_attributes(struct sd_lun *un, dev_info_t *devi) /* SD_PM_CAPABLE_IS_TRUE case */ un->un_f_pm_supported = TRUE; if (!un->un_f_power_condition_disabled && - SD_PM_CAPABLE_IS_SPC_4(pm_cap)) { + (SD_PM_CAPABLE_IS_GE_SPC_4(pm_cap))) { un->un_f_power_condition_supported = TRUE; } diff --git a/usr/src/uts/common/io/signalfd.c b/usr/src/uts/common/io/signalfd.c index 6416d6d45a..2a1cf25917 100644 --- a/usr/src/uts/common/io/signalfd.c +++ b/usr/src/uts/common/io/signalfd.c @@ -107,6 +107,7 @@ #include <sys/schedctl.h> #include <sys/id_space.h> #include <sys/sdt.h> +#include <sys/brand.h> #include <sys/disp.h> #include <sys/taskq_impl.h> @@ -459,6 +460,9 @@ consume_signal(k_sigset_t set, uio_t *uio, boolean_t block) lwp->lwp_extsig = 0; mutex_exit(&p->p_lock); + if (PROC_IS_BRANDED(p) && BROP(p)->b_sigfd_translate) + BROP(p)->b_sigfd_translate(infop); + /* Convert k_siginfo into external, datamodel independent, struct. */ bzero(ssp, sizeof (*ssp)); ssp->ssi_signo = infop->si_signo; diff --git a/usr/src/uts/common/io/simnet/simnet.c b/usr/src/uts/common/io/simnet/simnet.c index 727fbbad8e..9bfe2fe7cf 100644 --- a/usr/src/uts/common/io/simnet/simnet.c +++ b/usr/src/uts/common/io/simnet/simnet.c @@ -21,6 +21,8 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018 Joyent, Inc. */ /* @@ -795,12 +797,6 @@ simnet_m_tx(void *arg, mblk_t *mp_chain) continue; } - /* Fix mblk checksum as the pkt dest is local */ - if ((mp = mac_fix_cksum(mp)) == NULL) { - sdev->sd_stats.xmit_errors++; - continue; - } - /* Hold reference for taskq receive processing per-pkt */ if (!simnet_thread_ref(sdev_rx)) { freemsg(mp); diff --git a/usr/src/uts/common/io/stream.c b/usr/src/uts/common/io/stream.c index e9af19ca18..994ca8baa8 100644 --- a/usr/src/uts/common/io/stream.c +++ b/usr/src/uts/common/io/stream.c @@ -1451,6 +1451,16 @@ copyb(mblk_t *bp) ndp = nbp->b_datap; /* + * Copy the various checksum information that came in + * originally. + */ + ndp->db_cksumstart = dp->db_cksumstart; + ndp->db_cksumend = dp->db_cksumend; + ndp->db_cksumstuff = dp->db_cksumstuff; + bcopy(dp->db_struioun.data, ndp->db_struioun.data, + sizeof (dp->db_struioun.data)); + + /* * Well, here is a potential issue. If we are trying to * trace a flow, and we copy the message, we might lose * information about where this message might have been. diff --git a/usr/src/uts/common/io/tl.c b/usr/src/uts/common/io/tl.c index 3e2acab1ec..81a0cac18c 100644 --- a/usr/src/uts/common/io/tl.c +++ b/usr/src/uts/common/io/tl.c @@ -1419,8 +1419,9 @@ tl_closeok(tl_endpt_t *tep) static int tl_open(queue_t *rq, dev_t *devp, int oflag, int sflag, cred_t *credp) { - tl_endpt_t *tep; - minor_t minor = getminor(*devp); + tl_endpt_t *tep; + minor_t minor = getminor(*devp); + id_t inst_minor; /* * Driver is called directly. Both CLONEOPEN and MODOPEN @@ -1440,6 +1441,14 @@ tl_open(queue_t *rq, dev_t *devp, int oflag, int sflag, cred_t *credp) minor |= TL_SOCKET; } + /* + * Attempt to allocate a unique minor number for this instance. + * Avoid an uninterruptable sleep if none are available. + */ + if ((inst_minor = id_alloc_nosleep(tl_minors)) == -1) { + return (ENOMEM); + } + tep = kmem_cache_alloc(tl_cache, KM_SLEEP); tep->te_refcnt = 1; tep->te_cpid = curproc->p_pid; @@ -1451,9 +1460,7 @@ tl_open(queue_t *rq, dev_t *devp, int oflag, int sflag, cred_t *credp) tep->te_flag = minor & TL_MINOR_MASK; tep->te_transport = &tl_transports[minor]; - - /* Allocate a unique minor number for this instance. */ - tep->te_minor = (minor_t)id_alloc(tl_minors); + tep->te_minor = (minor_t)inst_minor; /* Reserve hash handle for bind(). */ (void) mod_hash_reserve(tep->te_addrhash, &tep->te_hash_hndl); diff --git a/usr/src/uts/common/io/usb/clients/hid/hid.c b/usr/src/uts/common/io/usb/clients/hid/hid.c index 084fa7fedc..eccd48bf08 100644 --- a/usr/src/uts/common/io/usb/clients/hid/hid.c +++ b/usr/src/uts/common/io/usb/clients/hid/hid.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2017 Joyent, Inc. */ @@ -139,6 +139,12 @@ static int hid_info(dev_info_t *, ddi_info_cmd_t, void *, void **); static int hid_attach(dev_info_t *, ddi_attach_cmd_t); static int hid_detach(dev_info_t *, ddi_detach_cmd_t); static int hid_power(dev_info_t *, int, int); +/* These are to enable ugen support: */ +static int hid_chropen(dev_t *, int, int, cred_t *); +static int hid_chrclose(dev_t, int, int, cred_t *); +static int hid_read(dev_t, struct uio *, cred_t *); +static int hid_write(dev_t, struct uio *, cred_t *); +static int hid_poll(dev_t, short, int, short *, struct pollhead **); /* * Warlock is not aware of the automatic locking mechanisms for @@ -198,18 +204,18 @@ struct streamtab hid_streamtab = { }; struct cb_ops hid_cb_ops = { - nulldev, /* open */ - nulldev, /* close */ + hid_chropen, /* open */ + hid_chrclose, /* close */ nulldev, /* strategy */ nulldev, /* print */ nulldev, /* dump */ - nulldev, /* read */ - nulldev, /* write */ + hid_read, /* read */ + hid_write, /* write */ nulldev, /* ioctl */ nulldev, /* devmap */ nulldev, /* mmap */ nulldev, /* segmap */ - nochpoll, /* poll */ + hid_poll, /* poll */ ddi_prop_op, /* cb_prop_op */ &hid_streamtab, /* streamtab */ D_MP | D_MTPERQ @@ -349,6 +355,7 @@ hid_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) usb_alt_if_data_t *altif_data; char minor_name[HID_MINOR_NAME_LEN]; usb_ep_data_t *ep_data; + usb_ugen_info_t usb_ugen_info; switch (cmd) { case DDI_ATTACH: @@ -491,6 +498,28 @@ hid_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) usb_free_dev_data(dip, dev_data); hidp->hid_dev_data = NULL; + if (usb_owns_device(dip)) { + /* Get a ugen handle. */ + bzero(&usb_ugen_info, sizeof (usb_ugen_info)); + + usb_ugen_info.usb_ugen_flags = 0; + usb_ugen_info.usb_ugen_minor_node_ugen_bits_mask = + (dev_t)HID_MINOR_UGEN_BITS_MASK; + usb_ugen_info.usb_ugen_minor_node_instance_mask = + (dev_t)HID_MINOR_INSTANCE_MASK; + hidp->hid_ugen_hdl = usb_ugen_get_hdl(dip, &usb_ugen_info); + + if (usb_ugen_attach(hidp->hid_ugen_hdl, cmd) != + USB_SUCCESS) { + USB_DPRINTF_L2(PRINT_MASK_ATTA, + hidp->hid_log_handle, + "usb_ugen_attach failed"); + + usb_ugen_release_hdl(hidp->hid_ugen_hdl); + hidp->hid_ugen_hdl = NULL; + } + } + /* * Don't get the report descriptor if parsing hid descriptor earlier * failed since device probably won't return valid report descriptor @@ -769,6 +798,149 @@ hid_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) return (rval); } +static int +hid_chropen(dev_t *devp, int flag, int sflag, cred_t *credp) +{ + int rval; + minor_t minor = getminor(*devp); + int instance; + hid_state_t *hidp; + + instance = HID_MINOR_TO_INSTANCE(minor); + + hidp = ddi_get_soft_state(hid_statep, instance); + if (hidp == NULL) { + return (ENXIO); + } + + if (!HID_IS_UGEN_OPEN(minor)) { + return (ENXIO); + } + + hid_pm_busy_component(hidp); + (void) pm_raise_power(hidp->hid_dip, 0, USB_DEV_OS_FULL_PWR); + + mutex_enter(&hidp->hid_mutex); + + rval = usb_ugen_open(hidp->hid_ugen_hdl, devp, flag, + sflag, credp); + + mutex_exit(&hidp->hid_mutex); + + if (rval != 0) { + hid_pm_idle_component(hidp); + } + + return (rval); +} + +static int +hid_chrclose(dev_t dev, int flag, int otyp, cred_t *credp) +{ + int rval; + minor_t minor = getminor(dev); + int instance; + hid_state_t *hidp; + + instance = HID_MINOR_TO_INSTANCE(minor); + + hidp = ddi_get_soft_state(hid_statep, instance); + if (hidp == NULL) { + return (ENXIO); + } + + if (!HID_IS_UGEN_OPEN(minor)) { + return (ENXIO); + } + + mutex_enter(&hidp->hid_mutex); + + rval = usb_ugen_close(hidp->hid_ugen_hdl, dev, flag, + otyp, credp); + + mutex_exit(&hidp->hid_mutex); + + if (rval == 0) { + hid_pm_idle_component(hidp); + } + + return (rval); +} + +static int +hid_read(dev_t dev, struct uio *uiop, cred_t *credp) +{ + int rval; + minor_t minor = getminor(dev); + int instance; + hid_state_t *hidp; + + instance = HID_MINOR_TO_INSTANCE(minor); + + hidp = ddi_get_soft_state(hid_statep, instance); + if (hidp == NULL) { + return (ENXIO); + } + + if (!HID_IS_UGEN_OPEN(minor)) { + return (ENXIO); + } + + rval = usb_ugen_read(hidp->hid_ugen_hdl, dev, uiop, credp); + + return (rval); +} + +static int +hid_write(dev_t dev, struct uio *uiop, cred_t *credp) +{ + int rval; + minor_t minor = getminor(dev); + int instance; + hid_state_t *hidp; + + instance = HID_MINOR_TO_INSTANCE(minor); + + hidp = ddi_get_soft_state(hid_statep, instance); + if (hidp == NULL) { + return (ENXIO); + } + + if (!HID_IS_UGEN_OPEN(minor)) { + return (ENXIO); + } + + rval = usb_ugen_write(hidp->hid_ugen_hdl, dev, uiop, credp); + + return (rval); +} + +static int +hid_poll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + int rval; + minor_t minor = getminor(dev); + int instance; + hid_state_t *hidp; + + instance = HID_MINOR_TO_INSTANCE(minor); + + hidp = ddi_get_soft_state(hid_statep, instance); + if (hidp == NULL) { + return (ENXIO); + } + + if (!HID_IS_UGEN_OPEN(minor)) { + return (ENXIO); + } + + rval = usb_ugen_poll(hidp->hid_ugen_hdl, dev, events, anyyet, + reventsp, phpp); + + return (rval); +} + /* * hid_open : * Open entry point: Opens the interrupt pipe. Sets up queues. @@ -787,13 +959,21 @@ hid_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) hidp = ddi_get_soft_state(hid_statep, instance); if (hidp == NULL) { - return (ENXIO); } USB_DPRINTF_L4(PRINT_MASK_OPEN, hidp->hid_log_handle, "hid_open: Begin"); + /* + * If this is a ugen device, return ENOSTR (no streams). This will + * cause spec_open to try hid_chropen from our regular ops_cb instead + * (and thus treat us as a plain character device). + */ + if (HID_IS_UGEN_OPEN(minor)) { + return (ENOSTR); + } + if (sflag) { /* clone open NOT supported here */ return (ENXIO); @@ -803,6 +983,8 @@ hid_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) return (EIO); } + mutex_enter(&hidp->hid_mutex); + /* * This is a workaround: * Currently, if we open an already disconnected device, and send @@ -812,7 +994,6 @@ hid_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) * The consconfig_dacf module need this interface to detect if the * device is already disconnnected. */ - mutex_enter(&hidp->hid_mutex); if (HID_IS_INTERNAL_OPEN(minor) && (hidp->hid_dev_state == USB_DEV_DISCONNECTED)) { mutex_exit(&hidp->hid_mutex); @@ -1688,6 +1869,11 @@ hid_cpr_suspend(hid_state_t *hidp) } mutex_exit(&hidp->hid_mutex); + if ((retval == USB_SUCCESS) && hidp->hid_ugen_hdl != NULL) { + retval = usb_ugen_detach(hidp->hid_ugen_hdl, + DDI_SUSPEND); + } + return (retval); } @@ -1699,6 +1885,10 @@ hid_cpr_resume(hid_state_t *hidp) "hid_cpr_resume: dip=0x%p", (void *)hidp->hid_dip); hid_restore_device_state(hidp->hid_dip, hidp); + + if (hidp->hid_ugen_hdl != NULL) { + (void) usb_ugen_attach(hidp->hid_ugen_hdl, DDI_RESUME); + } } @@ -2136,6 +2326,12 @@ hid_detach_cleanup(dev_info_t *dip, hid_state_t *hidp) hidp->hid_pm = NULL; } + if (hidp->hid_ugen_hdl != NULL) { + rval = usb_ugen_detach(hidp->hid_ugen_hdl, DDI_DETACH); + VERIFY0(rval); + usb_ugen_release_hdl(hidp->hid_ugen_hdl); + } + mutex_exit(&hidp->hid_mutex); if (hidp->hid_report_descr != NULL) { diff --git a/usr/src/uts/common/io/usb/hcd/xhci/xhci.c b/usr/src/uts/common/io/usb/hcd/xhci/xhci.c index 84bd1a13c9..d404ea1d18 100644 --- a/usr/src/uts/common/io/usb/hcd/xhci/xhci.c +++ b/usr/src/uts/common/io/usb/hcd/xhci/xhci.c @@ -10,7 +10,7 @@ */ /* - * Copyright (c) 2017, Joyent, Inc. + * Copyright (c) 2019, Joyent, Inc. */ /* @@ -86,39 +86,39 @@ * * There are four different kinds of endpoints: * - * BULK These transfers are large transfers of data to or from - * a device. The most common use for bulk transfers is for - * mass storage devices. Though they are often also used by - * network devices and more. Bulk endpoints do not have an - * explicit time component to them. They are always used - * for one-shot transfers. - * - * CONTROL These transfers are used to manipulate devices - * themselves and are used for USB protocol level - * operations (whether device-specific, class-specific, or - * generic across all of USB). Unlike other transfers, - * control transfers are always bi-directional and use - * different kinds of transfers. - * - * INTERRUPT Interrupt transfers are used for small transfers that - * happen infrequently, but need reasonable latency. A good - * example of interrupt transfers is to receive input from - * a USB keyboard. Interrupt-IN transfers are generally - * polled. Meaning that a client (device driver) opens up - * an interrupt-IN pipe to poll on it, and receives - * periodic updates whenever there is information - * available. However, Interrupt transfers can be used - * as one-shot transfers both going IN and OUT. - * - * ISOCHRONOUS These transfers are things that happen once per - * time-interval at a very regular rate. A good example of - * these transfers are for audio and video. A device may - * describe an interval as 10ms at which point it will read - * or write the next batch of data every 10ms and transform - * it for the user. There are no one-shot Isochronous-IN - * transfers. There are one-shot Isochronous-OUT transfers, - * but these are used by device drivers to always provide - * the system with sufficient data. + * BULK These transfers are large transfers of data to or from + * a device. The most common use for bulk transfers is for + * mass storage devices. Though they are often also used by + * network devices and more. Bulk endpoints do not have an + * explicit time component to them. They are always used + * for one-shot transfers. + * + * CONTROL These transfers are used to manipulate devices + * themselves and are used for USB protocol level + * operations (whether device-specific, class-specific, or + * generic across all of USB). Unlike other transfers, + * control transfers are always bi-directional and use + * different kinds of transfers. + * + * INTERRUPT Interrupt transfers are used for small transfers that + * happen infrequently, but need reasonable latency. A good + * example of interrupt transfers is to receive input from + * a USB keyboard. Interrupt-IN transfers are generally + * polled. Meaning that a client (device driver) opens up + * an interrupt-IN pipe to poll on it, and receives + * periodic updates whenever there is information + * available. However, Interrupt transfers can be used + * as one-shot transfers both going IN and OUT. + * + * ISOCHRONOUS These transfers are things that happen once per + * time-interval at a very regular rate. A good example of + * these transfers are for audio and video. A device may + * describe an interval as 10ms at which point it will read + * or write the next batch of data every 10ms and transform + * it for the user. There are no one-shot Isochronous-IN + * transfers. There are one-shot Isochronous-OUT transfers, + * but these are used by device drivers to always provide + * the system with sufficient data. * * To find out information about the endpoints, USB devices have a series of * descriptors that cover different aspects of the device. For example, there @@ -220,67 +220,67 @@ * purpose of each of these files. * * xhci_command.c: This file contains the logic to issue commands to the - * controller as well as the actual functions that the - * other parts of the driver use to cause those commands. + * controller as well as the actual functions that the + * other parts of the driver use to cause those commands. * * xhci_context.c: This file manages various data structures used by the - * controller to manage the controller's and device's - * context data structures. See more in the xHCI Overview - * and General Design for more information. + * controller to manage the controller's and device's + * context data structures. See more in the xHCI Overview + * and General Design for more information. * * xhci_dma.c: This manages the allocation of DMA memory and DMA - * attributes for controller, whether memory is for a - * transfer or something else. This file also deals with - * all the logic of getting data in and out of DMA buffers. + * attributes for controller, whether memory is for a + * transfer or something else. This file also deals with + * all the logic of getting data in and out of DMA buffers. * * xhci_endpoint.c: This manages all of the logic of handling endpoints or - * pipes. It deals with endpoint configuration, I/O - * scheduling, timeouts, and callbacks to USBA. + * pipes. It deals with endpoint configuration, I/O + * scheduling, timeouts, and callbacks to USBA. * * xhci_event.c: This manages callbacks from the hardware to the driver. - * This covers command completion notifications and I/O - * notifications. + * This covers command completion notifications and I/O + * notifications. * * xhci_hub.c: This manages the virtual root-hub. It basically - * implements and translates all of the USB level requests - * into xhci specific implements. It also contains the - * functions to register this hub with USBA. + * implements and translates all of the USB level requests + * into xhci specific implements. It also contains the + * functions to register this hub with USBA. * * xhci_intr.c: This manages the underlying interrupt allocation, - * interrupt moderation, and interrupt routines. + * interrupt moderation, and interrupt routines. * * xhci_quirks.c: This manages information about buggy hardware that's - * been collected and experienced primarily from other - * systems. + * been collected and experienced primarily from other + * systems. * * xhci_ring.c: This manages the abstraction of a ring in xhci, which is - * the primary of communication between the driver and the - * hardware, whether for the controller or a device. + * the primary of communication between the driver and the + * hardware, whether for the controller or a device. * * xhci_usba.c: This implements all of the HCDI functions required by - * USBA. This is the main entry point that drivers and the - * kernel frameworks will reach to start any operation. - * Many functions here will end up in the command and - * endpoint code. + * USBA. This is the main entry point that drivers and the + * kernel frameworks will reach to start any operation. + * Many functions here will end up in the command and + * endpoint code. * * xhci.c: This provides the main kernel DDI interfaces and - * performs device initialization. + * performs device initialization. * * xhci.h: This is the primary header file which defines - * illumos-specific data structures and constants to manage - * the system. + * illumos-specific data structures and constants to manage + * the system. * * xhcireg.h: This header file defines all of the register offsets, - * masks, and related macros. It also contains all of the - * constants that are used in various structures as defined - * by the specification, such as command offsets, etc. + * masks, and related macros. It also contains all of the + * constants that are used in various structures as defined + * by the specification, such as command offsets, etc. * * xhci_ioctl.h: This contains a few private ioctls that are used by a - * private debugging command. These are private. + * private debugging command. These are private. * * cmd/xhci/xhci_portsc: This is a private utility that can be useful for - * debugging xhci state. It is the only consumer of - * xhci_ioctl.h and the private ioctls. + * debugging xhci state. It is the only consumer of + * xhci_ioctl.h and the private ioctls. * * ---------------------------------- * xHCI Overview and Structure Layout @@ -1444,12 +1444,15 @@ xhci_find_ext_cap(xhci_t *xhcip, uint32_t id, uint32_t init, uint32_t *outp) static boolean_t xhci_port_count(xhci_t *xhcip) { - uint_t nusb2 = 0, nusb3 = 0; + uint_t nusb2 = 0, fusb2 = 0; + uint_t nusb30 = 0, fusb30 = 0; + uint_t nusb31 = 0, fusb31 = 0; uint32_t off = UINT32_MAX; while (xhci_find_ext_cap(xhcip, XHCI_ID_PROTOCOLS, off, &off) == B_TRUE) { uint32_t rvers, rport; + uint8_t maj, min, count, first; /* * See xHCI 1.1 / 7.2 for the format of this. The first uint32_t @@ -1466,23 +1469,48 @@ xhci_port_count(xhci_t *xhcip) return (B_FALSE); } - rvers = XHCI_XECP_PROT_MAJOR(rvers); - rport = XHCI_XECP_PROT_PCOUNT(rport); - - if (rvers == 3) { - nusb3 += rport; - } else if (rvers <= 2) { - nusb2 += rport; + maj = XHCI_XECP_PROT_MAJOR(rvers); + min = XHCI_XECP_PROT_MINOR(rvers); + count = XHCI_XECP_PROT_PCOUNT(rport); + first = XHCI_XECP_PROT_FPORT(rport); + + if (maj == 3 && min == 1) { + nusb31 = count; + fusb31 = first; + } else if (maj == 3 && min == 0) { + nusb30 = count; + fusb30 = first; + } else if (maj <= 2) { + nusb2 = count; + fusb2 = first; } else { xhci_error(xhcip, "encountered port capabilities with " - "unknown major USB version: %d\n", rvers); + "unknown USB version: %x.%x\n", maj, min); } } - (void) ddi_prop_update_int(DDI_DEV_T_NONE, xhcip->xhci_dip, - "usb2-capable-ports", nusb2); - (void) ddi_prop_update_int(DDI_DEV_T_NONE, xhcip->xhci_dip, - "usb3-capable-ports", nusb3); + /* + * These properties are used by FMA and the USB topo module. + */ + if (nusb2 > 0) { + (void) ddi_prop_update_int(DDI_DEV_T_NONE, xhcip->xhci_dip, + "usb2.0-port-count", nusb2); + (void) ddi_prop_update_int(DDI_DEV_T_NONE, xhcip->xhci_dip, + "usb2.0-first-port", fusb2); + } + if (nusb30 > 0) { + (void) ddi_prop_update_int(DDI_DEV_T_NONE, xhcip->xhci_dip, + "usb3.0-port-count", nusb30); + (void) ddi_prop_update_int(DDI_DEV_T_NONE, xhcip->xhci_dip, + "usb3.0-first-port", fusb30); + } + + if (nusb31 > 0) { + (void) ddi_prop_update_int(DDI_DEV_T_NONE, xhcip->xhci_dip, + "usb3.1-port-count", nusb30); + (void) ddi_prop_update_int(DDI_DEV_T_NONE, xhcip->xhci_dip, + "usb3.1-first-port", fusb31); + } return (B_TRUE); } @@ -2183,7 +2211,7 @@ static struct dev_ops xhci_dev_ops = { &xhci_cb_ops, /* devo_cb_ops */ &usba_hubdi_busops, /* devo_bus_ops */ usba_hubdi_root_hub_power, /* devo_power */ - ddi_quiesce_not_supported /* devo_quiesce */ + ddi_quiesce_not_supported /* devo_quiesce */ }; static struct modldrv xhci_modldrv = { diff --git a/usr/src/uts/common/io/vioif/vioif.c b/usr/src/uts/common/io/vioif/vioif.c index d5dd1e8e39..3b6660c542 100644 --- a/usr/src/uts/common/io/vioif/vioif.c +++ b/usr/src/uts/common/io/vioif/vioif.c @@ -11,6 +11,7 @@ /* * Copyright 2013 Nexenta Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. * Copyright (c) 2014, 2016 by Delphix. All rights reserved. * Copyright 2015 Joyent, Inc. */ @@ -909,16 +910,17 @@ vioif_reclaim_used_tx(struct vioif_softc *sc) while ((ve = virtio_pull_chain(sc->sc_tx_vq, &len))) { /* We don't chain descriptors for tx, so don't expect any. */ - ASSERT(!ve->qe_next); + ASSERT(ve->qe_next == NULL); buf = &sc->sc_txbufs[ve->qe_index]; mp = buf->tb_mp; buf->tb_mp = NULL; if (mp != NULL) { - for (int i = 0; i < buf->tb_external_num; i++) + for (int i = 0; i < buf->tb_external_num; i++) { (void) ddi_dma_unbind_handle( buf->tb_external_mapping[i].vbm_dmah); + } } virtio_free_chain(ve); diff --git a/usr/src/uts/common/io/vnd/frameio.c b/usr/src/uts/common/io/vnd/frameio.c new file mode 100644 index 0000000000..198c14d4be --- /dev/null +++ b/usr/src/uts/common/io/vnd/frameio.c @@ -0,0 +1,465 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +/* + * Frame I/O utility functions + */ + +#include <sys/frameio.h> + +#include <sys/file.h> +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/sysmacros.h> +#include <sys/inttypes.h> + +static kmem_cache_t *frameio_cache; + +int +frameio_init(void) +{ + frameio_cache = kmem_cache_create("frameio_cache", + sizeof (frameio_t) + sizeof (framevec_t) * FRAMEIO_NVECS_MAX, + 0, NULL, NULL, NULL, NULL, NULL, 0); + if (frameio_cache == NULL) + return (1); + + return (0); +} + +void +frameio_fini(void) +{ + if (frameio_cache != NULL) + kmem_cache_destroy(frameio_cache); +} + +frameio_t * +frameio_alloc(int kmflags) +{ + return (kmem_cache_alloc(frameio_cache, kmflags)); +} + +void +frameio_free(frameio_t *fio) +{ + kmem_cache_free(frameio_cache, fio); +} + +/* + * Ensure that we don't see any garbage in the framevecs that we're nominally + * supposed to work with. Specifically we want to make sure that the buflen and + * the address are not zero. + */ +static int +frameio_hdr_check_vecs(frameio_t *fio) +{ + int i; + for (i = 0; i < fio->fio_nvecs; i++) + if (fio->fio_vecs[i].fv_buf == NULL || + fio->fio_vecs[i].fv_buflen == 0) + return (EINVAL); + + return (0); +} + +/* + * We have to copy in framevec32_t's. To work around the data model issues and + * trying not to copy memory we first copy in the framevec32_t data into the + * standard fio_vec space. Next we work backwards copying a given framevec32_t + * to a temporaory framevec_t and then overwrite the frameio_t's data. Note that + * it is important that we do this in reverse so as to ensure that we don't + * clobber data as the framevec_t is larger than the framevec32_t. + */ +static int +frameio_hdr_copyin_ilp32(frameio_t *fio, const void *addr) +{ + framevec32_t *vec32p; + framevec_t fv; + int i; + + vec32p = (framevec32_t *)&fio->fio_vecs[0]; + + if (ddi_copyin(addr, vec32p, sizeof (framevec32_t) * fio->fio_nvecs, + 0) != 0) + return (EFAULT); + + for (i = fio->fio_nvecs - 1; i >= 0; i--) { + fv.fv_buf = (void *)(uintptr_t)vec32p[i].fv_buf; + fv.fv_buflen = vec32p[i].fv_buflen; + fv.fv_actlen = vec32p[i].fv_actlen; + fio->fio_vecs[i].fv_buf = fv.fv_buf; + fio->fio_vecs[i].fv_buflen = fv.fv_buflen; + fio->fio_vecs[i].fv_actlen = fv.fv_actlen; + } + + return (frameio_hdr_check_vecs(fio)); +} + +/* + * Copy in a frame io header into fio with space for up to nvecs. If the frameio + * contains more vectors than specified it will be ignored. mode should contain + * information about the datamodel. + */ +int +frameio_hdr_copyin(frameio_t *fio, int max_vecs, const void *addr, uint_t mode) +{ + int model = ddi_model_convert_from(mode & FMODELS); + int cpf = mode & FKIOCTL ? FKIOCTL : 0; + size_t fsize = model == DDI_MODEL_ILP32 ? + sizeof (frameio32_t) : sizeof (frameio_t); + + /* + * The start of the header is the same in all data models for the + * current verison. + */ + if (ddi_copyin(addr, fio, fsize, cpf) != 0) + return (EFAULT); + + if (fio->fio_version != FRAMEIO_VERSION_ONE) + return (EINVAL); + + if (fio->fio_nvecs > FRAMEIO_NVECS_MAX || fio->fio_nvecs == 0) + return (EINVAL); + + if (fio->fio_nvpf == 0) + return (EINVAL); + + if (fio->fio_nvecs % fio->fio_nvpf != 0) + return (EINVAL); + + if (fio->fio_nvecs > max_vecs) + return (EOVERFLOW); + + addr = (void *)((uintptr_t)addr + fsize); + if (model == DDI_MODEL_ILP32) { + if (cpf != 0) + return (EINVAL); + return (frameio_hdr_copyin_ilp32(fio, addr)); + } + + if (ddi_copyin(addr, &fio->fio_vecs[0], + sizeof (framevec_t) * fio->fio_nvecs, cpf) != 0) + return (EFAULT); + + return (frameio_hdr_check_vecs(fio)); +} + +static mblk_t * +frameio_allocb(size_t sz) +{ + mblk_t *mp; + + mp = allocb(sz, 0); + if (mp == NULL) + return (NULL); + + mp->b_datap->db_type = M_DATA; + return (mp); +} + +static int +framevec_mblk_read(framevec_t *fv, mblk_t **mpp, int cpf) +{ + mblk_t *mp; + cpf = cpf != 0 ? FKIOCTL : 0; + + mp = frameio_allocb(fv->fv_buflen); + + if (mp == NULL) { + freemsg(mp); + return (EAGAIN); + } + + if (ddi_copyin(fv->fv_buf, mp->b_wptr, fv->fv_buflen, + cpf) != 0) { + freemsg(mp); + return (EFAULT); + } + + mp->b_wptr += fv->fv_buflen; + *mpp = mp; + return (0); +} + +/* + * Read a set of frame vectors that make up a single message boundary and return + * that as a single message in *mpp that consists of multiple data parts. + */ +static int +frameio_mblk_read(frameio_t *fio, framevec_t *fv, mblk_t **mpp, int cpf) +{ + int nparts = fio->fio_nvpf; + int part, error; + mblk_t *mp; + + *mpp = NULL; + cpf = cpf != 0 ? FKIOCTL : 0; + + /* + * Construct the initial frame + */ + for (part = 0; part < nparts; part++) { + error = framevec_mblk_read(fv, &mp, cpf); + if (error != 0) { + freemsg(*mpp); + return (error); + } + + if (*mpp == NULL) + *mpp = mp; + else + linkb(*mpp, mp); + fv++; + } + + return (0); +} + +/* + * Read data from a series of frameio vectors into a message block chain. A + * given frameio request has a number of discrete messages divided into + * individual vectors based on fio->fio_nvcspframe. Each discrete message will + * be constructed into a message block chain pointed to by b_next. + * + * If we get an EAGAIN while trying to construct a given message block what we + * return depends on what else we've done so far. If we have succesfully + * completed at least one message then we free everything else we've done so + * far and return that. If no messages have been completed we return EAGAIN. If + * instead we encounter a different error, say EFAULT, then all of the fv_actlen + * entries values are undefined. + */ +int +frameio_mblk_chain_read(frameio_t *fio, mblk_t **mpp, int *nvecs, int cpf) +{ + int error = ENOTSUP; + int nframes = fio->fio_nvecs / fio->fio_nvpf; + int frame; + framevec_t *fv; + mblk_t *mp, *bmp = NULL; + + /* + * Protect against bogus kernel subsystems. + */ + VERIFY(fio->fio_nvecs > 0); + VERIFY(fio->fio_nvecs % fio->fio_nvpf == 0); + + *mpp = NULL; + cpf = cpf != 0 ? FKIOCTL : 0; + + fv = &fio->fio_vecs[0]; + for (frame = 0; frame < nframes; frame++) { + error = frameio_mblk_read(fio, fv, &mp, cpf); + if (error != 0) + goto failed; + + if (bmp != NULL) + bmp->b_next = mp; + else + *mpp = mp; + bmp = mp; + } + + *nvecs = nframes; + return (0); +failed: + /* + * On EAGAIN we've already taken care of making sure that we have no + * leftover messages, eg. they were never linked in. + */ + if (error == EAGAIN) { + if (frame != 0) + error = 0; + if (*nvecs != NULL) + *nvecs = frame; + ASSERT(*mpp != NULL); + } else { + for (mp = *mpp; mp != NULL; mp = bmp) { + bmp = mp->b_next; + freemsg(mp); + } + if (nvecs != NULL) + *nvecs = 0; + *mpp = NULL; + } + return (error); +} + +size_t +frameio_frame_length(frameio_t *fio, framevec_t *fv) +{ + int i; + size_t len = 0; + + for (i = 0; i < fio->fio_nvpf; i++, fv++) + len += fv->fv_buflen; + + return (len); +} + +/* + * Write a portion of an mblk to the current. + */ +static int +framevec_write_mblk_part(framevec_t *fv, mblk_t *mp, size_t len, size_t moff, + size_t foff, int cpf) +{ + ASSERT(len <= MBLKL(mp) - moff); + ASSERT(len <= fv->fv_buflen - fv->fv_actlen); + cpf = cpf != 0 ? FKIOCTL : 0; + + if (ddi_copyout(mp->b_rptr + moff, (caddr_t)fv->fv_buf + foff, len, + cpf) != 0) + return (EFAULT); + fv->fv_actlen += len; + + return (0); +} + +/* + * Because copying this out to the user might fail we don't want to update the + * b_rptr in case we need to copy it out again. + */ +static int +framevec_map_blk(frameio_t *fio, framevec_t *fv, mblk_t *mp, int cpf) +{ + int err; + size_t msize, blksize, len, moff, foff; + + msize = msgsize(mp); + if (msize > frameio_frame_length(fio, fv)) + return (EOVERFLOW); + + moff = 0; + foff = 0; + blksize = MBLKL(mp); + fv->fv_actlen = 0; + while (msize != 0) { + len = MIN(blksize, fv->fv_buflen - fv->fv_actlen); + err = framevec_write_mblk_part(fv, mp, len, moff, foff, cpf); + if (err != 0) + return (err); + + msize -= len; + blksize -= len; + moff += len; + foff += len; + + if (blksize == 0 && msize != 0) { + mp = mp->b_cont; + ASSERT(mp != NULL); + moff = 0; + blksize = MBLKL(mp); + } + + if (fv->fv_buflen == fv->fv_actlen && msize != 0) { + fv++; + fv->fv_actlen = 0; + foff = 0; + } + } + + return (0); +} + +int +frameio_mblk_chain_write(frameio_t *fio, frameio_write_mblk_map_t map, + mblk_t *mp, int *nwrite, int cpf) +{ + int mcount = 0; + int ret = 0; + + if (map != MAP_BLK_FRAME) + return (EINVAL); + + while (mp != NULL && mcount < fio->fio_nvecs) { + ret = framevec_map_blk(fio, &fio->fio_vecs[mcount], mp, cpf); + if (ret != 0) + break; + mcount += fio->fio_nvpf; + mp = mp->b_next; + } + + if (ret != 0 && mcount == 0) { + if (nwrite != NULL) + *nwrite = 0; + return (ret); + } + + if (nwrite != NULL) + *nwrite = mcount / fio->fio_nvpf; + + return (0); +} + +/* + * Copy out nframes worth of frameio header data back to userland. + */ +int +frameio_hdr_copyout(frameio_t *fio, int nframes, void *addr, uint_t mode) +{ + int i; + int model = ddi_model_convert_from(mode & FMODELS); + framevec32_t *vec32p; + framevec32_t f; + + if (fio->fio_nvecs / fio->fio_nvpf < nframes) + return (EINVAL); + + fio->fio_nvecs = nframes * fio->fio_nvpf; + + if (model == DDI_MODEL_NONE) { + if (ddi_copyout(fio, addr, + sizeof (frameio_t) + fio->fio_nvecs * sizeof (framevec_t), + mode & FKIOCTL) != 0) + return (EFAULT); + return (0); + } + + ASSERT(model == DDI_MODEL_ILP32); + + vec32p = (framevec32_t *)&fio->fio_vecs[0]; + for (i = 0; i < fio->fio_nvecs; i++) { + f.fv_buf = (caddr32_t)(uintptr_t)fio->fio_vecs[i].fv_buf; + if (fio->fio_vecs[i].fv_buflen > UINT_MAX || + fio->fio_vecs[i].fv_actlen > UINT_MAX) + return (EOVERFLOW); + f.fv_buflen = fio->fio_vecs[i].fv_buflen; + f.fv_actlen = fio->fio_vecs[i].fv_actlen; + vec32p[i].fv_buf = f.fv_buf; + vec32p[i].fv_buflen = f.fv_buflen; + vec32p[i].fv_actlen = f.fv_actlen; + } + + if (ddi_copyout(fio, addr, + sizeof (frameio32_t) + fio->fio_nvecs * sizeof (framevec32_t), + mode & FKIOCTL) != 0) + return (EFAULT); + return (0); +} + +void +frameio_mark_consumed(frameio_t *fio, int nframes) +{ + int i; + + ASSERT(fio->fio_nvecs / fio->fio_nvpf >= nframes); + for (i = 0; i < nframes * fio->fio_nvpf; i++) + fio->fio_vecs[i].fv_actlen = fio->fio_vecs[i].fv_buflen; +} diff --git a/usr/src/uts/common/io/vnd/vnd.c b/usr/src/uts/common/io/vnd/vnd.c new file mode 100644 index 0000000000..d03c7ce4ec --- /dev/null +++ b/usr/src/uts/common/io/vnd/vnd.c @@ -0,0 +1,5857 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018, Joyent, Inc. + */ + +/* + * vnd - virtual (machine) networking datapath + * + * vnd's purpose is to provide a highly performant data path for Layer 2 network + * traffic and exist side by side an active IP netstack, each servicing + * different datalinks. vnd provides many of the same capabilities as the + * current TCP/IP stack does and some specific to layer two. Specifically: + * + * o Use of the DLD fastpath + * o Packet capture hooks + * o Ability to use hardware capabilities + * o Useful interfaces for handling multiple frames + * + * The following image shows where vnd fits into today's networking stack: + * + * +---------+----------+----------+ + * | libdlpi | libvnd | libsocket| + * +---------+----------+----------+ + * | · · VFS | + * | VFS · VFS +----------+ + * | · | sockfs | + * +---------+----------+----------+ + * | | VND | IP | + * | +----------+----------+ + * | DLD/DLS | + * +-------------------------------+ + * | MAC | + * +-------------------------------+ + * | GLDv3 | + * +-------------------------------+ + * + * ----------------------------------------- + * A Tale of Two Devices - DDI Device Basics + * ----------------------------------------- + * + * vnd presents itself to userland as a character device; however, it also is a + * STREAMS device so that it can interface with dld and the rest of the + * networking stack. Users never interface with the STREAMs devices directly and + * they are purely an implementation detail of vnd. Opening the STREAMS device + * require kcred and as such userland cannot interact with it or push it onto + * the stream head. + * + * The main vnd character device, /dev/vnd/ctl, is a self-cloning device. Every + * clone gets its own minor number; however, minor nodes are not created in the + * devices tree for these instances. In this state a user may do two different + * things. They may issue ioctls that affect global state or they may issue + * ioctls that try to attach it to a given datalink. Once a minor device has + * been attached to a datalink, all operations on it are scoped to that context, + * therefore subsequent global operations are not permitted. + * + * A given device can be linked into the /devices and /dev name space via a link + * ioctl. That ioctl causes a minor node to be created in /devices and then it + * will also appear under /dev/vnd/ due to vnd's sdev plugin. This is similar + * to, but simpler than, IP's persistence mechanism. + * + * --------------------- + * Binding to a datalink + * --------------------- + * + * Datalinks are backed by the dld (datalink device) and dls (datalink services) + * drivers. These drivers provide a STREAMS device for datalinks on the system + * which are exposed through /dev/net. Userland generally manipulates datalinks + * through libdlpi. When an IP interface is being plumbed up what actually + * happens is that someone does a dlpi_open(3DLPI) of the underlying datalink + * and then pushes on the ip STREAMS module with an I_PUSH ioctl. Modules may + * then can negotiate with dld and dls to obtain access to various capabilities + * and fast paths via a series of STREAMS messages. + * + * In vnd, we do the same thing, but we leave our STREAMS module as an + * implementation detail of the system. We don't want users to be able to + * arbitrarily push vnd STREAMS module onto any stream, so we explicitly require + * kcred to manipulate it. Thus, when a user issues a request to attach a + * datalink to a minor instance of the character device, that vnd minor instance + * itself does a layered open (ldi_open_by_name(9F)) of the specified datalink. + * vnd does that open using the passed in credentials from the ioctl, not kcred. + * This ensures that users who doesn't have permissions to open the device + * cannot. Once that's been opened, we push on the vnd streams module. + * + * Once the vnd STREAMS instance has been created for this device, eg. the + * I_PUSH ioctl returns, we explicitly send a STREAMS ioctl + * (VND_STRIOC_ASSOCIATE) to associate the vnd STREAMS and character devices. + * This association begins the STREAM device's initialization. We start up an + * asynchronous state machine that takes care of all the different aspects of + * plumbing up the device with dld and dls and enabling the MAC fast path. We + * need to guarantee to consumers of the character device that by the time their + * ioctl returns, the data path has been fully initialized. + * + * The state progression is fairly linear. There are two general steady states. + * The first is VND_S_ONLINE, which means that everything is jacked up and good + * to go. The alternative is VND_S_ZOMBIE, which means that the streams device + * encountered an error or we have finished tearing it down and the character + * device can clean it up. The following is our state progression and the + * meaning of each state: + * + * | + * | + * V + * +---------------+ + * | VNS_S_INITIAL | This is our initial state. Every + * +---------------+ vnd STREAMS device starts here. + * | While in this state, only dlpi + * | M_PROTO and M_IOCTL messages can be + * | sent or received. All STREAMS based + * | data messages are dropped. + * | We transition out of this state by + * | sending a DL_INFO_REQ to obtain + * | information about the underlying + * | link. + * v + * +-----------------+ + * +--<-| VNS_S_INFO_SENT | In this state, we verify and + * | +-----------------+ record information about the + * | | underlying device. If the device is + * | | not suitable, eg. not of type + * v | DL_ETHER, then we immediately + * | | become a ZOMBIE. To leave this + * | | state we request exclusive active + * | | access to the device via + * v | DL_EXCLUSIVE_REQ. + * | v + * | +----------------------+ + * +--<-| VNS_S_EXCLUSIVE_SENT | In this state, we verify whether + * | +----------------------+ or not we were able to obtain + * | | | exclusive access to the device. If + * | | | we were not able to, then we leave, + * v | | as that means that something like + * | | | IP is already plumbed up on top of + * | | | the datalink. We leave this state + * | | | by progressing through to the + * | | | appropriate DLPI primitive, either + * v | | DLPI_ATTACH_REQ or DLPI_BIND_REQ + * | | | depending on the style of the + * | | | datalink. + * | | v + * | | +-------------------+ + * +------ |--<-| VNS_S_ATTACH_SENT | In this state, we verify we were + * | | +-------------------+ able to perform a standard DLPI + * | | | attach and if so, go ahead and + * v | | send a DLPI_BIND_REQ. + * | v v + * | +-------------------+ + * +--<-| VNS_S_BIND_SENT | In this state we see the result of + * | +-------------------+ our attempt to bind to PPA 0 of the + * v | underlying device. Because we're + * | | trying to be a layer two datapath, + * | | the specific attachment point isn't + * | | too important as we're going to + * v | have to enable promiscuous mode. We + * | | transition out of this by sending + * | | our first of three promiscuous mode + * | | requests. + * v v + * | +------------------------+ + * +--<-| VNS_S_SAP_PROMISC_SENT | In this state we verify that we + * | +------------------------+ were able to enable promiscuous + * | | mode at the physical level. We + * | | transition out of this by enabling + * | | multicast and broadcast promiscuous + * v | mode. + * | v + * | +--------------------------+ + * +--<-| VNS_S_MULTI_PROMISC_SENT | In this state we verify that we + * | +--------------------------+ have enabled DL_PROMISC_MULTI and + * v | move onto the second promiscuous + * | | mode request. + * | v + * | +----------------------------+ + * +--<-| VNS_S_RX_ONLY_PROMISC_SENT | In this state we verify that we + * | +----------------------------+ enabled RX_ONLY promiscuous mode. + * | | We specifically do this as we don't + * v | want to receive our own traffic + * | | that we'll send out. We leave this + * | | state by enabling the final flag + * | | DL_PROMISC_FIXUPS. + * | v + * | +--------------------------+ + * +--<-| VNS_S_FIXUP_PROMISC_SENT | In this state we verify that we + * | +--------------------------+ enabled FIXUP promiscuous mode. + * | | We specifically do this as we need + * v | to ensure that traffic which is + * | | received by being looped back to us + * | | correctly has checksums fixed. We + * | | leave this state by requesting the + * | | dld/dls capabilities that we can + * v | process. + * | v + * | +--------------------+ + * +--<-| VNS_S_CAPAB_Q_SENT | We loop over the set of + * | +--------------------+ capabilities that dld advertised + * | | and enable the ones that currently + * v | support for use. See the section + * | | later on regarding capabilities + * | | for more information. We leave this + * | | state by sending an enable request. + * v v + * | +--------------------+ + * +--<-| VNS_S_CAPAB_E_SENT | Here we finish all capability + * | +--------------------+ initialization. Once finished, we + * | | transition to the next state. If + * v | the dld fast path is not available, + * | | we become a zombie. + * | v + * | +--------------+ + * | | VNS_S_ONLINE | This is a vnd STREAMS device's + * | +--------------+ steady state. It will normally + * | | reside in this state while it is in + * | | active use. It will only transition + * v | to the next state when the STREAMS + * | | device is closed by the character + * | | device. In this state, all data + * | | flows over the dld fast path. + * | v + * | +---------------------+ + * +--->| VNS_S_SHUTTING_DOWN | This vnd state takes care of + * | +---------------------+ disabling capabilities and + * | | flushing all data. At this point + * | | any additional data that we receive + * | | will be dropped. We leave this + * v | state by trying to remove multicast + * | | promiscuity. + * | | + * | v + * | +---------------------------------+ + * +-->| VNS_S_MULTICAST_PROMISCOFF_SENT | In this state, we check if we have + * | +---------------------------------+ successfully removed multicast + * | | promiscuous mode. If we have + * | | failed, we still carry on but only + * | | warn. We leave this state by trying + * | | to disable SAP level promiscuous + * | | mode. + * | v + * | +---------------------------+ + * +-->| VNS_S_SAP_PROMISCOFF_SENT | In this state, we check if we have + * | +---------------------------+ successfully removed SAP level + * | | promiscuous mode. If we have + * | | failed, we still carry on but only + * | | warn. Note that we don't worry + * | | about either of + * | | DL_PROMISC_FIXUPS or + * | | DL_PROMISC_RX_ONLY. If these are + * | | the only two entries left, then we + * | | should have anything that MAC is + * | | doing for us at this point, + * | | therefore it's safe for us to + * | | proceed to unbind, which is how we + * | | leave this state via a + * | v DL_UNBIND_REQ. + * | +-------------------+ + * +--->| VNS_S_UNBIND_SENT | Here, we check how the unbind + * | +-------------------+ request went. Regardless of its + * | | success, we always transition to + * | | a zombie state. + * | v + * | +--------------+ + * +--->| VNS_S_ZOMBIE | In this state, the vnd STREAMS + * +--------------+ device is waiting to finish being + * reaped. Because we have no more + * ways to receive data it should be + * safe to destroy all remaining data + * structures. + * + * If the stream association fails for any reason the state machine reaches + * VNS_S_ZOMBIE. A more detailed vnd_errno_t will propagate back through the + * STREAMS ioctl to the character device. That will fail the user ioctl and + * propagate the vnd_errno_t back to userland. If, on the other hand, the + * association succeeds, then the vnd STREAMS device will be fully plumbed up + * and ready to transmit and receive message blocks. Consumers will be able to + * start using the other cbops(9E) entry points once the attach has fully + * finished, which will occur after the original user attach ioctl to the + * character device returns. + * + * It's quite important that we end up sending the full series of STREAMS + * messages when tearing down. While it's tempting to say that we should just + * rely on the STREAMS device being closed to properly ensure that we have no + * more additional data, that's not sufficient due to our use of direct + * callbacks. DLS does not ensure that by the time we change the direct + * callback (vnd_mac_input) that all callers to it will have been quiesced. + * However, it does guarantee that if we disable promiscuous mode ourselves and + * we turn off the main data path via DL_UNBIND_REQ that it will work. + * Therefore, we make sure to do this ourselves rather than letting DLS/DLD do + * it as part of tearing down the STREAMS device. This ensures that we'll + * quiesce all data before we destroy our data structures and thus we should + * eliminate the race in changing the data function. + * + * -------------------- + * General Architecture + * -------------------- + * + * There are several different devices and structures in the vnd driver. There + * is a per-netstack component, pieces related to the character device that + * consumers see, the internal STREAMS device state, and the data queues + * themselves. The following ASCII art picture describes their relationships and + * some of the major pieces of data that contain them. These are not exhaustive, + * e.g. synchronization primitives are left out. + * + * +----------------+ +-----------------+ + * | global | | global | + * | device list | | netstack list | + * | vnd_dev_list | | vnd_nsd_list | + * +----------------+ +-----------------+ + * | | + * | v + * | +-------------------+ +-------------------+ + * | | per-netstack data | ---> | per-netstack data | --> ... + * | | vnd_pnsd_t | | vnd_pnsd_t | + * | | | +-------------------+ + * | | | + * | | nestackid_t ---+----> Netstack ID + * | | vnd_pnsd_flags_t -+----> Status flags + * | | zoneid_t ---+----> Zone ID for this netstack + * | | hook_family_t ---+----> VND IPv4 Hooks + * | | hook_family_t ---+----> VND IPv6 Hooks + * | | list_t ----+ | + * | +------------+------+ + * | | + * | v + * | +------------------+ +------------------+ + * | | character device | ---> | character device | -> ... + * +---------->| vnd_dev_t | | vnd_dev_t | + * | | +------------------+ + * | | + * | minor_t ---+--> device minor number + * | ldi_handle_t ---+--> handle to /dev/net/%datalink + * | vnd_dev_flags_t -+--> device flags, non blocking, etc. + * | char[] ---+--> name if linked + * | vnd_str_t * -+ | + * +--------------+---+ + * | + * v + * +-------------------------+ + * | STREAMS device | + * | vnd_str_t | + * | | + * | vnd_str_state_t ---+---> State machine state + * | gsqueue_t * ---+---> mblk_t Serialization queue + * | vnd_str_stat_t ---+---> per-device kstats + * | vnd_str_capab_t ---+----------------------------+ + * | vnd_data_queue_t ---+ | | + * | vnd_data_queue_t -+ | | v + * +-------------------+-+---+ +---------------------+ + * | | | Stream capabilities | + * | | | vnd_str_capab_t | + * | | | | + * | | supported caps <--+-- vnd_capab_flags_t | + * | | dld cap handle <--+-- void * | + * | | direct tx func <--+-- vnd_dld_tx_t | + * | | +---------------------+ + * | | + * +----------------+ +-------------+ + * | | + * v v + * +-------------------+ +-------------------+ + * | Read data queue | | Write data queue | + * | vnd_data_queue_t | | vnd_data_queue_t | + * | | | | + * | size_t ----+--> Current size | size_t ----+--> Current size + * | size_t ----+--> Max size | size_t ----+--> Max size + * | mblk_t * ----+--> Queue head | mblk_t * ----+--> Queue head + * | mblk_t * ----+--> Queue tail | mblk_t * ----+--> Queue tail + * +-------------------+ +-------------------+ + * + * + * Globally, we maintain two lists. One list contains all of the character + * device soft states. The other maintains a list of all our netstack soft + * states. Each netstack maintains a list of active devices that have been + * associated with a datalink in its netstack. + * + * Recall that a given minor instance of the character device exists in one of + * two modes. It can either be a cloned open of /dev/vnd/ctl, the control node, + * or it can be associated with a given datalink. When minor instances are in + * the former state, they do not exist in a given vnd_pnsd_t's list of devices. + * As part of attaching to a datalink, the given vnd_dev_t will be inserted into + * the appropriate vnd_pnsd_t. In addition, this will cause a STREAMS device, a + * vnd_str_t, to be created and associated to a vnd_dev_t. + * + * The character device, and its vnd_dev_t, is the interface to the rest of the + * system. The vnd_dev_t keeps track of various aspects like whether various + * operations, such as read, write and the frameio ioctls, are considered + * blocking or non-blocking in the O_NONBLOCK sense. It also is responsible for + * keeping track of things like the name of the device, if any, in /dev. The + * vnd_str_t, on the other hand manages aspects like buffer sizes and the actual + * data queues. However, ioctls that manipulate these properties all go through + * the vnd_dev_t to its associated vnd_str_t. + * + * Each of the STREAMS devices, the vnd_str_t, maintains two data queues. One + * for frames to transmit (write queue) and one for frames received (read + * queue). These data queues have a maximum size and attempting to add data + * beyond that maximum size will result in data being dropped. The sizes are + * configurable via ioctls VND_IOC_SETTXBUF, VND_IOC_SETRXBUF. Data either sits + * in those buffers or has a reservation in those buffers while they are in vnd + * and waiting to be consumed by the user or by mac. + * + * Finally, the vnd_str_t also has a vnd_str_capab_t which we use to manage the + * available, negotiated, and currently active features. + * + * ---------------------- + * Data Path and gsqueues + * ---------------------- + * + * There's a lot of plumbing in vnd to get to the point where we can send data, + * but vnd's bread and butter is the data path, so it's worth diving into it in + * more detail. Data enters and exits the system from two ends. + * + * The first end is the vnd consumer. This comes in the form of read and write + * system calls as well as the frame I/O ioctls. The read and write system calls + * operate on a single frame at a time. Think of a frame as a single message + * that has come in off the wire, which may itself comprise multiple mblk_t's + * linked together in the kernel. readv(2) and writev(2) have the same + * limitations as read(2) and write(2). We enforce this as the system is + * required to fill up every uio(9S) buffer before moving onto the next one. + * This means that if you have a MTU sized buffer and two frames come in which + * are less than half of the MTU they must fill up the given iovec. Even if we + * didn't want to do this, we have no way of informing the supplier of the + * iovecs that they were only partially filled or where one frame ends and + * another begins. That's life, as such we have frame I/O which solves this + * problem. It allows for multiple frames to be consumed as well as for frames + * to be broken down into multiple vector components. + * + * The second end is the mac direct calls. As part of negotiating capabilities + * via dld, we give mac a function of ours to call when packets are received + * [vnd_mac_input()] and a callback to indicate that flow has been restored + * [vnd_mac_flow_control()]. In turn, we also get a function pointer that we can + * transmit data with. As part of the contract with mac, mac is allowed to flow + * control us by returning a cookie to the transmit function. When that happens, + * all outbound traffic is halted until our callback function is called and we + * can schedule drains. + * + * It's worth looking at these in further detail. We'll start with the rx path. + * + * + * | + * * . . . packets from gld + * | + * v + * +-------------+ + * | mac | + * +-------------+ + * | + * v + * +-------------+ + * | dld | + * +-------------+ + * | + * * . . . dld direct callback + * | + * v + * +---------------+ + * | vnd_mac_input | + * +---------------+ + * | + * v + * +---------+ +-------------+ + * | dropped |<--*---------| vnd_hooks | + * | by | . +-------------+ + * | hooks | . drop probe | + * +---------+ kstat bump * . . . Do we have free + * | buffer space? + * | + * no . | . yes + * . + . + * +---*--+------*-------+ + * | | + * * . . drop probe * . . recv probe + * | kstat bump | kstat bump + * v | + * +---------+ * . . fire pollin + * | freemsg | v + * +---------+ +-----------------------+ + * | vnd_str_t`vns_dq_read | + * +-----------------------+ + * ^ ^ + * +----------+ | | +---------+ + * | read(9E) |-->-+ +--<--| frameio | + * +----------+ +---------+ + * + * The rx path is rather linear. Packets come into us from mac. We always run + * them through the various hooks, and if they come out of that, we inspect the + * read data queue. If there is not enough space for a packet, we drop it. + * Otherwise, we append it to the data queue, and fire read notifications + * targetting anyone polling or doing blocking I/O on this device. Those + * consumers then drain the head of the data queue. + * + * The tx path is more complicated due to mac flow control. After any call into + * mac, we may have to potentially suspend writes and buffer data for an + * arbitrary amount of time. As such, we need to carefully track the total + * amount of outstanding data so that we don't waste kernel memory. This is + * further complicated by the fact that mac will asynchronously tell us when our + * flow has been resumed. + * + * For data to be able to enter the system, it needs to be able to take a + * reservation from the write data queue. Once the reservation has been + * obtained, we enter the gsqueue so that we can actually append it. We use + * gsqueues (serialization queues) to ensure that packets are manipulated in + * order as we deal with the draining and appending packets. We also leverage + * its worker thread to help us do draining after mac has restorted our flow. + * + * The following image describes the flow: + * + * +-----------+ +--------------+ +-------------------------+ +------+ + * | write(9E) |-->| Space in the |--*--->| gsqueue_enter_one() |-->| Done | + * | frameio | | write queue? | . | +->vnd_squeue_tx_append | +------+ + * +-----------+ +--------------+ . +-------------------------+ + * | ^ . + * | | . reserve space from gsqueue + * | | | + * queue . . . * | space v + * full | * . . . avail +------------------------+ + * v | | vnd_squeue_tx_append() | + * +--------+ +------------+ +------------------------+ + * | EAGAIN |<--*------| Non-block? |<-+ | + * +--------+ . +------------+ | v + * . yes v | wait +--------------+ + * no . .* * . . for | append chain | + * +----+ space | to outgoing | + * | mblk chain | + * from gsqueue +--------------+ + * | | + * | +-------------------------------------------------+ + * | | + * | | yes . . . + * v v . + * +-----------------------+ +--------------+ . +------+ + * | vnd_squeue_tx_drain() |--->| mac blocked? |----*---->| Done | + * +-----------------------+ +--------------+ +------+ + * | | + * +---------------------------------|---------------------+ + * | | tx | + * | no . . * queue . . * + * | flow controlled . | empty * . fire pollout + * | . v | if mblk_t's + * +-------------+ . +---------------------+ | sent + * | set blocked |<----*------| vnd_squeue_tx_one() |--------^-------+ + * | flags | +---------------------+ | + * +-------------+ More data | | | More data | + * and limit ^ v * . . and limit ^ + * not reached . . * | | reached | + * +----+ | | + * v | + * +----------+ +-------------+ +---------------------------+ + * | mac flow |--------->| remove mac |--->| gsqueue_enter_one() with | + * | control | | block flags | | vnd_squeue_tx_drain() and | + * | callback | +-------------+ | GSQUEUE_FILL flag, iff | + * +----------+ | not already scheduled | + * +---------------------------+ + * + * The final path taken for a given write(9E)/frameio ioctl depends on whether + * or not the vnd_dev_t is non-blocking. That controls the initial path of + * trying to take a reservation in write data queue. If the device is in + * non-blocking mode, we'll return EAGAIN when there is not enough space + * available, otherwise, the calling thread blocks on the data queue. + * + * Today when we call into vnd_squeue_tx_drain() we will not try to drain the + * entire queue, as that could be quite large and we don't want to necessarily + * keep the thread that's doing the drain until it's been finished. Not only + * could more data be coming in, but the draining thread could be a userland + * thread that has more work to do. We have two limits today. There is an upper + * bound on the total amount of data and the total number of mblk_t chains. If + * we hit either limit, then we will schedule another drain in the gsqueue and + * go from there. + * + * It's worth taking some time to describe how we interact with gsqueues. vnd + * has a gsqueue_set_t for itself. It's important that it has its own set, as + * the profile of work that vnd does is different from other sub-systems in the + * kernel. When we open a STREAMS device in vnd_s_open, we get a random gsqueue. + * Unlike TCP/IP which uses an gsqueue for per TCP connection, we end up + * maintaining one for a given device. Because of that, we want to use a + * pseudo-random one to try and spread out the load, and picking one at random + * is likely to be just as good as any fancy algorithm we might come up with, + * especially as any two devices could have radically different transmit + * profiles. + * + * While some of the write path may seem complicated, it does allow us to + * maintain an important property. Once we have acknowledged a write(9E) or + * frameio ioctl, we will not drop the packet, excepting something like ipf via + * the firewall hooks. + * + * There is one other source of flow control that can exist in the system which + * is in the form of a barrier. The barrier is an internal mechanism used for + * ensuring that an gsqueue is drained for a given device. We use this as part + * of tearing down. Specifically we disable the write path so nothing new can be + * inserted into the gsqueue and then insert a barrier block. Once the barrier + * block comes out of the gsqueue, then we know nothing else in the gsqueue that + * could refer to the vnd_str_t, being destroyed, exists. + * + * --------------------- + * vnd, zones, netstacks + * --------------------- + * + * vnd devices are scoped to datalinks and datalinks are scoped to a netstack. + * Because of that, vnd is also a netstack module. It registers with the + * netstack sub-system and receives callbacks every time a netstack is created, + * being shutdown, and destroyed. The netstack callbacks drive the creation and + * destruction of the vnd_pnsd_t structures. + * + * Recall from the earlier architecture diagrams that every vnd device is scoped + * to a netstack and known about by a given vnd_pnsd_t. When that netstack is + * torn down, we also tear down any vnd devices that are hanging around. When + * the netstack is torn down, we know that any zones that are scoped to that + * netstack are being shut down and have no processes remaining. This is going + * to be the case whether they are shared or exclusive stack zones. We have to + * perform a careful dance. + * + * There are two different callbacks that happen on tear down, the first is a + * shutdown callback, the second is a destroy callback. When the shutdown + * callback is fired we need to prepare for the netstack to go away and ensure + * that nothing can continue to persist itself. + * + * More specifically, when we get notice of a stack being shutdown we first + * remove the netstack from the global netstack list to ensure that no one new + * can come in and find the netstack and get a reference to it. After that, we + * notify the neti hooks that they're going away. Once that's all done, we get + * to the heart of the matter. + * + * When shutting down there could be any number of outstanding contexts that + * have a reference on the vnd_pnsd_t and on the individual links. However, we + * know that no one new will be able to find the vnd_pnsd_t. To account for + * things that have existing references we mark the vnd_pnsd_t`vpnd_flags with + * VND_NS_CONDEMNED. This is checked by code paths that wish to append a device + * to the netstack's list. If this is set, then they must not append to it. + * Once this is set, we know that the netstack's list of devices can never grow, + * only shrink. + * + * Next, for each device we tag it with VND_D_ZONE_DYING. This indicates that + * the container for the device is being destroyed and that we should not allow + * additional references to the device to be created, whether via open, or + * linking. The presence of this bit also allows things like the list ioctl and + * sdev to know not to consider its existence. At the conclusion of this being + * set, we know that no one else should be able to obtain a new reference to the + * device. + * + * Once that has been set for all devices, we go through and remove any existing + * links that have been established in sdev. Because doing that may cause the + * final reference for the device to be dropped, which still has a reference to + * the netstack, we have to restart our walk due to dropped locks. We know that + * this walk will eventually complete because the device cannot be relinked and + * no new devices will be attached in this netstack due to VND_NS_CONDEMNED. + * Once that's finished, the shutdown callback returns. + * + * When we reach the destroy callback, we simply wait for references on the + * netstack to disappear. Because the zone has been shut down, all processes in + * it that have open references have been terminated and reaped. Any threads + * that are newly trying to reference it will fail. However, there is one thing + * that can halt this that we have no control over, which is the global zone + * holding open a reference to the device. In this case the zone halt will hang + * in vnd_stack_destroy. Once the last references is dropped we finish destroy + * the netinfo hooks and free the vnd_pnsd_t. + * + * ---- + * sdev + * ---- + * + * vnd registers a sdev plugin which allows it to dynamically fill out /dev/vnd + * for both the global and non-global zones. In any given zone we always supply + * a control node via /dev/vnd/ctl. This is the self-cloning node. Each zone + * will also have an entry per-link in that zone under /dev/vnd/%datalink, eg. + * if a link was named net0, there would be a /dev/vnd/net0. The global zone can + * also see every link for every zone, ala /dev/net, under + * /dev/vnd/%zonename/%datalink, eg. if a zone named 'turin' had a vnd device + * named net0, the global zone would have /dev/vnd/turin/net0. + * + * The sdev plugin has three interfaces that it supplies back to sdev. One is to + * validate that a given node is still valid. The next is a callback from sdev + * to say that it is no longer using the node. The third and final one is from + * sdev where it asks us to fill a directory. All of the heavy lifting is done + * in directory filling and in valiation. We opt not to maintain a reference on + * the device while there is an sdev node present. This makes the removal of + * nodes much simpler and most of the possible failure modes shouldn't cause any + * real problems. For example, the open path has to handle both dev_t's which no + * longer exist and which are no longer linked. + * + * ----- + * hooks + * ----- + * + * Like IP, vnd sends all L3 packets through its firewall hooks. Currently vnd + * provides these for L3 IP and IPv6 traffic. Each netstack provides these hooks + * in a minimal fashion. While we will allow traffic to be filtered through the + * hooks, we do not provide means for packet injection or additional inspection + * at this time. There are a total of four different events created: + * + * o IPv4 physical in + * o IPv4 physical out + * o IPv6 physical in + * o IPv6 physical out + * + * --------------- + * Synchronization + * --------------- + * + * To make our synchronization simpler, we've put more effort into making the + * metadata/setup paths do more work. That work allows the data paths to make + * assumptions around synchronization that simplify the general case. Each major + * structure, the vnd_pnsd_t, vnd_dev_t, vnd_str_t, and vnd_data_queue_t is + * annotated with the protection that its members receives. The following + * annotations are used: + * + * A Atomics; these values are only modified using atomics values. + * Currently this only applies to kstat values. + * E Existence; no lock is needed to access this member, it does not + * change while the structure is valid. + * GL Global Lock; these members are protected by the global + * vnd_dev_lock. + * L Locked; access to the member is controlled by a lock that is in + * the structure. + * NSL netstack lock; this member is protected by the containing + * netstack. This only applies to the vnd_dev_t`vdd_nslink. + * X This member is special, and is discussed in this section. + * + * In addition to locking, we also have reference counts on the vnd_dev_t and + * the vnd_pnsd_t. The reference counts describe the lifetimes of the structure. + * With rare exception, once a reference count is decremented, the consumer + * should not assume that the data is valid any more. The only exception to this + * is the case where we're removing an extant reference count from a link into + * /devices or /dev. Reference counts are obtained on these structures as a part + * of looking them up. + * + * # Global Lock Ordering + * ###################### + * + * The following is the order that you must take locks in vnd: + * + * 1) vnd`vnd_dev_lock + * 2) vnd_pnsd_t`vpnd_lock + * 3) vnd_dev_t`vnd_lock + * 4) vnd_str_t`vns_lock + * 5) vnd_data_queue_t`vdq_lock + * + * One must adhere to the following rules: + * + * o You must acquire a lower numbered lock before a high numbered lock. + * o It is NOT legal to hold two locks of the same level concurrently, eg. you + * can not hold two different vnd_dev_t's vnd_lock at the same time. + * o You may release locks in any order. + * o If you release a lock, you must honor the locking rules before acquiring + * it again. + * o You should not hold any locks when calling any of the rele functions. + * + * # Special Considerations + * ######################## + * + * While most of the locking is what's expected, it's worth going into the + * special nature that a few members hold. Today, only two structures have + * special considerations: the vnd_dev_t and the vnd_str_t. All members with + * special considerations have an additional annotation that describes how you + * should interact with it. + * + * vnd_dev_t: The vdd_nsd and vdd_cr are only valid when the minor node is + * attached or in the process of attaching. If the code path that goes through + * requires an attached vnd_dev_t, eg. the data path and tear down path, then it + * is always legal to dereference that member without a lock held. When they are + * added to the system, they should be done under the vdd_lock and done as part + * of setting the VND_D_ATTACH_INFLIGHT flag. These should not change during the + * lifetime of the vnd_dev_t. + * + * vnd_dev_t: The vdd_ldih is similar to the vdd_nsd and vdd_cr, except that it + * always exists as it is a part of the structure. The only time that it's valid + * to be using it is during the attach path with the VND_D_ATTACH_INFLIGHT flag + * set or during tear down. Outside of those paths which are naturally + * serialized, there is no explicit locking around the member. + * + * vnd_str_t: The vns_dev and vns_nsd work in similar ways. They are not + * initially set as part of creating the structure, but are set as part of + * responding to the association ioctl. Anything in the data path or metadata + * path that requires association may assume that they exist, as we do not kick + * off the state machine until they're set. + * + * vnd_str_t: The vns_drainblk and vns_barrierblk are similarly special. The + * members are designed to be used as part of various operations with the + * gsqueues. A lock isn't needed to use them, but to work with them, the + * appropriate flag in the vnd_str_t`vns_flags must have been set by the current + * thread. Otherwise, it is always fair game to refer to their addresses. Their + * contents are ignored by vnd, but some members are manipulated by the gsqueue + * subsystem. + */ + +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/modctl.h> +#include <sys/stat.h> +#include <sys/file.h> +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/open.h> +#include <sys/ddi.h> +#include <sys/ethernet.h> +#include <sys/stropts.h> +#include <sys/sunddi.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/ksynch.h> +#include <sys/taskq_impl.h> +#include <sys/sdt.h> +#include <sys/debug.h> +#include <sys/sysmacros.h> +#include <sys/dlpi.h> +#include <sys/cred.h> +#include <sys/id_space.h> +#include <sys/list.h> +#include <sys/ctype.h> +#include <sys/policy.h> +#include <sys/sunldi.h> +#include <sys/cred.h> +#include <sys/strsubr.h> +#include <sys/poll.h> +#include <sys/neti.h> +#include <sys/hook.h> +#include <sys/hook_event.h> +#include <sys/vlan.h> +#include <sys/dld.h> +#include <sys/mac_client.h> +#include <sys/netstack.h> +#include <sys/fs/sdev_plugin.h> +#include <sys/kstat.h> +#include <sys/atomic.h> +#include <sys/disp.h> +#include <sys/random.h> +#include <sys/gsqueue.h> +#include <sys/ht.h> + +#include <inet/ip.h> +#include <inet/ip6.h> + +#include <sys/vnd.h> + +/* + * Globals + */ +static dev_info_t *vnd_dip; +static taskq_t *vnd_taskq; +static kmem_cache_t *vnd_str_cache; +static kmem_cache_t *vnd_dev_cache; +static kmem_cache_t *vnd_pnsd_cache; +static id_space_t *vnd_minors; +static int vnd_list_init = 0; +static sdev_plugin_hdl_t vnd_sdev_hdl; +static gsqueue_set_t *vnd_sqset; + +static kmutex_t vnd_dev_lock; +static list_t vnd_dev_list; /* Protected by the vnd_dev_lock */ +static list_t vnd_nsd_list; /* Protected by the vnd_dev_lock */ + +/* + * STREAMs ioctls + * + * The STREAMs ioctls are internal to vnd. No one should be seeing them, as such + * they aren't a part of the header file. + */ +#define VND_STRIOC (('v' << 24) | ('n' << 16) | ('d' << 8) | 0x80) + +/* + * Private ioctl to associate a given streams instance with a minor instance of + * the character device. + */ +#define VND_STRIOC_ASSOCIATE (VND_STRIOC | 0x1) + +typedef struct vnd_strioc_associate { + minor_t vsa_minor; /* minor device node */ + netstackid_t vsa_nsid; /* netstack id */ + vnd_errno_t vsa_errno; /* errno */ +} vnd_strioc_associate_t; + +typedef enum vnd_strioc_state { + VSS_UNKNOWN = 0, + VSS_COPYIN = 1, + VSS_COPYOUT = 2, +} vnd_strioc_state_t; + +typedef struct vnd_strioc { + vnd_strioc_state_t vs_state; + caddr_t vs_addr; +} vnd_strioc_t; + +/* + * VND SQUEUE TAGS, start at 0x42 so we don't overlap with extent tags. Though + * really, overlap is at the end of the day, inevitable. + */ +#define VND_SQUEUE_TAG_TX_DRAIN 0x42 +#define VND_SQUEUE_TAG_MAC_FLOW_CONTROL 0x43 +#define VND_SQUEUE_TAG_VND_WRITE 0x44 +#define VND_SQUEUE_TAG_ND_FRAMEIO_WRITE 0x45 +#define VND_SQUEUE_TAG_STRBARRIER 0x46 + +/* + * vnd reserved names. These are names which are reserved by vnd and thus + * shouldn't be used by some external program. + */ +static char *vnd_reserved_names[] = { + "ctl", + "zone", + NULL +}; + +/* + * vnd's DTrace probe macros + * + * DTRACE_VND* are all for a stable provider. We also have an unstable internal + * set of probes for reference count manipulation. + */ +#define DTRACE_VND3(name, type1, arg1, type2, arg2, type3, arg3) \ + DTRACE_PROBE3(__vnd_##name, type1, arg1, type2, arg2, type3, arg3); + +#define DTRACE_VND4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \ + DTRACE_PROBE4(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4); + +#define DTRACE_VND5(name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5) \ + DTRACE_PROBE5(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5); + +#define DTRACE_VND_REFINC(vdp) \ + DTRACE_PROBE2(vnd__ref__inc, vnd_dev_t *, vdp, int, vdp->vdd_ref); +#define DTRACE_VND_REFDEC(vdp) \ + DTRACE_PROBE2(vnd__ref__dec, vnd_dev_t *, vdp, int, vdp->vdd_ref); + + +/* + * Tunables + */ +size_t vnd_vdq_default_size = 1024 * 64; /* 64 KB */ +size_t vnd_vdq_hard_max = 1024 * 1024 * 4; /* 4 MB */ + +/* + * These numbers are designed as per-device tunables that are applied when a new + * vnd device is attached. They're a rough stab at what may be a reasonable + * amount of work to do in one burst in an squeue. + */ +size_t vnd_flush_burst_size = 1520 * 10; /* 10 1500 MTU packets */ +size_t vnd_flush_nburst = 10; /* 10 frames */ + +/* + * Constants related to our sdev plugins + */ +#define VND_SDEV_NAME "vnd" +#define VND_SDEV_ROOT "/dev/vnd" +#define VND_SDEV_ZROOT "/dev/vnd/zone" + +/* + * vnd relies on privileges, not mode bits to limit access. As such, device + * files are read-write to everyone. + */ +#define VND_SDEV_MODE (S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | \ + S_IROTH | S_IWOTH) + +/* + * Statistic macros + */ +#define VND_STAT_INC(vsp, field, val) \ + atomic_add_64(&(vsp)->vns_ksdata.field.value.ui64, val) +#define VND_LATENCY_1MS 1000000 +#define VND_LATENCY_10MS 10000000 +#define VND_LATENCY_100MS 100000000 +#define VND_LATENCY_1S 1000000000 +#define VND_LATENCY_10S 10000000000 + +/* + * Constants for vnd hooks + */ +static uint8_t vnd_bcast_addr[6] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; +#define IPV4_MCAST_LEN 3 +static uint8_t vnd_ipv4_mcast[3] = { 0x01, 0x00, 0x5E }; +#define IPV6_MCAST_LEN 2 +static uint8_t vnd_ipv6_mcast[2] = { 0x33, 0x33 }; + +/* + * vnd internal data structures and types + */ + +struct vnd_str; +struct vnd_dev; +struct vnd_pnsd; + +/* + * As part of opening the device stream we need to properly communicate with our + * underlying stream. This is a bit of an asynchronous dance and we need to + * properly work with dld to get everything set up. We have to initiate the + * conversation with dld and as such we keep track of our state here. + */ +typedef enum vnd_str_state { + VNS_S_INITIAL = 0, + VNS_S_INFO_SENT, + VNS_S_EXCLUSIVE_SENT, + VNS_S_ATTACH_SENT, + VNS_S_BIND_SENT, + VNS_S_SAP_PROMISC_SENT, + VNS_S_MULTI_PROMISC_SENT, + VNS_S_RX_ONLY_PROMISC_SENT, + VNS_S_FIXUP_PROMISC_SENT, + VNS_S_CAPAB_Q_SENT, + VNS_S_CAPAB_E_SENT, + VNS_S_ONLINE, + VNS_S_SHUTTING_DOWN, + VNS_S_MULTICAST_PROMISCOFF_SENT, + VNS_S_SAP_PROMISCOFF_SENT, + VNS_S_UNBIND_SENT, + VNS_S_ZOMBIE +} vnd_str_state_t; + +typedef enum vnd_str_flags { + VNS_F_NEED_ZONE = 0x1, + VNS_F_TASKQ_DISPATCHED = 0x2, + VNS_F_CONDEMNED = 0x4, + VNS_F_FLOW_CONTROLLED = 0x8, + VNS_F_DRAIN_SCHEDULED = 0x10, + VNS_F_BARRIER = 0x20, + VNS_F_BARRIER_DONE = 0x40 +} vnd_str_flags_t; + +typedef enum vnd_capab_flags { + VNS_C_HCKSUM = 0x1, + VNS_C_DLD = 0x2, + VNS_C_DIRECT = 0x4, + VNS_C_HCKSUM_BADVERS = 0x8 +} vnd_capab_flags_t; + +/* + * Definitions to interact with direct callbacks + */ +typedef void (*vnd_rx_t)(struct vnd_str *, mac_resource_t *, mblk_t *, + mac_header_info_t *); +typedef uintptr_t vnd_mac_cookie_t; +/* DLD Direct capability function */ +typedef int (*vnd_dld_cap_t)(void *, uint_t, void *, uint_t); +/* DLD Direct tx function */ +typedef vnd_mac_cookie_t (*vnd_dld_tx_t)(void *, mblk_t *, uint64_t, uint16_t); +/* DLD Direct function to set flow control callback */ +typedef void *(*vnd_dld_set_fcb_t)(void *, void (*)(void *, vnd_mac_cookie_t), + void *); +/* DLD Direct function to see if flow controlled still */ +typedef int (*vnd_dld_is_fc_t)(void *, vnd_mac_cookie_t); + +/* + * The vnd_str_capab_t is always protected by the vnd_str_t it's a member of. + */ +typedef struct vnd_str_capab { + vnd_capab_flags_t vsc_flags; + t_uscalar_t vsc_hcksum_opts; + vnd_dld_cap_t vsc_capab_f; + void *vsc_capab_hdl; + vnd_dld_tx_t vsc_tx_f; + void *vsc_tx_hdl; + vnd_dld_set_fcb_t vsc_set_fcb_f; + void *vsc_set_fcb_hdl; + vnd_dld_is_fc_t vsc_is_fc_f; + void *vsc_is_fc_hdl; + vnd_mac_cookie_t vsc_fc_cookie; + void *vsc_tx_fc_hdl; +} vnd_str_capab_t; + +/* + * The vnd_data_queue is a simple construct for storing a series of messages in + * a queue. + * + * See synchronization section of the big theory statement for member + * annotations. + */ +typedef struct vnd_data_queue { + struct vnd_str *vdq_vns; /* E */ + kmutex_t vdq_lock; + kcondvar_t vdq_ready; /* Uses vdq_lock */ + ssize_t vdq_max; /* L */ + ssize_t vdq_cur; /* L */ + mblk_t *vdq_head; /* L */ + mblk_t *vdq_tail; /* L */ +} vnd_data_queue_t; + +typedef struct vnd_str_stat { + kstat_named_t vks_rbytes; + kstat_named_t vks_rpackets; + kstat_named_t vks_obytes; + kstat_named_t vks_opackets; + kstat_named_t vks_nhookindrops; + kstat_named_t vks_nhookoutdrops; + kstat_named_t vks_ndlpidrops; + kstat_named_t vks_ndataindrops; + kstat_named_t vks_ndataoutdrops; + kstat_named_t vks_tdrops; + kstat_named_t vks_linkname; + kstat_named_t vks_zonename; + kstat_named_t vks_nmacflow; + kstat_named_t vks_tmacflow; + kstat_named_t vks_mac_flow_1ms; + kstat_named_t vks_mac_flow_10ms; + kstat_named_t vks_mac_flow_100ms; + kstat_named_t vks_mac_flow_1s; + kstat_named_t vks_mac_flow_10s; +} vnd_str_stat_t; + +/* + * vnd stream structure + * + * See synchronization section of the big theory statement for member + * annotations. + */ +typedef struct vnd_str { + kmutex_t vns_lock; + kcondvar_t vns_cancelcv; /* Uses vns_lock */ + kcondvar_t vns_barriercv; /* Uses vns_lock */ + kcondvar_t vns_stcv; /* Uses vns_lock */ + vnd_str_state_t vns_state; /* L */ + vnd_str_state_t vns_laststate; /* L */ + vnd_errno_t vns_errno; /* L */ + vnd_str_flags_t vns_flags; /* L */ + vnd_str_capab_t vns_caps; /* L */ + taskq_ent_t vns_tqe; /* L */ + vnd_data_queue_t vns_dq_read; /* E */ + vnd_data_queue_t vns_dq_write; /* E */ + mblk_t *vns_dlpi_inc; /* L */ + queue_t *vns_rq; /* E */ + queue_t *vns_wq; /* E */ + queue_t *vns_lrq; /* E */ + t_uscalar_t vns_dlpi_style; /* L */ + t_uscalar_t vns_minwrite; /* L */ + t_uscalar_t vns_maxwrite; /* L */ + hrtime_t vns_fclatch; /* L */ + hrtime_t vns_fcupdate; /* L */ + kstat_t *vns_kstat; /* E */ + gsqueue_t *vns_squeue; /* E */ + mblk_t vns_drainblk; /* E + X */ + mblk_t vns_barrierblk; /* E + X */ + vnd_str_stat_t vns_ksdata; /* A */ + size_t vns_nflush; /* L */ + size_t vns_bsize; /* L */ + struct vnd_dev *vns_dev; /* E + X */ + struct vnd_pnsd *vns_nsd; /* E + X */ +} vnd_str_t; + +typedef enum vnd_dev_flags { + VND_D_ATTACH_INFLIGHT = 0x001, + VND_D_ATTACHED = 0x002, + VND_D_LINK_INFLIGHT = 0x004, + VND_D_LINKED = 0x008, + VND_D_CONDEMNED = 0x010, + VND_D_ZONE_DYING = 0x020, + VND_D_OPENED = 0x040 +} vnd_dev_flags_t; + +/* + * This represents the data associated with a minor device instance. + * + * See synchronization section of the big theory statement for member + * annotations. + */ +typedef struct vnd_dev { + kmutex_t vdd_lock; + list_node_t vdd_link; /* GL */ + list_node_t vdd_nslink; /* NSL */ + int vdd_ref; /* L */ + vnd_dev_flags_t vdd_flags; /* L */ + minor_t vdd_minor; /* E */ + dev_t vdd_devid; /* E */ + ldi_ident_t vdd_ldiid; /* E */ + ldi_handle_t vdd_ldih; /* X */ + cred_t *vdd_cr; /* X */ + vnd_str_t *vdd_str; /* L */ + struct pollhead vdd_ph; /* E */ + struct vnd_pnsd *vdd_nsd; /* E + X */ + char vdd_datalink[VND_NAMELEN]; /* L */ + char vdd_lname[VND_NAMELEN]; /* L */ +} vnd_dev_t; + +typedef enum vnd_pnsd_flags { + VND_NS_CONDEMNED = 0x1 +} vnd_pnsd_flags_t; + +/* + * Per netstack data structure. + * + * See synchronization section of the big theory statement for member + * annotations. + */ +typedef struct vnd_pnsd { + list_node_t vpnd_link; /* protected by global dev lock */ + zoneid_t vpnd_zid; /* E */ + netstackid_t vpnd_nsid; /* E */ + boolean_t vpnd_hooked; /* E */ + net_handle_t vpnd_neti_v4; /* E */ + hook_family_t vpnd_family_v4; /* E */ + hook_event_t vpnd_event_in_v4; /* E */ + hook_event_t vpnd_event_out_v4; /* E */ + hook_event_token_t vpnd_token_in_v4; /* E */ + hook_event_token_t vpnd_token_out_v4; /* E */ + net_handle_t vpnd_neti_v6; /* E */ + hook_family_t vpnd_family_v6; /* E */ + hook_event_t vpnd_event_in_v6; /* E */ + hook_event_t vpnd_event_out_v6; /* E */ + hook_event_token_t vpnd_token_in_v6; /* E */ + hook_event_token_t vpnd_token_out_v6; /* E */ + kmutex_t vpnd_lock; /* Protects remaining members */ + kcondvar_t vpnd_ref_change; /* Uses vpnd_lock */ + int vpnd_ref; /* L */ + vnd_pnsd_flags_t vpnd_flags; /* L */ + list_t vpnd_dev_list; /* L */ +} vnd_pnsd_t; + +static void vnd_squeue_tx_drain(void *, mblk_t *, gsqueue_t *, void *); + +/* + * Drop function signature. + */ +typedef void (*vnd_dropper_f)(vnd_str_t *, mblk_t *, const char *); + +static void +vnd_drop_ctl(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__ctl, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_in(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_ndataindrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_out(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_ndataoutdrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_hook_in(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_nhookindrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_hook_out(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_nhookoutdrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +/* ARGSUSED */ +static void +vnd_drop_panic(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + panic("illegal vnd drop"); +} + +/* ARGSUSED */ +static void +vnd_mac_drop_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain, + mac_header_info_t *mhip) +{ + mblk_t *mp; + + while (mp_chain != NULL) { + mp = mp_chain; + mp_chain = mp->b_next; + vnd_drop_hook_in(vsp, mp, "stream not associated"); + } +} + +static vnd_pnsd_t * +vnd_nsd_lookup(netstackid_t nsid) +{ + vnd_pnsd_t *nsp; + + mutex_enter(&vnd_dev_lock); + for (nsp = list_head(&vnd_nsd_list); nsp != NULL; + nsp = list_next(&vnd_nsd_list, nsp)) { + if (nsp->vpnd_nsid == nsid) { + mutex_enter(&nsp->vpnd_lock); + VERIFY(nsp->vpnd_ref >= 0); + nsp->vpnd_ref++; + mutex_exit(&nsp->vpnd_lock); + break; + } + } + mutex_exit(&vnd_dev_lock); + return (nsp); +} + +static vnd_pnsd_t * +vnd_nsd_lookup_by_zid(zoneid_t zid) +{ + netstack_t *ns; + vnd_pnsd_t *nsp; + ns = netstack_find_by_zoneid(zid); + if (ns == NULL) + return (NULL); + nsp = vnd_nsd_lookup(ns->netstack_stackid); + netstack_rele(ns); + return (nsp); +} + +static vnd_pnsd_t * +vnd_nsd_lookup_by_zonename(char *zname) +{ + zone_t *zonep; + vnd_pnsd_t *nsp; + + zonep = zone_find_by_name(zname); + if (zonep == NULL) + return (NULL); + + nsp = vnd_nsd_lookup_by_zid(zonep->zone_id); + zone_rele(zonep); + return (nsp); +} + +static void +vnd_nsd_ref(vnd_pnsd_t *nsp) +{ + mutex_enter(&nsp->vpnd_lock); + /* + * This can only be used on something that has been obtained through + * some other means. As such, the caller should already have a reference + * before adding another one. This function should not be used as a + * means of creating the initial reference. + */ + VERIFY(nsp->vpnd_ref > 0); + nsp->vpnd_ref++; + mutex_exit(&nsp->vpnd_lock); + cv_broadcast(&nsp->vpnd_ref_change); +} + +static void +vnd_nsd_rele(vnd_pnsd_t *nsp) +{ + mutex_enter(&nsp->vpnd_lock); + VERIFY(nsp->vpnd_ref > 0); + nsp->vpnd_ref--; + mutex_exit(&nsp->vpnd_lock); + cv_broadcast(&nsp->vpnd_ref_change); +} + +static vnd_dev_t * +vnd_dev_lookup(minor_t m) +{ + vnd_dev_t *vdp; + mutex_enter(&vnd_dev_lock); + for (vdp = list_head(&vnd_dev_list); vdp != NULL; + vdp = list_next(&vnd_dev_list, vdp)) { + if (vdp->vdd_minor == m) { + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_ref > 0); + vdp->vdd_ref++; + DTRACE_VND_REFINC(vdp); + mutex_exit(&vdp->vdd_lock); + break; + } + } + mutex_exit(&vnd_dev_lock); + return (vdp); +} + +static void +vnd_dev_free(vnd_dev_t *vdp) +{ + /* + * When the STREAM exists we need to go through and make sure + * communication gets torn down. As part of closing the stream, we + * guarantee that nothing else should be able to enter the stream layer + * at this point. That means no one should be able to call + * read(),write() or one of the frameio ioctls. + */ + if (vdp->vdd_flags & VND_D_ATTACHED) { + (void) ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); + crfree(vdp->vdd_cr); + vdp->vdd_cr = NULL; + + /* + * We have to remove ourselves from our parents list now. It is + * really quite important that we have already set the condemend + * flag here so that our containing netstack basically knows + * that we're on the way down and knows not to wait for us. It's + * also important that we do that before we put a rele on the + * the device as that is the point at which it will check again. + */ + mutex_enter(&vdp->vdd_nsd->vpnd_lock); + list_remove(&vdp->vdd_nsd->vpnd_dev_list, vdp); + mutex_exit(&vdp->vdd_nsd->vpnd_lock); + vnd_nsd_rele(vdp->vdd_nsd); + vdp->vdd_nsd = NULL; + } + ASSERT(vdp->vdd_flags & VND_D_CONDEMNED); + id_free(vnd_minors, vdp->vdd_minor); + mutex_destroy(&vdp->vdd_lock); + kmem_cache_free(vnd_dev_cache, vdp); +} + +static void +vnd_dev_ref(vnd_dev_t *vdp) +{ + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_ref > 0); + vdp->vdd_ref++; + DTRACE_VND_REFINC(vdp); + mutex_exit(&vdp->vdd_lock); +} + +/* + * As part of releasing the hold on this we may tear down a given vnd_dev_t As + * such we need to make sure that we grab the list lock first before grabbing + * the vnd_dev_t's lock to ensure proper lock ordering. + */ +static void +vnd_dev_rele(vnd_dev_t *vdp) +{ + mutex_enter(&vnd_dev_lock); + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_ref > 0); + vdp->vdd_ref--; + DTRACE_VND_REFDEC(vdp); + if (vdp->vdd_ref > 0) { + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + return; + } + + /* + * Now that we've removed this from the list, we can go ahead and + * drop the list lock. No one else can find this device and reference + * it. As its reference count is zero, it by definition does not have + * any remaining entries in /devices that could lead someone back to + * this. + */ + vdp->vdd_flags |= VND_D_CONDEMNED; + list_remove(&vnd_dev_list, vdp); + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + + vnd_dev_free(vdp); +} + +/* + * Insert a mesage block chain if there's space, otherwise drop it. Return one + * so someone who was waiting for data would now end up having found it. eg. + * caller should consider a broadcast. + */ +static int +vnd_dq_push(vnd_data_queue_t *vqp, mblk_t *mp, boolean_t reserved, + vnd_dropper_f dropf) +{ + size_t msize; + + ASSERT(MUTEX_HELD(&vqp->vdq_lock)); + if (reserved == B_FALSE) { + msize = msgsize(mp); + if (vqp->vdq_cur + msize > vqp->vdq_max) { + dropf(vqp->vdq_vns, mp, "buffer full"); + return (0); + } + vqp->vdq_cur += msize; + } + + if (vqp->vdq_head == NULL) { + ASSERT(vqp->vdq_tail == NULL); + vqp->vdq_head = mp; + vqp->vdq_tail = mp; + } else { + vqp->vdq_tail->b_next = mp; + vqp->vdq_tail = mp; + } + + return (1); +} + +/* + * Remove a message message block chain. If the amount of space in the buffer + * has changed we return 1. We have no way of knowing whether or not there is + * enough space overall for a given writer who is blocked, so we always end up + * having to return true and thus tell consumers that they should consider + * signalling. + */ +static int +vnd_dq_pop(vnd_data_queue_t *vqp, mblk_t **mpp) +{ + size_t msize; + mblk_t *mp; + + ASSERT(MUTEX_HELD(&vqp->vdq_lock)); + ASSERT(mpp != NULL); + if (vqp->vdq_head == NULL) { + ASSERT(vqp->vdq_tail == NULL); + *mpp = NULL; + return (0); + } + + mp = vqp->vdq_head; + msize = msgsize(mp); + + vqp->vdq_cur -= msize; + if (mp->b_next == NULL) { + vqp->vdq_head = NULL; + vqp->vdq_tail = NULL; + /* + * We can't be certain that this is always going to be zero. + * Someone may have basically taken a reservation of space on + * the data queue, eg. claimed spae but not yet pushed it on + * yet. + */ + ASSERT(vqp->vdq_cur >= 0); + } else { + vqp->vdq_head = mp->b_next; + ASSERT(vqp->vdq_cur > 0); + } + mp->b_next = NULL; + *mpp = mp; + return (1); +} + +/* + * Reserve space in the queue. This will bump up the size of the queue and + * entitle the user to push something on later without bumping the space. + */ +static int +vnd_dq_reserve(vnd_data_queue_t *vqp, ssize_t size) +{ + ASSERT(MUTEX_HELD(&vqp->vdq_lock)); + ASSERT(size >= 0); + + if (size == 0) + return (0); + + if (size + vqp->vdq_cur > vqp->vdq_max) + return (0); + + vqp->vdq_cur += size; + return (1); +} + +static void +vnd_dq_unreserve(vnd_data_queue_t *vqp, ssize_t size) +{ + ASSERT(MUTEX_HELD(&vqp->vdq_lock)); + ASSERT(size > 0); + ASSERT(size <= vqp->vdq_cur); + + vqp->vdq_cur -= size; +} + +static void +vnd_dq_flush(vnd_data_queue_t *vqp, vnd_dropper_f dropf) +{ + mblk_t *mp, *next; + + mutex_enter(&vqp->vdq_lock); + for (mp = vqp->vdq_head; mp != NULL; mp = next) { + next = mp->b_next; + mp->b_next = NULL; + dropf(vqp->vdq_vns, mp, "vnd_dq_flush"); + } + vqp->vdq_cur = 0; + vqp->vdq_head = NULL; + vqp->vdq_tail = NULL; + mutex_exit(&vqp->vdq_lock); +} + +static boolean_t +vnd_dq_is_empty(vnd_data_queue_t *vqp) +{ + boolean_t ret; + + mutex_enter(&vqp->vdq_lock); + if (vqp->vdq_head == NULL) + ret = B_TRUE; + else + ret = B_FALSE; + mutex_exit(&vqp->vdq_lock); + + return (ret); +} + +/* + * Get a network uint16_t from the message and translate it into something the + * host understands. + */ +static int +vnd_mbc_getu16(mblk_t *mp, off_t off, uint16_t *out) +{ + size_t mpsize; + uint8_t *bp; + + mpsize = msgsize(mp); + /* Check for overflow */ + if (off + sizeof (uint16_t) > mpsize) + return (1); + + mpsize = MBLKL(mp); + while (off >= mpsize) { + mp = mp->b_cont; + off -= mpsize; + mpsize = MBLKL(mp); + } + + /* + * Data is in network order. Note the second byte of data might be in + * the next mp. + */ + bp = mp->b_rptr + off; + *out = *bp << 8; + if (off + 1 == mpsize) { + mp = mp->b_cont; + bp = mp->b_rptr; + } else { + bp++; + } + + *out |= *bp; + return (0); +} + +/* + * Given an mblk chain find the mblk and address of a particular offset. + */ +static int +vnd_mbc_getoffset(mblk_t *mp, off_t off, mblk_t **mpp, uintptr_t *offp) +{ + size_t mpsize; + + if (off >= msgsize(mp)) + return (1); + + mpsize = MBLKL(mp); + while (off >= mpsize) { + mp = mp->b_cont; + off -= mpsize; + mpsize = MBLKL(mp); + } + *mpp = mp; + *offp = (uintptr_t)mp->b_rptr + off; + + return (0); +} + +/* + * Fetch the destination mac address. Set *dstp to that mac address. If the data + * is not contiguous in the first mblk_t, fill in datap and set *dstp to it. + */ +static int +vnd_mbc_getdstmac(mblk_t *mp, uint8_t **dstpp, uint8_t *datap) +{ + int i; + + if (MBLKL(mp) >= ETHERADDRL) { + *dstpp = mp->b_rptr; + return (0); + } + + *dstpp = datap; + for (i = 0; i < ETHERADDRL; i += 2, datap += 2) { + if (vnd_mbc_getu16(mp, i, (uint16_t *)datap) != 0) + return (1); + } + + return (0); +} + +static int +vnd_hook(vnd_str_t *vsp, mblk_t **mpp, net_handle_t netiv4, hook_event_t hev4, + hook_event_token_t hetv4, net_handle_t netiv6, hook_event_t hev6, + hook_event_token_t hetv6, vnd_dropper_f hdrop, vnd_dropper_f ddrop) +{ + uint16_t etype; + hook_pkt_event_t info; + size_t offset, mblen; + uint8_t *dstp; + uint8_t dstaddr[6]; + hook_event_t he; + hook_event_token_t het; + net_handle_t neti; + + /* + * Before we can ask if we're interested we have to do enough work to + * determine the ethertype. + */ + + /* Byte 12 is either the VLAN tag or the ethertype */ + if (vnd_mbc_getu16(*mpp, 12, &etype) != 0) { + ddrop(vsp, *mpp, "packet has incomplete ethernet header"); + *mpp = NULL; + return (1); + } + + if (etype == ETHERTYPE_VLAN) { + /* Actual ethertype is another four bytes in */ + if (vnd_mbc_getu16(*mpp, 16, &etype) != 0) { + ddrop(vsp, *mpp, + "packet has incomplete ethernet vlan header"); + *mpp = NULL; + return (1); + } + offset = sizeof (struct ether_vlan_header); + } else { + offset = sizeof (struct ether_header); + } + + /* + * At the moment we only hook on the kinds of things that the IP module + * would normally. + */ + if (etype != ETHERTYPE_IP && etype != ETHERTYPE_IPV6) + return (0); + + if (etype == ETHERTYPE_IP) { + neti = netiv4; + he = hev4; + het = hetv4; + } else { + neti = netiv6; + he = hev6; + het = hetv6; + } + + if (!he.he_interested) + return (0); + + + if (vnd_mbc_getdstmac(*mpp, &dstp, dstaddr) != 0) { + ddrop(vsp, *mpp, "packet has incomplete ethernet header"); + *mpp = NULL; + return (1); + } + + /* + * Now that we know we're interested, we have to do some additional + * sanity checking for IPF's sake, ala ip_check_length(). Specifically + * we need to check to make sure that the remaining packet size, + * excluding MAC, is at least the size of an IP header. + */ + mblen = msgsize(*mpp); + if ((etype == ETHERTYPE_IP && + mblen - offset < IP_SIMPLE_HDR_LENGTH) || + (etype == ETHERTYPE_IPV6 && mblen - offset < IPV6_HDR_LEN)) { + ddrop(vsp, *mpp, "packet has invalid IP header"); + *mpp = NULL; + return (1); + } + + info.hpe_protocol = neti; + info.hpe_ifp = (phy_if_t)vsp; + info.hpe_ofp = (phy_if_t)vsp; + info.hpe_mp = mpp; + info.hpe_flags = 0; + + if (bcmp(vnd_bcast_addr, dstp, ETHERADDRL) == 0) + info.hpe_flags |= HPE_BROADCAST; + else if (etype == ETHERTYPE_IP && + bcmp(vnd_ipv4_mcast, vnd_bcast_addr, IPV4_MCAST_LEN) == 0) + info.hpe_flags |= HPE_MULTICAST; + else if (etype == ETHERTYPE_IPV6 && + bcmp(vnd_ipv6_mcast, vnd_bcast_addr, IPV6_MCAST_LEN) == 0) + info.hpe_flags |= HPE_MULTICAST; + + if (vnd_mbc_getoffset(*mpp, offset, &info.hpe_mb, + (uintptr_t *)&info.hpe_hdr) != 0) { + ddrop(vsp, *mpp, "packet too small -- " + "unable to find payload"); + *mpp = NULL; + return (1); + } + + if (hook_run(neti->netd_hooks, het, (hook_data_t)&info) != 0) { + hdrop(vsp, *mpp, "drooped by hooks"); + return (1); + } + + return (0); +} + +/* + * This should not be used for DL_INFO_REQ. + */ +static mblk_t * +vnd_dlpi_alloc(size_t len, t_uscalar_t prim) +{ + mblk_t *mp; + mp = allocb(len, BPRI_MED); + if (mp == NULL) + return (NULL); + + mp->b_datap->db_type = M_PROTO; + mp->b_wptr = mp->b_rptr + len; + bzero(mp->b_rptr, len); + ((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim; + + return (mp); +} + +static void +vnd_dlpi_inc_push(vnd_str_t *vsp, mblk_t *mp) +{ + mblk_t **mpp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + ASSERT(mp->b_next == NULL); + mpp = &vsp->vns_dlpi_inc; + while (*mpp != NULL) + mpp = &((*mpp)->b_next); + *mpp = mp; +} + +static mblk_t * +vnd_dlpi_inc_pop(vnd_str_t *vsp) +{ + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vsp->vns_dlpi_inc; + if (mp != NULL) { + VERIFY(mp->b_next == NULL || mp->b_next != mp); + vsp->vns_dlpi_inc = mp->b_next; + mp->b_next = NULL; + } + return (mp); +} + +static int +vnd_st_sinfo(vnd_str_t *vsp) +{ + mblk_t *mp; + dl_info_req_t *dlir; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), + BPRI_HI); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + vsp->vns_state = VNS_S_INFO_SENT; + cv_broadcast(&vsp->vns_stcv); + + mp->b_datap->db_type = M_PCPROTO; + dlir = (dl_info_req_t *)mp->b_rptr; + mp->b_wptr = (uchar_t *)&dlir[1]; + dlir->dl_primitive = DL_INFO_REQ; + putnext(vsp->vns_wq, mp); + + return (0); +} + +static int +vnd_st_info(vnd_str_t *vsp) +{ + dl_info_ack_t *dlia; + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + dlia = (dl_info_ack_t *)mp->b_rptr; + vsp->vns_dlpi_style = dlia->dl_provider_style; + vsp->vns_minwrite = dlia->dl_min_sdu; + vsp->vns_maxwrite = dlia->dl_max_sdu; + + /* + * At this time we only support DL_ETHER devices. + */ + if (dlia->dl_mac_type != DL_ETHER) { + freemsg(mp); + vsp->vns_errno = VND_E_NOTETHER; + return (1); + } + + /* + * Because vnd operates on entire packets, we need to manually account + * for the ethernet header information. We add the size of the + * ether_vlan_header to account for this, regardless if it is using + * vlans or not. + */ + vsp->vns_maxwrite += sizeof (struct ether_vlan_header); + + freemsg(mp); + return (0); +} + +static int +vnd_st_sexclusive(vnd_str_t *vsp) +{ + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_EXCLUSIVE_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + + vsp->vns_state = VNS_S_EXCLUSIVE_SENT; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + return (0); +} + +static int +vnd_st_exclusive(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, + "wrong dlpi primitive for vnd_st_exclusive"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (cprim != DL_EXCLUSIVE_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_exclusive: got ack/nack for wrong primitive"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (prim == DL_ERROR_ACK) + vsp->vns_errno = VND_E_DLEXCL; + + freemsg(mp); + return (prim == DL_ERROR_ACK); +} + +/* + * Send down a DLPI_ATTACH_REQ. + */ +static int +vnd_st_sattach(vnd_str_t *vsp) +{ + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_ATTACH_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + + ((dl_attach_req_t *)mp->b_rptr)->dl_ppa = 0; + vsp->vns_state = VNS_S_ATTACH_SENT; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + + return (0); +} + +static int +vnd_st_attach(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, "vnd_st_attach: unknown primitive type"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (cprim != DL_ATTACH_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_attach: Got ack/nack for wrong primitive"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (prim == DL_ERROR_ACK) + vsp->vns_errno = VND_E_ATTACHFAIL; + + freemsg(mp); + return (prim == DL_ERROR_ACK); +} + +static int +vnd_st_sbind(vnd_str_t *vsp) +{ + mblk_t *mp; + dl_bind_req_t *dbrp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), + DL_BIND_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + dbrp = (dl_bind_req_t *)(mp->b_rptr); + dbrp->dl_sap = 0; + dbrp->dl_service_mode = DL_CLDLS; + + vsp->vns_state = VNS_S_BIND_SENT; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + + return (0); +} + +static int +vnd_st_bind(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive; + + if (prim != DL_BIND_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, "wrong dlpi primitive for vnd_st_bind"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (prim == DL_ERROR_ACK) + vsp->vns_errno = VND_E_BINDFAIL; + + freemsg(mp); + return (prim == DL_ERROR_ACK); +} + +static int +vnd_st_spromisc(vnd_str_t *vsp, int type, vnd_str_state_t next) +{ + mblk_t *mp; + dl_promiscon_req_t *dprp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCON_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + + dprp = (dl_promiscon_req_t *)mp->b_rptr; + dprp->dl_level = type; + + vsp->vns_state = next; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + + return (0); +} + +static int +vnd_st_promisc(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, + "wrong dlpi primitive for vnd_st_promisc"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (cprim != DL_PROMISCON_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_promisc: Got ack/nack for wrong primitive"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (prim == DL_ERROR_ACK) + vsp->vns_errno = VND_E_PROMISCFAIL; + + freemsg(mp); + return (prim == DL_ERROR_ACK); +} + +static int +vnd_st_scapabq(vnd_str_t *vsp) +{ + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + + mp = vnd_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + + vsp->vns_state = VNS_S_CAPAB_Q_SENT; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + + return (0); +} + +/* ARGSUSED */ +static void +vnd_mac_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain, + mac_header_info_t *mhip) +{ + int signal = 0; + mblk_t *mp; + vnd_pnsd_t *nsp = vsp->vns_nsd; + + ASSERT(vsp != NULL); + ASSERT(mp_chain != NULL); + + for (mp = mp_chain; mp != NULL; mp = mp_chain) { + uint16_t vid; + mp_chain = mp->b_next; + mp->b_next = NULL; + + /* + * If we were operating in a traditional dlpi context then we + * would have enabled DLIOCRAW and rather than the fast path, we + * would come through dld_str_rx_raw. That function does two + * things that we have to consider doing ourselves. The first is + * that it adjusts the b_rptr back to account for dld bumping us + * past the mac header. It also tries to account for cases where + * mac provides an illusion of the mac header. Fortunately, dld + * only allows the fastpath when the media type is the same as + * the native type. Therefore all we have to do here is adjust + * the b_rptr. + */ + ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize); + mp->b_rptr -= mhip->mhi_hdrsize; + vid = VLAN_ID(mhip->mhi_tci); + if (mhip->mhi_istagged && vid != VLAN_ID_NONE) { + /* + * This is an overlapping copy. Do not use bcopy(9F). + */ + (void) memmove(mp->b_rptr + 4, mp->b_rptr, 12); + mp->b_rptr += 4; + } + + if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4, + nsp->vpnd_event_in_v4, nsp->vpnd_token_in_v4, + nsp->vpnd_neti_v6, nsp->vpnd_event_in_v6, + nsp->vpnd_token_in_v6, vnd_drop_hook_in, vnd_drop_in) != 0) + continue; + + VND_STAT_INC(vsp, vks_rpackets, 1); + VND_STAT_INC(vsp, vks_rbytes, msgsize(mp)); + DTRACE_VND5(recv, mblk_t *, mp, void *, NULL, void *, NULL, + vnd_str_t *, vsp, mblk_t *, mp); + mutex_enter(&vsp->vns_dq_read.vdq_lock); + signal |= vnd_dq_push(&vsp->vns_dq_read, mp, B_FALSE, + vnd_drop_in); + mutex_exit(&vsp->vns_dq_read.vdq_lock); + } + + if (signal != 0) { + cv_broadcast(&vsp->vns_dq_read.vdq_ready); + pollwakeup(&vsp->vns_dev->vdd_ph, POLLIN | POLLRDNORM); + } + +} + +static void +vnd_mac_flow_control_stat(vnd_str_t *vsp, hrtime_t diff) +{ + VND_STAT_INC(vsp, vks_nmacflow, 1); + VND_STAT_INC(vsp, vks_tmacflow, diff); + if (diff >= VND_LATENCY_1MS) + VND_STAT_INC(vsp, vks_mac_flow_1ms, 1); + if (diff >= VND_LATENCY_10MS) + VND_STAT_INC(vsp, vks_mac_flow_10ms, 1); + if (diff >= VND_LATENCY_100MS) + VND_STAT_INC(vsp, vks_mac_flow_100ms, 1); + if (diff >= VND_LATENCY_1S) + VND_STAT_INC(vsp, vks_mac_flow_1s, 1); + if (diff >= VND_LATENCY_10S) + VND_STAT_INC(vsp, vks_mac_flow_10s, 1); +} + +/* + * This is a callback from MAC that indicates that we are allowed to send + * packets again. + */ +static void +vnd_mac_flow_control(void *arg, vnd_mac_cookie_t cookie) +{ + vnd_str_t *vsp = arg; + hrtime_t now; + + mutex_enter(&vsp->vns_lock); + now = gethrtime(); + + /* + * Check for the case that we beat vnd_squeue_tx_one to the punch. + * There's also an additional case here that we got notified because + * we're sharing a device that ran out of tx descriptors, even though it + * wasn't because of us. + */ + if (!(vsp->vns_flags & VNS_F_FLOW_CONTROLLED)) { + vsp->vns_fcupdate = now; + mutex_exit(&vsp->vns_lock); + return; + } + + ASSERT(vsp->vns_flags & VNS_F_FLOW_CONTROLLED); + ASSERT(vsp->vns_caps.vsc_fc_cookie == cookie); + vsp->vns_flags &= ~VNS_F_FLOW_CONTROLLED; + vsp->vns_caps.vsc_fc_cookie = NULL; + vsp->vns_fclatch = 0; + DTRACE_VND3(flow__resumed, vnd_str_t *, vsp, uint64_t, + vsp->vns_dq_write.vdq_cur, uintptr_t, cookie); + /* + * If someone has asked to flush the squeue and thus inserted a barrier, + * than we shouldn't schedule a drain. + */ + if (!(vsp->vns_flags & (VNS_F_DRAIN_SCHEDULED | VNS_F_BARRIER))) { + vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED; + gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_drainblk, + vnd_squeue_tx_drain, vsp, GSQUEUE_FILL, + VND_SQUEUE_TAG_MAC_FLOW_CONTROL); + } + mutex_exit(&vsp->vns_lock); +} + +static void +vnd_mac_enter(vnd_str_t *vsp, mac_perim_handle_t *mphp) +{ + ASSERT(MUTEX_HELD(&vsp->vns_lock)); + VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl, + DLD_CAPAB_PERIM, mphp, DLD_ENABLE) == 0); +} + +static void +vnd_mac_exit(vnd_str_t *vsp, mac_perim_handle_t mph) +{ + ASSERT(MUTEX_HELD(&vsp->vns_lock)); + VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl, + DLD_CAPAB_PERIM, mph, DLD_DISABLE) == 0); +} + +static int +vnd_dld_cap_enable(vnd_str_t *vsp, vnd_rx_t rxfunc) +{ + int ret; + dld_capab_direct_t d; + mac_perim_handle_t mph; + vnd_str_capab_t *c = &vsp->vns_caps; + + bzero(&d, sizeof (d)); + d.di_rx_cf = (uintptr_t)rxfunc; + d.di_rx_ch = vsp; + d.di_flags = DI_DIRECT_RAW; + + vnd_mac_enter(vsp, &mph); + + /* + * If we're coming in here for a second pass, we need to make sure that + * we remove an existing flow control notification callback, otherwise + * we'll create a duplicate that will remain with garbage data. + */ + if (c->vsc_tx_fc_hdl != NULL) { + ASSERT(c->vsc_set_fcb_hdl != NULL); + (void) c->vsc_set_fcb_f(c->vsc_set_fcb_hdl, NULL, + c->vsc_tx_fc_hdl); + c->vsc_tx_fc_hdl = NULL; + } + + if (vsp->vns_caps.vsc_capab_f(c->vsc_capab_hdl, + DLD_CAPAB_DIRECT, &d, DLD_ENABLE) == 0) { + c->vsc_tx_f = (vnd_dld_tx_t)d.di_tx_df; + c->vsc_tx_hdl = d.di_tx_dh; + c->vsc_set_fcb_f = (vnd_dld_set_fcb_t)d.di_tx_cb_df; + c->vsc_set_fcb_hdl = d.di_tx_cb_dh; + c->vsc_is_fc_f = (vnd_dld_is_fc_t)d.di_tx_fctl_df; + c->vsc_is_fc_hdl = d.di_tx_fctl_dh; + c->vsc_tx_fc_hdl = c->vsc_set_fcb_f(c->vsc_set_fcb_hdl, + vnd_mac_flow_control, vsp); + c->vsc_flags |= VNS_C_DIRECT; + ret = 0; + } else { + vsp->vns_errno = VND_E_DIRECTFAIL; + ret = 1; + } + vnd_mac_exit(vsp, mph); + return (ret); +} + +static int +vnd_st_capabq(vnd_str_t *vsp) +{ + mblk_t *mp; + dl_capability_ack_t *cap; + dl_capability_sub_t *subp; + dl_capab_hcksum_t *hck; + dl_capab_dld_t *dld; + unsigned char *rp; + int ret = 0; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + + rp = mp->b_rptr; + cap = (dl_capability_ack_t *)rp; + if (cap->dl_sub_length == 0) + goto done; + + /* Don't try to process something too big */ + if (sizeof (dl_capability_ack_t) + cap->dl_sub_length > MBLKL(mp)) { + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); + vsp->vns_errno = VND_E_CAPACKINVAL; + ret = 1; + goto done; + } + + rp += cap->dl_sub_offset; + + while (cap->dl_sub_length > 0) { + subp = (dl_capability_sub_t *)rp; + /* Sanity check something crazy from down below */ + if (subp->dl_length + sizeof (dl_capability_sub_t) > + cap->dl_sub_length) { + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); + vsp->vns_errno = VND_E_SUBCAPINVAL; + ret = 1; + goto done; + } + + switch (subp->dl_cap) { + case DL_CAPAB_HCKSUM: + hck = (dl_capab_hcksum_t *)(rp + + sizeof (dl_capability_sub_t)); + if (hck->hcksum_version != HCKSUM_CURRENT_VERSION) { + vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM_BADVERS; + break; + } + if (dlcapabcheckqid(&hck->hcksum_mid, vsp->vns_lrq) != + B_TRUE) { + vsp->vns_errno = VND_E_CAPABPASS; + ret = 1; + goto done; + } + vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM; + vsp->vns_caps.vsc_hcksum_opts = hck->hcksum_txflags; + break; + case DL_CAPAB_DLD: + dld = (dl_capab_dld_t *)(rp + + sizeof (dl_capability_sub_t)); + if (dld->dld_version != DLD_CURRENT_VERSION) { + vsp->vns_errno = VND_E_DLDBADVERS; + ret = 1; + goto done; + } + if (dlcapabcheckqid(&dld->dld_mid, vsp->vns_lrq) != + B_TRUE) { + vsp->vns_errno = VND_E_CAPABPASS; + ret = 1; + goto done; + } + vsp->vns_caps.vsc_flags |= VNS_C_DLD; + vsp->vns_caps.vsc_capab_f = + (vnd_dld_cap_t)dld->dld_capab; + vsp->vns_caps.vsc_capab_hdl = + (void *)dld->dld_capab_handle; + /* + * At this point in time, we have to set up a direct + * function that drops all input. This validates that + * we'll be able to set up direct input and that we can + * easily switch it earlier to the real data function + * when we've plumbed everything up. + */ + if (vnd_dld_cap_enable(vsp, vnd_mac_drop_input) != 0) { + /* vns_errno set by vnd_dld_cap_enable */ + ret = 1; + goto done; + } + break; + default: + /* Ignore unsupported cap */ + break; + } + + rp += sizeof (dl_capability_sub_t) + subp->dl_length; + cap->dl_sub_length -= sizeof (dl_capability_sub_t) + + subp->dl_length; + } + +done: + /* Make sure we enabled direct callbacks */ + if (ret == 0 && !(vsp->vns_caps.vsc_flags & VNS_C_DIRECT)) { + vsp->vns_errno = VND_E_DIRECTNOTSUP; + ret = 1; + } + + freemsg(mp); + return (ret); +} + +static void +vnd_st_sonline(vnd_str_t *vsp) +{ + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + vsp->vns_state = VNS_S_ONLINE; + cv_broadcast(&vsp->vns_stcv); +} + +static void +vnd_st_shutdown(vnd_str_t *vsp) +{ + mac_perim_handle_t mph; + vnd_str_capab_t *vsc = &vsp->vns_caps; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + + /* + * At this point in time we know that there is no one transmitting as + * our final reference has been torn down and that vnd_s_close inserted + * a barrier to validate that everything is flushed. + */ + if (vsc->vsc_flags & VNS_C_DIRECT) { + vnd_mac_enter(vsp, &mph); + vsc->vsc_flags &= ~VNS_C_DIRECT; + (void) vsc->vsc_set_fcb_f(vsc->vsc_set_fcb_hdl, NULL, + vsc->vsc_tx_fc_hdl); + vsc->vsc_tx_fc_hdl = NULL; + (void) vsc->vsc_capab_f(vsc->vsc_capab_hdl, DLD_CAPAB_DIRECT, + NULL, DLD_DISABLE); + vnd_mac_exit(vsp, mph); + } +} + +static boolean_t +vnd_st_spromiscoff(vnd_str_t *vsp, int type, vnd_str_state_t next) +{ + boolean_t ret = B_TRUE; + mblk_t *mp; + dl_promiscoff_req_t *dprp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCOFF_REQ); + if (mp == NULL) { + cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for " + "promiscoff request"); + ret = B_FALSE; + goto next; + } + + dprp = (dl_promiscoff_req_t *)mp->b_rptr; + dprp->dl_level = type; + + putnext(vsp->vns_wq, mp); +next: + vsp->vns_state = next; + cv_broadcast(&vsp->vns_stcv); + return (ret); +} + +static void +vnd_st_promiscoff(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + + /* + * Unlike other cases where we guard against the incoming packet being + * NULL, during tear down we try to keep driving and therefore we may + * have gotten here due to an earlier failure, so there's nothing to do. + */ + mp = vnd_dlpi_inc_pop(vsp); + if (mp == NULL) + return; + + prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, + "wrong dlpi primitive for vnd_st_promiscoff"); + return; + } + + if (cprim != DL_PROMISCOFF_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_promiscoff: Got ack/nack for wrong primitive"); + return; + } + + if (prim == DL_ERROR_ACK) { + cmn_err(CE_WARN, "!failed to disable promiscuos mode during " + "vnd teardown"); + } +} + +static boolean_t +vnd_st_sunbind(vnd_str_t *vsp) +{ + mblk_t *mp; + boolean_t ret = B_TRUE; + + mp = vnd_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); + if (mp == NULL) { + cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for " + "unbind request"); + ret = B_FALSE; + goto next; + } + + putnext(vsp->vns_wq, mp); +next: + vsp->vns_state = VNS_S_UNBIND_SENT; + cv_broadcast(&vsp->vns_stcv); + return (ret); +} + +static void +vnd_st_unbind(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + /* + * Unlike other cases where we guard against the incoming packet being + * NULL, during tear down we try to keep driving and therefore we may + * have gotten here due to an earlier failure, so there's nothing to do. + */ + mp = vnd_dlpi_inc_pop(vsp); + if (mp == NULL) + goto next; + + prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, + "wrong dlpi primitive for vnd_st_unbind"); + goto next; + } + + if (cprim != DL_UNBIND_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_unbind: Got ack/nack for wrong primitive"); + goto next; + } + + if (prim == DL_ERROR_ACK) { + cmn_err(CE_WARN, "!failed to unbind stream during vnd " + "teardown"); + } + +next: + vsp->vns_state = VNS_S_ZOMBIE; + cv_broadcast(&vsp->vns_stcv); +} + +/* + * Perform state transitions. This is a one way shot down the flow chart + * described in the big theory statement. + */ +static void +vnd_str_state_transition(void *arg) +{ + boolean_t died = B_FALSE; + vnd_str_t *vsp = arg; + mblk_t *mp; + + mutex_enter(&vsp->vns_lock); + if (vsp->vns_dlpi_inc == NULL && (vsp->vns_state != VNS_S_INITIAL && + vsp->vns_state != VNS_S_SHUTTING_DOWN)) { + mutex_exit(&vsp->vns_lock); + return; + } + + /* + * When trying to shut down, or unwinding from a failed enabling, rather + * than immediately entering the ZOMBIE state, we may instead opt to try + * and enter the next state in the progression. This is especially + * important when trying to tear everything down. + */ +loop: + DTRACE_PROBE2(vnd__state__transition, uintptr_t, vsp, + vnd_str_state_t, vsp->vns_state); + switch (vsp->vns_state) { + case VNS_S_INITIAL: + VERIFY(vsp->vns_dlpi_inc == NULL); + if (vnd_st_sinfo(vsp) != 0) + died = B_TRUE; + break; + case VNS_S_INFO_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_info(vsp) == 0) { + if (vnd_st_sexclusive(vsp) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_EXCLUSIVE_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_exclusive(vsp) == 0) { + if (vsp->vns_dlpi_style == DL_STYLE2) { + if (vnd_st_sattach(vsp) != 0) + died = B_TRUE; + } else { + if (vnd_st_sbind(vsp) != 0) + died = B_TRUE; + } + } else { + died = B_TRUE; + } + break; + case VNS_S_ATTACH_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_attach(vsp) == 0) { + if (vnd_st_sbind(vsp) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_BIND_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_bind(vsp) == 0) { + if (vnd_st_spromisc(vsp, DL_PROMISC_SAP, + VNS_S_SAP_PROMISC_SENT) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_SAP_PROMISC_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_promisc(vsp) == 0) { + if (vnd_st_spromisc(vsp, DL_PROMISC_MULTI, + VNS_S_MULTI_PROMISC_SENT) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_MULTI_PROMISC_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_promisc(vsp) == 0) { + if (vnd_st_spromisc(vsp, DL_PROMISC_RX_ONLY, + VNS_S_RX_ONLY_PROMISC_SENT) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_RX_ONLY_PROMISC_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_promisc(vsp) == 0) { + if (vnd_st_spromisc(vsp, DL_PROMISC_FIXUPS, + VNS_S_FIXUP_PROMISC_SENT) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_FIXUP_PROMISC_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_promisc(vsp) == 0) { + if (vnd_st_scapabq(vsp) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_CAPAB_Q_SENT: + if (vnd_st_capabq(vsp) != 0) + died = B_TRUE; + else + vnd_st_sonline(vsp); + break; + case VNS_S_SHUTTING_DOWN: + vnd_st_shutdown(vsp); + if (vnd_st_spromiscoff(vsp, DL_PROMISC_MULTI, + VNS_S_MULTICAST_PROMISCOFF_SENT) == B_FALSE) + goto loop; + break; + case VNS_S_MULTICAST_PROMISCOFF_SENT: + vnd_st_promiscoff(vsp); + if (vnd_st_spromiscoff(vsp, DL_PROMISC_SAP, + VNS_S_SAP_PROMISCOFF_SENT) == B_FALSE) + goto loop; + break; + case VNS_S_SAP_PROMISCOFF_SENT: + vnd_st_promiscoff(vsp); + if (vnd_st_sunbind(vsp) == B_FALSE) + goto loop; + break; + case VNS_S_UNBIND_SENT: + vnd_st_unbind(vsp); + break; + case VNS_S_ZOMBIE: + while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL) + vnd_drop_ctl(vsp, mp, "vsp received data as a zombie"); + break; + default: + panic("vnd_str_t entered an unknown state"); + } + + if (died == B_TRUE) { + ASSERT(vsp->vns_errno != VND_E_SUCCESS); + vsp->vns_laststate = vsp->vns_state; + vsp->vns_state = VNS_S_ZOMBIE; + cv_broadcast(&vsp->vns_stcv); + } + + mutex_exit(&vsp->vns_lock); +} + +static void +vnd_dlpi_taskq_dispatch(void *arg) +{ + vnd_str_t *vsp = arg; + int run = 1; + + while (run != 0) { + vnd_str_state_transition(vsp); + mutex_enter(&vsp->vns_lock); + if (vsp->vns_flags & VNS_F_CONDEMNED || + vsp->vns_dlpi_inc == NULL) { + run = 0; + vsp->vns_flags &= ~VNS_F_TASKQ_DISPATCHED; + } + if (vsp->vns_flags & VNS_F_CONDEMNED) + cv_signal(&vsp->vns_cancelcv); + mutex_exit(&vsp->vns_lock); + } +} + +/* ARGSUSED */ +static int +vnd_neti_getifname(net_handle_t neti, phy_if_t phy, char *buf, const size_t len) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_getmtu(net_handle_t neti, phy_if_t phy, lif_if_t ifdata) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_getptmue(net_handle_t neti) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_getlifaddr(net_handle_t neti, phy_if_t phy, lif_if_t ifdata, + size_t nelem, net_ifaddr_t type[], void *storage) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_getlifzone(net_handle_t neti, phy_if_t phy, lif_if_t ifdata, + zoneid_t *zid) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_getlifflags(net_handle_t neti, phy_if_t phy, lif_if_t ifdata, + uint64_t *flags) +{ + return (-1); +} + +/* ARGSUSED */ +static phy_if_t +vnd_neti_phygetnext(net_handle_t neti, phy_if_t phy) +{ + return ((phy_if_t)-1); +} + +/* ARGSUSED */ +static phy_if_t +vnd_neti_phylookup(net_handle_t neti, const char *name) +{ + return ((phy_if_t)-1); +} + +/* ARGSUSED */ +static lif_if_t +vnd_neti_lifgetnext(net_handle_t neti, phy_if_t phy, lif_if_t ifdata) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_inject(net_handle_t neti, inject_t style, net_inject_t *packet) +{ + return (-1); +} + +/* ARGSUSED */ +static phy_if_t +vnd_neti_route(net_handle_t neti, struct sockaddr *address, + struct sockaddr *next) +{ + return ((phy_if_t)-1); +} + +/* ARGSUSED */ +static int +vnd_neti_ispchksum(net_handle_t neti, mblk_t *mp) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_isvchksum(net_handle_t neti, mblk_t *mp) +{ + return (-1); +} + +static net_protocol_t vnd_neti_info_v4 = { + NETINFO_VERSION, + NHF_VND_INET, + vnd_neti_getifname, + vnd_neti_getmtu, + vnd_neti_getptmue, + vnd_neti_getlifaddr, + vnd_neti_getlifzone, + vnd_neti_getlifflags, + vnd_neti_phygetnext, + vnd_neti_phylookup, + vnd_neti_lifgetnext, + vnd_neti_inject, + vnd_neti_route, + vnd_neti_ispchksum, + vnd_neti_isvchksum +}; + +static net_protocol_t vnd_neti_info_v6 = { + NETINFO_VERSION, + NHF_VND_INET6, + vnd_neti_getifname, + vnd_neti_getmtu, + vnd_neti_getptmue, + vnd_neti_getlifaddr, + vnd_neti_getlifzone, + vnd_neti_getlifflags, + vnd_neti_phygetnext, + vnd_neti_phylookup, + vnd_neti_lifgetnext, + vnd_neti_inject, + vnd_neti_route, + vnd_neti_ispchksum, + vnd_neti_isvchksum +}; + + +static int +vnd_netinfo_init(vnd_pnsd_t *nsp) +{ + nsp->vpnd_neti_v4 = net_protocol_register(nsp->vpnd_nsid, + &vnd_neti_info_v4); + ASSERT(nsp->vpnd_neti_v4 != NULL); + + nsp->vpnd_neti_v6 = net_protocol_register(nsp->vpnd_nsid, + &vnd_neti_info_v6); + ASSERT(nsp->vpnd_neti_v6 != NULL); + + nsp->vpnd_family_v4.hf_version = HOOK_VERSION; + nsp->vpnd_family_v4.hf_name = "vnd_inet"; + + if (net_family_register(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4) != 0) { + (void) net_protocol_unregister(nsp->vpnd_neti_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_family_v6.hf_version = HOOK_VERSION; + nsp->vpnd_family_v6.hf_name = "vnd_inet6"; + + if (net_family_register(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6) != 0) { + (void) net_family_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_family_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_event_in_v4.he_version = HOOK_VERSION; + nsp->vpnd_event_in_v4.he_name = NH_PHYSICAL_IN; + nsp->vpnd_event_in_v4.he_flags = 0; + nsp->vpnd_event_in_v4.he_interested = B_FALSE; + + nsp->vpnd_token_in_v4 = net_event_register(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + if (nsp->vpnd_token_in_v4 == NULL) { + (void) net_family_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_family_v4); + (void) net_family_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_family_v6); + (void) net_protocol_unregister(nsp->vpnd_neti_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_event_in_v6.he_version = HOOK_VERSION; + nsp->vpnd_event_in_v6.he_name = NH_PHYSICAL_IN; + nsp->vpnd_event_in_v6.he_flags = 0; + nsp->vpnd_event_in_v6.he_interested = B_FALSE; + + nsp->vpnd_token_in_v6 = net_event_register(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + if (nsp->vpnd_token_in_v6 == NULL) { + (void) net_event_shutdown(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + (void) net_event_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + (void) net_family_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_family_v4); + (void) net_family_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_family_v6); + (void) net_protocol_unregister(nsp->vpnd_neti_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_event_out_v4.he_version = HOOK_VERSION; + nsp->vpnd_event_out_v4.he_name = NH_PHYSICAL_OUT; + nsp->vpnd_event_out_v4.he_flags = 0; + nsp->vpnd_event_out_v4.he_interested = B_FALSE; + + nsp->vpnd_token_out_v4 = net_event_register(nsp->vpnd_neti_v4, + &nsp->vpnd_event_out_v4); + if (nsp->vpnd_token_out_v4 == NULL) { + (void) net_event_shutdown(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + (void) net_event_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + (void) net_event_shutdown(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + (void) net_event_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + (void) net_family_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_family_v4); + (void) net_family_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_family_v6); + (void) net_protocol_unregister(nsp->vpnd_neti_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_event_out_v6.he_version = HOOK_VERSION; + nsp->vpnd_event_out_v6.he_name = NH_PHYSICAL_OUT; + nsp->vpnd_event_out_v6.he_flags = 0; + nsp->vpnd_event_out_v6.he_interested = B_FALSE; + + nsp->vpnd_token_out_v6 = net_event_register(nsp->vpnd_neti_v6, + &nsp->vpnd_event_out_v6); + if (nsp->vpnd_token_out_v6 == NULL) { + (void) net_event_shutdown(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + (void) net_event_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + (void) net_event_shutdown(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + (void) net_event_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + (void) net_event_shutdown(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + (void) net_event_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + (void) net_family_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_family_v4); + (void) net_family_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_family_v6); + (void) net_protocol_unregister(nsp->vpnd_neti_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + return (0); +} + +static void +vnd_netinfo_shutdown(vnd_pnsd_t *nsp) +{ + int ret; + + ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + VERIFY(ret == 0); + ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4); + VERIFY(ret == 0); + ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + VERIFY(ret == 0); + ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6); + VERIFY(ret == 0); +} + +static void +vnd_netinfo_fini(vnd_pnsd_t *nsp) +{ + int ret; + + ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + VERIFY(ret == 0); + ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4); + VERIFY(ret == 0); + ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + VERIFY(ret == 0); + ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6); + VERIFY(ret == 0); + ret = net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); + VERIFY(ret == 0); + ret = net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6); + VERIFY(ret == 0); + ret = net_protocol_unregister(nsp->vpnd_neti_v4); + VERIFY(ret == 0); + ret = net_protocol_unregister(nsp->vpnd_neti_v6); + VERIFY(ret == 0); +} + +/* ARGSUSED */ +static void +vnd_strbarrier_cb(void *arg, mblk_t *bmp, gsqueue_t *gsp, void *dummy) +{ + vnd_str_t *vsp = arg; + + VERIFY(bmp == &vsp->vns_barrierblk); + mutex_enter(&vsp->vns_lock); + VERIFY(vsp->vns_flags & VNS_F_BARRIER); + VERIFY(!(vsp->vns_flags & VNS_F_BARRIER_DONE)); + vsp->vns_flags |= VNS_F_BARRIER_DONE; + mutex_exit(&vsp->vns_lock); + + /* + * For better or worse, we have to broadcast here as we could have a + * thread that's blocked for completion as well as one that's blocked + * waiting to do a barrier itself. + */ + cv_broadcast(&vsp->vns_barriercv); +} + +/* + * This is a data barrier for the stream while it is in fastpath mode. It blocks + * and ensures that there is nothing else in the squeue. + */ +static void +vnd_strbarrier(vnd_str_t *vsp) +{ + mutex_enter(&vsp->vns_lock); + while (vsp->vns_flags & VNS_F_BARRIER) + cv_wait(&vsp->vns_barriercv, &vsp->vns_lock); + vsp->vns_flags |= VNS_F_BARRIER; + mutex_exit(&vsp->vns_lock); + + gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_barrierblk, + vnd_strbarrier_cb, vsp, GSQUEUE_PROCESS, VND_SQUEUE_TAG_STRBARRIER); + + mutex_enter(&vsp->vns_lock); + while (!(vsp->vns_flags & VNS_F_BARRIER_DONE)) + cv_wait(&vsp->vns_barriercv, &vsp->vns_lock); + vsp->vns_flags &= ~VNS_F_BARRIER; + vsp->vns_flags &= ~VNS_F_BARRIER_DONE; + mutex_exit(&vsp->vns_lock); + + /* + * We have to broadcast in case anyone is waiting for the barrier + * themselves. + */ + cv_broadcast(&vsp->vns_barriercv); +} + +/* + * Based on the type of message that we're dealing with we're going to want to + * do one of several things. Basically if it looks like it's something we know + * about, we should probably handle it in one of our transition threads. + * Otherwise, we should just simply putnext. + */ +static int +vnd_s_rput(queue_t *q, mblk_t *mp) +{ + t_uscalar_t prim; + int dispatch = 0; + vnd_str_t *vsp = q->q_ptr; + + switch (DB_TYPE(mp)) { + case M_PROTO: + case M_PCPROTO: + if (MBLKL(mp) < sizeof (t_uscalar_t)) { + vnd_drop_ctl(vsp, mp, "PROTO message too short"); + break; + } + + prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive; + if (prim == DL_UNITDATA_REQ || prim == DL_UNITDATA_IND) { + vnd_drop_ctl(vsp, mp, + "recieved an unsupported dlpi DATA req"); + break; + } + + /* + * Enqueue the entry and fire off a taskq dispatch. + */ + mutex_enter(&vsp->vns_lock); + vnd_dlpi_inc_push(vsp, mp); + if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) { + dispatch = 1; + vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED; + } + mutex_exit(&vsp->vns_lock); + if (dispatch != 0) + taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, + vsp, 0, &vsp->vns_tqe); + break; + case M_DATA: + vnd_drop_in(vsp, mp, "M_DATA via put(9E)"); + break; + default: + putnext(vsp->vns_rq, mp); + } + return (0); +} + +/* ARGSUSED */ +static void +vnd_strioctl(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct iocblk *iocp) +{ + int error; + vnd_strioc_t *visp; + + if (iocp->ioc_cmd != VND_STRIOC_ASSOCIATE || + iocp->ioc_count != TRANSPARENT) { + error = EINVAL; + goto nak; + } + + /* + * All streams ioctls that we support must use kcred as a means to + * distinguish that this is a layered open by the kernel as opposed to + * one by a user who has done an I_PUSH of the module. + */ + if (iocp->ioc_cr != kcred) { + error = EPERM; + goto nak; + } + + if (mp->b_cont == NULL) { + error = EAGAIN; + goto nak; + } + + visp = kmem_alloc(sizeof (vnd_strioc_t), KM_SLEEP); + ASSERT(MBLKL(mp->b_cont) == sizeof (caddr_t)); + visp->vs_addr = *(caddr_t *)mp->b_cont->b_rptr; + visp->vs_state = VSS_COPYIN; + + mcopyin(mp, (void *)visp, sizeof (vnd_strioc_associate_t), NULL); + qreply(q, mp); + + return; + +nak: + if (mp->b_cont != NULL) { + freemsg(mp->b_cont); + mp->b_cont = NULL; + } + + iocp->ioc_error = error; + mp->b_datap->db_type = M_IOCNAK; + iocp->ioc_count = 0; + qreply(q, mp); +} + +static void +vnd_striocdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp) +{ + vnd_str_state_t state; + struct copyreq *crp; + vnd_strioc_associate_t *vss; + vnd_dev_t *vdp = NULL; + vnd_pnsd_t *nsp = NULL; + char iname[2*VND_NAMELEN]; + zone_t *zone; + vnd_strioc_t *visp; + + visp = (vnd_strioc_t *)csp->cp_private; + + /* If it's not ours, it's not our problem */ + if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) { + if (q->q_next != NULL) { + putnext(q, mp); + } else { + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); + vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA"); + } + kmem_free(visp, sizeof (vnd_strioc_t)); + return; + } + + /* The nak is already sent for us */ + if (csp->cp_rval != 0) { + vnd_drop_ctl(vsp, mp, "M_COPYIN failed"); + kmem_free(visp, sizeof (vnd_strioc_t)); + return; + } + + /* Data is sitting for us in b_cont */ + if (mp->b_cont == NULL || + MBLKL(mp->b_cont) != sizeof (vnd_strioc_associate_t)) { + kmem_free(visp, sizeof (vnd_strioc_t)); + miocnak(q, mp, 0, EINVAL); + return; + } + + vss = (vnd_strioc_associate_t *)mp->b_cont->b_rptr; + vdp = vnd_dev_lookup(vss->vsa_minor); + if (vdp == NULL) { + vss->vsa_errno = VND_E_NODEV; + goto nak; + } + + nsp = vnd_nsd_lookup(vss->vsa_nsid); + if (nsp == NULL) { + vss->vsa_errno = VND_E_NONETSTACK; + goto nak; + } + + mutex_enter(&vsp->vns_lock); + if (!(vsp->vns_flags & VNS_F_NEED_ZONE)) { + mutex_exit(&vsp->vns_lock); + vss->vsa_errno = VND_E_ASSOCIATED; + goto nak; + } + + vsp->vns_nsd = nsp; + vsp->vns_flags &= ~VNS_F_NEED_ZONE; + vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED; + mutex_exit(&vsp->vns_lock); + + taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp, 0, + &vsp->vns_tqe); + + + /* At this point we need to wait until we have transitioned to ONLINE */ + mutex_enter(&vsp->vns_lock); + while (vsp->vns_state != VNS_S_ONLINE && vsp->vns_state != VNS_S_ZOMBIE) + cv_wait(&vsp->vns_stcv, &vsp->vns_lock); + state = vsp->vns_state; + mutex_exit(&vsp->vns_lock); + + if (state == VNS_S_ZOMBIE) { + vss->vsa_errno = vsp->vns_errno; + goto nak; + } + + mutex_enter(&vdp->vdd_lock); + mutex_enter(&vsp->vns_lock); + VERIFY(vdp->vdd_str == NULL); + /* + * Now initialize the remaining kstat properties and let's go ahead and + * create it. + */ + (void) snprintf(iname, sizeof (iname), "z%d_%d", + vdp->vdd_nsd->vpnd_zid, vdp->vdd_minor); + vsp->vns_kstat = kstat_create_zone("vnd", vdp->vdd_minor, iname, "net", + KSTAT_TYPE_NAMED, sizeof (vnd_str_stat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID); + if (vsp->vns_kstat == NULL) { + vss->vsa_errno = VND_E_KSTATCREATE; + mutex_exit(&vsp->vns_lock); + mutex_exit(&vdp->vdd_lock); + goto nak; + } + vdp->vdd_str = vsp; + vsp->vns_dev = vdp; + + /* + * Now, it's time to do the las thing that can fail, changing out the + * input function. After this we know that we can receive data, so we + * should make sure that we're ready. + */ + if (vnd_dld_cap_enable(vsp, vnd_mac_input) != 0) { + vss->vsa_errno = VND_E_DIRECTFAIL; + vdp->vdd_str = NULL; + vsp->vns_dev = NULL; + mutex_exit(&vsp->vns_lock); + mutex_exit(&vdp->vdd_lock); + goto nak; + } + + zone = zone_find_by_id(vdp->vdd_nsd->vpnd_zid); + ASSERT(zone != NULL); + vsp->vns_kstat->ks_data = &vsp->vns_ksdata; + /* Account for zone name */ + vsp->vns_kstat->ks_data_size += strlen(zone->zone_name) + 1; + /* Account for eventual link name */ + vsp->vns_kstat->ks_data_size += VND_NAMELEN; + kstat_named_setstr(&vsp->vns_ksdata.vks_zonename, zone->zone_name); + kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname, + vdp->vdd_lname); + zone_rele(zone); + kstat_install(vsp->vns_kstat); + + mutex_exit(&vsp->vns_lock); + mutex_exit(&vdp->vdd_lock); + + /* + * Note that the vnd_str_t does not keep a permanent hold on the + * vnd_pnsd_t. We leave that up to the vnd_dev_t as that's also what + * the nestack goes through to take care of everything. + */ + vss->vsa_errno = VND_E_SUCCESS; +nak: + if (vdp != NULL) + vnd_dev_rele(vdp); + if (nsp != NULL) + vnd_nsd_rele(nsp); + /* + * Change the copyin request to a copyout. Note that we can't use + * mcopyout here as it only works when the DB_TYPE is M_IOCTL. That's + * okay, as the copyin vs. copyout is basically the same. + */ + DB_TYPE(mp) = M_COPYOUT; + visp->vs_state = VSS_COPYOUT; + crp = (struct copyreq *)mp->b_rptr; + crp->cq_private = (void *)visp; + crp->cq_addr = visp->vs_addr; + crp->cq_size = sizeof (vnd_strioc_associate_t); + qreply(q, mp); +} + +static void +vnd_stroutdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp) +{ + ASSERT(csp->cp_private != NULL); + kmem_free(csp->cp_private, sizeof (vnd_strioc_t)); + if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) { + if (q->q_next != NULL) { + putnext(q, mp); + } else { + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); + vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA"); + } + return; + } + + /* The nak is already sent for us */ + if (csp->cp_rval != 0) { + vnd_drop_ctl(vsp, mp, "M_COPYOUT failed"); + return; + } + + /* Ack and let's be done with it all */ + miocack(q, mp, 0, 0); +} + +static int +vnd_s_wput(queue_t *q, mblk_t *mp) +{ + vnd_str_t *vsp = q->q_ptr; + struct copyresp *crp; + vnd_strioc_state_t vstate; + vnd_strioc_t *visp; + + switch (DB_TYPE(mp)) { + case M_IOCTL: + vnd_strioctl(q, vsp, mp, (struct iocblk *)mp->b_rptr); + return (0); + case M_IOCDATA: + crp = (struct copyresp *)mp->b_rptr; + ASSERT(crp->cp_private != NULL); + visp = (vnd_strioc_t *)crp->cp_private; + vstate = visp->vs_state; + ASSERT(vstate == VSS_COPYIN || vstate == VSS_COPYOUT); + if (vstate == VSS_COPYIN) + vnd_striocdata(q, vsp, mp, + (struct copyresp *)mp->b_rptr); + else + vnd_stroutdata(q, vsp, mp, + (struct copyresp *)mp->b_rptr); + return (0); + default: + break; + } + if (q->q_next != NULL) + putnext(q, mp); + else + vnd_drop_ctl(vsp, mp, "!M_IOCTL in wput"); + + return (0); +} + +/* ARGSUSED */ +static int +vnd_s_open(queue_t *q, dev_t *devp, int oflag, int sflag, cred_t *credp) +{ + vnd_str_t *vsp; + uint_t rand; + + if (q->q_ptr != NULL) + return (EINVAL); + + if (!(sflag & MODOPEN)) + return (ENXIO); + + if (credp != kcred) + return (EPERM); + + vsp = kmem_cache_alloc(vnd_str_cache, KM_SLEEP); + bzero(vsp, sizeof (*vsp)); + mutex_init(&vsp->vns_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&vsp->vns_cancelcv, NULL, CV_DRIVER, NULL); + cv_init(&vsp->vns_barriercv, NULL, CV_DRIVER, NULL); + cv_init(&vsp->vns_stcv, NULL, CV_DRIVER, NULL); + vsp->vns_state = VNS_S_INITIAL; + + mutex_init(&vsp->vns_dq_read.vdq_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&vsp->vns_dq_write.vdq_lock, NULL, MUTEX_DRIVER, NULL); + mutex_enter(&vnd_dev_lock); + vsp->vns_dq_read.vdq_max = vnd_vdq_default_size; + vsp->vns_dq_read.vdq_vns = vsp; + vsp->vns_dq_write.vdq_max = vnd_vdq_default_size; + vsp->vns_dq_write.vdq_vns = vsp; + mutex_exit(&vnd_dev_lock); + vsp->vns_rq = q; + vsp->vns_wq = WR(q); + q->q_ptr = WR(q)->q_ptr = vsp; + vsp->vns_flags = VNS_F_NEED_ZONE; + vsp->vns_nflush = vnd_flush_nburst; + vsp->vns_bsize = vnd_flush_burst_size; + + (void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand)); + vsp->vns_squeue = gsqueue_set_get(vnd_sqset, rand); + + /* + * We create our kstat and initialize all of its fields now, but we + * don't install it until we actually do the zone association so we can + * get everything. + */ + kstat_named_init(&vsp->vns_ksdata.vks_rbytes, "rbytes", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_rpackets, "rpackets", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_obytes, "obytes", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_opackets, "opackets", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_nhookindrops, "nhookindrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_nhookoutdrops, "nhookoutdrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_ndlpidrops, "ndlpidrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_ndataindrops, "ndataindrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_ndataoutdrops, "ndataoutdrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_tdrops, "total_drops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_linkname, "linkname", + KSTAT_DATA_STRING); + kstat_named_init(&vsp->vns_ksdata.vks_zonename, "zonename", + KSTAT_DATA_STRING); + kstat_named_init(&vsp->vns_ksdata.vks_nmacflow, "flowcontrol_events", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_tmacflow, "flowcontrol_time", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1ms, "flowcontrol_1ms", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10ms, "flowcontrol_10ms", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_100ms, + "flowcontrol_100ms", KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1s, "flowcontrol_1s", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10s, "flowcontrol_10s", + KSTAT_DATA_UINT64); + qprocson(q); + /* + * Now that we've called qprocson, grab the lower module for making sure + * that we don't have any pass through modules. + */ + vsp->vns_lrq = RD(vsp->vns_wq->q_next); + + return (0); +} + +/* ARGSUSED */ +static int +vnd_s_close(queue_t *q, int flag, cred_t *credp) +{ + vnd_str_t *vsp; + mblk_t *mp; + + VERIFY(WR(q)->q_next != NULL); + + vsp = q->q_ptr; + ASSERT(vsp != NULL); + + /* + * We need to transition ourselves down. This means that we have a few + * important different things to do in the process of tearing down our + * input and output buffers, making sure we've drained the current + * squeue, and disabling the fast path. Before we disable the fast path, + * we should make sure the squeue is drained. Because we're in streams + * close, we know that no packets can come into us from userland, but we + * can receive more. As such, the following is the exact order of things + * that we do: + * + * 1) flush the vns_dq_read + * 2) Insert the drain mblk + * 3) When it's been received, tear down the fast path by kicking + * off the state machine. + * 4) One final flush of both the vns_dq_read,vns_dq_write + */ + + vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in); + vnd_strbarrier(vsp); + mutex_enter(&vsp->vns_lock); + vsp->vns_state = VNS_S_SHUTTING_DOWN; + if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) { + vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED; + taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp, + 0, &vsp->vns_tqe); + } + while (vsp->vns_state != VNS_S_ZOMBIE) + cv_wait(&vsp->vns_stcv, &vsp->vns_lock); + mutex_exit(&vsp->vns_lock); + + qprocsoff(q); + mutex_enter(&vsp->vns_lock); + vsp->vns_flags |= VNS_F_CONDEMNED; + while (vsp->vns_flags & VNS_F_TASKQ_DISPATCHED) + cv_wait(&vsp->vns_cancelcv, &vsp->vns_lock); + + while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL) + vnd_drop_ctl(vsp, mp, "vnd_s_close"); + mutex_exit(&vsp->vns_lock); + + q->q_ptr = NULL; + vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in); + vnd_dq_flush(&vsp->vns_dq_write, vnd_drop_out); + mutex_destroy(&vsp->vns_dq_read.vdq_lock); + mutex_destroy(&vsp->vns_dq_write.vdq_lock); + + if (vsp->vns_kstat != NULL) + kstat_delete(vsp->vns_kstat); + mutex_destroy(&vsp->vns_lock); + cv_destroy(&vsp->vns_stcv); + cv_destroy(&vsp->vns_barriercv); + cv_destroy(&vsp->vns_cancelcv); + kmem_cache_free(vnd_str_cache, vsp); + + return (0); +} + +static vnd_mac_cookie_t +vnd_squeue_tx_one(vnd_str_t *vsp, mblk_t *mp) +{ + hrtime_t txtime; + vnd_mac_cookie_t vc; + + VND_STAT_INC(vsp, vks_opackets, 1); + VND_STAT_INC(vsp, vks_obytes, msgsize(mp)); + DTRACE_VND5(send, mblk_t *, mp, void *, NULL, void *, NULL, + vnd_str_t *, vsp, mblk_t *, mp); + /* Actually tx now */ + txtime = gethrtime(); + vc = vsp->vns_caps.vsc_tx_f(vsp->vns_caps.vsc_tx_hdl, + mp, 0, MAC_DROP_ON_NO_DESC); + + /* + * We need to check two different conditions before we immediately set + * the flow control lock. The first thing that we need to do is verify + * that this is an instance of hard flow control, so to say. The flow + * control callbacks won't always fire in cases where we still get a + * cookie returned. The explicit check for flow control will guarantee + * us that we'll get a subsequent notification callback. + * + * The second case comes about because we do not hold the + * vnd_str_t`vns_lock across calls to tx, we need to determine if a flow + * control notification already came across for us in a different thread + * calling vnd_mac_flow_control(). To deal with this, we record a + * timestamp every time that we change the flow control state. We grab + * txtime here before we transmit because that guarantees that the + * hrtime_t of the call to vnd_mac_flow_control() will be after txtime. + * + * If the flow control notification beat us to the punch, the value of + * vns_fcupdate will be larger than the value of txtime, and we should + * just record the statistics. However, if we didn't beat it to the + * punch (txtime > vns_fcupdate), then we know that it's safe to wait + * for a notification. + */ + if (vc != NULL) { + hrtime_t diff; + + if (vsp->vns_caps.vsc_is_fc_f(vsp->vns_caps.vsc_is_fc_hdl, + vc) == 0) + return (NULL); + mutex_enter(&vsp->vns_lock); + diff = vsp->vns_fcupdate - txtime; + if (diff > 0) { + mutex_exit(&vsp->vns_lock); + vnd_mac_flow_control_stat(vsp, diff); + return (NULL); + } + vsp->vns_flags |= VNS_F_FLOW_CONTROLLED; + vsp->vns_caps.vsc_fc_cookie = vc; + vsp->vns_fclatch = txtime; + vsp->vns_fcupdate = txtime; + DTRACE_VND3(flow__blocked, vnd_str_t *, vsp, + uint64_t, vsp->vns_dq_write.vdq_cur, uintptr_t, vc); + mutex_exit(&vsp->vns_lock); + } + + return (vc); +} + +/* ARGSUSED */ +static void +vnd_squeue_tx_drain(void *arg, mblk_t *drain_mp, gsqueue_t *gsp, void *dummy) +{ + mblk_t *mp; + int nmps; + size_t mptot, nflush, bsize; + boolean_t blocked, empty; + vnd_data_queue_t *vqp; + vnd_str_t *vsp = arg; + + mutex_enter(&vsp->vns_lock); + /* + * We either enter here via an squeue or via vnd_squeue_tx_append(). In + * the former case we need to mark that there is no longer an active + * user of the drain block. + */ + if (drain_mp != NULL) { + VERIFY(drain_mp == &vsp->vns_drainblk); + VERIFY(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED); + vsp->vns_flags &= ~VNS_F_DRAIN_SCHEDULED; + } + + /* + * If we're still flow controlled or under a flush barrier, nothing to + * do. + */ + if (vsp->vns_flags & (VNS_F_FLOW_CONTROLLED | VNS_F_BARRIER)) { + mutex_exit(&vsp->vns_lock); + return; + } + + nflush = vsp->vns_nflush; + bsize = vsp->vns_bsize; + mutex_exit(&vsp->vns_lock); + + /* + * We're potentially going deep into the networking layer; make sure the + * guest can't run concurrently. + */ + ht_begin_unsafe(); + + nmps = 0; + mptot = 0; + blocked = B_FALSE; + vqp = &vsp->vns_dq_write; + while (nmps < nflush && mptot <= bsize) { + mutex_enter(&vqp->vdq_lock); + if (vnd_dq_pop(vqp, &mp) == 0) { + mutex_exit(&vqp->vdq_lock); + break; + } + mutex_exit(&vqp->vdq_lock); + + nmps++; + mptot += msgsize(mp); + if (vnd_squeue_tx_one(vsp, mp) != NULL) { + blocked = B_TRUE; + break; + } + } + + ht_end_unsafe(); + + empty = vnd_dq_is_empty(&vsp->vns_dq_write); + + /* + * If the queue is not empty, we're not blocked, and there isn't a drain + * scheduled, put it into the squeue with the drain block and + * GSQUEUE_FILL. + */ + if (blocked == B_FALSE && empty == B_FALSE) { + mutex_enter(&vsp->vns_lock); + if (!(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED)) { + mblk_t *mp = &vsp->vns_drainblk; + vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED; + gsqueue_enter_one(vsp->vns_squeue, + mp, vnd_squeue_tx_drain, vsp, + GSQUEUE_FILL, VND_SQUEUE_TAG_TX_DRAIN); + } + mutex_exit(&vsp->vns_lock); + } + + /* + * If we drained some amount of data, we need to signal the data queue. + */ + if (nmps > 0) { + cv_broadcast(&vsp->vns_dq_write.vdq_ready); + pollwakeup(&vsp->vns_dev->vdd_ph, POLLOUT); + } +} + +/* ARGSUSED */ +static void +vnd_squeue_tx_append(void *arg, mblk_t *mp, gsqueue_t *gsp, void *dummy) +{ + vnd_str_t *vsp = arg; + vnd_data_queue_t *vqp = &vsp->vns_dq_write; + vnd_pnsd_t *nsp = vsp->vns_nsd; + size_t len = msgsize(mp); + + /* + * Before we append this packet, we should run it through the firewall + * rules. + */ + if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4, + nsp->vpnd_event_out_v4, nsp->vpnd_token_out_v4, nsp->vpnd_neti_v6, + nsp->vpnd_event_out_v6, nsp->vpnd_token_out_v6, vnd_drop_hook_out, + vnd_drop_out) != 0) { + /* + * Because we earlier reserved space for this packet and it's + * not making the cut, we need to go through and unreserve that + * space. Also note that the message block will likely be freed + * by the time we return from vnd_hook so we cannot rely on it. + */ + mutex_enter(&vqp->vdq_lock); + vnd_dq_unreserve(vqp, len); + mutex_exit(&vqp->vdq_lock); + return; + } + + /* + * We earlier reserved space for this packet. So for now simply append + * it and call drain. We know that no other drain can be going on right + * now thanks to the squeue. + */ + mutex_enter(&vqp->vdq_lock); + (void) vnd_dq_push(&vsp->vns_dq_write, mp, B_TRUE, vnd_drop_panic); + mutex_exit(&vqp->vdq_lock); + vnd_squeue_tx_drain(vsp, NULL, NULL, NULL); +} + +/* + * We need to see if this is a valid name of sorts for us. That means a few + * things. First off, we can't assume that what we've been given has actually + * been null terminated. More importantly, that it's a valid name as far as + * ddi_create_minor_node is concerned (that means no '@', '/', or ' '). We + * further constrain ourselves to simply alphanumeric characters and a few + * additional ones, ':', '-', and '_'. + */ +static int +vnd_validate_name(const char *buf, size_t buflen) +{ + int i, len; + + /* First make sure a null terminator exists */ + for (i = 0; i < buflen; i++) + if (buf[i] == '\0') + break; + len = i; + if (i == 0 || i == buflen) + return (0); + + for (i = 0; i < len; i++) + if (!isalnum(buf[i]) && buf[i] != ':' && buf[i] != '-' && + buf[i] != '_') + return (0); + + return (1); +} + +static int +vnd_ioctl_attach(vnd_dev_t *vdp, uintptr_t arg, cred_t *credp, int cpflag) +{ + vnd_ioc_attach_t via; + vnd_strioc_associate_t vss; + vnd_pnsd_t *nsp; + zone_t *zonep; + zoneid_t zid; + char buf[2*VND_NAMELEN]; + int ret, rp; + + if (secpolicy_net_config(credp, B_FALSE) != 0) + return (EPERM); + + if (secpolicy_net_rawaccess(credp) != 0) + return (EPERM); + + if (ddi_copyin((void *)arg, &via, sizeof (via), cpflag) != 0) + return (EFAULT); + via.via_errno = VND_E_SUCCESS; + + if (vnd_validate_name(via.via_name, VND_NAMELEN) == 0) { + via.via_errno = VND_E_BADNAME; + ret = EIO; + goto errcopyout; + } + + /* + * Only the global zone can request to create a device in a different + * zone. + */ + zid = crgetzoneid(credp); + if (zid != GLOBAL_ZONEID && via.via_zoneid != -1 && + zid != via.via_zoneid) { + via.via_errno = VND_E_PERM; + ret = EIO; + goto errcopyout; + } + + if (via.via_zoneid == -1) + via.via_zoneid = zid; + + /* + * Establish the name we'll use now. We want to be extra paranoid about + * the device we're opening so check that now. + */ + if (zid == GLOBAL_ZONEID && via.via_zoneid != zid) { + zonep = zone_find_by_id(via.via_zoneid); + if (zonep == NULL) { + via.via_errno = VND_E_NOZONE; + ret = EIO; + goto errcopyout; + } + if (snprintf(NULL, 0, "/dev/net/zone/%s/%s", zonep->zone_name, + via.via_name) >= sizeof (buf)) { + zone_rele(zonep); + via.via_errno = VND_E_BADNAME; + ret = EIO; + goto errcopyout; + } + (void) snprintf(buf, sizeof (buf), "/dev/net/zone/%s/%s", + zonep->zone_name, via.via_name); + zone_rele(zonep); + zonep = NULL; + } else { + if (snprintf(NULL, 0, "/dev/net/%s", via.via_name) >= + sizeof (buf)) { + via.via_errno = VND_E_BADNAME; + ret = EIO; + goto errcopyout; + } + (void) snprintf(buf, sizeof (buf), "/dev/net/%s", via.via_name); + } + + /* + * If our zone is dying then the netstack will have been removed from + * this list. + */ + nsp = vnd_nsd_lookup_by_zid(via.via_zoneid); + if (nsp == NULL) { + via.via_errno = VND_E_NOZONE; + ret = EIO; + goto errcopyout; + } + + /* + * Note we set the attached handle even though we haven't actually + * finished the process of attaching the ldi handle. + */ + mutex_enter(&vdp->vdd_lock); + if (vdp->vdd_flags & (VND_D_ATTACHED | VND_D_ATTACH_INFLIGHT)) { + mutex_exit(&vdp->vdd_lock); + vnd_nsd_rele(nsp); + via.via_errno = VND_E_ATTACHED; + ret = EIO; + goto errcopyout; + } + vdp->vdd_flags |= VND_D_ATTACH_INFLIGHT; + ASSERT(vdp->vdd_cr == NULL); + crhold(credp); + vdp->vdd_cr = credp; + ASSERT(vdp->vdd_nsd == NULL); + vdp->vdd_nsd = nsp; + mutex_exit(&vdp->vdd_lock); + + /* + * Place an additional hold on the vnd_pnsd_t as we go through and do + * all of the rest of our work. This will be the hold that we keep for + * as long as this thing is attached. + */ + vnd_nsd_ref(nsp); + + ret = ldi_open_by_name(buf, FREAD | FWRITE, vdp->vdd_cr, + &vdp->vdd_ldih, vdp->vdd_ldiid); + if (ret != 0) { + if (ret == ENODEV) + via.via_errno = VND_E_NODATALINK; + goto err; + } + + /* + * Unfortunately the I_PUSH interface doesn't allow us a way to detect + * whether or not we're coming in from a layered device. We really want + * to make sure that a normal user can't push on our streams module. + * Currently the only idea I have for this is to make sure that the + * credp is kcred which is really terrible. + */ + ret = ldi_ioctl(vdp->vdd_ldih, I_PUSH, (intptr_t)"vnd", FKIOCTL, + kcred, &rp); + if (ret != 0) { + rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); + VERIFY(rp == 0); + via.via_errno = VND_E_STRINIT; + ret = EIO; + goto err; + } + + vss.vsa_minor = vdp->vdd_minor; + vss.vsa_nsid = nsp->vpnd_nsid; + + ret = ldi_ioctl(vdp->vdd_ldih, VND_STRIOC_ASSOCIATE, (intptr_t)&vss, + FKIOCTL, kcred, &rp); + if (ret != 0 || vss.vsa_errno != VND_E_SUCCESS) { + rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); + VERIFY(rp == 0); + if (ret == 0) { + via.via_errno = vss.vsa_errno; + ret = EIO; + } + goto err; + } + + mutex_enter(&vdp->vdd_nsd->vpnd_lock); + + /* + * There's a chance that our netstack was condemned while we've had a + * hold on it. As such we need to check and if so, error out. + */ + if (vdp->vdd_nsd->vpnd_flags & VND_NS_CONDEMNED) { + mutex_exit(&vdp->vdd_nsd->vpnd_lock); + rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); + VERIFY(rp == 0); + ret = EIO; + via.via_errno = VND_E_NOZONE; + goto err; + } + + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_str != NULL); + vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT; + vdp->vdd_flags |= VND_D_ATTACHED; + (void) strlcpy(vdp->vdd_datalink, via.via_name, + sizeof (vdp->vdd_datalink)); + list_insert_tail(&vdp->vdd_nsd->vpnd_dev_list, vdp); + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vdp->vdd_nsd->vpnd_lock); + vnd_nsd_rele(nsp); + + return (0); + +err: + mutex_enter(&vdp->vdd_lock); + vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT; + crfree(vdp->vdd_cr); + vdp->vdd_cr = NULL; + vdp->vdd_nsd = NULL; + mutex_exit(&vdp->vdd_lock); + + /* + * We have two holds to drop here. One for our original reference and + * one for the hold this operation would have represented. + */ + vnd_nsd_rele(nsp); + vnd_nsd_rele(nsp); +errcopyout: + if (ddi_copyout(&via, (void *)arg, sizeof (via), cpflag) != 0) + ret = EFAULT; + + return (ret); +} + +static int +vnd_ioctl_link(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag) +{ + int ret = 0; + vnd_ioc_link_t vil; + char mname[2*VND_NAMELEN]; + char **c; + vnd_dev_t *v; + zoneid_t zid; + + /* Not anyone can link something */ + if (secpolicy_net_config(credp, B_FALSE) != 0) + return (EPERM); + + if (ddi_copyin((void *)arg, &vil, sizeof (vil), cpflag) != 0) + return (EFAULT); + + if (vnd_validate_name(vil.vil_name, VND_NAMELEN) == 0) { + ret = EIO; + vil.vil_errno = VND_E_BADNAME; + goto errcopyout; + } + + c = vnd_reserved_names; + while (*c != NULL) { + if (strcmp(vil.vil_name, *c) == 0) { + ret = EIO; + vil.vil_errno = VND_E_BADNAME; + goto errcopyout; + } + c++; + } + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + vil.vil_errno = VND_E_NOTATTACHED; + goto errcopyout; + } + + if (vdp->vdd_flags & VND_D_ZONE_DYING) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + vil.vil_errno = VND_E_NOZONE; + goto errcopyout; + } + + if (vdp->vdd_flags & (VND_D_LINK_INFLIGHT | VND_D_LINKED)) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + vil.vil_errno = VND_E_LINKED; + goto errcopyout; + } + vdp->vdd_flags |= VND_D_LINK_INFLIGHT; + zid = vdp->vdd_nsd->vpnd_zid; + mutex_exit(&vdp->vdd_lock); + + if (snprintf(NULL, 0, "z%d:%s", zid, vil.vil_name) >= + sizeof (mname)) { + ret = EIO; + vil.vil_errno = VND_E_BADNAME; + goto errcopyout; + } + + mutex_enter(&vnd_dev_lock); + for (v = list_head(&vnd_dev_list); v != NULL; + v = list_next(&vnd_dev_list, v)) { + if (!(v->vdd_flags & VND_D_LINKED)) + continue; + + if (v->vdd_nsd->vpnd_zid == zid && + strcmp(v->vdd_lname, vil.vil_name) == 0) { + mutex_exit(&vnd_dev_lock); + ret = EIO; + vil.vil_errno = VND_E_LINKEXISTS; + goto error; + } + } + + /* + * We set the name and mark ourselves attached while holding the list + * lock to ensure that no other user can mistakingly find our name. + */ + (void) snprintf(mname, sizeof (mname), "z%d:%s", zid, + vil.vil_name); + mutex_enter(&vdp->vdd_lock); + + /* + * Because we dropped our lock, we need to double check whether or not + * the zone was marked as dying while we were here. If it hasn't, then + * it's safe for us to link it in. + */ + if (vdp->vdd_flags & VND_D_ZONE_DYING) { + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + ret = EIO; + vil.vil_errno = VND_E_NOZONE; + goto error; + } + + (void) strlcpy(vdp->vdd_lname, vil.vil_name, sizeof (vdp->vdd_lname)); + if (ddi_create_minor_node(vnd_dip, mname, S_IFCHR, vdp->vdd_minor, + DDI_PSEUDO, 0) != DDI_SUCCESS) { + ret = EIO; + vil.vil_errno = VND_E_MINORNODE; + } else { + vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT; + vdp->vdd_flags |= VND_D_LINKED; + kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname, + vdp->vdd_lname); + ret = 0; + } + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + + if (ret == 0) { + /* + * Add a reference to represent that this device is linked into + * the file system name space to ensure that it doesn't + * disappear. + */ + vnd_dev_ref(vdp); + return (0); + } + +error: + mutex_enter(&vdp->vdd_lock); + vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT; + vdp->vdd_lname[0] = '\0'; + mutex_exit(&vdp->vdd_lock); + +errcopyout: + if (ddi_copyout(&vil, (void *)arg, sizeof (vil), cpflag) != 0) + ret = EFAULT; + return (ret); +} + +/* + * Common unlink function. This is used both from the ioctl path and from the + * netstack shutdown path. The caller is required to hold the mutex on the + * vnd_dev_t, but they basically will have it relinquished for them. The only + * thing the caller is allowed to do afterward is to potentially rele the + * vnd_dev_t if they have their own hold. Note that only the ioctl path has its + * own hold. + */ +static void +vnd_dev_unlink(vnd_dev_t *vdp) +{ + char mname[2*VND_NAMELEN]; + + ASSERT(MUTEX_HELD(&vdp->vdd_lock)); + + (void) snprintf(mname, sizeof (mname), "z%d:%s", + vdp->vdd_nsd->vpnd_zid, vdp->vdd_lname); + ddi_remove_minor_node(vnd_dip, mname); + vdp->vdd_lname[0] = '\0'; + vdp->vdd_flags &= ~VND_D_LINKED; + kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname, + vdp->vdd_lname); + mutex_exit(&vdp->vdd_lock); + + /* + * This rele corresponds to the reference that we took in + * vnd_ioctl_link. + */ + vnd_dev_rele(vdp); +} + +static int +vnd_ioctl_unlink(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag) +{ + int ret; + zoneid_t zid; + vnd_ioc_unlink_t viu; + + /* Not anyone can unlink something */ + if (secpolicy_net_config(credp, B_FALSE) != 0) + return (EPERM); + + zid = crgetzoneid(credp); + + if (ddi_copyin((void *)arg, &viu, sizeof (viu), cpflag) != 0) + return (EFAULT); + + viu.viu_errno = VND_E_SUCCESS; + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_LINKED)) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + viu.viu_errno = VND_E_NOTLINKED; + goto err; + } + VERIFY(vdp->vdd_flags & VND_D_ATTACHED); + + if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + viu.viu_errno = VND_E_PERM; + goto err; + } + + /* vnd_dev_unlink releases the vdp mutex for us */ + vnd_dev_unlink(vdp); + ret = 0; +err: + if (ddi_copyout(&viu, (void *)arg, sizeof (viu), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_setrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + int ret; + vnd_ioc_buf_t vib; + + if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0) + return (EFAULT); + + mutex_enter(&vnd_dev_lock); + if (vib.vib_size > vnd_vdq_hard_max) { + mutex_exit(&vnd_dev_lock); + vib.vib_errno = VND_E_BUFTOOBIG; + ret = EIO; + goto err; + } + mutex_exit(&vnd_dev_lock); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_NOTATTACHED; + ret = EIO; + goto err; + } + + mutex_enter(&vdp->vdd_str->vns_lock); + if (vib.vib_size < vdp->vdd_str->vns_minwrite) { + mutex_exit(&vdp->vdd_str->vns_lock); + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_BUFTOOSMALL; + ret = EIO; + goto err; + } + + mutex_exit(&vdp->vdd_str->vns_lock); + mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock); + vdp->vdd_str->vns_dq_read.vdq_max = (size_t)vib.vib_size; + mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock); + mutex_exit(&vdp->vdd_lock); + ret = 0; + +err: + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_getrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + int ret; + vnd_ioc_buf_t vib; + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_NOTATTACHED; + ret = EIO; + goto err; + } + + mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock); + vib.vib_size = vdp->vdd_str->vns_dq_read.vdq_max; + mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock); + mutex_exit(&vdp->vdd_lock); + ret = 0; + +err: + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +/* ARGSUSED */ +static int +vnd_ioctl_getmaxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + vnd_ioc_buf_t vib; + + mutex_enter(&vnd_dev_lock); + vib.vib_size = vnd_vdq_hard_max; + mutex_exit(&vnd_dev_lock); + + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (0); +} + +static int +vnd_ioctl_gettxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + int ret; + vnd_ioc_buf_t vib; + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_NOTATTACHED; + ret = EIO; + goto err; + } + + mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock); + vib.vib_size = vdp->vdd_str->vns_dq_write.vdq_max; + mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock); + mutex_exit(&vdp->vdd_lock); + ret = 0; + +err: + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_settxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + int ret; + vnd_ioc_buf_t vib; + + if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0) + return (EFAULT); + + mutex_enter(&vnd_dev_lock); + if (vib.vib_size > vnd_vdq_hard_max) { + mutex_exit(&vnd_dev_lock); + vib.vib_errno = VND_E_BUFTOOBIG; + ret = EIO; + goto err; + } + mutex_exit(&vnd_dev_lock); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_NOTATTACHED; + ret = EIO; + goto err; + } + + mutex_enter(&vdp->vdd_str->vns_lock); + if (vib.vib_size < vdp->vdd_str->vns_minwrite) { + mutex_exit(&vdp->vdd_str->vns_lock); + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_BUFTOOSMALL; + ret = EIO; + goto err; + } + mutex_exit(&vdp->vdd_str->vns_lock); + + mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock); + vdp->vdd_str->vns_dq_write.vdq_max = (size_t)vib.vib_size; + mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock); + mutex_exit(&vdp->vdd_lock); + ret = 0; + +err: + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_gettu(vnd_dev_t *vdp, intptr_t arg, int mode, boolean_t min) +{ + vnd_ioc_buf_t vib; + + vib.vib_errno = 0; + mutex_enter(&vdp->vdd_lock); + if (vdp->vdd_flags & VND_D_ATTACHED) { + mutex_enter(&vdp->vdd_str->vns_lock); + if (min == B_TRUE) + vib.vib_size = vdp->vdd_str->vns_minwrite; + else + vib.vib_size = vdp->vdd_str->vns_maxwrite; + mutex_exit(&vdp->vdd_str->vns_lock); + } else { + vib.vib_errno = VND_E_NOTATTACHED; + } + mutex_exit(&vdp->vdd_lock); + + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), mode & FKIOCTL) != 0) + return (EFAULT); + + return (0); +} + +static int +vnd_frameio_read(vnd_dev_t *vdp, intptr_t addr, int mode) +{ + int ret, nonblock, nwrite; + frameio_t *fio; + vnd_data_queue_t *vqp; + mblk_t *mp; + + fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI); + if (fio == NULL) + return (EAGAIN); + + ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (const void *)addr, + mode); + if (ret != 0) { + frameio_free(fio); + return (ret); + } + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + frameio_free(fio); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + + nonblock = mode & (FNONBLOCK | FNDELAY); + + vqp = &vdp->vdd_str->vns_dq_read; + mutex_enter(&vqp->vdq_lock); + + /* Check empty case */ + if (vqp->vdq_cur == 0) { + if (nonblock != 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (EWOULDBLOCK); + } + while (vqp->vdq_cur == 0) { + if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (EINTR); + } + } + } + + ret = frameio_mblk_chain_write(fio, MAP_BLK_FRAME, vqp->vdq_head, + &nwrite, mode & FKIOCTL); + if (ret != 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (ret); + } + + ret = frameio_hdr_copyout(fio, nwrite, (void *)addr, mode); + if (ret != 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (ret); + } + + while (nwrite > 0) { + (void) vnd_dq_pop(vqp, &mp); + freemsg(mp); + nwrite--; + } + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + + return (0); +} + +static int +vnd_frameio_write(vnd_dev_t *vdp, intptr_t addr, int mode) +{ + frameio_t *fio; + int ret, nonblock, nframes, i, nread; + size_t maxwrite, minwrite, total, flen; + mblk_t *mp_chain, *mp, *nmp; + vnd_data_queue_t *vqp; + + fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI); + if (fio == NULL) + return (EAGAIN); + + ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (void *)addr, mode); + if (ret != 0) { + frameio_free(fio); + return (ret); + } + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + frameio_free(fio); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + + nonblock = mode & (FNONBLOCK | FNDELAY); + + /* + * Make sure no single frame is larger than we can accept. + */ + mutex_enter(&vdp->vdd_str->vns_lock); + minwrite = vdp->vdd_str->vns_minwrite; + maxwrite = vdp->vdd_str->vns_maxwrite; + mutex_exit(&vdp->vdd_str->vns_lock); + + nframes = fio->fio_nvpf / fio->fio_nvecs; + total = 0; + for (i = 0; i < nframes; i++) { + flen = frameio_frame_length(fio, + &fio->fio_vecs[i*fio->fio_nvpf]); + if (flen < minwrite || flen > maxwrite) { + frameio_free(fio); + return (ERANGE); + } + total += flen; + } + + vqp = &vdp->vdd_str->vns_dq_write; + mutex_enter(&vqp->vdq_lock); + while (vnd_dq_reserve(vqp, total) == 0) { + if (nonblock != 0) { + frameio_free(fio); + mutex_exit(&vqp->vdq_lock); + return (EAGAIN); + } + if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (EINTR); + } + } + mutex_exit(&vqp->vdq_lock); + + /* + * We've reserved our space, let's copyin and go from here. + */ + ret = frameio_mblk_chain_read(fio, &mp_chain, &nread, mode & FKIOCTL); + if (ret != 0) { + frameio_free(fio); + vnd_dq_unreserve(vqp, total); + cv_broadcast(&vqp->vdq_ready); + pollwakeup(&vdp->vdd_ph, POLLOUT); + return (ret); + } + + for (mp = mp_chain; mp != NULL; mp = nmp) { + nmp = mp->b_next; + mp->b_next = NULL; + gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp, + vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS, + VND_SQUEUE_TAG_VND_WRITE); + } + + /* + * Update the frameio structure to indicate that we wrote those frames. + */ + frameio_mark_consumed(fio, nread); + ret = frameio_hdr_copyout(fio, nread, (void *)addr, mode); + frameio_free(fio); + + return (ret); +} + +static int +vnd_ioctl_list_copy_info(vnd_dev_t *vdp, vnd_ioc_info_t *arg, int mode) +{ + const char *link; + uint32_t vers = 1; + ASSERT(MUTEX_HELD(&vdp->vdd_lock)); + + /* + * Copy all of the members out to userland. + */ + if (ddi_copyout(&vers, &arg->vii_version, sizeof (uint32_t), + mode & FKIOCTL) != 0) + return (EFAULT); + + if (vdp->vdd_flags & VND_D_LINKED) + link = vdp->vdd_lname; + else + link = "<anonymous>"; + if (ddi_copyout(link, arg->vii_name, sizeof (arg->vii_name), + mode & FKIOCTL) != 0) + return (EFAULT); + + if (ddi_copyout(vdp->vdd_datalink, arg->vii_datalink, + sizeof (arg->vii_datalink), mode & FKIOCTL) != 0) + return (EFAULT); + + if (ddi_copyout(&vdp->vdd_nsd->vpnd_zid, &arg->vii_zone, + sizeof (zoneid_t), mode & FKIOCTL) != 0) + return (EFAULT); + return (0); +} + +static int +vnd_ioctl_list(intptr_t arg, cred_t *credp, int mode) +{ + vnd_ioc_list_t vl; + vnd_ioc_list32_t vl32; + zoneid_t zid; + vnd_dev_t *vdp; + vnd_ioc_info_t *vip; + int found, cancopy, ret; + + if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { + if (ddi_copyin((void *)arg, &vl32, sizeof (vnd_ioc_list32_t), + mode & FKIOCTL) != 0) + return (EFAULT); + vl.vl_nents = vl32.vl_nents; + vl.vl_actents = vl32.vl_actents; + vl.vl_ents = (void *)(uintptr_t)vl32.vl_ents; + } else { + if (ddi_copyin((void *)arg, &vl, sizeof (vnd_ioc_list_t), + mode & FKIOCTL) != 0) + return (EFAULT); + } + + cancopy = vl.vl_nents; + vip = vl.vl_ents; + found = 0; + zid = crgetzoneid(credp); + mutex_enter(&vnd_dev_lock); + for (vdp = list_head(&vnd_dev_list); vdp != NULL; + vdp = list_next(&vnd_dev_list, vdp)) { + mutex_enter(&vdp->vdd_lock); + if (vdp->vdd_flags & VND_D_ATTACHED && + !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING)) && + (zid == GLOBAL_ZONEID || zid == vdp->vdd_nsd->vpnd_zid)) { + found++; + if (cancopy > 0) { + ret = vnd_ioctl_list_copy_info(vdp, vip, mode); + if (ret != 0) { + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + return (ret); + } + cancopy--; + vip++; + } + } + mutex_exit(&vdp->vdd_lock); + } + mutex_exit(&vnd_dev_lock); + + if (ddi_copyout(&found, &((vnd_ioc_list_t *)arg)->vl_actents, + sizeof (uint_t), mode & FKIOCTL) != 0) + return (EFAULT); + + return (0); +} + + +/* ARGSUSED */ +static int +vnd_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + int ret; + minor_t m; + vnd_dev_t *vdp; + + m = getminor(dev); + ASSERT(m != 0); + + /* + * Make sure no one has come in on an ioctl from the strioc case. + */ + if ((cmd & VND_STRIOC) == VND_STRIOC) + return (ENOTTY); + + /* + * Like close, seems like if this minor isn't found, it's a programmer + * error somehow. + */ + vdp = vnd_dev_lookup(m); + if (vdp == NULL) + return (ENXIO); + + switch (cmd) { + case VND_IOC_ATTACH: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_attach(vdp, arg, credp, mode); + break; + case VND_IOC_LINK: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_link(vdp, arg, credp, mode); + break; + case VND_IOC_UNLINK: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_unlink(vdp, arg, credp, mode); + break; + case VND_IOC_GETRXBUF: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_getrxbuf(vdp, arg, mode); + break; + case VND_IOC_SETRXBUF: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_setrxbuf(vdp, arg, mode); + break; + case VND_IOC_GETTXBUF: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_gettxbuf(vdp, arg, mode); + break; + case VND_IOC_SETTXBUF: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_settxbuf(vdp, arg, mode); + break; + case VND_IOC_GETMAXBUF: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + if (crgetzoneid(credp) != GLOBAL_ZONEID) { + ret = EPERM; + break; + } + ret = vnd_ioctl_getmaxbuf(vdp, arg, mode); + break; + case VND_IOC_GETMINTU: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_gettu(vdp, arg, mode, B_TRUE); + break; + case VND_IOC_GETMAXTU: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_gettu(vdp, arg, mode, B_FALSE); + break; + case VND_IOC_FRAMEIO_READ: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_frameio_read(vdp, arg, mode); + break; + case VND_IOC_FRAMEIO_WRITE: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_frameio_write(vdp, arg, mode); + break; + case VND_IOC_LIST: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_list(arg, credp, mode); + break; + default: + ret = ENOTTY; + break; + } + + vnd_dev_rele(vdp); + return (ret); +} + +static int +vnd_open(dev_t *devp, int flag, int otyp, cred_t *credp) +{ + vnd_dev_t *vdp; + minor_t m; + zoneid_t zid; + + if (flag & (FEXCL | FNDELAY)) + return (ENOTSUP); + + if (otyp & OTYP_BLK) + return (ENOTSUP); + + zid = crgetzoneid(credp); + m = getminor(*devp); + + /* + * If we have an open of a non-zero instance then we need to look that + * up in our list of entries. + */ + if (m != 0) { + + /* + * We don't check for rawaccess globally as a user could be + * doing a list ioctl on the control node which doesn't require + * this privilege. + */ + if (secpolicy_net_rawaccess(credp) != 0) + return (EPERM); + + + vdp = vnd_dev_lookup(m); + if (vdp == NULL) + return (ENOENT); + + /* + * We need to check to make sure that the user is allowed to + * open this node. At this point it should be an attached handle + * as that's all we're allowed to access. + */ + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_LINKED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENOENT); + } + + if (vdp->vdd_flags & VND_D_ZONE_DYING) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENOENT); + } + + if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENOENT); + } + + if ((flag & FEXCL) && (vdp->vdd_flags & VND_D_OPENED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (EBUSY); + } + + if (!(vdp->vdd_flags & VND_D_OPENED)) { + vdp->vdd_flags |= VND_D_OPENED; + vdp->vdd_ref++; + DTRACE_VND_REFINC(vdp); + } + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + + return (0); + } + + if (flag & FEXCL) + return (ENOTSUP); + + /* + * We need to clone ourselves and set up new a state. + */ + vdp = kmem_cache_alloc(vnd_dev_cache, KM_SLEEP); + bzero(vdp, sizeof (vnd_dev_t)); + + if (ldi_ident_from_dev(*devp, &vdp->vdd_ldiid) != 0) { + kmem_cache_free(vnd_dev_cache, vdp); + return (EINVAL); + } + + vdp->vdd_minor = id_alloc(vnd_minors); + mutex_init(&vdp->vdd_lock, NULL, MUTEX_DRIVER, NULL); + list_link_init(&vdp->vdd_link); + vdp->vdd_ref = 1; + *devp = makedevice(getmajor(*devp), vdp->vdd_minor); + vdp->vdd_devid = *devp; + DTRACE_VND_REFINC(vdp); + vdp->vdd_flags |= VND_D_OPENED; + + mutex_enter(&vnd_dev_lock); + list_insert_head(&vnd_dev_list, vdp); + mutex_exit(&vnd_dev_lock); + + return (0); +} + +/* ARGSUSED */ +static int +vnd_close(dev_t dev, int flag, int otyp, cred_t *credp) +{ + minor_t m; + vnd_dev_t *vdp; + + m = getminor(dev); + if (m == 0) + return (ENXIO); + + vdp = vnd_dev_lookup(m); + if (vdp == NULL) + return (ENXIO); + + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_flags & VND_D_OPENED); + vdp->vdd_flags &= ~VND_D_OPENED; + mutex_exit(&vdp->vdd_lock); + + /* Remove the hold from the previous open. */ + vnd_dev_rele(vdp); + + /* And now from lookup */ + vnd_dev_rele(vdp); + return (0); +} + +/* ARGSUSED */ +static int +vnd_read(dev_t dev, struct uio *uiop, cred_t *credp) +{ + int nonblock, error = 0; + size_t mpsize; + vnd_dev_t *vdp; + vnd_data_queue_t *vqp; + mblk_t *mp = NULL; + offset_t u_loffset; + + /* + * If we have more than one uio we refuse to do anything. That's for + * frameio. + */ + if (uiop->uio_iovcnt > 1) + return (EINVAL); + + vdp = vnd_dev_lookup(getminor(dev)); + if (vdp == NULL) + return (ENXIO); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY); + + vqp = &vdp->vdd_str->vns_dq_read; + mutex_enter(&vqp->vdq_lock); + + /* Check empty case */ + if (vqp->vdq_cur == 0) { + if (nonblock != 0) { + error = EWOULDBLOCK; + goto err; + } + while (vqp->vdq_cur == 0) { + if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { + error = EINTR; + goto err; + } + } + } + + /* Ensure our buffer is big enough */ + mp = vqp->vdq_head; + ASSERT(mp != NULL); + mpsize = msgsize(mp); + if (mpsize > uiop->uio_resid) { + error = EOVERFLOW; + goto err; + } + + u_loffset = uiop->uio_loffset; + while (mp != NULL) { + if (uiomove(mp->b_rptr, MBLKL(mp), UIO_READ, uiop) != 0) { + error = EFAULT; + uiop->uio_loffset = u_loffset; + mp = NULL; + goto err; + } + mpsize -= MBLKL(mp); + mp = mp->b_cont; + } + ASSERT(mpsize == 0); + (void) vnd_dq_pop(vqp, &mp); + freemsg(mp); +err: + mutex_exit(&vqp->vdq_lock); + vnd_dev_rele(vdp); + + return (error); +} + +/* ARGSUSED */ +static int +vnd_write(dev_t dev, struct uio *uiop, cred_t *credp) +{ + int nonblock, error; + vnd_dev_t *vdp; + mblk_t *mp; + ssize_t iosize, origsize; + vnd_data_queue_t *vqp; + + if (uiop->uio_iovcnt > 1) + return (EINVAL); + + vdp = vnd_dev_lookup(getminor(dev)); + if (vdp == NULL) + return (ENXIO); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY); + + mutex_enter(&vdp->vdd_str->vns_lock); + if (uiop->uio_resid > vdp->vdd_str->vns_maxwrite || + uiop->uio_resid < vdp->vdd_str->vns_minwrite) { + mutex_exit(&vdp->vdd_str->vns_lock); + vnd_dev_rele(vdp); + return (ERANGE); + } + mutex_exit(&vdp->vdd_str->vns_lock); + VERIFY(vdp->vdd_str != NULL); + + /* + * Reserve space in the data queue if we can. If we can't, block or + * return EAGAIN. If we can, go and squeue_enter. + */ + vqp = &vdp->vdd_str->vns_dq_write; + mutex_enter(&vqp->vdq_lock); + while (vnd_dq_reserve(vqp, uiop->uio_resid) == 0) { + if (nonblock != 0) { + mutex_exit(&vqp->vdq_lock); + vnd_dev_rele(vdp); + return (EAGAIN); + } + if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { + mutex_exit(&vqp->vdq_lock); + vnd_dev_rele(vdp); + return (EINTR); + } + } + mutex_exit(&vqp->vdq_lock); + + /* + * Now that we've reserved the space, try to allocate kernel space for + * and copy in the block. To take care of all this we use the + * strmakedata subroutine for now. + */ + origsize = iosize = uiop->uio_resid; + error = strmakedata(&iosize, uiop, vdp->vdd_str->vns_wq->q_stream, 0, + &mp); + + /* + * strmakedata() will return an error or it may only consume a portion + * of the data. + */ + if (error != 0 || uiop->uio_resid != 0) { + vnd_dq_unreserve(vqp, origsize); + cv_broadcast(&vqp->vdq_ready); + pollwakeup(&vdp->vdd_ph, POLLOUT); + vnd_dev_rele(vdp); + return (ENOSR); + } + + gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp, + vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS, + VND_SQUEUE_TAG_VND_WRITE); + + vnd_dev_rele(vdp); + return (0); +} + +static int +vnd_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + short ready = 0; + vnd_dev_t *vdp; + vnd_data_queue_t *vqp; + + vdp = vnd_dev_lookup(getminor(dev)); + if (vdp == NULL) + return (ENXIO); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + + if ((events & POLLIN) || (events & POLLRDNORM)) { + vqp = &vdp->vdd_str->vns_dq_read; + mutex_enter(&vqp->vdq_lock); + if (vqp->vdq_head != NULL) + ready |= events & (POLLIN | POLLRDNORM); + mutex_exit(&vqp->vdq_lock); + } + + if (events & POLLOUT) { + vqp = &vdp->vdd_str->vns_dq_write; + mutex_enter(&vqp->vdq_lock); + if (vqp->vdq_cur != vqp->vdq_max) + ready |= POLLOUT; + mutex_exit(&vqp->vdq_lock); + } + + if ((ready == 0 && !anyyet) || (events & POLLET)) { + *phpp = &vdp->vdd_ph; + } + *reventsp = ready; + vnd_dev_rele(vdp); + return (0); +} + +/* ARGSUSED */ +static void * +vnd_stack_init(netstackid_t stackid, netstack_t *ns) +{ + vnd_pnsd_t *nsp; + + nsp = kmem_cache_alloc(vnd_pnsd_cache, KM_SLEEP); + bzero(nsp, sizeof (*nsp)); + nsp->vpnd_nsid = stackid; + nsp->vpnd_zid = netstackid_to_zoneid(stackid); + nsp->vpnd_flags = 0; + mutex_init(&nsp->vpnd_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&nsp->vpnd_dev_list, sizeof (vnd_dev_t), + offsetof(vnd_dev_t, vdd_nslink)); + if (vnd_netinfo_init(nsp) == 0) + nsp->vpnd_hooked = B_TRUE; + + mutex_enter(&vnd_dev_lock); + list_insert_tail(&vnd_nsd_list, nsp); + mutex_exit(&vnd_dev_lock); + + return (nsp); +} + +/* ARGSUSED */ +static void +vnd_stack_shutdown(netstackid_t stackid, void *arg) +{ + vnd_pnsd_t *nsp = arg; + vnd_dev_t *vdp; + + ASSERT(nsp != NULL); + /* + * After shut down no one should be able to find their way to this + * netstack again. + */ + mutex_enter(&vnd_dev_lock); + list_remove(&vnd_nsd_list, nsp); + mutex_exit(&vnd_dev_lock); + + /* + * Make sure hooks know that they're going away. + */ + if (nsp->vpnd_hooked == B_TRUE) + vnd_netinfo_shutdown(nsp); + + /* + * Now we need to go through and notify each zone that they are in + * teardown phase. See the big theory statement section on vnd, zones, + * netstacks, and sdev for more information about this. + */ + mutex_enter(&nsp->vpnd_lock); + nsp->vpnd_flags |= VND_NS_CONDEMNED; + for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL; + vdp = list_next(&nsp->vpnd_dev_list, vdp)) { + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_CONDEMNED)) + vdp->vdd_flags |= VND_D_ZONE_DYING; + mutex_exit(&vdp->vdd_lock); + } + mutex_exit(&nsp->vpnd_lock); + + /* + * Next we remove all the links as we know nothing new can be added to + * the list and that none of the extent devices can obtain additional + * links. + */ +restart: + mutex_enter(&nsp->vpnd_lock); + for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL; + vdp = list_next(&nsp->vpnd_dev_list, vdp)) { + mutex_enter(&vdp->vdd_lock); + if ((vdp->vdd_flags & VND_D_CONDEMNED) || + !(vdp->vdd_flags & VND_D_LINKED)) { + mutex_exit(&vdp->vdd_lock); + continue; + } + + /* + * We drop our lock here and restart afterwards. Note that as + * part of unlinking we end up doing a rele of the vnd_dev_t. If + * this is the final hold on the vnd_dev_t then it might try and + * remove itself. Our locking rules requires not to be holding + * any locks when we call any of the rele functions. + * + * Note that the unlink function requires holders to call into + * it with the vnd_dev_t->vdd_lock held and will take care of it + * for us. Because we don't have a hold on it, we're done at + * this point. + */ + mutex_exit(&nsp->vpnd_lock); + /* Forcibly unlink */ + vnd_dev_unlink(vdp); + goto restart; + } + mutex_exit(&nsp->vpnd_lock); +} + +/* ARGSUSED */ +static void +vnd_stack_destroy(netstackid_t stackid, void *arg) +{ + vnd_pnsd_t *nsp = arg; + + ASSERT(nsp != NULL); + + /* + * Now that we've unlinked everything we just have to hang out for + * it to finish exiting. Now that it's no longer the kernel itself + * that's doing this we just need to wait for our reference count to + * equal zero and then we're free. If the global zone is holding open a + * reference to a vnd device for another zone, that's bad, but there's + * nothing much we can do. See the section on 'vnd, zones, netstacks' in + * the big theory statement for more information. + */ + mutex_enter(&nsp->vpnd_lock); + while (nsp->vpnd_ref != 0) + cv_wait(&nsp->vpnd_ref_change, &nsp->vpnd_lock); + mutex_exit(&nsp->vpnd_lock); + + /* + * During shutdown we removed ourselves from the list and now we have no + * more references so we can safely say that there is nothing left and + * destroy everything that we had sitting around. + */ + if (nsp->vpnd_hooked == B_TRUE) + vnd_netinfo_fini(nsp); + + mutex_destroy(&nsp->vpnd_lock); + list_destroy(&nsp->vpnd_dev_list); + kmem_cache_free(vnd_pnsd_cache, nsp); +} + +/* + * Convert a node with a name of the form /dev/vnd/zone/%zonename and + * /dev/vnd/zone/%zonename/%linkname to the corresponding vnd netstack. + */ +static vnd_pnsd_t * +vnd_sdev_ctx_to_ns(sdev_ctx_t ctx) +{ + enum vtype vt; + const char *path = sdev_ctx_path(ctx); + char *zstart, *dup; + size_t duplen; + vnd_pnsd_t *nsp; + + vt = sdev_ctx_vtype(ctx); + ASSERT(strncmp(path, VND_SDEV_ZROOT, strlen(VND_SDEV_ZROOT)) == 0); + + if (vt == VDIR) { + zstart = strrchr(path, '/'); + ASSERT(zstart != NULL); + zstart++; + return (vnd_nsd_lookup_by_zonename(zstart)); + } + + ASSERT(vt == VCHR); + + dup = strdup(path); + duplen = strlen(dup) + 1; + zstart = strrchr(dup, '/'); + *zstart = '\0'; + zstart--; + zstart = strrchr(dup, '/'); + zstart++; + nsp = vnd_nsd_lookup_by_zonename(zstart); + kmem_free(dup, duplen); + + return (nsp); +} + +static sdev_plugin_validate_t +vnd_sdev_validate_dir(sdev_ctx_t ctx) +{ + vnd_pnsd_t *nsp; + + if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ROOT) == 0) + return (SDEV_VTOR_VALID); + + if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ZROOT) == 0) { + ASSERT(getzoneid() == GLOBAL_ZONEID); + ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL); + return (SDEV_VTOR_VALID); + } + + nsp = vnd_sdev_ctx_to_ns(ctx); + if (nsp == NULL) + return (SDEV_VTOR_INVALID); + vnd_nsd_rele(nsp); + + return (SDEV_VTOR_VALID); +} + +static sdev_plugin_validate_t +vnd_sdev_validate(sdev_ctx_t ctx) +{ + enum vtype vt; + vnd_dev_t *vdp; + minor_t minor; + + vt = sdev_ctx_vtype(ctx); + if (vt == VDIR) + return (vnd_sdev_validate_dir(ctx)); + ASSERT(vt == VCHR); + + if (strcmp("ctl", sdev_ctx_name(ctx)) == 0) + return (SDEV_VTOR_VALID); + + if (sdev_ctx_minor(ctx, &minor) != 0) + return (SDEV_VTOR_STALE); + + vdp = vnd_dev_lookup(minor); + if (vdp == NULL) + return (SDEV_VTOR_STALE); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_LINKED) || + (vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (SDEV_VTOR_STALE); + } + + if (strcmp(sdev_ctx_name(ctx), vdp->vdd_lname) != 0) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (SDEV_VTOR_STALE); + } + + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (SDEV_VTOR_VALID); +} + +/* + * This function is a no-op. sdev never has holds on our devices as they can go + * away at any time and specfs has to deal with that fact. + */ +/* ARGSUSED */ +static void +vnd_sdev_inactive(sdev_ctx_t ctx) +{ +} + +static int +vnd_sdev_fillzone(vnd_pnsd_t *nsp, sdev_ctx_t ctx) +{ + int ret; + vnd_dev_t *vdp; + + mutex_enter(&nsp->vpnd_lock); + for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL; + vdp = list_next(&nsp->vpnd_dev_list, vdp)) { + mutex_enter(&vdp->vdd_lock); + if ((vdp->vdd_flags & VND_D_LINKED) && + !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) { + ret = sdev_plugin_mknod(ctx, vdp->vdd_lname, + VND_SDEV_MODE, vdp->vdd_devid); + if (ret != 0 && ret != EEXIST) { + mutex_exit(&vdp->vdd_lock); + mutex_exit(&nsp->vpnd_lock); + vnd_nsd_rele(nsp); + return (ret); + } + } + mutex_exit(&vdp->vdd_lock); + } + mutex_exit(&nsp->vpnd_lock); + + return (0); +} + +static int +vnd_sdev_filldir_root(sdev_ctx_t ctx) +{ + zoneid_t zid; + vnd_pnsd_t *nsp; + int ret; + + zid = getzoneid(); + nsp = vnd_nsd_lookup(zoneid_to_netstackid(zid)); + ASSERT(nsp != NULL); + ret = vnd_sdev_fillzone(nsp, ctx); + vnd_nsd_rele(nsp); + if (ret != 0) + return (ret); + + /* + * Checking the zone id is not sufficient as the global zone could be + * reaching down into a non-global zone's mounted /dev. + */ + if (zid == GLOBAL_ZONEID && (sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL)) { + ret = sdev_plugin_mkdir(ctx, "zone"); + if (ret != 0 && ret != EEXIST) + return (ret); + } + + /* + * Always add a reference to the control node. There's no need to + * reference it since it always exists and is always what we clone from. + */ + ret = sdev_plugin_mknod(ctx, "ctl", VND_SDEV_MODE, + makedevice(ddi_driver_major(vnd_dip), 0)); + if (ret != 0 && ret != EEXIST) + return (ret); + + return (0); +} + +static int +vnd_sdev_filldir_zroot(sdev_ctx_t ctx) +{ + int ret; + vnd_pnsd_t *nsp; + zone_t *zonep; + + ASSERT(getzoneid() == GLOBAL_ZONEID); + ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL); + + mutex_enter(&vnd_dev_lock); + for (nsp = list_head(&vnd_nsd_list); nsp != NULL; + nsp = list_next(&vnd_nsd_list, nsp)) { + mutex_enter(&nsp->vpnd_lock); + if (list_is_empty(&nsp->vpnd_dev_list)) { + mutex_exit(&nsp->vpnd_lock); + continue; + } + mutex_exit(&nsp->vpnd_lock); + zonep = zone_find_by_id(nsp->vpnd_zid); + /* + * This zone must be being torn down, so skip it. + */ + if (zonep == NULL) + continue; + ret = sdev_plugin_mkdir(ctx, zonep->zone_name); + zone_rele(zonep); + if (ret != 0 && ret != EEXIST) { + mutex_exit(&vnd_dev_lock); + return (ret); + } + } + mutex_exit(&vnd_dev_lock); + return (0); +} + +static int +vnd_sdev_filldir(sdev_ctx_t ctx) +{ + int ret; + vnd_pnsd_t *nsp; + + ASSERT(sdev_ctx_vtype(ctx) == VDIR); + if (strcmp(VND_SDEV_ROOT, sdev_ctx_path(ctx)) == 0) + return (vnd_sdev_filldir_root(ctx)); + + if (strcmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx)) == 0) + return (vnd_sdev_filldir_zroot(ctx)); + + ASSERT(strncmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx), + strlen(VND_SDEV_ZROOT)) == 0); + nsp = vnd_sdev_ctx_to_ns(ctx); + if (nsp == NULL) + return (0); + + ret = vnd_sdev_fillzone(nsp, ctx); + vnd_nsd_rele(nsp); + + return (ret); +} + +static sdev_plugin_ops_t vnd_sdev_ops = { + SDEV_PLUGIN_VERSION, + SDEV_PLUGIN_SUBDIR, + vnd_sdev_validate, + vnd_sdev_filldir, + vnd_sdev_inactive +}; + +static int +vnd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int errp = 0; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + /* + * Only allow one instance. + */ + if (vnd_dip != NULL) + return (DDI_FAILURE); + + vnd_dip = dip; + if (ddi_create_minor_node(vnd_dip, "vnd", S_IFCHR, 0, DDI_PSEUDO, 0) != + DDI_SUCCESS) { + vnd_dip = NULL; + return (DDI_FAILURE); + } + + if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, + DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) { + ddi_remove_minor_node(vnd_dip, NULL); + vnd_dip = NULL; + return (DDI_FAILURE); + } + + vnd_sdev_hdl = sdev_plugin_register(VND_SDEV_NAME, &vnd_sdev_ops, + &errp); + if (vnd_sdev_hdl == NULL) { + ddi_remove_minor_node(vnd_dip, NULL); + ddi_prop_remove_all(vnd_dip); + vnd_dip = NULL; + return (DDI_FAILURE); + } + + vnd_sqset = gsqueue_set_create(GSQUEUE_DEFAULT_PRIORITY); + + return (DDI_SUCCESS); +} + +/* ARGSUSED */ +static int +vnd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + mutex_enter(&vnd_dev_lock); + if (!list_is_empty(&vnd_dev_list)) { + mutex_exit(&vnd_dev_lock); + return (DDI_FAILURE); + } + mutex_exit(&vnd_dev_lock); + + return (DDI_FAILURE); +} + +/* ARGSUSED */ +static int +vnd_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) +{ + int error; + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)vnd_dip; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + break; + } + return (error); +} + + + +static void +vnd_ddi_fini(void) +{ + netstack_unregister(NS_VND); + if (vnd_taskq != NULL) + taskq_destroy(vnd_taskq); + if (vnd_str_cache != NULL) + kmem_cache_destroy(vnd_str_cache); + if (vnd_dev_cache != NULL) + kmem_cache_destroy(vnd_dev_cache); + if (vnd_pnsd_cache != NULL) + kmem_cache_destroy(vnd_pnsd_cache); + if (vnd_minors != NULL) + id_space_destroy(vnd_minors); + if (vnd_list_init != 0) { + list_destroy(&vnd_nsd_list); + list_destroy(&vnd_dev_list); + mutex_destroy(&vnd_dev_lock); + vnd_list_init = 0; + } + frameio_fini(); +} + +static int +vnd_ddi_init(void) +{ + if (frameio_init() != 0) + return (DDI_FAILURE); + + vnd_str_cache = kmem_cache_create("vnd_str_cache", sizeof (vnd_str_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + if (vnd_str_cache == NULL) { + frameio_fini(); + return (DDI_FAILURE); + } + vnd_dev_cache = kmem_cache_create("vnd_dev_cache", sizeof (vnd_dev_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + if (vnd_dev_cache == NULL) { + kmem_cache_destroy(vnd_str_cache); + frameio_fini(); + return (DDI_FAILURE); + } + vnd_pnsd_cache = kmem_cache_create("vnd_pnsd_cache", + sizeof (vnd_pnsd_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + if (vnd_pnsd_cache == NULL) { + kmem_cache_destroy(vnd_dev_cache); + kmem_cache_destroy(vnd_str_cache); + frameio_fini(); + return (DDI_FAILURE); + } + + vnd_taskq = taskq_create_instance("vnd", -1, 1, minclsyspri, 0, 0, 0); + if (vnd_taskq == NULL) { + kmem_cache_destroy(vnd_pnsd_cache); + kmem_cache_destroy(vnd_dev_cache); + kmem_cache_destroy(vnd_str_cache); + frameio_fini(); + return (DDI_FAILURE); + } + + vnd_minors = id_space_create("vnd_minors", 1, INT32_MAX); + if (vnd_minors == NULL) { + taskq_destroy(vnd_taskq); + kmem_cache_destroy(vnd_pnsd_cache); + kmem_cache_destroy(vnd_dev_cache); + kmem_cache_destroy(vnd_str_cache); + frameio_fini(); + return (DDI_FAILURE); + } + + mutex_init(&vnd_dev_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&vnd_dev_list, sizeof (vnd_dev_t), + offsetof(vnd_dev_t, vdd_link)); + list_create(&vnd_nsd_list, sizeof (vnd_pnsd_t), + offsetof(vnd_pnsd_t, vpnd_link)); + vnd_list_init = 1; + + netstack_register(NS_VND, vnd_stack_init, vnd_stack_shutdown, + vnd_stack_destroy); + + return (DDI_SUCCESS); +} + +static struct module_info vnd_minfo = { + 0, /* module id */ + "vnd", /* module name */ + 1, /* smallest packet size */ + INFPSZ, /* largest packet size (infinite) */ + 1, /* high watermark */ + 0 /* low watermark */ +}; + +static struct qinit vnd_r_qinit = { + vnd_s_rput, + NULL, + vnd_s_open, + vnd_s_close, + NULL, + &vnd_minfo, + NULL +}; + +static struct qinit vnd_w_qinit = { + vnd_s_wput, + NULL, + NULL, + NULL, + NULL, + &vnd_minfo, + NULL +}; + +static struct streamtab vnd_strtab = { + &vnd_r_qinit, + &vnd_w_qinit, + NULL, + NULL +}; + + +static struct cb_ops vnd_cb_ops = { + vnd_open, /* open */ + vnd_close, /* close */ + nulldev, /* strategy */ + nulldev, /* print */ + nodev, /* dump */ + vnd_read, /* read */ + vnd_write, /* write */ + vnd_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + vnd_chpoll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + NULL, /* streamtab */ + D_MP /* Driver compatibility flag */ +}; + +static struct dev_ops vnd_dev_ops = { + DEVO_REV, /* devo_rev */ + 0, /* refcnt */ + vnd_info, /* get_dev_info */ + nulldev, /* identify */ + nulldev, /* probe */ + vnd_attach, /* attach */ + vnd_detach, /* detach */ + nodev, /* reset */ + &vnd_cb_ops, /* driver operations */ + NULL, /* bus operations */ + nodev, /* dev power */ + ddi_quiesce_not_needed /* quiesce */ +}; + +static struct modldrv vnd_modldrv = { + &mod_driverops, + "Virtual Networking Datapath Driver", + &vnd_dev_ops +}; + +static struct fmodsw vnd_fmodfsw = { + "vnd", + &vnd_strtab, + D_NEW | D_MP +}; + +static struct modlstrmod vnd_modlstrmod = { + &mod_strmodops, + "Virtual Networking Datapath Driver", + &vnd_fmodfsw +}; + +static struct modlinkage vnd_modlinkage = { + MODREV_1, + &vnd_modldrv, + &vnd_modlstrmod, + NULL +}; + +int +_init(void) +{ + int error; + + /* + * We need to do all of our global initialization in init as opposed to + * attach and detach. The problem here is that because vnd can be used + * from a stream context while being detached, we can not rely on having + * run attach to create everything, alas. so it goes in _init, just like + * our friend ip. + */ + if ((error = vnd_ddi_init()) != DDI_SUCCESS) + return (error); + error = mod_install((&vnd_modlinkage)); + if (error != 0) + vnd_ddi_fini(); + return (error); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&vnd_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int error; + + error = mod_remove(&vnd_modlinkage); + if (error == 0) + vnd_ddi_fini(); + return (error); +} diff --git a/usr/src/uts/common/io/vnd/vnd.conf b/usr/src/uts/common/io/vnd/vnd.conf new file mode 100644 index 0000000000..65872e1ddf --- /dev/null +++ b/usr/src/uts/common/io/vnd/vnd.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2014, Joyent, Inc. All rights reserved. +# + +name="vnd" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/io/vnic/vnic_dev.c b/usr/src/uts/common/io/vnic/vnic_dev.c index d671153967..e532a551e7 100644 --- a/usr/src/uts/common/io/vnic/vnic_dev.c +++ b/usr/src/uts/common/io/vnic/vnic_dev.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015 Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved. */ @@ -354,7 +354,7 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, rw_enter(&vnic_lock, RW_WRITER); - /* does a VNIC with the same id already exist? */ + /* Does a VNIC with the same id already exist? */ err = mod_hash_find(vnic_hash, VNIC_HASH_KEY(vnic_id), (mod_hash_val_t *)&vnic); if (err == 0) { @@ -370,6 +370,7 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, bzero(vnic, sizeof (*vnic)); + vnic->vn_ls = LINK_STATE_UNKNOWN; vnic->vn_id = vnic_id; vnic->vn_link_id = linkid; vnic->vn_vrid = vrid; @@ -455,6 +456,20 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, } else { vnic->vn_hcksum_txflags = 0; } + + /* + * Check for LSO capabilities. LSO implementations + * depend on hardware checksumming, so the same + * requirement is enforced here. + */ + if (vnic->vn_hcksum_txflags != 0) { + if (!mac_capab_get(vnic->vn_lower_mh, MAC_CAPAB_LSO, + &vnic->vn_cap_lso)) { + vnic->vn_cap_lso.lso_flags = 0; + } + } else { + vnic->vn_cap_lso.lso_flags = 0; + } } /* register with the MAC module */ @@ -580,11 +595,12 @@ vnic_dev_create(datalink_id_t vnic_id, datalink_id_t linkid, vnic->vn_enabled = B_TRUE; if (is_anchor) { - mac_link_update(vnic->vn_mh, LINK_STATE_UP); + vnic->vn_ls = LINK_STATE_UP; } else { - mac_link_update(vnic->vn_mh, - mac_client_stat_get(vnic->vn_mch, MAC_STAT_LINK_STATE)); + vnic->vn_ls = mac_client_stat_get(vnic->vn_mch, + MAC_STAT_LINK_STATE); } + mac_link_update(vnic->vn_mh, vnic->vn_ls); rw_exit(&vnic_lock); @@ -824,6 +840,15 @@ vnic_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) HCKSUM_INET_PARTIAL); break; } + case MAC_CAPAB_LSO: { + mac_capab_lso_t *cap_lso = cap_data; + + if (vnic->vn_cap_lso.lso_flags == 0) { + return (B_FALSE); + } + *cap_lso = vnic->vn_cap_lso; + break; + } case MAC_CAPAB_VNIC: { mac_capab_vnic_t *vnic_capab = cap_data; @@ -1092,6 +1117,34 @@ vnic_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, err = vnic_set_secondary_macs(vn, &msa); break; } + case MAC_PROP_PRIVATE: { + long val, i; + const char *v; + + if (vn->vn_link_id != DATALINK_INVALID_LINKID || + strcmp(pr_name, "_linkstate") != 0) { + err = ENOTSUP; + break; + } + + for (v = pr_val, i = 0; i < pr_valsize; i++, v++) { + if (*v == '\0') + break; + } + if (i == pr_valsize) { + err = EINVAL; + break; + } + + (void) ddi_strtol(pr_val, (char **)NULL, 0, &val); + if (val != LINK_STATE_UP && val != LINK_STATE_DOWN) { + err = EINVAL; + break; + } + vn->vn_ls = val; + mac_link_update(vn->vn_mh, vn->vn_ls); + break; + } default: err = ENOTSUP; break; @@ -1117,6 +1170,18 @@ vnic_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, case MAC_PROP_SECONDARY_ADDRS: ret = vnic_get_secondary_macs(vn, pr_valsize, pr_val); break; + case MAC_PROP_PRIVATE: + if (vn->vn_link_id != DATALINK_INVALID_LINKID) { + ret = EINVAL; + break; + } + + if (strcmp(pr_name, "_linkstate") != 0) { + ret = EINVAL; + break; + } + (void) snprintf(pr_val, pr_valsize, "%d", vn->vn_ls); + break; default: ret = ENOTSUP; break; @@ -1126,7 +1191,8 @@ vnic_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, } /* ARGSUSED */ -static void vnic_m_propinfo(void *m_driver, const char *pr_name, +static void +vnic_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, mac_prop_info_handle_t prh) { vnic_t *vn = m_driver; @@ -1169,6 +1235,18 @@ static void vnic_m_propinfo(void *m_driver, const char *pr_name, mac_perim_exit(mph); } break; + case MAC_PROP_PRIVATE: + if (vn->vn_link_id != DATALINK_INVALID_LINKID) + break; + + if (strcmp(pr_name, "_linkstate") == 0) { + char buf[16]; + + mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW); + (void) snprintf(buf, sizeof (buf), "%d", vn->vn_ls); + mac_prop_info_set_default_str(prh, buf); + } + break; } } @@ -1241,8 +1319,9 @@ vnic_notify_cb(void *arg, mac_notify_type_t type) break; case MAC_NOTE_LINK: - mac_link_update(vnic->vn_mh, - mac_client_stat_get(vnic->vn_mch, MAC_STAT_LINK_STATE)); + vnic->vn_ls = mac_client_stat_get(vnic->vn_mch, + MAC_STAT_LINK_STATE); + mac_link_update(vnic->vn_mh, vnic->vn_ls); break; default: diff --git a/usr/src/uts/common/io/zfd.c b/usr/src/uts/common/io/zfd.c new file mode 100644 index 0000000000..2da310ab8d --- /dev/null +++ b/usr/src/uts/common/io/zfd.c @@ -0,0 +1,1154 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. All rights reserved. + */ + +/* + * Zone File Descriptor Driver. + * + * This driver is derived from the zcons driver which is in turn derived from + * the pts/ptm drivers. The purpose is to expose file descriptors within the + * zone which are connected to zoneadmd and used for logging or an interactive + * connection to a process within the zone. + * + * Its implementation is straightforward. Each instance of the driver + * represents a global-zone/local-zone pair. Unlike the zcons device, zoneadmd + * uses these devices unidirectionally to provide stdin, stdout and stderr to + * the process within the zone. + * + * Instances of zfd are onlined as children of /pseudo/zfdnex@2/ by zoneadmd, + * using the devctl framework; thus the driver does not need to maintain any + * sort of "admin" node. + * + * The driver shuttles I/O from master side to slave side and back. In a break + * from the pts/ptm semantics, if one side is not open, I/O directed towards + * it will simply be discarded. This is so that if zoneadmd is not holding the + * master side fd open (i.e. it has died somehow), processes in the zone do not + * experience any errors and I/O to the fd does not cause the process to hang. + * + * The driver can also act as a multiplexer so that data written to the + * slave side within the zone is also redirected back to another zfd device + * inside the zone for consumption (i.e. it can be read). The intention is + * that a logging process within the zone can consume data that is being + * written by an application onto the primary stream. This is essentially + * a tee off of the primary stream into a log stream. This tee can also be + * configured to be flow controlled via an ioctl. Flow control happens on the + * primary stream and is used to ensure that the log stream receives all of + * the messages off the primary stream when consumption of the data off of + * the log stream gets behind. Configuring for flow control implies that the + * application writing to the primary stream will be blocked when the log + * consumer gets behind. Note that closing the log stream (e.g. when the zone + * halts) will cause the loss of all messages queued in the stream. + * + * The zone's zfd device configuration is driven by zoneadmd and a zone mode. + * The mode, which is controlled by the zone attribute "zlog-mode" is somewhat + * of a misnomer since its purpose has evolved. The attribute can have a + * variety of values, but the lowest two positions are used to control how many + * zfd devices are created inside the zone and if the primary stream is a tty. + * + * Here is a summary of how the 4 modes control what zfd devices are created + * and how they're used: + * + * t-: 1 stdio zdev (0) configured as a tty + * --: 3 stdio zdevs (0, 1, 2), not configured as a tty + * tn: 1 stdio zdev (0) configured as a tty, 1 additional zdev (1) + * -n: 3 stdio zdevs (0, 1, 2), not tty, 2 additional zdevs (3, 4) + * + * With the 't' flag set, stdin/out/err is multiplexed onto a single full-duplex + * stream which is configured as a tty. That is, ptem, ldterm and ttycompat are + * autopushed onto the stream when the slave side is opened. There is only a + * single zfd dev (0) needed for the primary stream. + * + * When the 'n' flag is set, it is assumed that output logging will be done + * within the zone itself. In this configuration 1 or 2 additional zfd devices, + * depending on tty mode ('t' flag) are created within the zone. An application + * can then configure the zfd streams driver into a multiplexer. Output from + * the stdout/stderr zfd(s) will be teed into the correspond logging zfd(s) + * within the zone. + * + * The following is a diagram of how this works for a '-n' configuration: + * + * + * zoneadmd (for zlogin -I stdout) + * GZ: ^ + * | + * -------------------------- + * ^ + * NGZ: | + * app >1 -> zfd1 -> zfd3 -> logger (for logger to consume app's stdout) + * + * There would be a similar path for the app's stderr into zfd4 for the logger + * to consume stderr. + */ + +#include <sys/types.h> +#include <sys/cmn_err.h> +#include <sys/conf.h> +#include <sys/cred.h> +#include <sys/ddi.h> +#include <sys/debug.h> +#include <sys/devops.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/kstr.h> +#include <sys/modctl.h> +#include <sys/param.h> +#include <sys/stat.h> +#include <sys/stream.h> +#include <sys/stropts.h> +#include <sys/strsun.h> +#include <sys/sunddi.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/zfd.h> +#include <sys/vnode.h> +#include <sys/fs/snode.h> +#include <sys/zone.h> +#include <sys/sdt.h> + +static kmutex_t zfd_mux_lock; + +static int zfd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); +static int zfd_attach(dev_info_t *, ddi_attach_cmd_t); +static int zfd_detach(dev_info_t *, ddi_detach_cmd_t); + +static int zfd_open(queue_t *, dev_t *, int, int, cred_t *); +static int zfd_close(queue_t *, int, cred_t *); +static void zfd_wput(queue_t *, mblk_t *); +static void zfd_rsrv(queue_t *); +static void zfd_wsrv(queue_t *); + +/* + * The instance number is encoded in the dev_t in the minor number; the lowest + * bit of the minor number is used to track the master vs. slave side of the + * fd. The rest of the bits in the minor number are the instance. + */ +#define ZFD_MASTER_MINOR 0 +#define ZFD_SLAVE_MINOR 1 + +#define ZFD_INSTANCE(x) (getminor((x)) >> 1) +#define ZFD_NODE(x) (getminor((x)) & 0x01) + +/* + * This macro converts a zfd_state_t pointer to the associated slave minor + * node's dev_t. + */ +#define ZFD_STATE_TO_SLAVEDEV(x) \ + (makedevice(ddi_driver_major((x)->zfd_devinfo), \ + (minor_t)(ddi_get_instance((x)->zfd_devinfo) << 1 | ZFD_SLAVE_MINOR))) + +int zfd_debug = 0; +#define DBG(a) if (zfd_debug) cmn_err(CE_NOTE, a) +#define DBG1(a, b) if (zfd_debug) cmn_err(CE_NOTE, a, b) + +/* + * ZFD Pseudo Terminal Module: stream data structure definitions, + * based on zcons. + */ +static struct module_info zfd_info = { + 0x20FD, /* ZOFD - 8445 */ + "zfd", + 0, /* min packet size */ + INFPSZ, /* max packet size - infinity */ + 2048, /* high water */ + 128 /* low water */ +}; + +static struct qinit zfd_rinit = { + NULL, + (int (*)()) zfd_rsrv, + zfd_open, + zfd_close, + NULL, + &zfd_info, + NULL +}; + +static struct qinit zfd_winit = { + (int (*)()) zfd_wput, + (int (*)()) zfd_wsrv, + NULL, + NULL, + NULL, + &zfd_info, + NULL +}; + +static struct streamtab zfd_tab_info = { + &zfd_rinit, + &zfd_winit, + NULL, + NULL +}; + +#define ZFD_CONF_FLAG (D_MP | D_MTQPAIR | D_MTOUTPERIM | D_MTOCEXCL) + +/* + * this will define (struct cb_ops cb_zfd_ops) and (struct dev_ops zfd_ops) + */ +DDI_DEFINE_STREAM_OPS(zfd_ops, nulldev, nulldev, zfd_attach, zfd_detach, \ + nodev, zfd_getinfo, ZFD_CONF_FLAG, &zfd_tab_info, \ + ddi_quiesce_not_needed); + +/* + * Module linkage information for the kernel. + */ + +static struct modldrv modldrv = { + &mod_driverops, /* Type of module (this is a pseudo driver) */ + "Zone FD driver", /* description of module */ + &zfd_ops /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modldrv, + NULL +}; + +typedef enum { + ZFD_NO_MUX, + ZFD_PRIMARY_STREAM, + ZFD_LOG_STREAM +} zfd_mux_type_t; + +typedef struct zfd_state { + dev_info_t *zfd_devinfo; /* instance info */ + queue_t *zfd_master_rdq; /* GZ read queue */ + queue_t *zfd_slave_rdq; /* in-zone read queue */ + int zfd_state; /* ZFD_STATE_MOPEN, ZFD_STATE_SOPEN */ + int zfd_tty; /* ZFD_MAKETTY - strm mods will push */ + boolean_t zfd_is_flowcon; /* primary stream flow stopped */ + boolean_t zfd_allow_flowcon; /* use flow control */ + zfd_mux_type_t zfd_muxt; /* state type: none, primary, log */ + struct zfd_state *zfd_inst_pri; /* log state's primary ptr */ + struct zfd_state *zfd_inst_log; /* primary state's log ptr */ +} zfd_state_t; + +#define ZFD_STATE_MOPEN 0x01 +#define ZFD_STATE_SOPEN 0x02 + +static void *zfd_soft_state; + +/* + * List of STREAMS modules that are autopushed onto a slave instance when its + * opened, but only if the ZFD_MAKETTY ioctl has first been received by the + * master. + */ +static char *zfd_mods[] = { + "ptem", + "ldterm", + "ttcompat", + NULL +}; + +int +_init(void) +{ + int err; + + if ((err = ddi_soft_state_init(&zfd_soft_state, sizeof (zfd_state_t), + 0)) != 0) { + return (err); + } + + if ((err = mod_install(&modlinkage)) != 0) + ddi_soft_state_fini(zfd_soft_state); + + mutex_init(&zfd_mux_lock, NULL, MUTEX_DEFAULT, NULL); + return (err); +} + + +int +_fini(void) +{ + int err; + + if ((err = mod_remove(&modlinkage)) != 0) { + return (err); + } + + ddi_soft_state_fini(&zfd_soft_state); + mutex_destroy(&zfd_mux_lock); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +static int +zfd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + zfd_state_t *zfds; + int instance; + char masternm[ZFD_NAME_LEN], slavenm[ZFD_NAME_LEN]; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + instance = ddi_get_instance(dip); + if (ddi_soft_state_zalloc(zfd_soft_state, instance) != DDI_SUCCESS) + return (DDI_FAILURE); + + (void) snprintf(masternm, sizeof (masternm), "%s%d", ZFD_MASTER_NAME, + instance); + (void) snprintf(slavenm, sizeof (slavenm), "%s%d", ZFD_SLAVE_NAME, + instance); + + /* + * Create the master and slave minor nodes. + */ + if ((ddi_create_minor_node(dip, slavenm, S_IFCHR, + instance << 1 | ZFD_SLAVE_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE) || + (ddi_create_minor_node(dip, masternm, S_IFCHR, + instance << 1 | ZFD_MASTER_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE)) { + ddi_remove_minor_node(dip, NULL); + ddi_soft_state_free(zfd_soft_state, instance); + return (DDI_FAILURE); + } + + VERIFY((zfds = ddi_get_soft_state(zfd_soft_state, instance)) != NULL); + zfds->zfd_devinfo = dip; + zfds->zfd_tty = 0; + zfds->zfd_muxt = ZFD_NO_MUX; + zfds->zfd_inst_log = NULL; + return (DDI_SUCCESS); +} + +static int +zfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + zfd_state_t *zfds; + int instance; + + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + instance = ddi_get_instance(dip); + if ((zfds = ddi_get_soft_state(zfd_soft_state, instance)) == NULL) + return (DDI_FAILURE); + + if ((zfds->zfd_state & ZFD_STATE_MOPEN) || + (zfds->zfd_state & ZFD_STATE_SOPEN)) { + DBG1("zfd_detach: device (dip=%p) still open\n", (void *)dip); + return (DDI_FAILURE); + } + + ddi_remove_minor_node(dip, NULL); + ddi_soft_state_free(zfd_soft_state, instance); + + return (DDI_SUCCESS); +} + +/* + * zfd_getinfo() + * getinfo(9e) entrypoint. + */ +/*ARGSUSED*/ +static int +zfd_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + zfd_state_t *zfds; + int instance = ZFD_INSTANCE((dev_t)arg); + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + if ((zfds = ddi_get_soft_state(zfd_soft_state, + instance)) == NULL) + return (DDI_FAILURE); + *result = zfds->zfd_devinfo; + return (DDI_SUCCESS); + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)(uintptr_t)instance; + return (DDI_SUCCESS); + } + return (DDI_FAILURE); +} + +/* + * Return the equivalent queue from the other side of the relationship. + * e.g.: given the slave's write queue, return the master's write queue. + */ +static queue_t * +zfd_switch(queue_t *qp) +{ + zfd_state_t *zfds = qp->q_ptr; + ASSERT(zfds != NULL); + + if (qp == zfds->zfd_master_rdq) + return (zfds->zfd_slave_rdq); + else if (OTHERQ(qp) == zfds->zfd_master_rdq && zfds->zfd_slave_rdq + != NULL) + return (OTHERQ(zfds->zfd_slave_rdq)); + else if (qp == zfds->zfd_slave_rdq) + return (zfds->zfd_master_rdq); + else if (OTHERQ(qp) == zfds->zfd_slave_rdq && zfds->zfd_master_rdq + != NULL) + return (OTHERQ(zfds->zfd_master_rdq)); + else + return (NULL); +} + +/* + * For debugging and outputting messages. Returns the name of the side of + * the relationship associated with this queue. + */ +static const char * +zfd_side(queue_t *qp) +{ + zfd_state_t *zfds = qp->q_ptr; + ASSERT(zfds != NULL); + + if (qp == zfds->zfd_master_rdq || + OTHERQ(qp) == zfds->zfd_master_rdq) { + return ("master"); + } + ASSERT(qp == zfds->zfd_slave_rdq || OTHERQ(qp) == zfds->zfd_slave_rdq); + return ("slave"); +} + +/*ARGSUSED*/ +static int +zfd_master_open(zfd_state_t *zfds, + queue_t *rqp, /* pointer to the read side queue */ + dev_t *devp, /* pointer to stream tail's dev */ + int oflag, /* the user open(2) supplied flags */ + int sflag, /* open state flag */ + cred_t *credp) /* credentials */ +{ + mblk_t *mop; + struct stroptions *sop; + + /* + * Enforce exclusivity on the master side; the only consumer should + * be the zoneadmd for the zone. + */ + if ((zfds->zfd_state & ZFD_STATE_MOPEN) != 0) + return (EBUSY); + + if ((mop = allocb(sizeof (struct stroptions), BPRI_MED)) == NULL) { + DBG("zfd_master_open(): mop allocation failed\n"); + return (ENOMEM); + } + + zfds->zfd_state |= ZFD_STATE_MOPEN; + + /* + * q_ptr stores driver private data; stash the soft state data on both + * read and write sides of the queue. + */ + WR(rqp)->q_ptr = rqp->q_ptr = zfds; + qprocson(rqp); + + /* + * Following qprocson(), the master side is fully plumbed into the + * STREAM and may send/receive messages. Setting zfds->zfd_master_rdq + * will allow the slave to send messages to us (the master). + * This cannot occur before qprocson() because the master is not + * ready to process them until that point. + */ + zfds->zfd_master_rdq = rqp; + + /* + * set up hi/lo water marks on stream head read queue and add + * controlling tty as needed. + */ + mop->b_datap->db_type = M_SETOPTS; + mop->b_wptr += sizeof (struct stroptions); + sop = (struct stroptions *)(void *)mop->b_rptr; + if (oflag & FNOCTTY) + sop->so_flags = SO_HIWAT | SO_LOWAT; + else + sop->so_flags = SO_HIWAT | SO_LOWAT | SO_ISTTY; + sop->so_hiwat = 512; + sop->so_lowat = 256; + putnext(rqp, mop); + + return (0); +} + +/*ARGSUSED*/ +static int +zfd_slave_open(zfd_state_t *zfds, + queue_t *rqp, /* pointer to the read side queue */ + dev_t *devp, /* pointer to stream tail's dev */ + int oflag, /* the user open(2) supplied flags */ + int sflag, /* open state flag */ + cred_t *credp) /* credentials */ +{ + mblk_t *mop; + struct stroptions *sop; + /* + * The slave side can be opened as many times as needed. + */ + if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) { + ASSERT((rqp != NULL) && (WR(rqp)->q_ptr == zfds)); + return (0); + } + + /* A log stream is read-only */ + if (zfds->zfd_muxt == ZFD_LOG_STREAM && + (oflag & (FREAD | FWRITE)) != FREAD) + return (EINVAL); + + if (zfds->zfd_tty == 1) { + major_t major; + minor_t minor; + minor_t lastminor; + uint_t anchorindex; + + /* + * Set up sad(7D) so that the necessary STREAMS modules will + * be in place. A wrinkle is that 'ptem' must be anchored + * in place (see streamio(7i)) because we always want the + * fd to have terminal semantics. + */ + minor = + ddi_get_instance(zfds->zfd_devinfo) << 1 | ZFD_SLAVE_MINOR; + major = ddi_driver_major(zfds->zfd_devinfo); + lastminor = 0; + anchorindex = 1; + if (kstr_autopush(SET_AUTOPUSH, &major, &minor, &lastminor, + &anchorindex, zfd_mods) != 0) { + DBG("zfd_slave_open(): kstr_autopush() failed\n"); + return (EIO); + } + } + + if ((mop = allocb(sizeof (struct stroptions), BPRI_MED)) == NULL) { + DBG("zfd_slave_open(): mop allocation failed\n"); + return (ENOMEM); + } + + zfds->zfd_state |= ZFD_STATE_SOPEN; + + /* + * q_ptr stores driver private data; stash the soft state data on both + * read and write sides of the queue. + */ + WR(rqp)->q_ptr = rqp->q_ptr = zfds; + + qprocson(rqp); + + /* + * Must follow qprocson(), since we aren't ready to process until then. + */ + zfds->zfd_slave_rdq = rqp; + + /* + * set up hi/lo water marks on stream head read queue and add + * controlling tty as needed. + */ + mop->b_datap->db_type = M_SETOPTS; + mop->b_wptr += sizeof (struct stroptions); + sop = (struct stroptions *)(void *)mop->b_rptr; + sop->so_flags = SO_HIWAT | SO_LOWAT | SO_ISTTY; + sop->so_hiwat = 512; + sop->so_lowat = 256; + putnext(rqp, mop); + + return (0); +} + +/* + * open(9e) entrypoint; checks sflag, and rejects anything unordinary. + */ +static int +zfd_open(queue_t *rqp, /* pointer to the read side queue */ + dev_t *devp, /* pointer to stream tail's dev */ + int oflag, /* the user open(2) supplied flags */ + int sflag, /* open state flag */ + cred_t *credp) /* credentials */ +{ + int instance = ZFD_INSTANCE(*devp); + int ret; + zfd_state_t *zfds; + + if (sflag != 0) + return (EINVAL); + + if ((zfds = ddi_get_soft_state(zfd_soft_state, instance)) == NULL) + return (ENXIO); + + switch (ZFD_NODE(*devp)) { + case ZFD_MASTER_MINOR: + ret = zfd_master_open(zfds, rqp, devp, oflag, sflag, credp); + break; + case ZFD_SLAVE_MINOR: + ret = zfd_slave_open(zfds, rqp, devp, oflag, sflag, credp); + /* + * If we just opened the log stream and flow control has + * been enabled, we want to make sure the primary stream can + * start flowing. + */ + if (ret == 0 && zfds->zfd_muxt == ZFD_LOG_STREAM && + zfds->zfd_inst_pri->zfd_allow_flowcon) { + zfds->zfd_inst_pri->zfd_is_flowcon = B_FALSE; + if (zfds->zfd_inst_pri->zfd_master_rdq != NULL) + qenable(RD(zfds->zfd_inst_pri->zfd_master_rdq)); + } + break; + default: + ret = ENXIO; + break; + } + + return (ret); +} + +/* + * close(9e) entrypoint. + */ +/*ARGSUSED1*/ +static int +zfd_close(queue_t *rqp, int flag, cred_t *credp) +{ + queue_t *wqp; + mblk_t *bp; + zfd_state_t *zfds; + major_t major; + minor_t minor; + + zfds = (zfd_state_t *)rqp->q_ptr; + + if (rqp == zfds->zfd_master_rdq) { + DBG("Closing master side"); + + zfds->zfd_master_rdq = NULL; + zfds->zfd_state &= ~ZFD_STATE_MOPEN; + + /* + * qenable slave side write queue so that it can flush + * its messages as master's read queue is going away + */ + if (zfds->zfd_slave_rdq != NULL) { + qenable(WR(zfds->zfd_slave_rdq)); + } + + qprocsoff(rqp); + WR(rqp)->q_ptr = rqp->q_ptr = NULL; + + } else if (rqp == zfds->zfd_slave_rdq) { + + DBG("Closing slave side"); + zfds->zfd_state &= ~ZFD_STATE_SOPEN; + zfds->zfd_slave_rdq = NULL; + + wqp = WR(rqp); + while ((bp = getq(wqp)) != NULL) { + if (zfds->zfd_master_rdq != NULL) + putnext(zfds->zfd_master_rdq, bp); + else if (bp->b_datap->db_type == M_IOCTL) + miocnak(wqp, bp, 0, 0); + else + freemsg(bp); + } + + /* + * Qenable master side write queue so that it can flush its + * messages as slaves's read queue is going away. + */ + if (zfds->zfd_master_rdq != NULL) + qenable(WR(zfds->zfd_master_rdq)); + + /* + * Qenable primary stream if necessary. + */ + if (zfds->zfd_muxt == ZFD_LOG_STREAM && + zfds->zfd_inst_pri->zfd_allow_flowcon) { + zfds->zfd_inst_pri->zfd_is_flowcon = B_FALSE; + if (zfds->zfd_inst_pri->zfd_master_rdq != NULL) + qenable(RD(zfds->zfd_inst_pri->zfd_master_rdq)); + } + + qprocsoff(rqp); + WR(rqp)->q_ptr = rqp->q_ptr = NULL; + + if (zfds->zfd_tty == 1) { + /* + * Clear the sad configuration so that reopening + * doesn't fail to set up sad configuration. + */ + major = ddi_driver_major(zfds->zfd_devinfo); + minor = ddi_get_instance(zfds->zfd_devinfo) << 1 | + ZFD_SLAVE_MINOR; + (void) kstr_autopush(CLR_AUTOPUSH, &major, &minor, + NULL, NULL, NULL); + } + } + + return (0); +} + +static void +handle_mflush(queue_t *qp, mblk_t *mp) +{ + mblk_t *nmp; + DBG1("M_FLUSH on %s side", zfd_side(qp)); + + if (*mp->b_rptr & FLUSHW) { + DBG1("M_FLUSH, FLUSHW, %s side", zfd_side(qp)); + flushq(qp, FLUSHDATA); + *mp->b_rptr &= ~FLUSHW; + if ((*mp->b_rptr & FLUSHR) == 0) { + /* + * FLUSHW only. Change to FLUSHR and putnext other side, + * then we are done. + */ + *mp->b_rptr |= FLUSHR; + if (zfd_switch(RD(qp)) != NULL) { + putnext(zfd_switch(RD(qp)), mp); + return; + } + } else if ((zfd_switch(RD(qp)) != NULL) && + (nmp = copyb(mp)) != NULL) { + /* + * It is a FLUSHRW; we copy the mblk and send + * it to the other side, since we still need to use + * the mblk in FLUSHR processing, below. + */ + putnext(zfd_switch(RD(qp)), nmp); + } + } + + if (*mp->b_rptr & FLUSHR) { + DBG("qreply(qp) turning FLUSHR around\n"); + qreply(qp, mp); + return; + } + freemsg(mp); +} + +/* + * Evaluate the various conditionals to determine if we're teeing into a log + * stream and if the primary stream should be flow controlled. This function + * can set the zfd_is_flowcon flag as a side effect. + * + * When teeing with flow control, we always queue the teed msg here and if + * the queue is getting full, we set zfd_is_flowcon. The primary stream will + * always queue when zfd_is_flowcon and will also not be served when + * zfd_is_flowcon is set. This causes backpressure on the primary stream + * until the teed queue can drain. + */ +static void +zfd_tee_handler(zfd_state_t *zfds, unsigned char type, mblk_t *mp) +{ + queue_t *log_qp; + zfd_state_t *log_zfds; + mblk_t *lmp; + + if (zfds->zfd_muxt != ZFD_PRIMARY_STREAM) + return; + + if (type != M_DATA) + return; + + log_zfds = zfds->zfd_inst_log; + if (log_zfds == NULL) + return; + + ASSERT(log_zfds->zfd_muxt == ZFD_LOG_STREAM); + + if ((log_zfds->zfd_state & ZFD_STATE_SOPEN) == 0) { + if (zfds->zfd_allow_flowcon) + zfds->zfd_is_flowcon = B_TRUE; + return; + } + + /* The zfd_slave_rdq is null until the log dev is opened in the zone */ + log_qp = RD(log_zfds->zfd_slave_rdq); + DTRACE_PROBE2(zfd__tee__check, void *, log_qp, void *, zfds); + + if (!zfds->zfd_allow_flowcon) { + /* + * We're not supposed to tee with flow control and the tee is + * full so we skip teeing into the log stream. + */ + if ((log_qp->q_flag & QFULL) != 0) + return; + } + + /* + * Tee the message into the log stream. + */ + lmp = dupmsg(mp); + if (lmp == NULL) { + if (zfds->zfd_allow_flowcon) + zfds->zfd_is_flowcon = B_TRUE; + return; + } + + if (log_qp->q_first == NULL && bcanputnext(log_qp, lmp->b_band)) { + putnext(log_qp, lmp); + } else { + if (putq(log_qp, lmp) == 0) { + /* The logger queue is full, free the msg. */ + freemsg(lmp); + } + /* + * If we're supposed to tee with flow control and the tee is + * over the high water mark then we want the primary stream to + * stop flowing. We'll stop queueing the primary stream after + * the log stream has drained. + */ + if (zfds->zfd_allow_flowcon && + log_qp->q_count > log_qp->q_hiwat) { + zfds->zfd_is_flowcon = B_TRUE; + } + } +} + +/* + * wput(9E) is symmetric for master and slave sides, so this handles both + * without splitting the codepath. (The only exception to this is the + * processing of zfd ioctls, which is restricted to the master side.) + * + * zfd_wput() looks at the other side; if there is no process holding that + * side open, it frees the message. This prevents processes from hanging + * if no one is holding open the fd. Otherwise, it putnext's high + * priority messages, putnext's normal messages if possible, and otherwise + * enqueues the messages; in the case that something is enqueued, wsrv(9E) + * will take care of eventually shuttling I/O to the other side. + * + * When configured as a multiplexer, then anything written to the stream + * from inside the zone is also teed off to the corresponding log stream + * for consumption within the zone (i.e. the log stream can be read, but never + * written to, by an application inside the zone). + */ +static void +zfd_wput(queue_t *qp, mblk_t *mp) +{ + unsigned char type = mp->b_datap->db_type; + zfd_state_t *zfds; + struct iocblk *iocbp; + boolean_t must_queue = B_FALSE; + + ASSERT(qp->q_ptr); + + DBG1("entering zfd_wput, %s side", zfd_side(qp)); + + /* + * Process zfd ioctl messages if qp is the master side's write queue. + */ + zfds = (zfd_state_t *)qp->q_ptr; + + if (type == M_IOCTL) { + iocbp = (struct iocblk *)(void *)mp->b_rptr; + + switch (iocbp->ioc_cmd) { + case ZFD_MAKETTY: + zfds->zfd_tty = 1; + miocack(qp, mp, 0, 0); + return; + case ZFD_EOF: + if (zfds->zfd_slave_rdq != NULL) + (void) putnextctl(zfds->zfd_slave_rdq, + M_HANGUP); + miocack(qp, mp, 0, 0); + return; + case ZFD_HAS_SLAVE: + if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) { + miocack(qp, mp, 0, 0); + } else { + miocack(qp, mp, 0, ENOTTY); + } + return; + case ZFD_MUX: { + /* + * Setup the multiplexer configuration for the two + * streams. + * + * We expect to be called on the stream that will + * become the log stream and be passed one data block + * with the minor number of the slave side of the + * primary stream. + */ + int to; + int instance; + zfd_state_t *prim_zfds; + + if (iocbp->ioc_count != TRANSPARENT || + mp->b_cont == NULL) { + miocack(qp, mp, 0, EINVAL); + return; + } + + /* Get the primary slave minor device number */ + to = *(int *)mp->b_cont->b_rptr; + instance = ZFD_INSTANCE(to); + + if ((prim_zfds = ddi_get_soft_state(zfd_soft_state, + instance)) == NULL) { + miocack(qp, mp, 0, EINVAL); + return; + } + + /* Disallow changing primary/log once set. */ + mutex_enter(&zfd_mux_lock); + if (zfds->zfd_muxt != ZFD_NO_MUX || + prim_zfds->zfd_muxt != ZFD_NO_MUX) { + mutex_exit(&zfd_mux_lock); + miocack(qp, mp, 0, EINVAL); + return; + } + + zfds->zfd_muxt = ZFD_LOG_STREAM; + zfds->zfd_inst_pri = prim_zfds; + prim_zfds->zfd_muxt = ZFD_PRIMARY_STREAM; + prim_zfds->zfd_inst_log = zfds; + mutex_exit(&zfd_mux_lock); + DTRACE_PROBE2(zfd__mux__link, void *, prim_zfds, + void *, zfds); + + miocack(qp, mp, 0, 0); + return; + } + case ZFD_MUX_FLOWCON: { + /* + * We expect this ioctl to be issued against the + * log stream. We don't use the primary stream since + * there can be other streams modules pushed onto that + * stream which would interfere with the ioctl. + */ + int val; + zfd_state_t *prim_zfds; + + if (iocbp->ioc_count != TRANSPARENT || + mp->b_cont == NULL) { + miocack(qp, mp, 0, EINVAL); + return; + } + + if (zfds->zfd_muxt != ZFD_LOG_STREAM) { + miocack(qp, mp, 0, EINVAL); + return; + } + prim_zfds = zfds->zfd_inst_pri; + + /* Get the flow control setting */ + val = *(int *)mp->b_cont->b_rptr; + if (val != 0 && val != 1) { + miocack(qp, mp, 0, EINVAL); + return; + } + + prim_zfds->zfd_allow_flowcon = (boolean_t)val; + if (!prim_zfds->zfd_allow_flowcon) + prim_zfds->zfd_is_flowcon = B_FALSE; + + DTRACE_PROBE1(zfd__mux__flowcon, void *, prim_zfds); + miocack(qp, mp, 0, 0); + return; + } + default: + break; + } + } + + /* if on the write side, may need to tee */ + if (zfds->zfd_slave_rdq != NULL && qp == WR(zfds->zfd_slave_rdq)) { + /* tee output to any attached log stream */ + zfd_tee_handler(zfds, type, mp); + + /* high-priority msgs are not subject to flow control */ + if (zfds->zfd_is_flowcon && type == M_DATA) + must_queue = B_TRUE; + } + + if (zfd_switch(RD(qp)) == NULL) { + DBG1("wput to %s side (no one listening)", zfd_side(qp)); + switch (type) { + case M_FLUSH: + handle_mflush(qp, mp); + break; + case M_IOCTL: + miocnak(qp, mp, 0, 0); + break; + default: + freemsg(mp); + break; + } + return; + } + + if (type >= QPCTL) { + DBG1("(hipri) wput, %s side", zfd_side(qp)); + switch (type) { + case M_READ: /* supposedly from ldterm? */ + DBG("zfd_wput: tossing M_READ\n"); + freemsg(mp); + break; + case M_FLUSH: + handle_mflush(qp, mp); + break; + default: + /* + * Put this to the other side. + */ + ASSERT(zfd_switch(RD(qp)) != NULL); + putnext(zfd_switch(RD(qp)), mp); + break; + } + DBG1("done (hipri) wput, %s side", zfd_side(qp)); + return; + } + + /* + * If the primary stream has been stopped for flow control then + * enqueue the msg, otherwise only putnext if there isn't already + * something in the queue. If we don't do this then things would wind + * up out of order. + */ + if (!must_queue && qp->q_first == NULL && + bcanputnext(RD(zfd_switch(qp)), mp->b_band)) { + putnext(RD(zfd_switch(qp)), mp); + } else { + /* + * zfd_wsrv expects msgs queued on the primary queue. Those + * will be handled by zfd_wsrv after zfd_rsrv performs the + * qenable on the proper queue. + */ + (void) putq(qp, mp); + } + + DBG1("done wput, %s side", zfd_side(qp)); +} + +/* + * Read server + * + * For primary stream: + * Under normal execution rsrv(9E) is symmetric for master and slave, so + * zfd_rsrv() can handle both without splitting up the codepath. We do this by + * enabling the write side of the partner. This triggers the partner to send + * messages queued on its write side to this queue's read side. + * + * For log stream: + * Internally we've queued up the msgs that we've teed off to the log stream + * so when we're invoked we need to pass these along. + */ +static void +zfd_rsrv(queue_t *qp) +{ + zfd_state_t *zfds; + zfds = (zfd_state_t *)qp->q_ptr; + + /* + * log stream server + */ + if (zfds->zfd_muxt == ZFD_LOG_STREAM && zfds->zfd_slave_rdq != NULL) { + queue_t *log_qp; + mblk_t *mp; + + log_qp = RD(zfds->zfd_slave_rdq); + + if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) { + zfd_state_t *pzfds = zfds->zfd_inst_pri; + + while ((mp = getq(qp)) != NULL) { + if (bcanputnext(log_qp, mp->b_band)) { + putnext(log_qp, mp); + } else { + (void) putbq(log_qp, mp); + break; + } + } + + if (log_qp->q_count < log_qp->q_lowat) { + DTRACE_PROBE(zfd__flow__on); + pzfds->zfd_is_flowcon = B_FALSE; + if (pzfds->zfd_master_rdq != NULL) + qenable(RD(pzfds->zfd_master_rdq)); + } + } else { + /* No longer open, drain the queue */ + while ((mp = getq(qp)) != NULL) { + freemsg(mp); + } + flushq(qp, FLUSHALL); + } + return; + } + + /* + * Care must be taken here, as either of the master or slave side + * qptr could be NULL. + */ + ASSERT(qp == zfds->zfd_master_rdq || qp == zfds->zfd_slave_rdq); + if (zfd_switch(qp) == NULL) { + DBG("zfd_rsrv: other side isn't listening\n"); + return; + } + qenable(WR(zfd_switch(qp))); +} + +/* + * Write server + * + * This routine is symmetric for master and slave, so it handles both without + * splitting up the codepath. + * + * If there are messages on this queue that can be sent to the other, send + * them via putnext(). Else, if queued messages cannot be sent, leave them + * on this queue. + */ +static void +zfd_wsrv(queue_t *qp) +{ + queue_t *swq; + mblk_t *mp; + zfd_state_t *zfds = (zfd_state_t *)qp->q_ptr; + + ASSERT(zfds != NULL); + + /* + * Partner has no read queue, so take the data, and throw it away. + */ + if (zfd_switch(RD(qp)) == NULL) { + DBG("zfd_wsrv: other side isn't listening"); + while ((mp = getq(qp)) != NULL) { + if (mp->b_datap->db_type == M_IOCTL) + miocnak(qp, mp, 0, 0); + else + freemsg(mp); + } + flushq(qp, FLUSHALL); + return; + } + + swq = RD(zfd_switch(qp)); + + /* + * while there are messages on this write queue... + */ + while (!zfds->zfd_is_flowcon && (mp = getq(qp)) != NULL) { + /* + * Due to the way zfd_wput is implemented, we should never + * see a high priority control message here. + */ + ASSERT(mp->b_datap->db_type < QPCTL); + + if (bcanputnext(swq, mp->b_band)) { + putnext(swq, mp); + } else { + (void) putbq(qp, mp); + break; + } + } +} |