/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 */

/*
 * Copyright 2019 Joyent, Inc.
 */

/*
 * vnd - virtual (machine) networking datapath
 *
 * vnd's purpose is to provide a highly performant data path for Layer 2 network
 * traffic and exist side by side an active IP netstack, each servicing
 * different datalinks. vnd provides many of the same capabilities as the
 * current TCP/IP stack does and some specific to layer two. Specifically:
 *
 * 	o Use of the DLD fastpath
 * 	o Packet capture hooks
 * 	o Ability to use hardware capabilities
 * 	o Useful interfaces for handling multiple frames
 *
 * The following image shows where vnd fits into today's networking stack:
 *
 *             +---------+----------+----------+
 *             | libdlpi |  libvnd  | libsocket|
 *             +---------+----------+----------+
 *             |         ·          ·    VFS   |
 *             |   VFS   ·    VFS   +----------+
 *             |         ·          |  sockfs  |
 *             +---------+----------+----------+
 *             |         |    VND   |    IP    |
 *             |         +----------+----------+
 *             |            DLD/DLS            |
 *             +-------------------------------+
 *             |              MAC              |
 *             +-------------------------------+
 *             |             GLDv3             |
 *             +-------------------------------+
 *
 * -----------------------------------------
 * A Tale of Two Devices - DDI Device Basics
 * -----------------------------------------
 *
 * vnd presents itself to userland as a character device; however, it also is a
 * STREAMS device so that it can interface with dld and the rest of the
 * networking stack. Users never interface with the STREAMs devices directly and
 * they are purely an implementation detail of vnd. Opening the STREAMS device
 * require kcred and as such userland cannot interact with it or push it onto
 * the stream head.
 *
 * The main vnd character device, /dev/vnd/ctl, is a self-cloning device. Every
 * clone gets its own minor number; however, minor nodes are not created in the
 * devices tree for these instances. In this state a user may do two different
 * things. They may issue ioctls that affect global state or they may issue
 * ioctls that try to attach it to a given datalink. Once a minor device has
 * been attached to a datalink, all operations on it are scoped to that context,
 * therefore subsequent global operations are not permitted.
 *
 * A given device can be linked into the /devices and /dev name space via a link
 * ioctl. That ioctl causes a minor node to be created in /devices and then it
 * will also appear under /dev/vnd/ due to vnd's sdev plugin. This is similar
 * to, but simpler than, IP's persistence mechanism.
 *
 * ---------------------
 * Binding to a datalink
 * ---------------------
 *
 * Datalinks are backed by the dld (datalink device) and dls (datalink services)
 * drivers. These drivers provide a STREAMS device for datalinks on the system
 * which are exposed through /dev/net. Userland generally manipulates datalinks
 * through libdlpi. When an IP interface is being plumbed up what actually
 * happens is that someone does a dlpi_open(3DLPI) of the underlying datalink
 * and then pushes on the ip STREAMS module with an I_PUSH ioctl.  Modules may
 * then can negotiate with dld and dls to obtain access to various capabilities
 * and fast paths via a series of STREAMS messages.
 *
 * In vnd, we do the same thing, but we leave our STREAMS module as an
 * implementation detail of the system. We don't want users to be able to
 * arbitrarily push vnd STREAMS module onto any stream, so we explicitly require
 * kcred to manipulate it. Thus, when a user issues a request to attach a
 * datalink to a minor instance of the character device, that vnd minor instance
 * itself does a layered open (ldi_open_by_name(9F)) of the specified datalink.
 * vnd does that open using the passed in credentials from the ioctl, not kcred.
 * This ensures that users who doesn't have permissions to open the device
 * cannot. Once that's been opened, we push on the vnd streams module.
 *
 * Once the vnd STREAMS instance has been created for this device, eg. the
 * I_PUSH ioctl returns, we explicitly send a STREAMS ioctl
 * (VND_STRIOC_ASSOCIATE) to associate the vnd STREAMS and character devices.
 * This association begins the STREAM device's initialization. We start up an
 * asynchronous state machine that takes care of all the different aspects of
 * plumbing up the device with dld and dls and enabling the MAC fast path. We
 * need to guarantee to consumers of the character device that by the time their
 * ioctl returns, the data path has been fully initialized.
 *
 * The state progression is fairly linear. There are two general steady states.
 * The first is VND_S_ONLINE, which means that everything is jacked up and good
 * to go. The alternative is VND_S_ZOMBIE, which means that the streams device
 * encountered an error or we have finished tearing it down and the character
 * device can clean it up. The following is our state progression and the
 * meaning of each state:
 *
 *                |
 *                |
 *                V
 *        +---------------+
 *        | VNS_S_INITIAL |                  This is our initial state. Every
 *        +---------------+                  vnd STREAMS device starts here.
 *                |                          While in this state, only dlpi
 *                |                          M_PROTO and M_IOCTL messages can be
 *                |                          sent or received. All STREAMS based
 *                |                          data messages are dropped.
 *                |                          We transition out of this state by
 *                |                          sending a DL_INFO_REQ to obtain
 *                |                          information about the underlying
 *                |                          link.
 *                v
 *        +-----------------+
 *   +--<-| VNS_S_INFO_SENT |                In this state, we verify and
 *   |    +-----------------+                record information about the
 *   |            |                          underlying device. If the device is
 *   |            |                          not suitable, eg. not of type
 *   v            |                          DL_ETHER, then we immediately
 *   |            |                          become a ZOMBIE. To leave this
 *   |            |                          state we request exclusive active
 *   |            |                          access to the device via
 *   v            |                          DL_EXCLUSIVE_REQ.
 *   |            v
 *   |    +----------------------+
 *   +--<-| VNS_S_EXCLUSIVE_SENT |           In this state, we verify whether
 *   |    +----------------------+           or not we were able to obtain
 *   |       |             |                 exclusive access to the device. If
 *   |       |             |                 we were not able to, then we leave,
 *   v       |             |                 as that means that something like
 *   |       |             |                 IP is already plumbed up on top of
 *   |       |             |                 the datalink. We leave this state
 *   |       |             |                 by progressing through to the
 *   |       |             |                 appropriate DLPI primitive, either
 *   v       |             |                 DLPI_ATTACH_REQ or DLPI_BIND_REQ
 *   |       |             |                 depending on the style of the
 *   |       |             |                 datalink.
 *   |       |             v
 *   |       |    +-------------------+
 *   +------ |--<-| VNS_S_ATTACH_SENT |      In this state, we verify we were
 *   |       |    +-------------------+      able to perform a standard DLPI
 *   |       |          |                    attach and if so, go ahead and
 *   v       |          |                    send a DLPI_BIND_REQ.
 *   |       v          v
 *   |    +-------------------+
 *   +--<-| VNS_S_BIND_SENT   |              In this state we see the result of
 *   |    +-------------------+              our attempt to bind to PPA 0 of the
 *   v             |                         underlying device. Because we're
 *   |             |                         trying to be a layer two datapath,
 *   |             |                         the specific attachment point isn't
 *   |             |                         too important as we're going to
 *   v             |                         have to enable promiscuous mode. We
 *   |             |                         transition out of this by sending
 *   |             |                         our first of three promiscuous mode
 *   |             |                         requests.
 *   v             v
 *   |    +------------------------+
 *   +--<-| VNS_S_SAP_PROMISC_SENT |         In this state we verify that we
 *   |    +------------------------+         were able to enable promiscuous
 *   |             |                         mode at the physical level. We
 *   |             |                         transition out of this by enabling
 *   |             |                         multicast and broadcast promiscuous
 *   v             |                         mode.
 *   |             v
 *   |    +--------------------------+
 *   +--<-| VNS_S_MULTI_PROMISC_SENT |       In this state we verify that we
 *   |    +--------------------------+       have enabled DL_PROMISC_MULTI and
 *   v             |                         move onto the second promiscuous
 *   |             |                         mode request.
 *   |             v
 *   |    +----------------------------+
 *   +--<-| VNS_S_RX_ONLY_PROMISC_SENT |     In this state we verify that we
 *   |    +----------------------------+     enabled RX_ONLY promiscuous mode.
 *   |             |                         We specifically do this as we don't
 *   v             |                         want to receive our own traffic
 *   |             |                         that we'll send out. We leave this
 *   |             |                         state by enabling the final flag
 *   |             |                         DL_PROMISC_FIXUPS.
 *   |             v
 *   |    +--------------------------+
 *   +--<-| VNS_S_FIXUP_PROMISC_SENT |       In this state we verify that we
 *   |    +--------------------------+       enabled FIXUP promiscuous mode.
 *   |             |                         We specifically do this as we need
 *   v             |                         to ensure that traffic which is
 *   |             |                         received by being looped back to us
 *   |             |                         correctly has checksums fixed. We
 *   |             |                         leave this state by requesting the
 *   |             |                         dld/dls capabilities that we can
 *   v             |                         process.
 *   |             v
 *   |    +--------------------+
 *   +--<-| VNS_S_CAPAB_Q_SENT |             We loop over the set of
 *   |    +--------------------+             capabilities that dld advertised
 *   |             |                         and enable the ones that currently
 *   v             |                         support for use. See the section
 *   |             |                         later on regarding capabilities
 *   |             |                         for more information. We leave this
 *   |             |                         state by sending an enable request.
 *   v             v
 *   |    +--------------------+
 *   +--<-| VNS_S_CAPAB_E_SENT |             Here we finish all capability
 *   |    +--------------------+             initialization. Once finished, we
 *   |             |                         transition to the next state. If
 *   v             |                         the dld fast path is not available,
 *   |             |                         we become a zombie.
 *   |             v
 *   |    +--------------+
 *   |    | VNS_S_ONLINE |                   This is a vnd STREAMS device's
 *   |    +--------------+                   steady state. It will normally
 *   |             |                         reside in this state while it is in
 *   |             |                         active use. It will only transition
 *   v             |                         to the next state when the STREAMS
 *   |             |                         device is closed by the character
 *   |             |                         device. In this state, all data
 *   |             |                         flows over the dld fast path.
 *   |             v
 *   |    +---------------------+
 *   +--->| VNS_S_SHUTTING_DOWN |            This vnd state takes care of
 *   |    +---------------------+            disabling capabilities and
 *   |             |                         flushing all data. At this point
 *   |             |                         any additional data that we receive
 *   |             |                         will be dropped. We leave this
 *   v             |                         state by trying to remove multicast
 *   |             |                         promiscuity.
 *   |             |
 *   |             v
 *   |   +---------------------------------+
 *   +-->| VNS_S_MULTICAST_PROMISCOFF_SENT | In this state, we check if we have
 *   |   +---------------------------------+ successfully removed multicast
 *   |             |                         promiscuous mode. If we have
 *   |             |                         failed, we still carry on but only
 *   |             |                         warn. We leave this state by trying
 *   |             |                         to disable SAP level promiscuous
 *   |             |                         mode.
 *   |             v
 *   |   +---------------------------+
 *   +-->| VNS_S_SAP_PROMISCOFF_SENT |       In this state, we check if we have
 *   |   +---------------------------+       successfully removed SAP level
 *   |             |                         promiscuous mode. If we have
 *   |             |                         failed, we still carry on but only
 *   |             |                         warn. Note that we don't worry
 *   |             |                         about either of
 *   |             |                         DL_PROMISC_FIXUPS or
 *   |             |                         DL_PROMISC_RX_ONLY. If these are
 *   |             |                         the only two entries left, then we
 *   |             |                         should have anything that MAC is
 *   |             |                         doing for us at this point,
 *   |             |                         therefore it's safe for us to
 *   |             |                         proceed to unbind, which is how we
 *   |             |                         leave this state via a
 *   |             v                         DL_UNBIND_REQ.
 *   |    +-------------------+
 *   +--->| VNS_S_UNBIND_SENT |              Here, we check how the unbind
 *   |    +-------------------+              request went. Regardless of its
 *   |             |                         success, we always transition to
 *   |             |                         a zombie state.
 *   |             v
 *   |    +--------------+
 *   +--->| VNS_S_ZOMBIE |                   In this state, the vnd STREAMS
 *        +--------------+                   device is waiting to finish being
 *                                           reaped. Because we have no more
 *                                           ways to receive data it should be
 *                                           safe to destroy all remaining data
 *                                           structures.
 *
 * If the stream association fails for any reason the state machine reaches
 * VNS_S_ZOMBIE. A more detailed vnd_errno_t will propagate back through the
 * STREAMS ioctl to the character device. That will fail the user ioctl and
 * propagate the vnd_errno_t back to userland. If, on the other hand, the
 * association succeeds, then the vnd STREAMS device will be fully plumbed up
 * and ready to transmit and receive message blocks. Consumers will be able to
 * start using the other cbops(9E) entry points once the attach has fully
 * finished, which will occur after the original user attach ioctl to the
 * character device returns.
 *
 * It's quite important that we end up sending the full series of STREAMS
 * messages when tearing down. While it's tempting to say that we should just
 * rely on the STREAMS device being closed to properly ensure that we have no
 * more additional data, that's not sufficient due to our use of direct
 * callbacks.  DLS does not ensure that by the time we change the direct
 * callback (vnd_mac_input) that all callers to it will have been quiesced.
 * However, it does guarantee that if we disable promiscuous mode ourselves and
 * we turn off the main data path via DL_UNBIND_REQ that it will work.
 * Therefore, we make sure to do this ourselves rather than letting DLS/DLD do
 * it as part of tearing down the STREAMS device. This ensures that we'll
 * quiesce all data before we destroy our data structures and thus we should
 * eliminate the race in changing the data function.
 *
 * --------------------
 * General Architecture
 * --------------------
 *
 * There are several different devices and structures in the vnd driver. There
 * is a per-netstack component, pieces related to the character device that
 * consumers see, the internal STREAMS device state, and the data queues
 * themselves. The following ASCII art picture describes their relationships and
 * some of the major pieces of data that contain them. These are not exhaustive,
 * e.g. synchronization primitives are left out.
 *
 *  +----------------+     +-----------------+
 *  | global         |     | global          |
 *  | device list    |     | netstack list   |
 *  | vnd_dev_list   |     | vnd_nsd_list    |
 *  +----------------+     +-----------------+
 *      |                    |
 *      |                    v
 *      |    +-------------------+      +-------------------+
 *      |    | per-netstack data | ---> | per-netstack data | --> ...
 *      |    | vnd_pnsd_t        |      | vnd_pnsd_t        |
 *      |    |                   |      +-------------------+
 *      |    |                   |
 *      |    | nestackid_t    ---+----> Netstack ID
 *      |    | vnd_pnsd_flags_t -+----> Status flags
 *      |    | zoneid_t       ---+----> Zone ID for this netstack
 *      |    | hook_family_t  ---+----> VND IPv4 Hooks
 *      |    | hook_family_t  ---+----> VND IPv6 Hooks
 *      |    | list_t ----+      |
 *      |    +------------+------+
 *      |                 |
 *      |                 v
 *      |           +------------------+       +------------------+
 *      |           | character device |  ---> | character device | -> ...
 *      +---------->| vnd_dev_t        |       | vnd_dev_t        |
 *                  |                  |       +------------------+
 *                  |                  |
 *                  | minor_t       ---+--> device minor number
 *                  | ldi_handle_t  ---+--> handle to /dev/net/%datalink
 *                  | vnd_dev_flags_t -+--> device flags, non blocking, etc.
 *                  | char[]        ---+--> name if linked
 *                  | vnd_str_t * -+   |
 *                  +--------------+---+
 *                                 |
 *                                 v
 *          +-------------------------+
 *          | STREAMS device          |
 *          | vnd_str_t               |
 *          |                         |
 *          | vnd_str_state_t      ---+---> State machine state
 *          | gsqueue_t *          ---+---> mblk_t Serialization queue
 *          | vnd_str_stat_t       ---+---> per-device kstats
 *          | vnd_str_capab_t      ---+----------------------------+
 *          | vnd_data_queue_t ---+   |                            |
 *          | vnd_data_queue_t -+ |   |                            v
 *          +-------------------+-+---+                  +---------------------+
 *                              | |                      | Stream capabilities |
 *                              | |                      | vnd_str_capab_t     |
 *                              | |                      |                     |
 *                              | |    supported caps <--+-- vnd_capab_flags_t |
 *                              | |    dld cap handle <--+-- void *            |
 *                              | |    direct tx func <--+-- vnd_dld_tx_t      |
 *                              | |                      +---------------------+
 *                              | |
 *             +----------------+ +-------------+
 *             |                                |
 *             v                                v
 *  +-------------------+                  +-------------------+
 *  | Read data queue   |                  | Write data queue  |
 *  | vnd_data_queue_t  |                  | vnd_data_queue_t  |
 *  |                   |                  |                   |
 *  | size_t        ----+--> Current size  | size_t        ----+--> Current size
 *  | size_t        ----+--> Max size      | size_t        ----+--> Max size
 *  | mblk_t *      ----+--> Queue head    | mblk_t *      ----+--> Queue head
 *  | mblk_t *      ----+--> Queue tail    | mblk_t *      ----+--> Queue tail
 *  +-------------------+                  +-------------------+
 *
 *
 * Globally, we maintain two lists. One list contains all of the character
 * device soft states. The other maintains a list of all our netstack soft
 * states. Each netstack maintains a list of active devices that have been
 * associated with a datalink in its netstack.
 *
 * Recall that a given minor instance of the character device exists in one of
 * two modes. It can either be a cloned open of /dev/vnd/ctl, the control node,
 * or it can be associated with a given datalink. When minor instances are in
 * the former state, they do not exist in a given vnd_pnsd_t's list of devices.
 * As part of attaching to a datalink, the given vnd_dev_t will be inserted into
 * the appropriate vnd_pnsd_t. In addition, this will cause a STREAMS device, a
 * vnd_str_t, to be created and associated to a vnd_dev_t.
 *
 * The character device, and its vnd_dev_t, is the interface to the rest of the
 * system. The vnd_dev_t keeps track of various aspects like whether various
 * operations, such as read, write and the frameio ioctls, are considered
 * blocking or non-blocking in the O_NONBLOCK sense. It also is responsible for
 * keeping track of things like the name of the device, if any, in /dev. The
 * vnd_str_t, on the other hand manages aspects like buffer sizes and the actual
 * data queues. However, ioctls that manipulate these properties all go through
 * the vnd_dev_t to its associated vnd_str_t.
 *
 * Each of the STREAMS devices, the vnd_str_t, maintains two data queues. One
 * for frames to transmit (write queue) and one for frames received (read
 * queue). These data queues have a maximum size and attempting to add data
 * beyond that maximum size will result in data being dropped. The sizes are
 * configurable via ioctls VND_IOC_SETTXBUF, VND_IOC_SETRXBUF. Data either sits
 * in those buffers or has a reservation in those buffers while they are in vnd
 * and waiting to be consumed by the user or by mac.
 *
 * Finally, the vnd_str_t also has a vnd_str_capab_t which we use to manage the
 * available, negotiated, and currently active features.
 *
 * ----------------------
 * Data Path and gsqueues
 * ----------------------
 *
 * There's a lot of plumbing in vnd to get to the point where we can send data,
 * but vnd's bread and butter is the data path, so it's worth diving into it in
 * more detail. Data enters and exits the system from two ends.
 *
 * The first end is the vnd consumer. This comes in the form of read and write
 * system calls as well as the frame I/O ioctls. The read and write system calls
 * operate on a single frame at a time. Think of a frame as a single message
 * that has come in off the wire, which may itself comprise multiple mblk_t's
 * linked together in the kernel. readv(2) and writev(2) have the same
 * limitations as read(2) and write(2). We enforce this as the system is
 * required to fill up every uio(9S) buffer before moving onto the next one.
 * This means that if you have a MTU sized buffer and two frames come in which
 * are less than half of the MTU they must fill up the given iovec. Even if we
 * didn't want to do this, we have no way of informing the supplier of the
 * iovecs that they were only partially filled or where one frame ends and
 * another begins.  That's life, as such we have frame I/O which solves this
 * problem. It allows for multiple frames to be consumed as well as for frames
 * to be broken down into multiple vector components.
 *
 * The second end is the mac direct calls. As part of negotiating capabilities
 * via dld, we give mac a function of ours to call when packets are received
 * [vnd_mac_input()] and a callback to indicate that flow has been restored
 * [vnd_mac_flow_control()]. In turn, we also get a function pointer that we can
 * transmit data with. As part of the contract with mac, mac is allowed to flow
 * control us by returning a cookie to the transmit function. When that happens,
 * all outbound traffic is halted until our callback function is called and we
 * can schedule drains.
 *
 * It's worth looking at these in further detail. We'll start with the rx path.
 *
 *
 *                                |
 *                                * . . . packets from gld
 *                                |
 *                                v
 *                         +-------------+
 *                         |     mac     |
 *                         +-------------+
 *                                |
 *                                v
 *                         +-------------+
 *                         |     dld     |
 *                         +-------------+
 *                                |
 *                                * . . . dld direct callback
 *                                |
 *                                v
 *                        +---------------+
 *                        | vnd_mac_input |
 *                        +---------------+
 *                                |
 *                                v
 * +---------+             +-------------+
 * | dropped |<--*---------|  vnd_hooks  |
 * |   by    |   .         +-------------+
 * |  hooks  |   . drop probe     |
 * +---------+     kstat bump     * . . . Do we have free
 *                                |         buffer space?
 *                                |
 *                          no .  |      . yes
 *                             .  +      .
 *                         +---*--+------*-------+
 *                         |                     |
 *                         * . . drop probe      * . . recv probe
 *                         |     kstat bump      |     kstat bump
 *                         v                     |
 *                      +---------+              * . . fire pollin
 *                      | freemsg |              v
 *                      +---------+   +-----------------------+
 *                                    | vnd_str_t`vns_dq_read |
 *                                    +-----------------------+
 *                                             ^ ^
 *                             +----------+    | |     +---------+
 *                             | read(9E) |-->-+ +--<--| frameio |
 *                             +----------+            +---------+
 *
 * The rx path is rather linear. Packets come into us from mac. We always run
 * them through the various hooks, and if they come out of that, we inspect the
 * read data queue. If there is not enough space for a packet, we drop it.
 * Otherwise, we append it to the data queue, and fire read notifications
 * targetting anyone polling or doing blocking I/O on this device. Those
 * consumers then drain the head of the data queue.
 *
 * The tx path is more complicated due to mac flow control. After any call into
 * mac, we may have to potentially suspend writes and buffer data for an
 * arbitrary amount of time. As such, we need to carefully track the total
 * amount of outstanding data so that we don't waste kernel memory. This is
 * further complicated by the fact that mac will asynchronously tell us when our
 * flow has been resumed.
 *
 * For data to be able to enter the system, it needs to be able to take a
 * reservation from the write data queue. Once the reservation has been
 * obtained, we enter the gsqueue so that we can actually append it. We use
 * gsqueues (serialization queues) to ensure that packets are manipulated in
 * order as we deal with the draining and appending packets. We also leverage
 * its worker thread to help us do draining after mac has restorted our flow.
 *
 * The following image describes the flow:
 *
 * +-----------+   +--------------+       +-------------------------+   +------+
 * | write(9E) |-->| Space in the |--*--->| gsqueue_enter_one()     |-->| Done |
 * | frameio   |   | write queue? |  .    | +->vnd_squeue_tx_append |   +------+
 * +-----------+   +--------------+  .    +-------------------------+
 *                         |   ^     .
 *                         |   |     . reserve space           from gsqueue
 *                         |   |                                   |
 *            queue  . . . *   |       space                       v
 *             full        |   * . . . avail          +------------------------+
 *                         v   |                      | vnd_squeue_tx_append() |
 * +--------+          +------------+                 +------------------------+
 * | EAGAIN |<--*------| Non-block? |<-+                           |
 * +--------+   .      +------------+  |                           v
 *              . yes             v    |     wait          +--------------+
 *                          no . .*    * . . for           | append chain |
 *                                +----+     space         | to outgoing  |
 *                                                         |  mblk chain  |
 *   from gsqueue                                          +--------------+
 *       |                                                        |
 *       |      +-------------------------------------------------+
 *       |      |
 *       |      |                            yes . . .
 *       v      v                                    .
 *  +-----------------------+    +--------------+    .     +------+
 *  | vnd_squeue_tx_drain() |--->| mac blocked? |----*---->| Done |
 *  +-----------------------+    +--------------+          +------+
 *                                       |                     |
 *     +---------------------------------|---------------------+
 *     |                                 |           tx        |
 *     |                          no . . *           queue . . *
 *     | flow controlled .               |           empty     * . fire pollout
 *     |                 .               v                     |   if mblk_t's
 *   +-------------+     .      +---------------------+        |   sent
 *   | set blocked |<----*------| vnd_squeue_tx_one() |--------^-------+
 *   | flags       |            +---------------------+                |
 *   +-------------+    More data       |    |      |      More data   |
 *                      and limit       ^    v      * . .  and limit   ^
 *                      not reached . . *    |      |      reached     |
 *                                      +----+      |                  |
 *                                                  v                  |
 *   +----------+          +-------------+    +---------------------------+
 *   | mac flow |--------->| remove mac  |--->| gsqueue_enter_one() with  |
 *   | control  |          | block flags |    | vnd_squeue_tx_drain() and |
 *   | callback |          +-------------+    | GSQUEUE_FILL flag, iff    |
 *   +----------+                             | not already scheduled     |
 *                                            +---------------------------+
 *
 * The final path taken for a given write(9E)/frameio ioctl depends on whether
 * or not the vnd_dev_t is non-blocking. That controls the initial path of
 * trying to take a reservation in write data queue. If the device is in
 * non-blocking mode, we'll return EAGAIN when there is not enough space
 * available, otherwise, the calling thread blocks on the data queue.
 *
 * Today when we call into vnd_squeue_tx_drain() we will not try to drain the
 * entire queue, as that could be quite large and we don't want to necessarily
 * keep the thread that's doing the drain until it's been finished. Not only
 * could more data be coming in, but the draining thread could be a userland
 * thread that has more work to do. We have two limits today. There is an upper
 * bound on the total amount of data and the total number of mblk_t chains. If
 * we hit either limit, then we will schedule another drain in the gsqueue and
 * go from there.
 *
 * It's worth taking some time to describe how we interact with gsqueues. vnd
 * has a gsqueue_set_t for itself. It's important that it has its own set, as
 * the profile of work that vnd does is different from other sub-systems in the
 * kernel. When we open a STREAMS device in vnd_s_open, we get a random gsqueue.
 * Unlike TCP/IP which uses an gsqueue for per TCP connection, we end up
 * maintaining one for a given device. Because of that, we want to use a
 * pseudo-random one to try and spread out the load, and picking one at random
 * is likely to be just as good as any fancy algorithm we might come up with,
 * especially as any two devices could have radically different transmit
 * profiles.
 *
 * While some of the write path may seem complicated, it does allow us to
 * maintain an important property. Once we have acknowledged a write(9E) or
 * frameio ioctl, we will not drop the packet, excepting something like ipf via
 * the firewall hooks.
 *
 * There is one other source of flow control that can exist in the system which
 * is in the form of a barrier. The barrier is an internal mechanism used for
 * ensuring that an gsqueue is drained for a given device. We use this as part
 * of tearing down. Specifically we disable the write path so nothing new can be
 * inserted into the gsqueue and then insert a barrier block. Once the barrier
 * block comes out of the gsqueue, then we know nothing else in the gsqueue that
 * could refer to the vnd_str_t, being destroyed, exists.
 *
 * ---------------------
 * vnd, zones, netstacks
 * ---------------------
 *
 * vnd devices are scoped to datalinks and datalinks are scoped to a netstack.
 * Because of that, vnd is also a netstack module. It registers with the
 * netstack sub-system and receives callbacks every time a netstack is created,
 * being shutdown, and destroyed. The netstack callbacks drive the creation and
 * destruction of the vnd_pnsd_t structures.
 *
 * Recall from the earlier architecture diagrams that every vnd device is scoped
 * to a netstack and known about by a given vnd_pnsd_t. When that netstack is
 * torn down, we also tear down any vnd devices that are hanging around. When
 * the netstack is torn down, we know that any zones that are scoped to that
 * netstack are being shut down and have no processes remaining. This is going
 * to be the case whether they are shared or exclusive stack zones. We have to
 * perform a careful dance.
 *
 * There are two different callbacks that happen on tear down, the first is a
 * shutdown callback, the second is a destroy callback. When the shutdown
 * callback is fired we need to prepare for the netstack to go away and ensure
 * that nothing can continue to persist itself.
 *
 * More specifically, when we get notice of a stack being shutdown we first
 * remove the netstack from the global netstack list to ensure that no one new
 * can come in and find the netstack and get a reference to it. After that, we
 * notify the neti hooks that they're going away. Once that's all done, we get
 * to the heart of the matter.
 *
 * When shutting down there could be any number of outstanding contexts that
 * have a reference on the vnd_pnsd_t and on the individual links. However, we
 * know that no one new will be able to find the vnd_pnsd_t. To account for
 * things that have existing references we mark the vnd_pnsd_t`vpnd_flags with
 * VND_NS_CONDEMNED. This is checked by code paths that wish to append a device
 * to the netstack's list. If this is set, then they must not append to it.
 * Once this is set, we know that the netstack's list of devices can never grow,
 * only shrink.
 *
 * Next, for each device we tag it with VND_D_ZONE_DYING. This indicates that
 * the container for the device is being destroyed and that we should not allow
 * additional references to the device to be created, whether via open, or
 * linking. The presence of this bit also allows things like the list ioctl and
 * sdev to know not to consider its existence. At the conclusion of this being
 * set, we know that no one else should be able to obtain a new reference to the
 * device.
 *
 * Once that has been set for all devices, we go through and remove any existing
 * links that have been established in sdev. Because doing that may cause the
 * final reference for the device to be dropped, which still has a reference to
 * the netstack, we have to restart our walk due to dropped locks. We know that
 * this walk will eventually complete because the device cannot be relinked and
 * no new devices will be attached in this netstack due to VND_NS_CONDEMNED.
 * Once that's finished, the shutdown callback returns.
 *
 * When we reach the destroy callback, we simply wait for references on the
 * netstack to disappear. Because the zone has been shut down, all processes in
 * it that have open references have been terminated and reaped. Any threads
 * that are newly trying to reference it will fail. However, there is one thing
 * that can halt this that we have no control over, which is the global zone
 * holding open a reference to the device. In this case the zone halt will hang
 * in vnd_stack_destroy. Once the last references is dropped we finish destroy
 * the netinfo hooks and free the vnd_pnsd_t.
 *
 * ----
 * sdev
 * ----
 *
 * vnd registers a sdev plugin which allows it to dynamically fill out /dev/vnd
 * for both the global and non-global zones. In any given zone we always supply
 * a control node via /dev/vnd/ctl. This is the self-cloning node. Each zone
 * will also have an entry per-link in that zone under /dev/vnd/%datalink, eg.
 * if a link was named net0, there would be a /dev/vnd/net0. The global zone can
 * also see every link for every zone, ala /dev/net, under
 * /dev/vnd/%zonename/%datalink, eg. if a zone named 'turin' had a vnd device
 * named net0, the global zone would have /dev/vnd/turin/net0.
 *
 * The sdev plugin has three interfaces that it supplies back to sdev. One is to
 * validate that a given node is still valid. The next is a callback from sdev
 * to say that it is no longer using the node. The third and final one is from
 * sdev where it asks us to fill a directory. All of the heavy lifting is done
 * in directory filling and in valiation. We opt not to maintain a reference on
 * the device while there is an sdev node present. This makes the removal of
 * nodes much simpler and most of the possible failure modes shouldn't cause any
 * real problems. For example, the open path has to handle both dev_t's which no
 * longer exist and which are no longer linked.
 *
 * -----
 * hooks
 * -----
 *
 * Like IP, vnd sends all L3 packets through its firewall hooks. Currently vnd
 * provides these for L3 IP and IPv6 traffic. Each netstack provides these hooks
 * in a minimal fashion. While we will allow traffic to be filtered through the
 * hooks, we do not provide means for packet injection or additional inspection
 * at this time. There are a total of four different events created:
 *
 *   o IPv4 physical in
 *   o IPv4 physical out
 *   o IPv6 physical in
 *   o IPv6 physical out
 *
 * ---------------
 * Synchronization
 * ---------------
 *
 * To make our synchronization simpler, we've put more effort into making the
 * metadata/setup paths do more work. That work allows the data paths to make
 * assumptions around synchronization that simplify the general case. Each major
 * structure, the vnd_pnsd_t, vnd_dev_t, vnd_str_t, and vnd_data_queue_t is
 * annotated with the protection that its members receives.  The following
 * annotations are used:
 *
 * 	A	Atomics; these values are only modified using atomics values.
 *		Currently this only applies to kstat values.
 * 	E	Existence; no lock is needed to access this member, it does not
 *		change while the structure is valid.
 * 	GL	Global Lock; these members are protected by the global
 *		vnd_dev_lock.
 * 	L	Locked; access to the member is controlled by a lock that is in
 * 		the structure.
 * 	NSL	netstack lock; this member is protected by the containing
 * 		netstack. This only applies to the vnd_dev_t`vdd_nslink.
 *	X	This member is special, and is discussed in this section.
 *
 * In addition to locking, we also have reference counts on the vnd_dev_t and
 * the vnd_pnsd_t. The reference counts describe the lifetimes of the structure.
 * With rare exception, once a reference count is decremented, the consumer
 * should not assume that the data is valid any more. The only exception to this
 * is the case where we're removing an extant reference count from a link into
 * /devices or /dev. Reference counts are obtained on these structures as a part
 * of looking them up.
 *
 * 	# Global Lock Ordering
 * 	######################
 *
 * The following is the order that you must take locks in vnd:
 *
 * 1) vnd`vnd_dev_lock
 * 2) vnd_pnsd_t`vpnd_lock
 * 3) vnd_dev_t`vnd_lock
 * 4) vnd_str_t`vns_lock
 * 5) vnd_data_queue_t`vdq_lock
 *
 * One must adhere to the following rules:
 *
 *   o You must acquire a lower numbered lock before a high numbered lock.
 *   o It is NOT legal to hold two locks of the same level concurrently, eg. you
 *     can not hold two different vnd_dev_t's vnd_lock at the same time.
 *   o You may release locks in any order.
 *   o If you release a lock, you must honor the locking rules before acquiring
 *     it again.
 *   o You should not hold any locks when calling any of the rele functions.
 *
 * 	# Special Considerations
 * 	########################
 *
 * While most of the locking is what's expected, it's worth going into the
 * special nature that a few members hold.  Today, only two structures have
 * special considerations: the vnd_dev_t and the vnd_str_t. All members with
 * special considerations have an additional annotation that describes how you
 * should interact with it.
 *
 * vnd_dev_t: The vdd_nsd and vdd_cr are only valid when the minor node is
 * attached or in the process of attaching. If the code path that goes through
 * requires an attached vnd_dev_t, eg. the data path and tear down path, then it
 * is always legal to dereference that member without a lock held. When they are
 * added to the system, they should be done under the vdd_lock and done as part
 * of setting the VND_D_ATTACH_INFLIGHT flag. These should not change during the
 * lifetime of the vnd_dev_t.
 *
 * vnd_dev_t: The vdd_ldih is similar to the vdd_nsd and vdd_cr, except that it
 * always exists as it is a part of the structure. The only time that it's valid
 * to be using it is during the attach path with the VND_D_ATTACH_INFLIGHT flag
 * set or during tear down. Outside of those paths which are naturally
 * serialized, there is no explicit locking around the member.
 *
 * vnd_str_t: The vns_dev and vns_nsd work in similar ways. They are not
 * initially set as part of creating the structure, but are set as part of
 * responding to the association ioctl. Anything in the data path or metadata
 * path that requires association may assume that they exist, as we do not kick
 * off the state machine until they're set.
 *
 * vnd_str_t: The vns_drainblk and vns_barrierblk are similarly special. The
 * members are designed to be used as part of various operations with the
 * gsqueues. A lock isn't needed to use them, but to work with them, the
 * appropriate flag in the vnd_str_t`vns_flags must have been set by the current
 * thread. Otherwise, it is always fair game to refer to their addresses. Their
 * contents are ignored by vnd, but some members are manipulated by the gsqueue
 * subsystem.
 */

#include <sys/conf.h>
#include <sys/devops.h>
#include <sys/modctl.h>
#include <sys/stat.h>
#include <sys/file.h>
#include <sys/types.h>
#include <sys/errno.h>
#include <sys/open.h>
#include <sys/ddi.h>
#include <sys/ethernet.h>
#include <sys/stropts.h>
#include <sys/sunddi.h>
#include <sys/stream.h>
#include <sys/strsun.h>
#include <sys/ksynch.h>
#include <sys/taskq_impl.h>
#include <sys/sdt.h>
#include <sys/debug.h>
#include <sys/sysmacros.h>
#include <sys/dlpi.h>
#include <sys/cred.h>
#include <sys/id_space.h>
#include <sys/list.h>
#include <sys/ctype.h>
#include <sys/policy.h>
#include <sys/sunldi.h>
#include <sys/cred.h>
#include <sys/strsubr.h>
#include <sys/poll.h>
#include <sys/neti.h>
#include <sys/hook.h>
#include <sys/hook_event.h>
#include <sys/vlan.h>
#include <sys/dld.h>
#include <sys/mac_client.h>
#include <sys/netstack.h>
#include <sys/fs/sdev_plugin.h>
#include <sys/kstat.h>
#include <sys/atomic.h>
#include <sys/disp.h>
#include <sys/random.h>
#include <sys/gsqueue.h>
#include <sys/smt.h>

#include <inet/ip.h>
#include <inet/ip6.h>

#include <sys/vnd.h>

/*
 * Globals
 */
static dev_info_t *vnd_dip;
static taskq_t *vnd_taskq;
static kmem_cache_t *vnd_str_cache;
static kmem_cache_t *vnd_dev_cache;
static kmem_cache_t *vnd_pnsd_cache;
static id_space_t *vnd_minors;
static int vnd_list_init = 0;
static sdev_plugin_hdl_t vnd_sdev_hdl;
static gsqueue_set_t *vnd_sqset;

static kmutex_t vnd_dev_lock;
static list_t vnd_dev_list;	/* Protected by the vnd_dev_lock */
static list_t vnd_nsd_list;	/* Protected by the vnd_dev_lock */

/*
 * STREAMs ioctls
 *
 * The STREAMs ioctls are internal to vnd. No one should be seeing them, as such
 * they aren't a part of the header file.
 */
#define	VND_STRIOC	(('v' << 24) | ('n' << 16) | ('d' << 8) | 0x80)

/*
 * Private ioctl to associate a given streams instance with a minor instance of
 * the character device.
 */
#define	VND_STRIOC_ASSOCIATE	(VND_STRIOC | 0x1)

typedef struct vnd_strioc_associate {
	minor_t	vsa_minor;	/* minor device node */
	netstackid_t vsa_nsid;	/* netstack id */
	vnd_errno_t vsa_errno;	/* errno */
} vnd_strioc_associate_t;

typedef enum vnd_strioc_state {
	VSS_UNKNOWN = 0,
	VSS_COPYIN = 1,
	VSS_COPYOUT = 2,
} vnd_strioc_state_t;

typedef struct vnd_strioc {
	vnd_strioc_state_t vs_state;
	caddr_t vs_addr;
} vnd_strioc_t;

/*
 * VND SQUEUE TAGS, start at 0x42 so we don't overlap with extent tags. Though
 * really, overlap is at the end of the day, inevitable.
 */
#define	VND_SQUEUE_TAG_TX_DRAIN		0x42
#define	VND_SQUEUE_TAG_MAC_FLOW_CONTROL	0x43
#define	VND_SQUEUE_TAG_VND_WRITE	0x44
#define	VND_SQUEUE_TAG_ND_FRAMEIO_WRITE	0x45
#define	VND_SQUEUE_TAG_STRBARRIER	0x46

/*
 * vnd reserved names. These are names which are reserved by vnd and thus
 * shouldn't be used by some external program.
 */
static char *vnd_reserved_names[] = {
	"ctl",
	"zone",
	NULL
};

/*
 * vnd's DTrace probe macros
 *
 * DTRACE_VND* are all for a stable provider. We also have an unstable internal
 * set of probes for reference count manipulation.
 */
#define	DTRACE_VND3(name, type1, arg1, type2, arg2, type3, arg3) \
    DTRACE_PROBE3(__vnd_##name, type1, arg1, type2, arg2, type3, arg3);

#define	DTRACE_VND4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \
    DTRACE_PROBE4(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \
	type4, arg4);

#define	DTRACE_VND5(name, type1, arg1, type2, arg2, type3, arg3,	\
    type4, arg4, type5, arg5)						\
    DTRACE_PROBE5(__vnd_##name, type1, arg1, type2, arg2, type3, arg3,	\
	type4, arg4, type5, arg5);

#define	DTRACE_VND_REFINC(vdp) \
    DTRACE_PROBE2(vnd__ref__inc, vnd_dev_t *, vdp, int, vdp->vdd_ref);
#define	DTRACE_VND_REFDEC(vdp) \
    DTRACE_PROBE2(vnd__ref__dec, vnd_dev_t *, vdp, int, vdp->vdd_ref);


/*
 * Tunables
 */
size_t vnd_vdq_default_size = 1024 * 64;	/* 64 KB */
size_t vnd_vdq_hard_max = 1024 * 1024 * 4;	/* 4 MB */

/*
 * These numbers are designed as per-device tunables that are applied when a new
 * vnd device is attached. They're a rough stab at what may be a reasonable
 * amount of work to do in one burst in an squeue.
 */
size_t vnd_flush_burst_size = 1520 * 10;	/* 10 1500 MTU packets */
size_t vnd_flush_nburst = 10;			/* 10 frames */

/*
 * Constants related to our sdev plugins
 */
#define	VND_SDEV_NAME	"vnd"
#define	VND_SDEV_ROOT	"/dev/vnd"
#define	VND_SDEV_ZROOT	"/dev/vnd/zone"

/*
 * vnd relies on privileges, not mode bits to limit access.  As such, device
 * files are read-write to everyone.
 */
#define	VND_SDEV_MODE	(S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | \
			S_IROTH | S_IWOTH)

/*
 * Statistic macros
 */
#define	VND_STAT_INC(vsp, field, val) \
    atomic_add_64(&(vsp)->vns_ksdata.field.value.ui64, val)
#define	VND_LATENCY_1MS		1000000
#define	VND_LATENCY_10MS	10000000
#define	VND_LATENCY_100MS	100000000
#define	VND_LATENCY_1S		1000000000
#define	VND_LATENCY_10S		10000000000

/*
 * Constants for vnd hooks
 */
static uint8_t vnd_bcast_addr[6] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
#define	IPV4_MCAST_LEN	3
static uint8_t vnd_ipv4_mcast[3] = { 0x01, 0x00, 0x5E };
#define	IPV6_MCAST_LEN	2
static uint8_t vnd_ipv6_mcast[2] = { 0x33, 0x33 };

/*
 * vnd internal data structures and types
 */

struct vnd_str;
struct vnd_dev;
struct vnd_pnsd;

/*
 * As part of opening the device stream we need to properly communicate with our
 * underlying stream. This is a bit of an asynchronous dance and we need to
 * properly work with dld to get everything set up. We have to initiate the
 * conversation with dld and as such we keep track of our state here.
 */
typedef enum vnd_str_state {
	VNS_S_INITIAL = 0,
	VNS_S_INFO_SENT,
	VNS_S_EXCLUSIVE_SENT,
	VNS_S_ATTACH_SENT,
	VNS_S_BIND_SENT,
	VNS_S_SAP_PROMISC_SENT,
	VNS_S_MULTI_PROMISC_SENT,
	VNS_S_RX_ONLY_PROMISC_SENT,
	VNS_S_FIXUP_PROMISC_SENT,
	VNS_S_CAPAB_Q_SENT,
	VNS_S_CAPAB_E_SENT,
	VNS_S_ONLINE,
	VNS_S_SHUTTING_DOWN,
	VNS_S_MULTICAST_PROMISCOFF_SENT,
	VNS_S_SAP_PROMISCOFF_SENT,
	VNS_S_UNBIND_SENT,
	VNS_S_ZOMBIE
} vnd_str_state_t;

typedef enum vnd_str_flags {
	VNS_F_NEED_ZONE = 0x1,
	VNS_F_TASKQ_DISPATCHED = 0x2,
	VNS_F_CONDEMNED = 0x4,
	VNS_F_FLOW_CONTROLLED = 0x8,
	VNS_F_DRAIN_SCHEDULED = 0x10,
	VNS_F_BARRIER = 0x20,
	VNS_F_BARRIER_DONE = 0x40
} vnd_str_flags_t;

typedef enum vnd_capab_flags {
	VNS_C_HCKSUM = 0x1,
	VNS_C_DLD = 0x2,
	VNS_C_DIRECT = 0x4,
	VNS_C_HCKSUM_BADVERS = 0x8
} vnd_capab_flags_t;

/*
 * Definitions to interact with direct callbacks
 */
typedef void (*vnd_rx_t)(struct vnd_str *, mac_resource_t *, mblk_t *,
    mac_header_info_t *);
typedef uintptr_t vnd_mac_cookie_t;
/* DLD Direct capability function */
typedef int (*vnd_dld_cap_t)(void *, uint_t, void *, uint_t);
/* DLD Direct tx function */
typedef vnd_mac_cookie_t (*vnd_dld_tx_t)(void *, mblk_t *, uint64_t, uint16_t);
/* DLD Direct function to set flow control callback */
typedef void *(*vnd_dld_set_fcb_t)(void *, void (*)(void *, vnd_mac_cookie_t),
    void *);
/* DLD Direct function to see if flow controlled still */
typedef int (*vnd_dld_is_fc_t)(void *, vnd_mac_cookie_t);

/*
 * The vnd_str_capab_t is always protected by the vnd_str_t it's a member of.
 */
typedef struct vnd_str_capab {
	vnd_capab_flags_t vsc_flags;
	t_uscalar_t vsc_hcksum_opts;
	vnd_dld_cap_t vsc_capab_f;
	void *vsc_capab_hdl;
	vnd_dld_tx_t vsc_tx_f;
	void *vsc_tx_hdl;
	vnd_dld_set_fcb_t vsc_set_fcb_f;
	void *vsc_set_fcb_hdl;
	vnd_dld_is_fc_t vsc_is_fc_f;
	void *vsc_is_fc_hdl;
	vnd_mac_cookie_t vsc_fc_cookie;
	void *vsc_tx_fc_hdl;
} vnd_str_capab_t;

/*
 * The vnd_data_queue is a simple construct for storing a series of messages in
 * a queue.
 *
 * See synchronization section of the big theory statement for member
 * annotations.
 */
typedef struct vnd_data_queue {
	struct vnd_str *vdq_vns;	/* E */
	kmutex_t vdq_lock;
	kcondvar_t vdq_ready;		/* Uses vdq_lock */
	ssize_t vdq_max;		/* L */
	ssize_t vdq_cur;		/* L */
	mblk_t *vdq_head;		/* L */
	mblk_t *vdq_tail;		/* L */
} vnd_data_queue_t;

typedef struct vnd_str_stat {
	kstat_named_t	vks_rbytes;
	kstat_named_t	vks_rpackets;
	kstat_named_t	vks_obytes;
	kstat_named_t	vks_opackets;
	kstat_named_t	vks_nhookindrops;
	kstat_named_t	vks_nhookoutdrops;
	kstat_named_t	vks_ndlpidrops;
	kstat_named_t	vks_ndataindrops;
	kstat_named_t	vks_ndataoutdrops;
	kstat_named_t	vks_tdrops;
	kstat_named_t	vks_linkname;
	kstat_named_t	vks_zonename;
	kstat_named_t	vks_nmacflow;
	kstat_named_t	vks_tmacflow;
	kstat_named_t	vks_mac_flow_1ms;
	kstat_named_t	vks_mac_flow_10ms;
	kstat_named_t	vks_mac_flow_100ms;
	kstat_named_t	vks_mac_flow_1s;
	kstat_named_t	vks_mac_flow_10s;
} vnd_str_stat_t;

/*
 * vnd stream structure
 *
 * See synchronization section of the big theory statement for member
 * annotations.
 */
typedef struct vnd_str {
	kmutex_t 	vns_lock;
	kcondvar_t	vns_cancelcv;		/* Uses vns_lock */
	kcondvar_t	vns_barriercv;		/* Uses vns_lock */
	kcondvar_t	vns_stcv;		/* Uses vns_lock */
	vnd_str_state_t	vns_state;		/* L */
	vnd_str_state_t	vns_laststate;		/* L */
	vnd_errno_t	vns_errno;		/* L */
	vnd_str_flags_t	vns_flags;		/* L */
	vnd_str_capab_t vns_caps;		/* L */
	taskq_ent_t	vns_tqe;		/* L */
	vnd_data_queue_t vns_dq_read;		/* E */
	vnd_data_queue_t vns_dq_write;		/* E */
	mblk_t		*vns_dlpi_inc;		/* L */
	queue_t		*vns_rq;		/* E */
	queue_t		*vns_wq;		/* E */
	queue_t		*vns_lrq;		/* E */
	t_uscalar_t	vns_dlpi_style;		/* L */
	t_uscalar_t	vns_minwrite;		/* L */
	t_uscalar_t	vns_maxwrite;		/* L */
	hrtime_t	vns_fclatch;		/* L */
	hrtime_t	vns_fcupdate;		/* L */
	kstat_t		*vns_kstat;		/* E */
	gsqueue_t	*vns_squeue;		/* E */
	mblk_t		vns_drainblk;		/* E + X */
	mblk_t		vns_barrierblk;		/* E + X */
	vnd_str_stat_t	vns_ksdata;		/* A */
	size_t		vns_nflush;		/* L */
	size_t 		vns_bsize;		/* L */
	struct vnd_dev	*vns_dev;		/* E + X */
	struct vnd_pnsd	*vns_nsd;		/* E + X */
} vnd_str_t;

typedef enum vnd_dev_flags {
	VND_D_ATTACH_INFLIGHT =	0x001,
	VND_D_ATTACHED =	0x002,
	VND_D_LINK_INFLIGHT =	0x004,
	VND_D_LINKED =		0x008,
	VND_D_CONDEMNED =	0x010,
	VND_D_ZONE_DYING =	0x020,
	VND_D_OPENED =		0x040
} vnd_dev_flags_t;

/*
 * This represents the data associated with a minor device instance.
 *
 * See synchronization section of the big theory statement for member
 * annotations.
 */
typedef struct vnd_dev {
	kmutex_t	vdd_lock;
	list_node_t	vdd_link;			/* GL */
	list_node_t	vdd_nslink;			/* NSL */
	int		vdd_ref;			/* L */
	vnd_dev_flags_t	vdd_flags;			/* L */
	minor_t		vdd_minor;			/* E */
	dev_t		vdd_devid;			/* E */
	ldi_ident_t	vdd_ldiid;			/* E */
	ldi_handle_t	vdd_ldih;			/* X */
	cred_t		*vdd_cr;			/* X */
	vnd_str_t	*vdd_str;			/* L */
	struct pollhead	vdd_ph;				/* E */
	struct vnd_pnsd *vdd_nsd;			/* E + X */
	char		vdd_datalink[VND_NAMELEN];	/* L */
	char		vdd_lname[VND_NAMELEN];		/* L */
} vnd_dev_t;

typedef enum vnd_pnsd_flags {
	VND_NS_CONDEMNED = 0x1
} vnd_pnsd_flags_t;

/*
 * Per netstack data structure.
 *
 * See synchronization section of the big theory statement for member
 * annotations.
 */
typedef struct vnd_pnsd {
	list_node_t vpnd_link;	/* protected by global dev lock */
	zoneid_t vpnd_zid;			/* E */
	netstackid_t vpnd_nsid;			/* E */
	boolean_t vpnd_hooked;			/* E */
	net_handle_t vpnd_neti_v4;		/* E */
	hook_family_t vpnd_family_v4;		/* E */
	hook_event_t vpnd_event_in_v4;		/* E */
	hook_event_t vpnd_event_out_v4;		/* E */
	hook_event_token_t vpnd_token_in_v4;	/* E */
	hook_event_token_t vpnd_token_out_v4;	/* E */
	net_handle_t vpnd_neti_v6;		/* E */
	hook_family_t vpnd_family_v6;		/* E */
	hook_event_t vpnd_event_in_v6;		/* E */
	hook_event_t vpnd_event_out_v6;		/* E */
	hook_event_token_t vpnd_token_in_v6;	/* E */
	hook_event_token_t vpnd_token_out_v6;	/* E */
	kmutex_t vpnd_lock;		/* Protects remaining members */
	kcondvar_t vpnd_ref_change;		/* Uses vpnd_lock */
	int vpnd_ref;				/* L */
	vnd_pnsd_flags_t vpnd_flags;		/* L */
	list_t vpnd_dev_list;			/* L */
} vnd_pnsd_t;

static void vnd_squeue_tx_drain(void *, mblk_t *, gsqueue_t *, void *);

/*
 * Drop function signature.
 */
typedef void (*vnd_dropper_f)(vnd_str_t *, mblk_t *, const char *);

static void
vnd_drop_ctl(vnd_str_t *vsp, mblk_t *mp, const char *reason)
{
	DTRACE_VND4(drop__ctl, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
	    mp, const char *, reason);
	if (mp != NULL) {
		freemsg(mp);
	}
	VND_STAT_INC(vsp, vks_ndlpidrops, 1);
	VND_STAT_INC(vsp, vks_tdrops, 1);
}

static void
vnd_drop_in(vnd_str_t *vsp, mblk_t *mp, const char *reason)
{
	DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
	    mp, const char *, reason);
	if (mp != NULL) {
		freemsg(mp);
	}
	VND_STAT_INC(vsp, vks_ndataindrops, 1);
	VND_STAT_INC(vsp, vks_tdrops, 1);
}

static void
vnd_drop_out(vnd_str_t *vsp, mblk_t *mp, const char *reason)
{
	DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
	    mp, const char *, reason);
	if (mp != NULL) {
		freemsg(mp);
	}
	VND_STAT_INC(vsp, vks_ndataoutdrops, 1);
	VND_STAT_INC(vsp, vks_tdrops, 1);
}

static void
vnd_drop_hook_in(vnd_str_t *vsp, mblk_t *mp, const char *reason)
{
	DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
	    mp, const char *, reason);
	if (mp != NULL) {
		freemsg(mp);
	}
	VND_STAT_INC(vsp, vks_nhookindrops, 1);
	VND_STAT_INC(vsp, vks_tdrops, 1);
}

static void
vnd_drop_hook_out(vnd_str_t *vsp, mblk_t *mp, const char *reason)
{
	DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *,
	    mp, const char *, reason);
	if (mp != NULL) {
		freemsg(mp);
	}
	VND_STAT_INC(vsp, vks_nhookoutdrops, 1);
	VND_STAT_INC(vsp, vks_tdrops, 1);
}

/* ARGSUSED */
static void
vnd_drop_panic(vnd_str_t *vsp, mblk_t *mp, const char *reason)
{
	panic("illegal vnd drop");
}

/* ARGSUSED */
static void
vnd_mac_drop_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain,
    mac_header_info_t *mhip)
{
	mblk_t *mp;

	while (mp_chain != NULL) {
		mp = mp_chain;
		mp_chain = mp->b_next;
		vnd_drop_hook_in(vsp, mp, "stream not associated");
	}
}

static vnd_pnsd_t *
vnd_nsd_lookup(netstackid_t nsid)
{
	vnd_pnsd_t *nsp;

	mutex_enter(&vnd_dev_lock);
	for (nsp = list_head(&vnd_nsd_list); nsp != NULL;
	    nsp = list_next(&vnd_nsd_list, nsp)) {
		if (nsp->vpnd_nsid == nsid) {
			mutex_enter(&nsp->vpnd_lock);
			VERIFY(nsp->vpnd_ref >= 0);
			nsp->vpnd_ref++;
			mutex_exit(&nsp->vpnd_lock);
			break;
		}
	}
	mutex_exit(&vnd_dev_lock);
	return (nsp);
}

static vnd_pnsd_t *
vnd_nsd_lookup_by_zid(zoneid_t zid)
{
	netstack_t *ns;
	vnd_pnsd_t *nsp;
	ns = netstack_find_by_zoneid(zid);
	if (ns == NULL)
		return (NULL);
	nsp = vnd_nsd_lookup(ns->netstack_stackid);
	netstack_rele(ns);
	return (nsp);
}

static vnd_pnsd_t *
vnd_nsd_lookup_by_zonename(char *zname)
{
	zone_t *zonep;
	vnd_pnsd_t *nsp;

	zonep = zone_find_by_name(zname);
	if (zonep == NULL)
		return (NULL);

	nsp = vnd_nsd_lookup_by_zid(zonep->zone_id);
	zone_rele(zonep);
	return (nsp);
}

static void
vnd_nsd_ref(vnd_pnsd_t *nsp)
{
	mutex_enter(&nsp->vpnd_lock);
	/*
	 * This can only be used on something that has been obtained through
	 * some other means. As such, the caller should already have a reference
	 * before adding another one. This function should not be used as a
	 * means of creating the initial reference.
	 */
	VERIFY(nsp->vpnd_ref > 0);
	nsp->vpnd_ref++;
	mutex_exit(&nsp->vpnd_lock);
	cv_broadcast(&nsp->vpnd_ref_change);
}

static void
vnd_nsd_rele(vnd_pnsd_t *nsp)
{
	mutex_enter(&nsp->vpnd_lock);
	VERIFY(nsp->vpnd_ref > 0);
	nsp->vpnd_ref--;
	mutex_exit(&nsp->vpnd_lock);
	cv_broadcast(&nsp->vpnd_ref_change);
}

static vnd_dev_t *
vnd_dev_lookup(minor_t m)
{
	vnd_dev_t *vdp;
	mutex_enter(&vnd_dev_lock);
	for (vdp = list_head(&vnd_dev_list); vdp != NULL;
	    vdp = list_next(&vnd_dev_list, vdp)) {
		if (vdp->vdd_minor == m) {
			mutex_enter(&vdp->vdd_lock);
			VERIFY(vdp->vdd_ref > 0);
			vdp->vdd_ref++;
			DTRACE_VND_REFINC(vdp);
			mutex_exit(&vdp->vdd_lock);
			break;
		}
	}
	mutex_exit(&vnd_dev_lock);
	return (vdp);
}

static void
vnd_dev_free(vnd_dev_t *vdp)
{
	/*
	 * When the STREAM exists we need to go through and make sure
	 * communication gets torn down. As part of closing the stream, we
	 * guarantee that nothing else should be able to enter the stream layer
	 * at this point. That means no one should be able to call
	 * read(),write() or one of the frameio ioctls.
	 */
	if (vdp->vdd_flags & VND_D_ATTACHED) {
		(void) ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
		crfree(vdp->vdd_cr);
		vdp->vdd_cr = NULL;

		/*
		 * We have to remove ourselves from our parents list now. It is
		 * really quite important that we have already set the condemend
		 * flag here so that our containing netstack basically knows
		 * that we're on the way down and knows not to wait for us. It's
		 * also important that we do that before we put a rele on the
		 * the device as that is the point at which it will check again.
		 */
		mutex_enter(&vdp->vdd_nsd->vpnd_lock);
		list_remove(&vdp->vdd_nsd->vpnd_dev_list, vdp);
		mutex_exit(&vdp->vdd_nsd->vpnd_lock);
		vnd_nsd_rele(vdp->vdd_nsd);
		vdp->vdd_nsd = NULL;
	}
	ASSERT(vdp->vdd_flags & VND_D_CONDEMNED);
	id_free(vnd_minors, vdp->vdd_minor);
	mutex_destroy(&vdp->vdd_lock);
	kmem_cache_free(vnd_dev_cache, vdp);
}

static void
vnd_dev_ref(vnd_dev_t *vdp)
{
	mutex_enter(&vdp->vdd_lock);
	VERIFY(vdp->vdd_ref > 0);
	vdp->vdd_ref++;
	DTRACE_VND_REFINC(vdp);
	mutex_exit(&vdp->vdd_lock);
}

/*
 * As part of releasing the hold on this we may tear down a given vnd_dev_t As
 * such we need to make sure that we grab the list lock first before grabbing
 * the vnd_dev_t's lock to ensure proper lock ordering.
 */
static void
vnd_dev_rele(vnd_dev_t *vdp)
{
	mutex_enter(&vnd_dev_lock);
	mutex_enter(&vdp->vdd_lock);
	VERIFY(vdp->vdd_ref > 0);
	vdp->vdd_ref--;
	DTRACE_VND_REFDEC(vdp);
	if (vdp->vdd_ref > 0) {
		mutex_exit(&vdp->vdd_lock);
		mutex_exit(&vnd_dev_lock);
		return;
	}

	/*
	 * Now that we've removed this from the list, we can go ahead and
	 * drop the list lock. No one else can find this device and reference
	 * it. As its reference count is zero, it by definition does not have
	 * any remaining entries in /devices that could lead someone back to
	 * this.
	 */
	vdp->vdd_flags |= VND_D_CONDEMNED;
	list_remove(&vnd_dev_list, vdp);
	mutex_exit(&vdp->vdd_lock);
	mutex_exit(&vnd_dev_lock);

	vnd_dev_free(vdp);
}

/*
 * Insert a mesage block chain if there's space, otherwise drop it. Return one
 * so someone who was waiting for data would now end up having found it. eg.
 * caller should consider a broadcast.
 */
static int
vnd_dq_push(vnd_data_queue_t *vqp, mblk_t *mp, boolean_t reserved,
    vnd_dropper_f dropf)
{
	size_t msize;

	ASSERT(MUTEX_HELD(&vqp->vdq_lock));
	if (reserved == B_FALSE) {
		msize = msgsize(mp);
		if (vqp->vdq_cur + msize > vqp->vdq_max) {
			dropf(vqp->vdq_vns, mp, "buffer full");
			return (0);
		}
		vqp->vdq_cur += msize;
	}

	if (vqp->vdq_head == NULL) {
		ASSERT(vqp->vdq_tail == NULL);
		vqp->vdq_head = mp;
		vqp->vdq_tail = mp;
	} else {
		vqp->vdq_tail->b_next = mp;
		vqp->vdq_tail = mp;
	}

	return (1);
}

/*
 * Remove a message message block chain. If the amount of space in the buffer
 * has changed we return 1. We have no way of knowing whether or not there is
 * enough space overall for a given writer who is blocked, so we always end up
 * having to return true and thus tell consumers that they should consider
 * signalling.
 */
static int
vnd_dq_pop(vnd_data_queue_t *vqp, mblk_t **mpp)
{
	size_t msize;
	mblk_t *mp;

	ASSERT(MUTEX_HELD(&vqp->vdq_lock));
	ASSERT(mpp != NULL);
	if (vqp->vdq_head == NULL) {
		ASSERT(vqp->vdq_tail == NULL);
		*mpp = NULL;
		return (0);
	}

	mp = vqp->vdq_head;
	msize = msgsize(mp);

	vqp->vdq_cur -= msize;
	if (mp->b_next == NULL) {
		vqp->vdq_head = NULL;
		vqp->vdq_tail = NULL;
		/*
		 * We can't be certain that this is always going to be zero.
		 * Someone may have basically taken a reservation of space on
		 * the data queue, eg. claimed spae but not yet pushed it on
		 * yet.
		 */
		ASSERT(vqp->vdq_cur >= 0);
	} else {
		vqp->vdq_head = mp->b_next;
		ASSERT(vqp->vdq_cur > 0);
	}
	mp->b_next = NULL;
	*mpp = mp;
	return (1);
}

/*
 * Reserve space in the queue. This will bump up the size of the queue and
 * entitle the user to push something on later without bumping the space.
 */
static int
vnd_dq_reserve(vnd_data_queue_t *vqp, ssize_t size)
{
	ASSERT(MUTEX_HELD(&vqp->vdq_lock));
	ASSERT(size >= 0);

	if (size == 0)
		return (0);

	if (size + vqp->vdq_cur > vqp->vdq_max)
		return (0);

	vqp->vdq_cur += size;
	return (1);
}

static void
vnd_dq_unreserve(vnd_data_queue_t *vqp, ssize_t size)
{
	ASSERT(MUTEX_HELD(&vqp->vdq_lock));
	ASSERT(size > 0);
	ASSERT(size <= vqp->vdq_cur);

	vqp->vdq_cur -= size;
}

static void
vnd_dq_flush(vnd_data_queue_t *vqp, vnd_dropper_f dropf)
{
	mblk_t *mp, *next;

	mutex_enter(&vqp->vdq_lock);
	for (mp = vqp->vdq_head; mp != NULL; mp = next) {
		next = mp->b_next;
		mp->b_next = NULL;
		dropf(vqp->vdq_vns, mp, "vnd_dq_flush");
	}
	vqp->vdq_cur = 0;
	vqp->vdq_head = NULL;
	vqp->vdq_tail = NULL;
	mutex_exit(&vqp->vdq_lock);
}

static boolean_t
vnd_dq_is_empty(vnd_data_queue_t *vqp)
{
	boolean_t ret;

	mutex_enter(&vqp->vdq_lock);
	if (vqp->vdq_head == NULL)
		ret = B_TRUE;
	else
		ret = B_FALSE;
	mutex_exit(&vqp->vdq_lock);

	return (ret);
}

/*
 * Get a network uint16_t from the message and translate it into something the
 * host understands.
 */
static int
vnd_mbc_getu16(mblk_t *mp, off_t off, uint16_t *out)
{
	size_t mpsize;
	uint8_t *bp;

	mpsize = msgsize(mp);
	/* Check for overflow */
	if (off + sizeof (uint16_t) > mpsize)
		return (1);

	mpsize = MBLKL(mp);
	while (off >= mpsize) {
		mp = mp->b_cont;
		off -= mpsize;
		mpsize = MBLKL(mp);
	}

	/*
	 * Data is in network order. Note the second byte of data might be in
	 * the next mp.
	 */
	bp = mp->b_rptr + off;
	*out = *bp << 8;
	if (off + 1 == mpsize) {
		mp = mp->b_cont;
		bp = mp->b_rptr;
	} else {
		bp++;
	}

	*out |= *bp;
	return (0);
}

/*
 * Given an mblk chain find the mblk and address of a particular offset.
 */
static int
vnd_mbc_getoffset(mblk_t *mp, off_t off, mblk_t **mpp, uintptr_t *offp)
{
	size_t mpsize;

	if (off >= msgsize(mp))
		return (1);

	mpsize = MBLKL(mp);
	while (off >= mpsize) {
		mp = mp->b_cont;
		off -= mpsize;
		mpsize = MBLKL(mp);
	}
	*mpp = mp;
	*offp = (uintptr_t)mp->b_rptr + off;

	return (0);
}

/*
 * Fetch the destination mac address. Set *dstp to that mac address. If the data
 * is not contiguous in the first mblk_t, fill in datap and set *dstp to it.
 */
static int
vnd_mbc_getdstmac(mblk_t *mp, uint8_t **dstpp, uint8_t *datap)
{
	int i;

	if (MBLKL(mp) >= ETHERADDRL) {
		*dstpp = mp->b_rptr;
		return (0);
	}

	*dstpp = datap;
	for (i = 0; i < ETHERADDRL; i += 2, datap += 2) {
		if (vnd_mbc_getu16(mp, i, (uint16_t *)datap) != 0)
			return (1);
	}

	return (0);
}

static int
vnd_hook(vnd_str_t *vsp, mblk_t **mpp, net_handle_t netiv4, hook_event_t hev4,
    hook_event_token_t hetv4, net_handle_t netiv6, hook_event_t hev6,
    hook_event_token_t hetv6, vnd_dropper_f hdrop, vnd_dropper_f ddrop)
{
	uint16_t etype;
	hook_pkt_event_t info;
	size_t offset, mblen;
	uint8_t *dstp;
	uint8_t dstaddr[6];
	hook_event_t he;
	hook_event_token_t het;
	net_handle_t neti;

	/*
	 * Before we can ask if we're interested we have to do enough work to
	 * determine the ethertype.
	 */

	/* Byte 12 is either the VLAN tag or the ethertype */
	if (vnd_mbc_getu16(*mpp, 12, &etype) != 0) {
		ddrop(vsp, *mpp, "packet has incomplete ethernet header");
		*mpp = NULL;
		return (1);
	}

	if (etype == ETHERTYPE_VLAN) {
		/* Actual ethertype is another four bytes in */
		if (vnd_mbc_getu16(*mpp, 16, &etype) != 0) {
			ddrop(vsp, *mpp,
			    "packet has incomplete ethernet vlan header");
			*mpp = NULL;
			return (1);
		}
		offset = sizeof (struct ether_vlan_header);
	} else {
		offset = sizeof (struct ether_header);
	}

	/*
	 * At the moment we only hook on the kinds of things that the IP module
	 * would normally.
	 */
	if (etype != ETHERTYPE_IP && etype != ETHERTYPE_IPV6)
		return (0);

	if (etype == ETHERTYPE_IP) {
		neti = netiv4;
		he = hev4;
		het = hetv4;
	} else {
		neti = netiv6;
		he = hev6;
		het = hetv6;
	}

	if (!he.he_interested)
		return (0);


	if (vnd_mbc_getdstmac(*mpp, &dstp, dstaddr) != 0) {
		ddrop(vsp, *mpp, "packet has incomplete ethernet header");
		*mpp = NULL;
		return (1);
	}

	/*
	 * Now that we know we're interested, we have to do some additional
	 * sanity checking for IPF's sake, ala ip_check_length(). Specifically
	 * we need to check to make sure that the remaining packet size,
	 * excluding MAC, is at least the size of an IP header.
	 */
	mblen = msgsize(*mpp);
	if ((etype == ETHERTYPE_IP &&
	    mblen - offset < IP_SIMPLE_HDR_LENGTH) ||
	    (etype == ETHERTYPE_IPV6 && mblen - offset < IPV6_HDR_LEN)) {
		ddrop(vsp, *mpp, "packet has invalid IP header");
		*mpp = NULL;
		return (1);
	}

	info.hpe_protocol = neti;
	info.hpe_ifp = (phy_if_t)vsp;
	info.hpe_ofp = (phy_if_t)vsp;
	info.hpe_mp = mpp;
	info.hpe_flags = 0;

	if (bcmp(vnd_bcast_addr, dstp, ETHERADDRL) == 0)
		info.hpe_flags |= HPE_BROADCAST;
	else if (etype == ETHERTYPE_IP &&
	    bcmp(vnd_ipv4_mcast, vnd_bcast_addr, IPV4_MCAST_LEN) == 0)
		info.hpe_flags |= HPE_MULTICAST;
	else if (etype == ETHERTYPE_IPV6 &&
	    bcmp(vnd_ipv6_mcast, vnd_bcast_addr, IPV6_MCAST_LEN) == 0)
		info.hpe_flags |= HPE_MULTICAST;

	if (vnd_mbc_getoffset(*mpp, offset, &info.hpe_mb,
	    (uintptr_t *)&info.hpe_hdr) != 0) {
		ddrop(vsp, *mpp, "packet too small -- "
		    "unable to find payload");
		*mpp = NULL;
		return (1);
	}

	if (hook_run(neti->netd_hooks, het, (hook_data_t)&info) != 0) {
		hdrop(vsp, *mpp, "drooped by hooks");
		return (1);
	}

	return (0);
}

/*
 * This should not be used for DL_INFO_REQ.
 */
static mblk_t *
vnd_dlpi_alloc(size_t len, t_uscalar_t prim)
{
	mblk_t *mp;
	mp = allocb(len, BPRI_MED);
	if (mp == NULL)
		return (NULL);

	mp->b_datap->db_type = M_PROTO;
	mp->b_wptr = mp->b_rptr + len;
	bzero(mp->b_rptr, len);
	((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim;

	return (mp);
}

static void
vnd_dlpi_inc_push(vnd_str_t *vsp, mblk_t *mp)
{
	mblk_t **mpp;

	VERIFY(MUTEX_HELD(&vsp->vns_lock));
	ASSERT(mp->b_next == NULL);
	mpp = &vsp->vns_dlpi_inc;
	while (*mpp != NULL)
		mpp = &((*mpp)->b_next);
	*mpp = mp;
}

static mblk_t *
vnd_dlpi_inc_pop(vnd_str_t *vsp)
{
	mblk_t *mp;

	VERIFY(MUTEX_HELD(&vsp->vns_lock));
	mp = vsp->vns_dlpi_inc;
	if (mp != NULL) {
		VERIFY(mp->b_next == NULL || mp->b_next != mp);
		vsp->vns_dlpi_inc = mp->b_next;
		mp->b_next = NULL;
	}
	return (mp);
}

static int
vnd_st_sinfo(vnd_str_t *vsp)
{
	mblk_t *mp;
	dl_info_req_t *dlir;

	VERIFY(MUTEX_HELD(&vsp->vns_lock));
	mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
	    BPRI_HI);
	if (mp == NULL) {
		vsp->vns_errno = VND_E_NOMEM;
		return (1);
	}
	vsp->vns_state = VNS_S_INFO_SENT;
	cv_broadcast(&vsp->vns_stcv);

	mp->b_datap->db_type = M_PCPROTO;
	dlir = (dl_info_req_t *)mp->b_rptr;
	mp->b_wptr = (uchar_t *)&dlir[1];
	dlir->dl_primitive = DL_INFO_REQ;
	putnext(vsp->vns_wq, mp);

	return (0);
}

static int
vnd_st_info(vnd_str_t *vsp)
{
	dl_info_ack_t *dlia;
	mblk_t *mp;

	VERIFY(MUTEX_HELD(&vsp->vns_lock));
	mp = vnd_dlpi_inc_pop(vsp);
	dlia = (dl_info_ack_t *)mp->b_rptr;
	vsp->vns_dlpi_style = dlia->dl_provider_style;
	vsp->vns_minwrite = dlia->dl_min_sdu;
	vsp->vns_maxwrite = dlia->dl_max_sdu;

	/*
	 * At this time we only support DL_ETHER devices.
	 */
	if (dlia->dl_mac_type != DL_ETHER) {
		freemsg(mp);
		vsp->vns_errno = VND_E_NOTETHER;
		return (1);
	}

	/*
	 * Because vnd operates on entire packets, we need to manually account
	 * for the ethernet header information. We add the size of the
	 * ether_vlan_header to account for this, regardless if it is using
	 * vlans or not.
	 */
	vsp->vns_maxwrite += sizeof (struct ether_vlan_header);

	freemsg(mp);
	return (0);
}

static int
vnd_st_sexclusive(vnd_str_t *vsp)
{
	mblk_t *mp;

	VERIFY(MUTEX_HELD(&vsp->vns_lock));
	mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_EXCLUSIVE_REQ);
	if (mp == NULL) {
		vsp->vns_errno = VND_E_NOMEM;
		return (1);
	}

	vsp->vns_state = VNS_S_EXCLUSIVE_SENT;
	cv_broadcast(&vsp->vns_stcv);
	putnext(vsp->vns_wq, mp);
	return (0);
}

static int
vnd_st_exclusive(vnd_str_t *vsp)
{
	mblk_t *mp;
	t_uscalar_t prim, cprim;

	VERIFY(MUTEX_HELD(&vsp->vns_lock));
	mp = vnd_dlpi_inc_pop(vsp);
	prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive;
	cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;

	if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
		vnd_drop_ctl(vsp, mp,
		    "wrong dlpi primitive for vnd_st_exclusive");
		vsp->vns_errno = VND_E_DLPIINVAL;
		return (1);
	}

	if (cprim != DL_EXCLUSIVE_REQ) {
		vnd_drop_ctl(vsp, mp,
		    "vnd_st_exclusive: got ack/nack for wrong primitive");
		vsp->vns_errno = VND_E_DLPIINVAL;
		return (1);
	}

	if (prim == DL_ERROR_ACK)
		vsp->vns_errno = VND_E_DLEXCL;

	freemsg(mp);
	return (prim == DL_ERROR_ACK);
}

/*
 * Send down a DLPI_ATTACH_REQ.
 */
static int
vnd_st_sattach(vnd_str_t *vsp)
{
	mblk_t *mp;

	VERIFY(MUTEX_HELD(&vsp->vns_lock));
	mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_ATTACH_REQ);
	if (mp == NULL) {
		vsp->vns_errno = VND_E_NOMEM;
		return (1);
	}

	((dl_attach_req_t *)mp->b_rptr)->dl_ppa = 0;
	vsp->vns_state = VNS_S_ATTACH_SENT;
	cv_broadcast(&vsp->vns_stcv);
	putnext(vsp->vns_wq, mp);

	return (0);
}

static int
vnd_st_attach(vnd_str_t *vsp)
{
	mblk_t *mp;
	t_uscalar_t prim, cprim;

	VERIFY(MUTEX_HELD(&vsp->vns_lock));
	mp = vnd_dlpi_inc_pop(vsp);
	prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
	cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;


	if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
		vnd_drop_ctl(vsp, mp, "vnd_st_attach: unknown primitive type");
		vsp->vns_errno = VND_E_DLPIINVAL;
		return (1);
	}

	if (cprim != DL_ATTACH_REQ) {
		vnd_drop_ctl(vsp, mp,
		    "vnd_st_attach: Got ack/nack for wrong primitive");
		vsp->vns_errno = VND_E_DLPIINVAL;
		return (1);
	}

	if (prim == DL_ERROR_ACK)
		vsp->vns_errno = VND_E_ATTACHFAIL;

	freemsg(mp);
	return (prim == DL_ERROR_ACK);
}

static int
vnd_st_sbind(vnd_str_t *vsp)
{
	mblk_t *mp;
	dl_bind_req_t *dbrp;

	VERIFY(MUTEX_HELD(&vsp->vns_lock));
	mp = vnd_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long),
	    DL_BIND_REQ);
	if (mp == NULL) {
		vsp->vns_errno = VND_E_NOMEM;
		return (1);
	}
	dbrp = (dl_bind_req_t *)(mp->b_rptr);
	dbrp->dl_sap = 0;
	dbrp->dl_service_mode = DL_CLDLS;

	vsp->vns_state = VNS_S_BIND_SENT;
	cv_broadcast(&vsp->vns_stcv);
	putnext(vsp->vns_wq, mp);

	return (0);
}

static int
vnd_st_bind(vnd_str_t *vsp)
{
	mblk_t *mp;
	t_uscalar_t prim;

	VERIFY(MUTEX_HELD(&vsp->vns_lock));
	mp = vnd_dlpi_inc_pop(vsp);
	prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive;

	if (prim != DL_BIND_ACK && prim != DL_ERROR_ACK) {
		vnd_drop_ctl(vsp, mp, "wrong dlpi primitive for vnd_st_bind");
		vsp->vns_errno = VND_E_DLPIINVAL;
		return (1);
	}

	if (prim == DL_ERROR_ACK)
		vsp->vns_errno = VND_E_BINDFAIL;

	freemsg(mp);
	return (prim == DL_ERROR_ACK);
}

static int
vnd_st_spromisc(vnd_str_t *vsp, int type, vnd_str_state_t next)
{
	mblk_t *mp;
	dl_promiscon_req_t *dprp;

	VERIFY(MUTEX_HELD(&vsp->vns_lock));
	mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCON_REQ);
	if (mp == NULL) {
		vsp->vns_errno = VND_E_NOMEM;
		return (1);
	}

	dprp = (dl_promiscon_req_t *)mp->b_rptr;
	dprp->dl_level = type;

	vsp->vns_state = next;
	cv_broadcast(&vsp->vns_stcv);
	putnext(vsp->vns_wq, mp);

	return (0);
}

static int
vnd_st_promisc(vnd_str_t *vsp)
{
	mblk_t *mp;
	t_uscalar_t prim, cprim;

	VERIFY(MUTEX_HELD(&vsp->vns_lock));
	mp = vnd_dlpi_inc_pop(vsp);
	prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
	cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;

	if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
		vnd_drop_ctl(vsp, mp,
		    "wrong dlpi primitive for vnd_st_promisc");
		vsp->vns_errno = VND_E_DLPIINVAL;
		return (1);
	}

	if (cprim != DL_PROMISCON_REQ) {
		vnd_drop_ctl(vsp, mp,
		    "vnd_st_promisc: Got ack/nack for wrong primitive");
		vsp->vns_errno = VND_E_DLPIINVAL;
		return (1);
	}

	if (prim == DL_ERROR_ACK)
		vsp->vns_errno = VND_E_PROMISCFAIL;

	freemsg(mp);
	return (prim == DL_ERROR_ACK);
}

static int
vnd_st_scapabq(vnd_str_t *vsp)
{
	mblk_t *mp;

	VERIFY(MUTEX_HELD(&vsp->vns_lock));

	mp = vnd_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
	if (mp == NULL) {
		vsp->vns_errno = VND_E_NOMEM;
		return (1);
	}

	vsp->vns_state = VNS_S_CAPAB_Q_SENT;
	cv_broadcast(&vsp->vns_stcv);
	putnext(vsp->vns_wq, mp);

	return (0);
}

/* ARGSUSED */
static void
vnd_mac_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain,
    mac_header_info_t *mhip)
{
	int signal = 0;
	mblk_t *mp;
	vnd_pnsd_t *nsp = vsp->vns_nsd;

	ASSERT(vsp != NULL);
	ASSERT(mp_chain != NULL);

	for (mp = mp_chain; mp != NULL; mp = mp_chain) {
		uint16_t vid;
		mp_chain = mp->b_next;
		mp->b_next = NULL;

		/*
		 * If we were operating in a traditional dlpi context then we
		 * would have enabled DLIOCRAW and rather than the fast path, we
		 * would come through dld_str_rx_raw. That function does two
		 * things that we have to consider doing ourselves. The first is
		 * that it adjusts the b_rptr back to account for dld bumping us
		 * past the mac header. It also tries to account for cases where
		 * mac provides an illusion of the mac header. Fortunately, dld
		 * only allows the fastpath when the media type is the same as
		 * the native type. Therefore all we have to do here is adjust
		 * the b_rptr.
		 */
		ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize);
		mp->b_rptr -= mhip->mhi_hdrsize;
		vid = VLAN_ID(mhip->mhi_tci);
		if (mhip->mhi_istagged && vid != VLAN_ID_NONE) {
			/*
			 * This is an overlapping copy. Do not use bcopy(9F).
			 */
			(void) memmove(mp->b_rptr + 4, mp->b_rptr, 12);
			mp->b_rptr += 4;
		}

		if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4,
		    nsp->vpnd_event_in_v4, nsp->vpnd_token_in_v4,
		    nsp->vpnd_neti_v6, nsp->vpnd_event_in_v6,
		    nsp->vpnd_token_in_v6, vnd_drop_hook_in, vnd_drop_in) != 0)
			continue;

		VND_STAT_INC(vsp, vks_rpackets, 1);
		VND_STAT_INC(vsp, vks_rbytes, msgsize(mp));
		DTRACE_VND5(recv, mblk_t *, mp, void *, NULL, void *, NULL,
		    vnd_str_t *, vsp, mblk_t *, mp);
		mutex_enter(&vsp->vns_dq_read.vdq_lock);
		signal |= vnd_dq_push(&vsp->vns_dq_read, mp, B_FALSE,
		    vnd_drop_in);
		mutex_exit(&vsp->vns_dq_read.vdq_lock);
	}

	if (signal != 0) {
		cv_broadcast(&vsp->vns_dq_read.vdq_ready);
		pollwakeup(&vsp->vns_dev->vdd_ph, POLLIN | POLLRDNORM);
	}

}

static void
vnd_mac_flow_control_stat(vnd_str_t *vsp, hrtime_t diff)
{
	VND_STAT_INC(vsp, vks_nmacflow, 1);
	VND_STAT_INC(vsp, vks_tmacflow, diff);
	if (diff >= VND_LATENCY_1MS)
		VND_STAT_INC(vsp, vks_mac_flow_1ms, 1);
	if (diff >= VND_LATENCY_10MS)
		VND_STAT_INC(vsp, vks_mac_flow_10ms, 1);
	if (diff >= VND_LATENCY_100MS)
		VND_STAT_INC(vsp, vks_mac_flow_100ms, 1);
	if (diff >= VND_LATENCY_1S)
		VND_STAT_INC(vsp, vks_mac_flow_1s, 1);
	if (diff >= VND_LATENCY_10S)
		VND_STAT_INC(vsp, vks_mac_flow_10s, 1);
}

/*
 * This is a callback from MAC that indicates that we are allowed to send
 * packets again.
 */
static void
vnd_mac_flow_control(void *arg, vnd_mac_cookie_t cookie)
{
	vnd_str_t *vsp = arg;
	hrtime_t now;

	mutex_enter(&vsp->vns_lock);
	now = gethrtime();

	/*
	 * Check for the case that we beat vnd_squeue_tx_one to the punch.
	 * There's also an additional case here that we got notified because
	 * we're sharing a device that ran out of tx descriptors, even though it
	 * wasn't because of us.
	 */
	if (!(vsp->vns_flags & VNS_F_FLOW_CONTROLLED)) {
		vsp->vns_fcupdate = now;
		mutex_exit(&vsp->vns_lock);
		return;
	}

	ASSERT(vsp->vns_flags & VNS_F_FLOW_CONTROLLED);
	ASSERT(vsp->vns_caps.vsc_fc_cookie == cookie);
	vsp->vns_flags &= ~VNS_F_FLOW_CONTROLLED;
	vsp->vns_caps.vsc_fc_cookie = (vnd_mac_cookie_t)NULL;
	vsp->vns_fclatch = 0;
	DTRACE_VND3(flow__resumed, vnd_str_t *, vsp, uint64_t,
	    vsp->vns_dq_write.vdq_cur, uintptr_t, cookie);
	/*
	 * If someone has asked to flush the squeue and thus inserted a barrier,
	 * than we shouldn't schedule a drain.
	 */
	if (!(vsp->vns_flags & (VNS_F_DRAIN_SCHEDULED | VNS_F_BARRIER))) {
		vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED;
		gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_drainblk,
		    vnd_squeue_tx_drain, vsp, GSQUEUE_FILL,
		    VND_SQUEUE_TAG_MAC_FLOW_CONTROL);
	}
	mutex_exit(&vsp->vns_lock);
}

static void
vnd_mac_enter(vnd_str_t *vsp, mac_perim_handle_t *mphp)
{
	ASSERT(MUTEX_HELD(&vsp->vns_lock));
	VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl,
	    DLD_CAPAB_PERIM, mphp, DLD_ENABLE) == 0);
}

static void
vnd_mac_exit(vnd_str_t *vsp, mac_perim_handle_t mph)
{
	ASSERT(MUTEX_HELD(&vsp->vns_lock));
	VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl,
	    DLD_CAPAB_PERIM, mph, DLD_DISABLE) == 0);
}

static int
vnd_dld_cap_enable(vnd_str_t *vsp, vnd_rx_t rxfunc)
{
	int ret;
	dld_capab_direct_t d;
	mac_perim_handle_t mph;
	vnd_str_capab_t *c = &vsp->vns_caps;

	bzero(&d, sizeof (d));
	d.di_rx_cf = (uintptr_t)rxfunc;
	d.di_rx_ch = vsp;
	d.di_flags = DI_DIRECT_RAW;

	vnd_mac_enter(vsp, &mph);

	/*
	 * If we're coming in here for a second pass, we need to make sure that
	 * we remove an existing flow control notification callback, otherwise
	 * we'll create a duplicate that will remain with garbage data.
	 */
	if (c->vsc_tx_fc_hdl != NULL) {
		ASSERT(c->vsc_set_fcb_hdl != NULL);
		(void) c->vsc_set_fcb_f(c->vsc_set_fcb_hdl, NULL,
		    c->vsc_tx_fc_hdl);
		c->vsc_tx_fc_hdl = NULL;
	}

	if (vsp->vns_caps.vsc_capab_f(c->vsc_capab_hdl,
	    DLD_CAPAB_DIRECT, &d, DLD_ENABLE) == 0) {
		c->vsc_tx_f = (vnd_dld_tx_t)d.di_tx_df;
		c->vsc_tx_hdl = d.di_tx_dh;
		c->vsc_set_fcb_f = (vnd_dld_set_fcb_t)d.di_tx_cb_df;
		c->vsc_set_fcb_hdl = d.di_tx_cb_dh;
		c->vsc_is_fc_f = (vnd_dld_is_fc_t)d.di_tx_fctl_df;
		c->vsc_is_fc_hdl = d.di_tx_fctl_dh;
		c->vsc_tx_fc_hdl = c->vsc_set_fcb_f(c->vsc_set_fcb_hdl,
		    vnd_mac_flow_control, vsp);
		c->vsc_flags |= VNS_C_DIRECT;
		ret = 0;
	} else {
		vsp->vns_errno = VND_E_DIRECTFAIL;
		ret = 1;
	}
	vnd_mac_exit(vsp, mph);
	return (ret);
}

static int
vnd_st_capabq(vnd_str_t *vsp)
{
	mblk_t *mp;
	dl_capability_ack_t *cap;
	dl_capability_sub_t *subp;
	dl_capab_hcksum_t *hck;
	dl_capab_dld_t *dld;
	unsigned char *rp;
	int ret = 0;

	VERIFY(MUTEX_HELD(&vsp->vns_lock));
	mp = vnd_dlpi_inc_pop(vsp);

	rp = mp->b_rptr;
	cap = (dl_capability_ack_t *)rp;
	if (cap->dl_sub_length == 0)
		goto done;

	/* Don't try to process something too big */
	if (sizeof (dl_capability_ack_t) + cap->dl_sub_length > MBLKL(mp)) {
		VND_STAT_INC(vsp, vks_ndlpidrops, 1);
		VND_STAT_INC(vsp, vks_tdrops, 1);
		vsp->vns_errno = VND_E_CAPACKINVAL;
		ret = 1;
		goto done;
	}

	rp += cap->dl_sub_offset;

	while (cap->dl_sub_length > 0) {
		subp = (dl_capability_sub_t *)rp;
		/* Sanity check something crazy from down below */
		if (subp->dl_length + sizeof (dl_capability_sub_t) >
		    cap->dl_sub_length) {
			VND_STAT_INC(vsp, vks_ndlpidrops, 1);
			VND_STAT_INC(vsp, vks_tdrops, 1);
			vsp->vns_errno = VND_E_SUBCAPINVAL;
			ret = 1;
			goto done;
		}

		switch (subp->dl_cap) {
		case DL_CAPAB_HCKSUM:
			hck = (dl_capab_hcksum_t *)(rp +
			    sizeof (dl_capability_sub_t));
			if (hck->hcksum_version != HCKSUM_CURRENT_VERSION) {
				vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM_BADVERS;
				break;
			}
			if (dlcapabcheckqid(&hck->hcksum_mid, vsp->vns_lrq) !=
			    B_TRUE) {
				vsp->vns_errno = VND_E_CAPABPASS;
				ret = 1;
				goto done;
			}
			vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM;
			vsp->vns_caps.vsc_hcksum_opts = hck->hcksum_txflags;
			break;
		case DL_CAPAB_DLD:
			dld = (dl_capab_dld_t *)(rp +
			    sizeof (dl_capability_sub_t));
			if (dld->dld_version != DLD_CURRENT_VERSION) {
				vsp->vns_errno = VND_E_DLDBADVERS;
				ret = 1;
				goto done;
			}
			if (dlcapabcheckqid(&dld->dld_mid, vsp->vns_lrq) !=
			    B_TRUE) {
				vsp->vns_errno = VND_E_CAPABPASS;
				ret = 1;
				goto done;
			}
			vsp->vns_caps.vsc_flags |= VNS_C_DLD;
			vsp->vns_caps.vsc_capab_f =
			    (vnd_dld_cap_t)dld->dld_capab;
			vsp->vns_caps.vsc_capab_hdl =
			    (void *)dld->dld_capab_handle;
			/*
			 * At this point in time, we have to set up a direct
			 * function that drops all input. This validates that
			 * we'll be able to set up direct input and that we can
			 * easily switch it earlier to the real data function
			 * when we've plumbed everything up.
			 */
			if (vnd_dld_cap_enable(vsp, vnd_mac_drop_input) != 0) {
				/* vns_errno set by vnd_dld_cap_enable */
				ret = 1;
				goto done;
			}
			break;
		default:
			/* Ignore unsupported cap */
			break;
		}

		rp += sizeof (dl_capability_sub_t) + subp->dl_length;
		cap->dl_sub_length -= sizeof (dl_capability_sub_t) +
		    subp->dl_length;
	}

done:
	/* Make sure we enabled direct callbacks */
	if (ret == 0 && !(vsp->vns_caps.vsc_flags & VNS_C_DIRECT)) {
		vsp->vns_errno = VND_E_DIRECTNOTSUP;
		ret = 1;
	}

	freemsg(mp);
	return (ret);
}

static void
vnd_st_sonline(vnd_str_t *vsp)
{
	VERIFY(MUTEX_HELD(&vsp->vns_lock));
	vsp->vns_state = VNS_S_ONLINE;
	cv_broadcast(&vsp->vns_stcv);
}

static void
vnd_st_shutdown(vnd_str_t *vsp)
{
	mac_perim_handle_t mph;
	vnd_str_capab_t *vsc = &vsp->vns_caps;

	VERIFY(MUTEX_HELD(&vsp->vns_lock));

	/*
	 * At this point in time we know that there is no one transmitting as
	 * our final reference has been torn down and that vnd_s_close inserted
	 * a barrier to validate that everything is flushed.
	 */
	if (vsc->vsc_flags & VNS_C_DIRECT) {
		vnd_mac_enter(vsp, &mph);
		vsc->vsc_flags &= ~VNS_C_DIRECT;
		(void) vsc->vsc_set_fcb_f(vsc->vsc_set_fcb_hdl, NULL,
		    vsc->vsc_tx_fc_hdl);
		vsc->vsc_tx_fc_hdl = NULL;
		(void) vsc->vsc_capab_f(vsc->vsc_capab_hdl, DLD_CAPAB_DIRECT,
		    NULL, DLD_DISABLE);
		vnd_mac_exit(vsp, mph);
	}
}

static boolean_t
vnd_st_spromiscoff(vnd_str_t *vsp, int type, vnd_str_state_t next)
{
	boolean_t ret = B_TRUE;
	mblk_t *mp;
	dl_promiscoff_req_t *dprp;

	VERIFY(MUTEX_HELD(&vsp->vns_lock));
	mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCOFF_REQ);
	if (mp == NULL) {
		cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for "
		    "promiscoff request");
		ret = B_FALSE;
		goto next;
	}

	dprp = (dl_promiscoff_req_t *)mp->b_rptr;
	dprp->dl_level = type;

	putnext(vsp->vns_wq, mp);
next:
	vsp->vns_state = next;
	cv_broadcast(&vsp->vns_stcv);
	return (ret);
}

static void
vnd_st_promiscoff(vnd_str_t *vsp)
{
	mblk_t *mp;
	t_uscalar_t prim, cprim;

	VERIFY(MUTEX_HELD(&vsp->vns_lock));

	/*
	 * Unlike other cases where we guard against the incoming packet being
	 * NULL, during tear down we try to keep driving and therefore we may
	 * have gotten here due to an earlier failure, so there's nothing to do.
	 */
	mp = vnd_dlpi_inc_pop(vsp);
	if (mp == NULL)
		return;

	prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
	cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;

	if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
		vnd_drop_ctl(vsp, mp,
		    "wrong dlpi primitive for vnd_st_promiscoff");
		return;
	}

	if (cprim != DL_PROMISCOFF_REQ) {
		vnd_drop_ctl(vsp, mp,
		    "vnd_st_promiscoff: Got ack/nack for wrong primitive");
		return;
	}

	if (prim == DL_ERROR_ACK) {
		cmn_err(CE_WARN, "!failed to disable promiscuos mode during "
		    "vnd teardown");
	}
}

static boolean_t
vnd_st_sunbind(vnd_str_t *vsp)
{
	mblk_t *mp;
	boolean_t ret = B_TRUE;

	mp = vnd_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ);
	if (mp == NULL) {
		cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for "
		    "unbind request");
		ret = B_FALSE;
		goto next;
	}

	putnext(vsp->vns_wq, mp);
next:
	vsp->vns_state = VNS_S_UNBIND_SENT;
	cv_broadcast(&vsp->vns_stcv);
	return (ret);
}

static void
vnd_st_unbind(vnd_str_t *vsp)
{
	mblk_t *mp;
	t_uscalar_t prim, cprim;

	/*
	 * Unlike other cases where we guard against the incoming packet being
	 * NULL, during tear down we try to keep driving and therefore we may
	 * have gotten here due to an earlier failure, so there's nothing to do.
	 */
	mp = vnd_dlpi_inc_pop(vsp);
	if (mp == NULL)
		goto next;

	prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive;
	cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive;

	if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) {
		vnd_drop_ctl(vsp, mp,
		    "wrong dlpi primitive for vnd_st_unbind");
		goto next;
	}

	if (cprim != DL_UNBIND_REQ) {
		vnd_drop_ctl(vsp, mp,
		    "vnd_st_unbind: Got ack/nack for wrong primitive");
		goto next;
	}

	if (prim == DL_ERROR_ACK) {
		cmn_err(CE_WARN, "!failed to unbind stream during vnd "
		    "teardown");
	}

next:
	vsp->vns_state = VNS_S_ZOMBIE;
	cv_broadcast(&vsp->vns_stcv);
}

/*
 * Perform state transitions. This is a one way shot down the flow chart
 * described in the big theory statement.
 */
static void
vnd_str_state_transition(void *arg)
{
	boolean_t died = B_FALSE;
	vnd_str_t *vsp = arg;
	mblk_t *mp;

	mutex_enter(&vsp->vns_lock);
	if (vsp->vns_dlpi_inc == NULL && (vsp->vns_state != VNS_S_INITIAL &&
	    vsp->vns_state != VNS_S_SHUTTING_DOWN)) {
		mutex_exit(&vsp->vns_lock);
		return;
	}

	/*
	 * When trying to shut down, or unwinding from a failed enabling, rather
	 * than immediately entering the ZOMBIE state, we may instead opt to try
	 * and enter the next state in the progression. This is especially
	 * important when trying to tear everything down.
	 */
loop:
	DTRACE_PROBE2(vnd__state__transition, uintptr_t, vsp,
	    vnd_str_state_t, vsp->vns_state);
	switch (vsp->vns_state) {
	case VNS_S_INITIAL:
		VERIFY(vsp->vns_dlpi_inc == NULL);
		if (vnd_st_sinfo(vsp) != 0)
			died = B_TRUE;
		break;
	case VNS_S_INFO_SENT:
		VERIFY(vsp->vns_dlpi_inc != NULL);
		if (vnd_st_info(vsp) == 0) {
			if (vnd_st_sexclusive(vsp) != 0)
				died = B_TRUE;
		} else {
			died = B_TRUE;
		}
		break;
	case VNS_S_EXCLUSIVE_SENT:
		VERIFY(vsp->vns_dlpi_inc != NULL);
		if (vnd_st_exclusive(vsp) == 0) {
			if (vsp->vns_dlpi_style == DL_STYLE2) {
				if (vnd_st_sattach(vsp) != 0)
					died = B_TRUE;
			} else {
				if (vnd_st_sbind(vsp) != 0)
					died = B_TRUE;
			}
		} else  {
			died = B_TRUE;
		}
		break;
	case VNS_S_ATTACH_SENT:
		VERIFY(vsp->vns_dlpi_inc != NULL);
		if (vnd_st_attach(vsp) == 0) {
			if (vnd_st_sbind(vsp) != 0)
				died = B_TRUE;
		} else {
			died = B_TRUE;
		}
		break;
	case VNS_S_BIND_SENT:
		VERIFY(vsp->vns_dlpi_inc != NULL);
		if (vnd_st_bind(vsp) == 0) {
			if (vnd_st_spromisc(vsp, DL_PROMISC_SAP,
			    VNS_S_SAP_PROMISC_SENT) != 0)
				died = B_TRUE;
		} else {
			died = B_TRUE;
		}
		break;
	case VNS_S_SAP_PROMISC_SENT:
		VERIFY(vsp->vns_dlpi_inc != NULL);
		if (vnd_st_promisc(vsp) == 0) {
			if (vnd_st_spromisc(vsp, DL_PROMISC_MULTI,
			    VNS_S_MULTI_PROMISC_SENT) != 0)
				died = B_TRUE;
		} else {
			died = B_TRUE;
		}
		break;
	case VNS_S_MULTI_PROMISC_SENT:
		VERIFY(vsp->vns_dlpi_inc != NULL);
		if (vnd_st_promisc(vsp) == 0) {
			if (vnd_st_spromisc(vsp, DL_PROMISC_RX_ONLY,
			    VNS_S_RX_ONLY_PROMISC_SENT) != 0)
				died = B_TRUE;
		} else {
			died = B_TRUE;
		}
		break;
	case VNS_S_RX_ONLY_PROMISC_SENT:
		VERIFY(vsp->vns_dlpi_inc != NULL);
		if (vnd_st_promisc(vsp) == 0) {
			if (vnd_st_spromisc(vsp, DL_PROMISC_FIXUPS,
			    VNS_S_FIXUP_PROMISC_SENT) != 0)
				died = B_TRUE;
		} else {
			died = B_TRUE;
		}
		break;
	case VNS_S_FIXUP_PROMISC_SENT:
		VERIFY(vsp->vns_dlpi_inc != NULL);
		if (vnd_st_promisc(vsp) == 0) {
			if (vnd_st_scapabq(vsp) != 0)
				died = B_TRUE;
		} else {
			died = B_TRUE;
		}
		break;
	case VNS_S_CAPAB_Q_SENT:
		if (vnd_st_capabq(vsp) != 0)
			died = B_TRUE;
		else
			vnd_st_sonline(vsp);
		break;
	case VNS_S_SHUTTING_DOWN:
		vnd_st_shutdown(vsp);
		if (vnd_st_spromiscoff(vsp, DL_PROMISC_MULTI,
		    VNS_S_MULTICAST_PROMISCOFF_SENT) == B_FALSE)
			goto loop;
		break;
	case VNS_S_MULTICAST_PROMISCOFF_SENT:
		vnd_st_promiscoff(vsp);
		if (vnd_st_spromiscoff(vsp, DL_PROMISC_SAP,
		    VNS_S_SAP_PROMISCOFF_SENT) == B_FALSE)
			goto loop;
		break;
	case VNS_S_SAP_PROMISCOFF_SENT:
		vnd_st_promiscoff(vsp);
		if (vnd_st_sunbind(vsp) == B_FALSE)
			goto loop;
		break;
	case VNS_S_UNBIND_SENT:
		vnd_st_unbind(vsp);
		break;
	case VNS_S_ZOMBIE:
		while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL)
			vnd_drop_ctl(vsp, mp, "vsp received data as a zombie");
		break;
	default:
		panic("vnd_str_t entered an unknown state");
	}

	if (died == B_TRUE) {
		ASSERT(vsp->vns_errno != VND_E_SUCCESS);
		vsp->vns_laststate = vsp->vns_state;
		vsp->vns_state = VNS_S_ZOMBIE;
		cv_broadcast(&vsp->vns_stcv);
	}

	mutex_exit(&vsp->vns_lock);
}

static void
vnd_dlpi_taskq_dispatch(void *arg)
{
	vnd_str_t *vsp = arg;
	int run = 1;

	while (run != 0) {
		vnd_str_state_transition(vsp);
		mutex_enter(&vsp->vns_lock);
		if (vsp->vns_flags & VNS_F_CONDEMNED ||
		    vsp->vns_dlpi_inc == NULL) {
			run = 0;
			vsp->vns_flags &= ~VNS_F_TASKQ_DISPATCHED;
		}
		if (vsp->vns_flags & VNS_F_CONDEMNED)
			cv_signal(&vsp->vns_cancelcv);
		mutex_exit(&vsp->vns_lock);
	}
}

/* ARGSUSED */
static int
vnd_neti_getifname(net_handle_t neti, phy_if_t phy, char *buf, const size_t len)
{
	return (-1);
}

/* ARGSUSED */
static int
vnd_neti_getmtu(net_handle_t neti, phy_if_t phy, lif_if_t ifdata)
{
	return (-1);
}

/* ARGSUSED */
static int
vnd_neti_getptmue(net_handle_t neti)
{
	return (-1);
}

/* ARGSUSED */
static int
vnd_neti_getlifaddr(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
    size_t nelem, net_ifaddr_t type[], void *storage)
{
	return (-1);
}

/* ARGSUSED */
static int
vnd_neti_getlifzone(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
    zoneid_t *zid)
{
	return (-1);
}

/* ARGSUSED */
static int
vnd_neti_getlifflags(net_handle_t neti, phy_if_t phy, lif_if_t ifdata,
    uint64_t *flags)
{
	return (-1);
}

/* ARGSUSED */
static phy_if_t
vnd_neti_phygetnext(net_handle_t neti, phy_if_t phy)
{
	return ((phy_if_t)-1);
}

/* ARGSUSED */
static phy_if_t
vnd_neti_phylookup(net_handle_t neti, const char *name)
{
	return ((phy_if_t)-1);
}

/* ARGSUSED */
static lif_if_t
vnd_neti_lifgetnext(net_handle_t neti, phy_if_t phy, lif_if_t ifdata)
{
	return (-1);
}

/* ARGSUSED */
static int
vnd_neti_inject(net_handle_t neti, inject_t style, net_inject_t *packet)
{
	return (-1);
}

/* ARGSUSED */
static phy_if_t
vnd_neti_route(net_handle_t neti, struct sockaddr *address,
    struct sockaddr *next)
{
	return ((phy_if_t)-1);
}

/* ARGSUSED */
static int
vnd_neti_ispchksum(net_handle_t neti, mblk_t *mp)
{
	return (-1);
}

/* ARGSUSED */
static int
vnd_neti_isvchksum(net_handle_t neti, mblk_t *mp)
{
	return (-1);
}

static net_protocol_t vnd_neti_info_v4 = {
	NETINFO_VERSION,
	NHF_VND_INET,
	vnd_neti_getifname,
	vnd_neti_getmtu,
	vnd_neti_getptmue,
	vnd_neti_getlifaddr,
	vnd_neti_getlifzone,
	vnd_neti_getlifflags,
	vnd_neti_phygetnext,
	vnd_neti_phylookup,
	vnd_neti_lifgetnext,
	vnd_neti_inject,
	vnd_neti_route,
	vnd_neti_ispchksum,
	vnd_neti_isvchksum
};

static net_protocol_t vnd_neti_info_v6 = {
	NETINFO_VERSION,
	NHF_VND_INET6,
	vnd_neti_getifname,
	vnd_neti_getmtu,
	vnd_neti_getptmue,
	vnd_neti_getlifaddr,
	vnd_neti_getlifzone,
	vnd_neti_getlifflags,
	vnd_neti_phygetnext,
	vnd_neti_phylookup,
	vnd_neti_lifgetnext,
	vnd_neti_inject,
	vnd_neti_route,
	vnd_neti_ispchksum,
	vnd_neti_isvchksum
};


static int
vnd_netinfo_init(vnd_pnsd_t *nsp)
{
	nsp->vpnd_neti_v4 = net_protocol_register(nsp->vpnd_nsid,
	    &vnd_neti_info_v4);
	ASSERT(nsp->vpnd_neti_v4 != NULL);

	nsp->vpnd_neti_v6 = net_protocol_register(nsp->vpnd_nsid,
	    &vnd_neti_info_v6);
	ASSERT(nsp->vpnd_neti_v6 != NULL);

	nsp->vpnd_family_v4.hf_version = HOOK_VERSION;
	nsp->vpnd_family_v4.hf_name = "vnd_inet";

	if (net_family_register(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4) != 0) {
		(void) net_protocol_unregister(nsp->vpnd_neti_v4);
		(void) net_protocol_unregister(nsp->vpnd_neti_v6);
		cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register "
		    "failed for stack %d", nsp->vpnd_nsid);
		return (1);
	}

	nsp->vpnd_family_v6.hf_version = HOOK_VERSION;
	nsp->vpnd_family_v6.hf_name = "vnd_inet6";

	if (net_family_register(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6) != 0) {
		(void) net_family_unregister(nsp->vpnd_neti_v4,
		    &nsp->vpnd_family_v4);
		(void) net_protocol_unregister(nsp->vpnd_neti_v4);
		(void) net_protocol_unregister(nsp->vpnd_neti_v6);
		cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register "
		    "failed for stack %d", nsp->vpnd_nsid);
		return (1);
	}

	nsp->vpnd_event_in_v4.he_version = HOOK_VERSION;
	nsp->vpnd_event_in_v4.he_name = NH_PHYSICAL_IN;
	nsp->vpnd_event_in_v4.he_flags = 0;
	nsp->vpnd_event_in_v4.he_interested = B_FALSE;

	nsp->vpnd_token_in_v4 = net_event_register(nsp->vpnd_neti_v4,
	    &nsp->vpnd_event_in_v4);
	if (nsp->vpnd_token_in_v4 == NULL) {
		(void) net_family_unregister(nsp->vpnd_neti_v4,
		    &nsp->vpnd_family_v4);
		(void) net_family_unregister(nsp->vpnd_neti_v6,
		    &nsp->vpnd_family_v6);
		(void) net_protocol_unregister(nsp->vpnd_neti_v4);
		(void) net_protocol_unregister(nsp->vpnd_neti_v6);
		cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
		    "failed for stack %d", nsp->vpnd_nsid);
		return (1);
	}

	nsp->vpnd_event_in_v6.he_version = HOOK_VERSION;
	nsp->vpnd_event_in_v6.he_name = NH_PHYSICAL_IN;
	nsp->vpnd_event_in_v6.he_flags = 0;
	nsp->vpnd_event_in_v6.he_interested = B_FALSE;

	nsp->vpnd_token_in_v6 = net_event_register(nsp->vpnd_neti_v6,
	    &nsp->vpnd_event_in_v6);
	if (nsp->vpnd_token_in_v6 == NULL) {
		(void) net_event_shutdown(nsp->vpnd_neti_v4,
		    &nsp->vpnd_event_in_v4);
		(void) net_event_unregister(nsp->vpnd_neti_v4,
		    &nsp->vpnd_event_in_v4);
		(void) net_family_unregister(nsp->vpnd_neti_v4,
		    &nsp->vpnd_family_v4);
		(void) net_family_unregister(nsp->vpnd_neti_v6,
		    &nsp->vpnd_family_v6);
		(void) net_protocol_unregister(nsp->vpnd_neti_v4);
		(void) net_protocol_unregister(nsp->vpnd_neti_v6);
		cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
		    "failed for stack %d", nsp->vpnd_nsid);
		return (1);
	}

	nsp->vpnd_event_out_v4.he_version = HOOK_VERSION;
	nsp->vpnd_event_out_v4.he_name = NH_PHYSICAL_OUT;
	nsp->vpnd_event_out_v4.he_flags = 0;
	nsp->vpnd_event_out_v4.he_interested = B_FALSE;

	nsp->vpnd_token_out_v4 = net_event_register(nsp->vpnd_neti_v4,
	    &nsp->vpnd_event_out_v4);
	if (nsp->vpnd_token_out_v4 == NULL) {
		(void) net_event_shutdown(nsp->vpnd_neti_v6,
		    &nsp->vpnd_event_in_v6);
		(void) net_event_unregister(nsp->vpnd_neti_v6,
		    &nsp->vpnd_event_in_v6);
		(void) net_event_shutdown(nsp->vpnd_neti_v4,
		    &nsp->vpnd_event_in_v4);
		(void) net_event_unregister(nsp->vpnd_neti_v4,
		    &nsp->vpnd_event_in_v4);
		(void) net_family_unregister(nsp->vpnd_neti_v4,
		    &nsp->vpnd_family_v4);
		(void) net_family_unregister(nsp->vpnd_neti_v6,
		    &nsp->vpnd_family_v6);
		(void) net_protocol_unregister(nsp->vpnd_neti_v4);
		(void) net_protocol_unregister(nsp->vpnd_neti_v6);
		cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
		    "failed for stack %d", nsp->vpnd_nsid);
		return (1);
	}

	nsp->vpnd_event_out_v6.he_version = HOOK_VERSION;
	nsp->vpnd_event_out_v6.he_name = NH_PHYSICAL_OUT;
	nsp->vpnd_event_out_v6.he_flags = 0;
	nsp->vpnd_event_out_v6.he_interested = B_FALSE;

	nsp->vpnd_token_out_v6 = net_event_register(nsp->vpnd_neti_v6,
	    &nsp->vpnd_event_out_v6);
	if (nsp->vpnd_token_out_v6 == NULL) {
		(void) net_event_shutdown(nsp->vpnd_neti_v6,
		    &nsp->vpnd_event_in_v6);
		(void) net_event_unregister(nsp->vpnd_neti_v6,
		    &nsp->vpnd_event_in_v6);
		(void) net_event_shutdown(nsp->vpnd_neti_v6,
		    &nsp->vpnd_event_in_v6);
		(void) net_event_unregister(nsp->vpnd_neti_v6,
		    &nsp->vpnd_event_in_v6);
		(void) net_event_shutdown(nsp->vpnd_neti_v4,
		    &nsp->vpnd_event_in_v4);
		(void) net_event_unregister(nsp->vpnd_neti_v4,
		    &nsp->vpnd_event_in_v4);
		(void) net_family_unregister(nsp->vpnd_neti_v4,
		    &nsp->vpnd_family_v4);
		(void) net_family_unregister(nsp->vpnd_neti_v6,
		    &nsp->vpnd_family_v6);
		(void) net_protocol_unregister(nsp->vpnd_neti_v4);
		(void) net_protocol_unregister(nsp->vpnd_neti_v6);
		cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register "
		    "failed for stack %d", nsp->vpnd_nsid);
		return (1);
	}

	return (0);
}

static void
vnd_netinfo_shutdown(vnd_pnsd_t *nsp)
{
	int ret;

	ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
	VERIFY(ret == 0);
	ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4);
	VERIFY(ret == 0);
	ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
	VERIFY(ret == 0);
	ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6);
	VERIFY(ret == 0);
}

static void
vnd_netinfo_fini(vnd_pnsd_t *nsp)
{
	int ret;

	ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4);
	VERIFY(ret == 0);
	ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4);
	VERIFY(ret == 0);
	ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6);
	VERIFY(ret == 0);
	ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6);
	VERIFY(ret == 0);
	ret = net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4);
	VERIFY(ret == 0);
	ret = net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6);
	VERIFY(ret == 0);
	ret = net_protocol_unregister(nsp->vpnd_neti_v4);
	VERIFY(ret == 0);
	ret = net_protocol_unregister(nsp->vpnd_neti_v6);
	VERIFY(ret == 0);
}

/* ARGSUSED */
static void
vnd_strbarrier_cb(void *arg, mblk_t *bmp, gsqueue_t *gsp, void *dummy)
{
	vnd_str_t *vsp = arg;

	VERIFY(bmp == &vsp->vns_barrierblk);
	mutex_enter(&vsp->vns_lock);
	VERIFY(vsp->vns_flags & VNS_F_BARRIER);
	VERIFY(!(vsp->vns_flags & VNS_F_BARRIER_DONE));
	vsp->vns_flags |= VNS_F_BARRIER_DONE;
	mutex_exit(&vsp->vns_lock);

	/*
	 * For better or worse, we have to broadcast here as we could have a
	 * thread that's blocked for completion as well as one that's blocked
	 * waiting to do a barrier itself.
	 */
	cv_broadcast(&vsp->vns_barriercv);
}

/*
 * This is a data barrier for the stream while it is in fastpath mode. It blocks
 * and ensures that there is nothing else in the squeue.
 */
static void
vnd_strbarrier(vnd_str_t *vsp)
{
	mutex_enter(&vsp->vns_lock);
	while (vsp->vns_flags & VNS_F_BARRIER)
		cv_wait(&vsp->vns_barriercv, &vsp->vns_lock);
	vsp->vns_flags |= VNS_F_BARRIER;
	mutex_exit(&vsp->vns_lock);

	gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_barrierblk,
	    vnd_strbarrier_cb, vsp, GSQUEUE_PROCESS, VND_SQUEUE_TAG_STRBARRIER);

	mutex_enter(&vsp->vns_lock);
	while (!(vsp->vns_flags & VNS_F_BARRIER_DONE))
		cv_wait(&vsp->vns_barriercv, &vsp->vns_lock);
	vsp->vns_flags &= ~VNS_F_BARRIER;
	vsp->vns_flags &= ~VNS_F_BARRIER_DONE;
	mutex_exit(&vsp->vns_lock);

	/*
	 * We have to broadcast in case anyone is waiting for the barrier
	 * themselves.
	 */
	cv_broadcast(&vsp->vns_barriercv);
}

/*
 * Based on the type of message that we're dealing with we're going to want to
 * do one of several things. Basically if it looks like it's something we know
 * about, we should probably handle it in one of our transition threads.
 * Otherwise, we should just simply putnext.
 */
static int
vnd_s_rput(queue_t *q, mblk_t *mp)
{
	t_uscalar_t prim;
	int dispatch = 0;
	vnd_str_t *vsp = q->q_ptr;

	switch (DB_TYPE(mp)) {
	case M_PROTO:
	case M_PCPROTO:
		if (MBLKL(mp) < sizeof (t_uscalar_t)) {
			vnd_drop_ctl(vsp, mp, "PROTO message too short");
			break;
		}

		prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive;
		if (prim == DL_UNITDATA_REQ || prim == DL_UNITDATA_IND) {
			vnd_drop_ctl(vsp, mp,
			    "recieved an unsupported dlpi DATA req");
			break;
		}

		/*
		 * Enqueue the entry and fire off a taskq dispatch.
		 */
		mutex_enter(&vsp->vns_lock);
		vnd_dlpi_inc_push(vsp, mp);
		if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) {
			dispatch = 1;
			vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED;
		}
		mutex_exit(&vsp->vns_lock);
		if (dispatch != 0)
			taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch,
			    vsp, 0, &vsp->vns_tqe);
		break;
	case M_DATA:
		vnd_drop_in(vsp, mp, "M_DATA via put(9E)");
		break;
	default:
		putnext(vsp->vns_rq, mp);
	}
	return (0);
}

/* ARGSUSED */
static void
vnd_strioctl(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct iocblk *iocp)
{
	int error;
	vnd_strioc_t *visp;

	if (iocp->ioc_cmd != VND_STRIOC_ASSOCIATE ||
	    iocp->ioc_count != TRANSPARENT) {
		error = EINVAL;
		goto nak;
	}

	/*
	 * All streams ioctls that we support must use kcred as a means to
	 * distinguish that this is a layered open by the kernel as opposed to
	 * one by a user who has done an I_PUSH of the module.
	 */
	if (iocp->ioc_cr != kcred) {
		error = EPERM;
		goto nak;
	}

	if (mp->b_cont == NULL) {
		error = EAGAIN;
		goto nak;
	}

	visp = kmem_alloc(sizeof (vnd_strioc_t), KM_SLEEP);
	ASSERT(MBLKL(mp->b_cont) == sizeof (caddr_t));
	visp->vs_addr = *(caddr_t *)mp->b_cont->b_rptr;
	visp->vs_state = VSS_COPYIN;

	mcopyin(mp, (void *)visp, sizeof (vnd_strioc_associate_t), NULL);
	qreply(q, mp);

	return;

nak:
	if (mp->b_cont != NULL) {
		freemsg(mp->b_cont);
		mp->b_cont = NULL;
	}

	iocp->ioc_error = error;
	mp->b_datap->db_type = M_IOCNAK;
	iocp->ioc_count = 0;
	qreply(q, mp);
}

static void
vnd_striocdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp)
{
	vnd_str_state_t state;
	struct copyreq *crp;
	vnd_strioc_associate_t *vss;
	vnd_dev_t *vdp = NULL;
	vnd_pnsd_t *nsp = NULL;
	char iname[2*VND_NAMELEN];
	zone_t *zone;
	vnd_strioc_t *visp;

	visp = (vnd_strioc_t *)csp->cp_private;

	/* If it's not ours, it's not our problem */
	if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) {
		if (q->q_next != NULL) {
			putnext(q, mp);
		} else {
			VND_STAT_INC(vsp, vks_ndlpidrops, 1);
			VND_STAT_INC(vsp, vks_tdrops, 1);
			vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA");
		}
		kmem_free(visp, sizeof (vnd_strioc_t));
		return;
	}

	/* The nak is already sent for us */
	if (csp->cp_rval != 0) {
		vnd_drop_ctl(vsp, mp, "M_COPYIN failed");
		kmem_free(visp, sizeof (vnd_strioc_t));
		return;
	}

	/* Data is sitting for us in b_cont */
	if (mp->b_cont == NULL ||
	    MBLKL(mp->b_cont) != sizeof (vnd_strioc_associate_t)) {
		kmem_free(visp, sizeof (vnd_strioc_t));
		miocnak(q, mp, 0, EINVAL);
		return;
	}

	vss = (vnd_strioc_associate_t *)mp->b_cont->b_rptr;
	vdp = vnd_dev_lookup(vss->vsa_minor);
	if (vdp == NULL) {
		vss->vsa_errno = VND_E_NODEV;
		goto nak;
	}

	nsp = vnd_nsd_lookup(vss->vsa_nsid);
	if (nsp == NULL) {
		vss->vsa_errno = VND_E_NONETSTACK;
		goto nak;
	}

	mutex_enter(&vsp->vns_lock);
	if (!(vsp->vns_flags & VNS_F_NEED_ZONE)) {
		mutex_exit(&vsp->vns_lock);
		vss->vsa_errno = VND_E_ASSOCIATED;
		goto nak;
	}

	vsp->vns_nsd = nsp;
	vsp->vns_flags &= ~VNS_F_NEED_ZONE;
	vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED;
	mutex_exit(&vsp->vns_lock);

	taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp, 0,
	    &vsp->vns_tqe);


	/* At this point we need to wait until we have transitioned to ONLINE */
	mutex_enter(&vsp->vns_lock);
	while (vsp->vns_state != VNS_S_ONLINE && vsp->vns_state != VNS_S_ZOMBIE)
		cv_wait(&vsp->vns_stcv, &vsp->vns_lock);
	state = vsp->vns_state;
	mutex_exit(&vsp->vns_lock);

	if (state == VNS_S_ZOMBIE) {
		vss->vsa_errno = vsp->vns_errno;
		goto nak;
	}

	mutex_enter(&vdp->vdd_lock);
	mutex_enter(&vsp->vns_lock);
	VERIFY(vdp->vdd_str == NULL);
	/*
	 * Now initialize the remaining kstat properties and let's go ahead and
	 * create it.
	 */
	(void) snprintf(iname, sizeof (iname), "z%d_%d",
	    vdp->vdd_nsd->vpnd_zid, vdp->vdd_minor);
	vsp->vns_kstat = kstat_create_zone("vnd", vdp->vdd_minor, iname, "net",
	    KSTAT_TYPE_NAMED, sizeof (vnd_str_stat_t) / sizeof (kstat_named_t),
	    KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
	if (vsp->vns_kstat == NULL) {
		vss->vsa_errno = VND_E_KSTATCREATE;
		mutex_exit(&vsp->vns_lock);
		mutex_exit(&vdp->vdd_lock);
		goto nak;
	}
	vdp->vdd_str = vsp;
	vsp->vns_dev = vdp;

	/*
	 * Now, it's time to do the las thing that can fail, changing out the
	 * input function. After this we know that we can receive data, so we
	 * should make sure that we're ready.
	 */
	if (vnd_dld_cap_enable(vsp, vnd_mac_input) != 0) {
		vss->vsa_errno = VND_E_DIRECTFAIL;
		vdp->vdd_str = NULL;
		vsp->vns_dev = NULL;
		mutex_exit(&vsp->vns_lock);
		mutex_exit(&vdp->vdd_lock);
		goto nak;
	}

	zone = zone_find_by_id(vdp->vdd_nsd->vpnd_zid);
	ASSERT(zone != NULL);
	vsp->vns_kstat->ks_data = &vsp->vns_ksdata;
	/* Account for zone name */
	vsp->vns_kstat->ks_data_size += strlen(zone->zone_name) + 1;
	/* Account for eventual link name */
	vsp->vns_kstat->ks_data_size += VND_NAMELEN;
	kstat_named_setstr(&vsp->vns_ksdata.vks_zonename, zone->zone_name);
	kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname,
	    vdp->vdd_lname);
	zone_rele(zone);
	kstat_install(vsp->vns_kstat);

	mutex_exit(&vsp->vns_lock);
	mutex_exit(&vdp->vdd_lock);

	/*
	 * Note that the vnd_str_t does not keep a permanent hold on the
	 * vnd_pnsd_t. We leave that up to the vnd_dev_t as that's also what
	 * the nestack goes through to take care of everything.
	 */
	vss->vsa_errno = VND_E_SUCCESS;
nak:
	if (vdp != NULL)
		vnd_dev_rele(vdp);
	if (nsp != NULL)
		vnd_nsd_rele(nsp);
	/*
	 * Change the copyin request to a copyout. Note that we can't use
	 * mcopyout here as it only works when the DB_TYPE is M_IOCTL. That's
	 * okay, as the copyin vs. copyout is basically the same.
	 */
	DB_TYPE(mp) = M_COPYOUT;
	visp->vs_state = VSS_COPYOUT;
	crp = (struct copyreq *)mp->b_rptr;
	crp->cq_private = (void *)visp;
	crp->cq_addr = visp->vs_addr;
	crp->cq_size = sizeof (vnd_strioc_associate_t);
	qreply(q, mp);
}

static void
vnd_stroutdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp)
{
	ASSERT(csp->cp_private != NULL);
	kmem_free(csp->cp_private, sizeof (vnd_strioc_t));
	if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) {
		if (q->q_next != NULL) {
			putnext(q, mp);
		} else {
			VND_STAT_INC(vsp, vks_ndlpidrops, 1);
			VND_STAT_INC(vsp, vks_tdrops, 1);
			vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA");
		}
		return;
	}

	/* The nak is already sent for us */
	if (csp->cp_rval != 0) {
		vnd_drop_ctl(vsp, mp, "M_COPYOUT failed");
		return;
	}

	/* Ack and let's be done with it all */
	miocack(q, mp, 0, 0);
}

static int
vnd_s_wput(queue_t *q, mblk_t *mp)
{
	vnd_str_t *vsp = q->q_ptr;
	struct copyresp *crp;
	vnd_strioc_state_t vstate;
	vnd_strioc_t *visp;

	switch (DB_TYPE(mp)) {
	case M_IOCTL:
		vnd_strioctl(q, vsp, mp, (struct iocblk *)mp->b_rptr);
		return (0);
	case M_IOCDATA:
		crp = (struct copyresp *)mp->b_rptr;
		ASSERT(crp->cp_private != NULL);
		visp = (vnd_strioc_t *)crp->cp_private;
		vstate = visp->vs_state;
		ASSERT(vstate == VSS_COPYIN || vstate == VSS_COPYOUT);
		if (vstate == VSS_COPYIN)
			vnd_striocdata(q, vsp, mp,
			    (struct copyresp *)mp->b_rptr);
		else
			vnd_stroutdata(q, vsp, mp,
			    (struct copyresp *)mp->b_rptr);
		return (0);
	default:
		break;
	}
	if (q->q_next != NULL)
		putnext(q, mp);
	else
		vnd_drop_ctl(vsp, mp, "!M_IOCTL in wput");

	return (0);
}

/* ARGSUSED */
static int
vnd_s_open(queue_t *q, dev_t *devp, int oflag, int sflag, cred_t *credp)
{
	vnd_str_t *vsp;
	uint_t rand;

	if (q->q_ptr != NULL)
		return (EINVAL);

	if (!(sflag & MODOPEN))
		return (ENXIO);

	if (credp != kcred)
		return (EPERM);

	vsp = kmem_cache_alloc(vnd_str_cache, KM_SLEEP);
	bzero(vsp, sizeof (*vsp));
	mutex_init(&vsp->vns_lock, NULL, MUTEX_DRIVER, NULL);
	cv_init(&vsp->vns_cancelcv, NULL, CV_DRIVER, NULL);
	cv_init(&vsp->vns_barriercv, NULL, CV_DRIVER, NULL);
	cv_init(&vsp->vns_stcv, NULL, CV_DRIVER, NULL);
	vsp->vns_state = VNS_S_INITIAL;

	mutex_init(&vsp->vns_dq_read.vdq_lock, NULL, MUTEX_DRIVER, NULL);
	mutex_init(&vsp->vns_dq_write.vdq_lock, NULL, MUTEX_DRIVER, NULL);
	mutex_enter(&vnd_dev_lock);
	vsp->vns_dq_read.vdq_max = vnd_vdq_default_size;
	vsp->vns_dq_read.vdq_vns = vsp;
	vsp->vns_dq_write.vdq_max = vnd_vdq_default_size;
	vsp->vns_dq_write.vdq_vns = vsp;
	mutex_exit(&vnd_dev_lock);
	vsp->vns_rq = q;
	vsp->vns_wq = WR(q);
	q->q_ptr = WR(q)->q_ptr = vsp;
	vsp->vns_flags = VNS_F_NEED_ZONE;
	vsp->vns_nflush = vnd_flush_nburst;
	vsp->vns_bsize = vnd_flush_burst_size;

	(void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand));
	vsp->vns_squeue = gsqueue_set_get(vnd_sqset, rand);

	/*
	 * We create our kstat and initialize all of its fields now, but we
	 * don't install it until we actually do the zone association so we can
	 * get everything.
	 */
	kstat_named_init(&vsp->vns_ksdata.vks_rbytes, "rbytes",
	    KSTAT_DATA_UINT64);
	kstat_named_init(&vsp->vns_ksdata.vks_rpackets, "rpackets",
	    KSTAT_DATA_UINT64);
	kstat_named_init(&vsp->vns_ksdata.vks_obytes, "obytes",
	    KSTAT_DATA_UINT64);
	kstat_named_init(&vsp->vns_ksdata.vks_opackets, "opackets",
	    KSTAT_DATA_UINT64);
	kstat_named_init(&vsp->vns_ksdata.vks_nhookindrops, "nhookindrops",
	    KSTAT_DATA_UINT64);
	kstat_named_init(&vsp->vns_ksdata.vks_nhookoutdrops, "nhookoutdrops",
	    KSTAT_DATA_UINT64);
	kstat_named_init(&vsp->vns_ksdata.vks_ndlpidrops, "ndlpidrops",
	    KSTAT_DATA_UINT64);
	kstat_named_init(&vsp->vns_ksdata.vks_ndataindrops, "ndataindrops",
	    KSTAT_DATA_UINT64);
	kstat_named_init(&vsp->vns_ksdata.vks_ndataoutdrops, "ndataoutdrops",
	    KSTAT_DATA_UINT64);
	kstat_named_init(&vsp->vns_ksdata.vks_tdrops, "total_drops",
	    KSTAT_DATA_UINT64);
	kstat_named_init(&vsp->vns_ksdata.vks_linkname, "linkname",
	    KSTAT_DATA_STRING);
	kstat_named_init(&vsp->vns_ksdata.vks_zonename, "zonename",
	    KSTAT_DATA_STRING);
	kstat_named_init(&vsp->vns_ksdata.vks_nmacflow, "flowcontrol_events",
	    KSTAT_DATA_UINT64);
	kstat_named_init(&vsp->vns_ksdata.vks_tmacflow, "flowcontrol_time",
	    KSTAT_DATA_UINT64);
	kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1ms, "flowcontrol_1ms",
	    KSTAT_DATA_UINT64);
	kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10ms, "flowcontrol_10ms",
	    KSTAT_DATA_UINT64);
	kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_100ms,
	    "flowcontrol_100ms", KSTAT_DATA_UINT64);
	kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1s, "flowcontrol_1s",
	    KSTAT_DATA_UINT64);
	kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10s, "flowcontrol_10s",
	    KSTAT_DATA_UINT64);
	qprocson(q);
	/*
	 * Now that we've called qprocson, grab the lower module for making sure
	 * that we don't have any pass through modules.
	 */
	vsp->vns_lrq = RD(vsp->vns_wq->q_next);

	return (0);
}

/* ARGSUSED */
static int
vnd_s_close(queue_t *q, int flag, cred_t *credp)
{
	vnd_str_t *vsp;
	mblk_t *mp;

	VERIFY(WR(q)->q_next != NULL);

	vsp = q->q_ptr;
	ASSERT(vsp != NULL);

	/*
	 * We need to transition ourselves down.  This means that we have a few
	 * important different things to do in the process of tearing down our
	 * input and output buffers, making sure we've drained the current
	 * squeue, and disabling the fast path. Before we disable the fast path,
	 * we should make sure the squeue is drained. Because we're in streams
	 * close, we know that no packets can come into us from userland, but we
	 * can receive more. As such, the following is the exact order of things
	 * that we do:
	 *
	 * 1) flush the vns_dq_read
	 * 2) Insert the drain mblk
	 * 3) When it's been received, tear down the fast path by kicking
	 * off the state machine.
	 * 4) One final flush of both the vns_dq_read,vns_dq_write
	 */

	vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in);
	vnd_strbarrier(vsp);
	mutex_enter(&vsp->vns_lock);
	vsp->vns_state = VNS_S_SHUTTING_DOWN;
	if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) {
		vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED;
		taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp,
		    0, &vsp->vns_tqe);
	}
	while (vsp->vns_state != VNS_S_ZOMBIE)
		cv_wait(&vsp->vns_stcv, &vsp->vns_lock);
	mutex_exit(&vsp->vns_lock);

	qprocsoff(q);
	mutex_enter(&vsp->vns_lock);
	vsp->vns_flags |= VNS_F_CONDEMNED;
	while (vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)
		cv_wait(&vsp->vns_cancelcv, &vsp->vns_lock);

	while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL)
		vnd_drop_ctl(vsp, mp, "vnd_s_close");
	mutex_exit(&vsp->vns_lock);

	q->q_ptr = NULL;
	vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in);
	vnd_dq_flush(&vsp->vns_dq_write, vnd_drop_out);
	mutex_destroy(&vsp->vns_dq_read.vdq_lock);
	mutex_destroy(&vsp->vns_dq_write.vdq_lock);

	if (vsp->vns_kstat != NULL)
		kstat_delete(vsp->vns_kstat);
	mutex_destroy(&vsp->vns_lock);
	cv_destroy(&vsp->vns_stcv);
	cv_destroy(&vsp->vns_barriercv);
	cv_destroy(&vsp->vns_cancelcv);
	kmem_cache_free(vnd_str_cache, vsp);

	return (0);
}

static vnd_mac_cookie_t
vnd_squeue_tx_one(vnd_str_t *vsp, mblk_t *mp)
{
	hrtime_t txtime;
	vnd_mac_cookie_t vc;

	VND_STAT_INC(vsp, vks_opackets, 1);
	VND_STAT_INC(vsp, vks_obytes, msgsize(mp));
	DTRACE_VND5(send, mblk_t *, mp, void *, NULL, void *, NULL,
	    vnd_str_t *, vsp, mblk_t *, mp);
	/* Actually tx now */
	txtime = gethrtime();
	vc = vsp->vns_caps.vsc_tx_f(vsp->vns_caps.vsc_tx_hdl,
	    mp, 0, MAC_DROP_ON_NO_DESC);

	/*
	 * We need to check two different conditions before we immediately set
	 * the flow control lock. The first thing that we need to do is verify
	 * that this is an instance of hard flow control, so to say. The flow
	 * control callbacks won't always fire in cases where we still get a
	 * cookie returned. The explicit check for flow control will guarantee
	 * us that we'll get a subsequent notification callback.
	 *
	 * The second case comes about because we do not hold the
	 * vnd_str_t`vns_lock across calls to tx, we need to determine if a flow
	 * control notification already came across for us in a different thread
	 * calling vnd_mac_flow_control(). To deal with this, we record a
	 * timestamp every time that we change the flow control state. We grab
	 * txtime here before we transmit because that guarantees that the
	 * hrtime_t of the call to vnd_mac_flow_control() will be after txtime.
	 *
	 * If the flow control notification beat us to the punch, the value of
	 * vns_fcupdate will be larger than the value of txtime, and we should
	 * just record the statistics. However, if we didn't beat it to the
	 * punch (txtime > vns_fcupdate), then we know that it's safe to wait
	 * for a notification.
	 */
	if (vc != (vnd_mac_cookie_t)NULL) {
		hrtime_t diff;

		if (vsp->vns_caps.vsc_is_fc_f(vsp->vns_caps.vsc_is_fc_hdl,
		    vc) == 0)
			return ((vnd_mac_cookie_t)NULL);
		mutex_enter(&vsp->vns_lock);
		diff = vsp->vns_fcupdate - txtime;
		if (diff > 0) {
			mutex_exit(&vsp->vns_lock);
			vnd_mac_flow_control_stat(vsp, diff);
			return ((vnd_mac_cookie_t)NULL);
		}
		vsp->vns_flags |= VNS_F_FLOW_CONTROLLED;
		vsp->vns_caps.vsc_fc_cookie = vc;
		vsp->vns_fclatch = txtime;
		vsp->vns_fcupdate = txtime;
		DTRACE_VND3(flow__blocked, vnd_str_t *, vsp,
		    uint64_t, vsp->vns_dq_write.vdq_cur, uintptr_t, vc);
		mutex_exit(&vsp->vns_lock);
	}

	return (vc);
}

/* ARGSUSED */
static void
vnd_squeue_tx_drain(void *arg, mblk_t *drain_mp, gsqueue_t *gsp, void *dummy)
{
	mblk_t *mp;
	int nmps;
	size_t mptot, nflush, bsize;
	boolean_t blocked, empty;
	vnd_data_queue_t *vqp;
	vnd_str_t *vsp = arg;

	mutex_enter(&vsp->vns_lock);
	/*
	 * We either enter here via an squeue or via vnd_squeue_tx_append(). In
	 * the former case we need to mark that there is no longer an active
	 * user of the drain block.
	 */
	if (drain_mp != NULL) {
		VERIFY(drain_mp == &vsp->vns_drainblk);
		VERIFY(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED);
		vsp->vns_flags &= ~VNS_F_DRAIN_SCHEDULED;
	}

	/*
	 * If we're still flow controlled or under a flush barrier, nothing to
	 * do.
	 */
	if (vsp->vns_flags & (VNS_F_FLOW_CONTROLLED | VNS_F_BARRIER)) {
		mutex_exit(&vsp->vns_lock);
		return;
	}

	nflush = vsp->vns_nflush;
	bsize = vsp->vns_bsize;
	mutex_exit(&vsp->vns_lock);

	/*
	 * We're potentially going deep into the networking layer; make sure the
	 * guest can't run concurrently.
	 */
	smt_begin_unsafe();

	nmps = 0;
	mptot = 0;
	blocked = B_FALSE;
	vqp = &vsp->vns_dq_write;
	while (nmps < nflush && mptot <= bsize) {
		mutex_enter(&vqp->vdq_lock);
		if (vnd_dq_pop(vqp, &mp) == 0) {
			mutex_exit(&vqp->vdq_lock);
			break;
		}
		mutex_exit(&vqp->vdq_lock);

		nmps++;
		mptot += msgsize(mp);
		if (vnd_squeue_tx_one(vsp, mp) != (vnd_mac_cookie_t)NULL) {
			blocked = B_TRUE;
			break;
		}
	}

	smt_end_unsafe();

	empty = vnd_dq_is_empty(&vsp->vns_dq_write);

	/*
	 * If the queue is not empty, we're not blocked, and there isn't a drain
	 * scheduled, put it into the squeue with the drain block and
	 * GSQUEUE_FILL.
	 */
	if (blocked == B_FALSE && empty == B_FALSE) {
		mutex_enter(&vsp->vns_lock);
		if (!(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED)) {
			mblk_t *mp = &vsp->vns_drainblk;
			vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED;
			gsqueue_enter_one(vsp->vns_squeue,
			    mp, vnd_squeue_tx_drain, vsp,
			    GSQUEUE_FILL, VND_SQUEUE_TAG_TX_DRAIN);
		}
		mutex_exit(&vsp->vns_lock);
	}

	/*
	 * If we drained some amount of data, we need to signal the data queue.
	 */
	if (nmps > 0) {
		cv_broadcast(&vsp->vns_dq_write.vdq_ready);
		pollwakeup(&vsp->vns_dev->vdd_ph, POLLOUT);
	}
}

/* ARGSUSED */
static void
vnd_squeue_tx_append(void *arg, mblk_t *mp, gsqueue_t *gsp, void *dummy)
{
	vnd_str_t *vsp = arg;
	vnd_data_queue_t *vqp = &vsp->vns_dq_write;
	vnd_pnsd_t *nsp = vsp->vns_nsd;
	size_t len = msgsize(mp);

	/*
	 * Before we append this packet, we should run it through the firewall
	 * rules.
	 */
	if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4,
	    nsp->vpnd_event_out_v4, nsp->vpnd_token_out_v4, nsp->vpnd_neti_v6,
	    nsp->vpnd_event_out_v6, nsp->vpnd_token_out_v6, vnd_drop_hook_out,
	    vnd_drop_out) != 0) {
		/*
		 * Because we earlier reserved space for this packet and it's
		 * not making the cut, we need to go through and unreserve that
		 * space. Also note that the message block will likely be freed
		 * by the time we return from vnd_hook so we cannot rely on it.
		 */
		mutex_enter(&vqp->vdq_lock);
		vnd_dq_unreserve(vqp, len);
		mutex_exit(&vqp->vdq_lock);
		return;
	}

	/*
	 * We earlier reserved space for this packet. So for now simply append
	 * it and call drain. We know that no other drain can be going on right
	 * now thanks to the squeue.
	 */
	mutex_enter(&vqp->vdq_lock);
	(void) vnd_dq_push(&vsp->vns_dq_write, mp, B_TRUE, vnd_drop_panic);
	mutex_exit(&vqp->vdq_lock);
	vnd_squeue_tx_drain(vsp, NULL, NULL, NULL);
}

/*
 * We need to see if this is a valid name of sorts for us. That means a few
 * things. First off, we can't assume that what we've been given has actually
 * been null terminated. More importantly, that it's a valid name as far as
 * ddi_create_minor_node is concerned (that means no '@', '/', or ' '). We
 * further constrain ourselves to simply alphanumeric characters and a few
 * additional ones, ':', '-', and '_'.
 */
static int
vnd_validate_name(const char *buf, size_t buflen)
{
	int i, len;

	/* First make sure a null terminator exists */
	for (i = 0; i < buflen; i++)
		if (buf[i] == '\0')
			break;
	len = i;
	if (i == 0 || i == buflen)
		return (0);

	for (i = 0; i < len; i++)
		if (!isalnum(buf[i]) && buf[i] != ':' && buf[i] != '-' &&
		    buf[i] != '_')
			return (0);

	return (1);
}

static int
vnd_ioctl_attach(vnd_dev_t *vdp, uintptr_t arg, cred_t *credp, int cpflag)
{
	vnd_ioc_attach_t via;
	vnd_strioc_associate_t vss;
	vnd_pnsd_t *nsp;
	zone_t *zonep;
	zoneid_t zid;
	char buf[2*VND_NAMELEN];
	int ret, rp;

	if (secpolicy_net_config(credp, B_FALSE) != 0)
		return (EPERM);

	if (secpolicy_net_rawaccess(credp) != 0)
		return (EPERM);

	if (ddi_copyin((void *)arg, &via, sizeof (via), cpflag) != 0)
		return (EFAULT);
	via.via_errno = VND_E_SUCCESS;

	if (vnd_validate_name(via.via_name, VND_NAMELEN) == 0) {
		via.via_errno = VND_E_BADNAME;
		ret = EIO;
		goto errcopyout;
	}

	/*
	 * Only the global zone can request to create a device in a different
	 * zone.
	 */
	zid = crgetzoneid(credp);
	if (zid != GLOBAL_ZONEID && via.via_zoneid != -1 &&
	    zid != via.via_zoneid) {
		via.via_errno = VND_E_PERM;
		ret = EIO;
		goto errcopyout;
	}

	if (via.via_zoneid == -1)
		via.via_zoneid = zid;

	/*
	 * Establish the name we'll use now. We want to be extra paranoid about
	 * the device we're opening so check that now.
	 */
	if (zid == GLOBAL_ZONEID && via.via_zoneid != zid) {
		zonep = zone_find_by_id(via.via_zoneid);
		if (zonep == NULL) {
			via.via_errno = VND_E_NOZONE;
			ret = EIO;
			goto errcopyout;
		}
		if (snprintf(NULL, 0, "/dev/net/zone/%s/%s", zonep->zone_name,
		    via.via_name) >= sizeof (buf)) {
			zone_rele(zonep);
			via.via_errno = VND_E_BADNAME;
			ret = EIO;
			goto errcopyout;
		}
		(void) snprintf(buf, sizeof (buf), "/dev/net/zone/%s/%s",
		    zonep->zone_name, via.via_name);
		zone_rele(zonep);
		zonep = NULL;
	} else {
		if (snprintf(NULL, 0, "/dev/net/%s", via.via_name) >=
		    sizeof (buf)) {
			via.via_errno = VND_E_BADNAME;
			ret = EIO;
			goto errcopyout;
		}
		(void) snprintf(buf, sizeof (buf), "/dev/net/%s", via.via_name);
	}

	/*
	 * If our zone is dying then the netstack will have been removed from
	 * this list.
	 */
	nsp = vnd_nsd_lookup_by_zid(via.via_zoneid);
	if (nsp == NULL) {
		via.via_errno = VND_E_NOZONE;
		ret = EIO;
		goto errcopyout;
	}

	/*
	 * Note we set the attached handle even though we haven't actually
	 * finished the process of attaching the ldi handle.
	 */
	mutex_enter(&vdp->vdd_lock);
	if (vdp->vdd_flags & (VND_D_ATTACHED | VND_D_ATTACH_INFLIGHT)) {
		mutex_exit(&vdp->vdd_lock);
		vnd_nsd_rele(nsp);
		via.via_errno = VND_E_ATTACHED;
		ret = EIO;
		goto errcopyout;
	}
	vdp->vdd_flags |= VND_D_ATTACH_INFLIGHT;
	ASSERT(vdp->vdd_cr == NULL);
	crhold(credp);
	vdp->vdd_cr = credp;
	ASSERT(vdp->vdd_nsd == NULL);
	vdp->vdd_nsd = nsp;
	mutex_exit(&vdp->vdd_lock);

	/*
	 * Place an additional hold on the vnd_pnsd_t as we go through and do
	 * all of the rest of our work. This will be the hold that we keep for
	 * as long as this thing is attached.
	 */
	vnd_nsd_ref(nsp);

	ret = ldi_open_by_name(buf, FREAD | FWRITE, vdp->vdd_cr,
	    &vdp->vdd_ldih, vdp->vdd_ldiid);
	if (ret != 0) {
		if (ret == ENODEV)
			via.via_errno = VND_E_NODATALINK;
		goto err;
	}

	/*
	 * Unfortunately the I_PUSH interface doesn't allow us a way to detect
	 * whether or not we're coming in from a layered device. We really want
	 * to make sure that a normal user can't push on our streams module.
	 * Currently the only idea I have for this is to make sure that the
	 * credp is kcred which is really terrible.
	 */
	ret = ldi_ioctl(vdp->vdd_ldih, I_PUSH, (intptr_t)"vnd", FKIOCTL,
	    kcred, &rp);
	if (ret != 0) {
		rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
		VERIFY(rp == 0);
		via.via_errno = VND_E_STRINIT;
		ret = EIO;
		goto err;
	}

	vss.vsa_minor = vdp->vdd_minor;
	vss.vsa_nsid = nsp->vpnd_nsid;

	ret = ldi_ioctl(vdp->vdd_ldih, VND_STRIOC_ASSOCIATE, (intptr_t)&vss,
	    FKIOCTL, kcred, &rp);
	if (ret != 0 || vss.vsa_errno != VND_E_SUCCESS) {
		rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
		VERIFY(rp == 0);
		if (ret == 0) {
			via.via_errno = vss.vsa_errno;
			ret = EIO;
		}
		goto err;
	}

	mutex_enter(&vdp->vdd_nsd->vpnd_lock);

	/*
	 * There's a chance that our netstack was condemned while we've had a
	 * hold on it. As such we need to check and if so, error out.
	 */
	if (vdp->vdd_nsd->vpnd_flags & VND_NS_CONDEMNED) {
		mutex_exit(&vdp->vdd_nsd->vpnd_lock);
		rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr);
		VERIFY(rp == 0);
		ret = EIO;
		via.via_errno = VND_E_NOZONE;
		goto err;
	}

	mutex_enter(&vdp->vdd_lock);
	VERIFY(vdp->vdd_str != NULL);
	vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT;
	vdp->vdd_flags |= VND_D_ATTACHED;
	(void) strlcpy(vdp->vdd_datalink, via.via_name,
	    sizeof (vdp->vdd_datalink));
	list_insert_tail(&vdp->vdd_nsd->vpnd_dev_list, vdp);
	mutex_exit(&vdp->vdd_lock);
	mutex_exit(&vdp->vdd_nsd->vpnd_lock);
	vnd_nsd_rele(nsp);

	return (0);

err:
	mutex_enter(&vdp->vdd_lock);
	vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT;
	crfree(vdp->vdd_cr);
	vdp->vdd_cr = NULL;
	vdp->vdd_nsd = NULL;
	mutex_exit(&vdp->vdd_lock);

	/*
	 * We have two holds to drop here. One for our original reference and
	 * one for the hold this operation would have represented.
	 */
	vnd_nsd_rele(nsp);
	vnd_nsd_rele(nsp);
errcopyout:
	if (ddi_copyout(&via, (void *)arg, sizeof (via), cpflag) != 0)
		ret = EFAULT;

	return (ret);
}

static int
vnd_ioctl_link(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag)
{
	int ret = 0;
	vnd_ioc_link_t vil;
	char mname[2*VND_NAMELEN];
	char **c;
	vnd_dev_t *v;
	zoneid_t zid;

	/* Not anyone can link something */
	if (secpolicy_net_config(credp, B_FALSE) != 0)
		return (EPERM);

	if (ddi_copyin((void *)arg, &vil, sizeof (vil), cpflag) != 0)
		return (EFAULT);

	if (vnd_validate_name(vil.vil_name, VND_NAMELEN) == 0) {
		ret = EIO;
		vil.vil_errno = VND_E_BADNAME;
		goto errcopyout;
	}

	c = vnd_reserved_names;
	while (*c != NULL) {
		if (strcmp(vil.vil_name, *c) == 0) {
			ret = EIO;
			vil.vil_errno = VND_E_BADNAME;
			goto errcopyout;
		}
		c++;
	}

	mutex_enter(&vdp->vdd_lock);
	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
		mutex_exit(&vdp->vdd_lock);
		ret = EIO;
		vil.vil_errno = VND_E_NOTATTACHED;
		goto errcopyout;
	}

	if (vdp->vdd_flags & VND_D_ZONE_DYING) {
		mutex_exit(&vdp->vdd_lock);
		ret = EIO;
		vil.vil_errno = VND_E_NOZONE;
		goto errcopyout;
	}

	if (vdp->vdd_flags & (VND_D_LINK_INFLIGHT | VND_D_LINKED)) {
		mutex_exit(&vdp->vdd_lock);
		ret = EIO;
		vil.vil_errno = VND_E_LINKED;
		goto errcopyout;
	}
	vdp->vdd_flags |= VND_D_LINK_INFLIGHT;
	zid = vdp->vdd_nsd->vpnd_zid;
	mutex_exit(&vdp->vdd_lock);

	if (snprintf(NULL, 0, "z%d:%s", zid, vil.vil_name) >=
	    sizeof (mname)) {
		ret = EIO;
		vil.vil_errno = VND_E_BADNAME;
		goto errcopyout;
	}

	mutex_enter(&vnd_dev_lock);
	for (v = list_head(&vnd_dev_list); v != NULL;
	    v = list_next(&vnd_dev_list, v)) {
		if (!(v->vdd_flags & VND_D_LINKED))
			continue;

		if (v->vdd_nsd->vpnd_zid == zid &&
		    strcmp(v->vdd_lname, vil.vil_name) == 0) {
			mutex_exit(&vnd_dev_lock);
			ret = EIO;
			vil.vil_errno = VND_E_LINKEXISTS;
			goto error;
		}
	}

	/*
	 * We set the name and mark ourselves attached while holding the list
	 * lock to ensure that no other user can mistakingly find our name.
	 */
	(void) snprintf(mname, sizeof (mname), "z%d:%s", zid,
	    vil.vil_name);
	mutex_enter(&vdp->vdd_lock);

	/*
	 * Because we dropped our lock, we need to double check whether or not
	 * the zone was marked as dying while we were here. If it hasn't, then
	 * it's safe for us to link it in.
	 */
	if (vdp->vdd_flags & VND_D_ZONE_DYING) {
		mutex_exit(&vdp->vdd_lock);
		mutex_exit(&vnd_dev_lock);
		ret = EIO;
		vil.vil_errno = VND_E_NOZONE;
		goto error;
	}

	(void) strlcpy(vdp->vdd_lname, vil.vil_name, sizeof (vdp->vdd_lname));
	if (ddi_create_minor_node(vnd_dip, mname, S_IFCHR, vdp->vdd_minor,
	    DDI_PSEUDO, 0) != DDI_SUCCESS) {
		ret = EIO;
		vil.vil_errno = VND_E_MINORNODE;
	} else {
		vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT;
		vdp->vdd_flags |= VND_D_LINKED;
		kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname,
		    vdp->vdd_lname);
		ret = 0;
	}
	mutex_exit(&vdp->vdd_lock);
	mutex_exit(&vnd_dev_lock);

	if (ret == 0) {
		/*
		 * Add a reference to represent that this device is linked into
		 * the file system name space to ensure that it doesn't
		 * disappear.
		 */
		vnd_dev_ref(vdp);
		return (0);
	}

error:
	mutex_enter(&vdp->vdd_lock);
	vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT;
	vdp->vdd_lname[0] = '\0';
	mutex_exit(&vdp->vdd_lock);

errcopyout:
	if (ddi_copyout(&vil, (void *)arg, sizeof (vil), cpflag) != 0)
		ret = EFAULT;
	return (ret);
}

/*
 * Common unlink function. This is used both from the ioctl path and from the
 * netstack shutdown path. The caller is required to hold the mutex on the
 * vnd_dev_t, but they basically will have it relinquished for them. The only
 * thing the caller is allowed to do afterward is to potentially rele the
 * vnd_dev_t if they have their own hold. Note that only the ioctl path has its
 * own hold.
 */
static void
vnd_dev_unlink(vnd_dev_t *vdp)
{
	char mname[2*VND_NAMELEN];

	ASSERT(MUTEX_HELD(&vdp->vdd_lock));

	(void) snprintf(mname, sizeof (mname), "z%d:%s",
	    vdp->vdd_nsd->vpnd_zid, vdp->vdd_lname);
	ddi_remove_minor_node(vnd_dip, mname);
	vdp->vdd_lname[0] = '\0';
	vdp->vdd_flags &= ~VND_D_LINKED;
	kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname,
	    vdp->vdd_lname);
	mutex_exit(&vdp->vdd_lock);

	/*
	 * This rele corresponds to the reference that we took in
	 * vnd_ioctl_link.
	 */
	vnd_dev_rele(vdp);
}

static int
vnd_ioctl_unlink(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag)
{
	int ret;
	zoneid_t zid;
	vnd_ioc_unlink_t viu;

	/* Not anyone can unlink something */
	if (secpolicy_net_config(credp, B_FALSE) != 0)
		return (EPERM);

	zid = crgetzoneid(credp);

	if (ddi_copyin((void *)arg, &viu, sizeof (viu), cpflag) != 0)
		return (EFAULT);

	viu.viu_errno = VND_E_SUCCESS;

	mutex_enter(&vdp->vdd_lock);
	if (!(vdp->vdd_flags & VND_D_LINKED)) {
		mutex_exit(&vdp->vdd_lock);
		ret = EIO;
		viu.viu_errno = VND_E_NOTLINKED;
		goto err;
	}
	VERIFY(vdp->vdd_flags & VND_D_ATTACHED);

	if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) {
		mutex_exit(&vdp->vdd_lock);
		ret = EIO;
		viu.viu_errno = VND_E_PERM;
		goto err;
	}

	/* vnd_dev_unlink releases the vdp mutex for us */
	vnd_dev_unlink(vdp);
	ret = 0;
err:
	if (ddi_copyout(&viu, (void *)arg, sizeof (viu), cpflag) != 0)
		return (EFAULT);

	return (ret);
}

static int
vnd_ioctl_setrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
{
	int ret;
	vnd_ioc_buf_t vib;

	if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0)
		return (EFAULT);

	mutex_enter(&vnd_dev_lock);
	if (vib.vib_size > vnd_vdq_hard_max) {
		mutex_exit(&vnd_dev_lock);
		vib.vib_errno = VND_E_BUFTOOBIG;
		ret = EIO;
		goto err;
	}
	mutex_exit(&vnd_dev_lock);

	mutex_enter(&vdp->vdd_lock);
	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
		mutex_exit(&vdp->vdd_lock);
		vib.vib_errno = VND_E_NOTATTACHED;
		ret = EIO;
		goto err;
	}

	mutex_enter(&vdp->vdd_str->vns_lock);
	if (vib.vib_size < vdp->vdd_str->vns_minwrite) {
		mutex_exit(&vdp->vdd_str->vns_lock);
		mutex_exit(&vdp->vdd_lock);
		vib.vib_errno = VND_E_BUFTOOSMALL;
		ret = EIO;
		goto err;
	}

	mutex_exit(&vdp->vdd_str->vns_lock);
	mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock);
	vdp->vdd_str->vns_dq_read.vdq_max = (size_t)vib.vib_size;
	mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock);
	mutex_exit(&vdp->vdd_lock);
	ret = 0;

err:
	if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
		return (EFAULT);

	return (ret);
}

static int
vnd_ioctl_getrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
{
	int ret;
	vnd_ioc_buf_t vib;

	mutex_enter(&vdp->vdd_lock);
	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
		mutex_exit(&vdp->vdd_lock);
		vib.vib_errno = VND_E_NOTATTACHED;
		ret = EIO;
		goto err;
	}

	mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock);
	vib.vib_size = vdp->vdd_str->vns_dq_read.vdq_max;
	mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock);
	mutex_exit(&vdp->vdd_lock);
	ret = 0;

err:
	if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
		return (EFAULT);

	return (ret);
}

/* ARGSUSED */
static int
vnd_ioctl_getmaxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
{
	vnd_ioc_buf_t vib;

	mutex_enter(&vnd_dev_lock);
	vib.vib_size = vnd_vdq_hard_max;
	mutex_exit(&vnd_dev_lock);

	if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
		return (EFAULT);

	return (0);
}

static int
vnd_ioctl_gettxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
{
	int ret;
	vnd_ioc_buf_t vib;

	mutex_enter(&vdp->vdd_lock);
	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
		mutex_exit(&vdp->vdd_lock);
		vib.vib_errno = VND_E_NOTATTACHED;
		ret = EIO;
		goto err;
	}

	mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock);
	vib.vib_size = vdp->vdd_str->vns_dq_write.vdq_max;
	mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock);
	mutex_exit(&vdp->vdd_lock);
	ret = 0;

err:
	if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
		return (EFAULT);

	return (ret);
}

static int
vnd_ioctl_settxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag)
{
	int ret;
	vnd_ioc_buf_t vib;

	if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0)
		return (EFAULT);

	mutex_enter(&vnd_dev_lock);
	if (vib.vib_size > vnd_vdq_hard_max) {
		mutex_exit(&vnd_dev_lock);
		vib.vib_errno = VND_E_BUFTOOBIG;
		ret = EIO;
		goto err;
	}
	mutex_exit(&vnd_dev_lock);

	mutex_enter(&vdp->vdd_lock);
	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
		mutex_exit(&vdp->vdd_lock);
		vib.vib_errno = VND_E_NOTATTACHED;
		ret = EIO;
		goto err;
	}

	mutex_enter(&vdp->vdd_str->vns_lock);
	if (vib.vib_size < vdp->vdd_str->vns_minwrite) {
		mutex_exit(&vdp->vdd_str->vns_lock);
		mutex_exit(&vdp->vdd_lock);
		vib.vib_errno = VND_E_BUFTOOSMALL;
		ret = EIO;
		goto err;
	}
	mutex_exit(&vdp->vdd_str->vns_lock);

	mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock);
	vdp->vdd_str->vns_dq_write.vdq_max = (size_t)vib.vib_size;
	mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock);
	mutex_exit(&vdp->vdd_lock);
	ret = 0;

err:
	if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0)
		return (EFAULT);

	return (ret);
}

static int
vnd_ioctl_gettu(vnd_dev_t *vdp, intptr_t arg, int mode, boolean_t min)
{
	vnd_ioc_buf_t vib;

	vib.vib_errno = 0;
	mutex_enter(&vdp->vdd_lock);
	if (vdp->vdd_flags & VND_D_ATTACHED) {
		mutex_enter(&vdp->vdd_str->vns_lock);
		if (min == B_TRUE)
			vib.vib_size = vdp->vdd_str->vns_minwrite;
		else
			vib.vib_size = vdp->vdd_str->vns_maxwrite;
		mutex_exit(&vdp->vdd_str->vns_lock);
	} else {
		vib.vib_errno = VND_E_NOTATTACHED;
	}
	mutex_exit(&vdp->vdd_lock);

	if (ddi_copyout(&vib, (void *)arg, sizeof (vib), mode & FKIOCTL) != 0)
		return (EFAULT);

	return (0);
}

static int
vnd_frameio_read(vnd_dev_t *vdp, intptr_t addr, int mode)
{
	int ret, nonblock, nwrite;
	frameio_t *fio;
	vnd_data_queue_t *vqp;
	mblk_t *mp;

	fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI);
	if (fio == NULL)
		return (EAGAIN);

	ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (const void *)addr,
	    mode);
	if (ret != 0) {
		frameio_free(fio);
		return (ret);
	}

	mutex_enter(&vdp->vdd_lock);
	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
		mutex_exit(&vdp->vdd_lock);
		frameio_free(fio);
		return (ENXIO);
	}
	mutex_exit(&vdp->vdd_lock);

	nonblock = mode & (FNONBLOCK | FNDELAY);

	vqp = &vdp->vdd_str->vns_dq_read;
	mutex_enter(&vqp->vdq_lock);

	/* Check empty case */
	if (vqp->vdq_cur == 0) {
		if (nonblock != 0) {
			mutex_exit(&vqp->vdq_lock);
			frameio_free(fio);
			return (EWOULDBLOCK);
		}
		while (vqp->vdq_cur == 0) {
			if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
				mutex_exit(&vqp->vdq_lock);
				frameio_free(fio);
				return (EINTR);
			}
		}
	}

	ret = frameio_mblk_chain_write(fio, MAP_BLK_FRAME, vqp->vdq_head,
	    &nwrite, mode & FKIOCTL);
	if (ret != 0) {
		mutex_exit(&vqp->vdq_lock);
		frameio_free(fio);
		return (ret);
	}

	ret = frameio_hdr_copyout(fio, nwrite, (void *)addr, mode);
	if (ret != 0) {
		mutex_exit(&vqp->vdq_lock);
		frameio_free(fio);
		return (ret);
	}

	while (nwrite > 0) {
		(void) vnd_dq_pop(vqp, &mp);
		freemsg(mp);
		nwrite--;
	}
	mutex_exit(&vqp->vdq_lock);
	frameio_free(fio);

	return (0);
}

static int
vnd_frameio_write(vnd_dev_t *vdp, intptr_t addr, int mode)
{
	frameio_t *fio;
	int ret, nonblock, nframes, i, nread;
	size_t maxwrite, minwrite, total, flen;
	mblk_t *mp_chain, *mp, *nmp;
	vnd_data_queue_t *vqp;

	fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI);
	if (fio == NULL)
		return (EAGAIN);

	ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (void *)addr, mode);
	if (ret != 0) {
		frameio_free(fio);
		return (ret);
	}

	mutex_enter(&vdp->vdd_lock);
	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
		mutex_exit(&vdp->vdd_lock);
		frameio_free(fio);
		return (ENXIO);
	}
	mutex_exit(&vdp->vdd_lock);

	nonblock = mode & (FNONBLOCK | FNDELAY);

	/*
	 * Make sure no single frame is larger than we can accept.
	 */
	mutex_enter(&vdp->vdd_str->vns_lock);
	minwrite = vdp->vdd_str->vns_minwrite;
	maxwrite = vdp->vdd_str->vns_maxwrite;
	mutex_exit(&vdp->vdd_str->vns_lock);

	nframes = fio->fio_nvpf / fio->fio_nvecs;
	total = 0;
	for (i = 0; i < nframes; i++) {
		flen = frameio_frame_length(fio,
		    &fio->fio_vecs[i*fio->fio_nvpf]);
		if (flen < minwrite || flen > maxwrite) {
			frameio_free(fio);
			return (ERANGE);
		}
		total += flen;
	}

	vqp = &vdp->vdd_str->vns_dq_write;
	mutex_enter(&vqp->vdq_lock);
	while (vnd_dq_reserve(vqp, total) == 0) {
		if (nonblock != 0) {
			frameio_free(fio);
			mutex_exit(&vqp->vdq_lock);
			return (EAGAIN);
		}
		if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
			mutex_exit(&vqp->vdq_lock);
			frameio_free(fio);
			return (EINTR);
		}
	}
	mutex_exit(&vqp->vdq_lock);

	/*
	 * We've reserved our space, let's copyin and go from here.
	 */
	ret = frameio_mblk_chain_read(fio, &mp_chain, &nread, mode & FKIOCTL);
	if (ret != 0) {
		frameio_free(fio);
		vnd_dq_unreserve(vqp, total);
		cv_broadcast(&vqp->vdq_ready);
		pollwakeup(&vdp->vdd_ph, POLLOUT);
		return (ret);
	}

	for (mp = mp_chain; mp != NULL; mp = nmp) {
		nmp = mp->b_next;
		mp->b_next = NULL;
		gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp,
		    vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS,
		    VND_SQUEUE_TAG_VND_WRITE);
	}

	/*
	 * Update the frameio structure to indicate that we wrote those frames.
	 */
	frameio_mark_consumed(fio, nread);
	ret = frameio_hdr_copyout(fio, nread, (void *)addr, mode);
	frameio_free(fio);

	return (ret);
}

static int
vnd_ioctl_list_copy_info(vnd_dev_t *vdp, vnd_ioc_info_t *arg, int mode)
{
	const char *link;
	uint32_t vers = 1;
	ASSERT(MUTEX_HELD(&vdp->vdd_lock));

	/*
	 * Copy all of the members out to userland.
	 */
	if (ddi_copyout(&vers, &arg->vii_version, sizeof (uint32_t),
	    mode & FKIOCTL) != 0)
		return (EFAULT);

	if (vdp->vdd_flags & VND_D_LINKED)
		link = vdp->vdd_lname;
	else
		link = "<anonymous>";
	if (ddi_copyout(link, arg->vii_name, sizeof (arg->vii_name),
	    mode & FKIOCTL) != 0)
		return (EFAULT);

	if (ddi_copyout(vdp->vdd_datalink, arg->vii_datalink,
	    sizeof (arg->vii_datalink), mode & FKIOCTL) != 0)
		return (EFAULT);

	if (ddi_copyout(&vdp->vdd_nsd->vpnd_zid, &arg->vii_zone,
	    sizeof (zoneid_t), mode & FKIOCTL) != 0)
		return (EFAULT);
	return (0);
}

static int
vnd_ioctl_list(intptr_t arg, cred_t *credp, int mode)
{
	vnd_ioc_list_t vl;
	vnd_ioc_list32_t vl32;
	zoneid_t zid;
	vnd_dev_t *vdp;
	vnd_ioc_info_t *vip;
	int found, cancopy, ret;

	if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) {
		if (ddi_copyin((void *)arg, &vl32, sizeof (vnd_ioc_list32_t),
		    mode & FKIOCTL) != 0)
			return (EFAULT);
		vl.vl_nents = vl32.vl_nents;
		vl.vl_actents = vl32.vl_actents;
		vl.vl_ents = (void *)(uintptr_t)vl32.vl_ents;
	} else {
		if (ddi_copyin((void *)arg, &vl, sizeof (vnd_ioc_list_t),
		    mode & FKIOCTL) != 0)
			return (EFAULT);
	}

	cancopy = vl.vl_nents;
	vip = vl.vl_ents;
	found = 0;
	zid = crgetzoneid(credp);
	mutex_enter(&vnd_dev_lock);
	for (vdp = list_head(&vnd_dev_list); vdp != NULL;
	    vdp = list_next(&vnd_dev_list, vdp)) {
		mutex_enter(&vdp->vdd_lock);
		if (vdp->vdd_flags & VND_D_ATTACHED &&
		    !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING)) &&
		    (zid == GLOBAL_ZONEID || zid == vdp->vdd_nsd->vpnd_zid)) {
			found++;
			if (cancopy > 0) {
				ret = vnd_ioctl_list_copy_info(vdp, vip, mode);
				if (ret != 0) {
					mutex_exit(&vdp->vdd_lock);
					mutex_exit(&vnd_dev_lock);
					return (ret);
				}
				cancopy--;
				vip++;
			}
		}
		mutex_exit(&vdp->vdd_lock);
	}
	mutex_exit(&vnd_dev_lock);

	if (ddi_copyout(&found, &((vnd_ioc_list_t *)arg)->vl_actents,
	    sizeof (uint_t), mode & FKIOCTL) != 0)
		return (EFAULT);

	return (0);
}


/* ARGSUSED */
static int
vnd_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
    int *rvalp)
{
	int ret;
	minor_t m;
	vnd_dev_t *vdp;

	m = getminor(dev);
	ASSERT(m != 0);

	/*
	 * Make sure no one has come in on an ioctl from the strioc case.
	 */
	if ((cmd & VND_STRIOC) == VND_STRIOC)
		return (ENOTTY);

	/*
	 * Like close, seems like if this minor isn't found, it's a programmer
	 * error somehow.
	 */
	vdp = vnd_dev_lookup(m);
	if (vdp == NULL)
		return (ENXIO);

	switch (cmd) {
	case VND_IOC_ATTACH:
		if (!(mode & FWRITE)) {
			ret = EBADF;
			break;
		}
		ret = vnd_ioctl_attach(vdp, arg, credp, mode);
		break;
	case VND_IOC_LINK:
		if (!(mode & FWRITE)) {
			ret = EBADF;
			break;
		}
		ret = vnd_ioctl_link(vdp, arg, credp, mode);
		break;
	case VND_IOC_UNLINK:
		if (!(mode & FWRITE)) {
			ret = EBADF;
			break;
		}
		ret = vnd_ioctl_unlink(vdp, arg, credp, mode);
		break;
	case VND_IOC_GETRXBUF:
		if (!(mode & FREAD)) {
			ret = EBADF;
			break;
		}
		ret = vnd_ioctl_getrxbuf(vdp, arg, mode);
		break;
	case VND_IOC_SETRXBUF:
		if (!(mode & FWRITE)) {
			ret = EBADF;
			break;
		}
		ret = vnd_ioctl_setrxbuf(vdp, arg, mode);
		break;
	case VND_IOC_GETTXBUF:
		if (!(mode & FREAD)) {
			ret = EBADF;
			break;
		}
		ret = vnd_ioctl_gettxbuf(vdp, arg, mode);
		break;
	case VND_IOC_SETTXBUF:
		if (!(mode & FWRITE)) {
			ret = EBADF;
			break;
		}
		ret = vnd_ioctl_settxbuf(vdp, arg, mode);
		break;
	case VND_IOC_GETMAXBUF:
		if (!(mode & FREAD)) {
			ret = EBADF;
			break;
		}
		if (crgetzoneid(credp) != GLOBAL_ZONEID) {
			ret = EPERM;
			break;
		}
		ret = vnd_ioctl_getmaxbuf(vdp, arg, mode);
		break;
	case VND_IOC_GETMINTU:
		if (!(mode & FREAD)) {
			ret = EBADF;
			break;
		}
		ret = vnd_ioctl_gettu(vdp, arg, mode, B_TRUE);
		break;
	case VND_IOC_GETMAXTU:
		if (!(mode & FREAD)) {
			ret = EBADF;
			break;
		}
		ret = vnd_ioctl_gettu(vdp, arg, mode, B_FALSE);
		break;
	case VND_IOC_FRAMEIO_READ:
		if (!(mode & FREAD)) {
			ret = EBADF;
			break;
		}
		ret = vnd_frameio_read(vdp, arg, mode);
		break;
	case VND_IOC_FRAMEIO_WRITE:
		if (!(mode & FWRITE)) {
			ret = EBADF;
			break;
		}
		ret = vnd_frameio_write(vdp, arg, mode);
		break;
	case VND_IOC_LIST:
		if (!(mode & FREAD)) {
			ret = EBADF;
			break;
		}
		ret = vnd_ioctl_list(arg, credp, mode);
		break;
	default:
		ret = ENOTTY;
		break;
	}

	vnd_dev_rele(vdp);
	return (ret);
}

static int
vnd_open(dev_t *devp, int flag, int otyp, cred_t *credp)
{
	vnd_dev_t *vdp;
	minor_t m;
	zoneid_t zid;

	if (flag & (FEXCL | FNDELAY))
		return (ENOTSUP);

	if (otyp & OTYP_BLK)
		return (ENOTSUP);

	zid = crgetzoneid(credp);
	m = getminor(*devp);

	/*
	 * If we have an open of a non-zero instance then we need to look that
	 * up in our list of entries.
	 */
	if (m != 0) {

		/*
		 * We don't check for rawaccess globally as a user could be
		 * doing a list ioctl on the control node which doesn't require
		 * this privilege.
		 */
		if (secpolicy_net_rawaccess(credp) != 0)
			return (EPERM);


		vdp = vnd_dev_lookup(m);
		if (vdp == NULL)
			return (ENOENT);

		/*
		 * We need to check to make sure that the user is allowed to
		 * open this node. At this point it should be an attached handle
		 * as that's all we're allowed to access.
		 */
		mutex_enter(&vdp->vdd_lock);
		if (!(vdp->vdd_flags & VND_D_LINKED)) {
			mutex_exit(&vdp->vdd_lock);
			vnd_dev_rele(vdp);
			return (ENOENT);
		}

		if (vdp->vdd_flags & VND_D_ZONE_DYING) {
			mutex_exit(&vdp->vdd_lock);
			vnd_dev_rele(vdp);
			return (ENOENT);
		}

		if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) {
			mutex_exit(&vdp->vdd_lock);
			vnd_dev_rele(vdp);
			return (ENOENT);
		}

		if ((flag & FEXCL) && (vdp->vdd_flags & VND_D_OPENED)) {
			mutex_exit(&vdp->vdd_lock);
			vnd_dev_rele(vdp);
			return (EBUSY);
		}

		if (!(vdp->vdd_flags & VND_D_OPENED)) {
			vdp->vdd_flags |= VND_D_OPENED;
			vdp->vdd_ref++;
			DTRACE_VND_REFINC(vdp);
		}
		mutex_exit(&vdp->vdd_lock);
		vnd_dev_rele(vdp);

		return (0);
	}

	if (flag & FEXCL)
		return (ENOTSUP);

	/*
	 * We need to clone ourselves and set up new a state.
	 */
	vdp = kmem_cache_alloc(vnd_dev_cache, KM_SLEEP);
	bzero(vdp, sizeof (vnd_dev_t));

	if (ldi_ident_from_dev(*devp, &vdp->vdd_ldiid) != 0) {
		kmem_cache_free(vnd_dev_cache, vdp);
		return (EINVAL);
	}

	vdp->vdd_minor = id_alloc(vnd_minors);
	mutex_init(&vdp->vdd_lock, NULL, MUTEX_DRIVER, NULL);
	list_link_init(&vdp->vdd_link);
	vdp->vdd_ref = 1;
	*devp = makedevice(getmajor(*devp), vdp->vdd_minor);
	vdp->vdd_devid = *devp;
	DTRACE_VND_REFINC(vdp);
	vdp->vdd_flags |= VND_D_OPENED;

	mutex_enter(&vnd_dev_lock);
	list_insert_head(&vnd_dev_list, vdp);
	mutex_exit(&vnd_dev_lock);

	return (0);
}

/* ARGSUSED */
static int
vnd_close(dev_t dev, int flag, int otyp, cred_t *credp)
{
	minor_t m;
	vnd_dev_t *vdp;

	m = getminor(dev);
	if (m == 0)
		return (ENXIO);

	vdp = vnd_dev_lookup(m);
	if (vdp == NULL)
		return (ENXIO);

	mutex_enter(&vdp->vdd_lock);
	VERIFY(vdp->vdd_flags & VND_D_OPENED);
	vdp->vdd_flags &= ~VND_D_OPENED;
	mutex_exit(&vdp->vdd_lock);

	/* Remove the hold from the previous open. */
	vnd_dev_rele(vdp);

	/* And now from lookup */
	vnd_dev_rele(vdp);
	return (0);
}

/* ARGSUSED */
static int
vnd_read(dev_t dev, struct uio *uiop, cred_t *credp)
{
	int nonblock, error = 0;
	size_t mpsize;
	vnd_dev_t *vdp;
	vnd_data_queue_t *vqp;
	mblk_t *mp = NULL;
	offset_t u_loffset;

	/*
	 * If we have more than one uio we refuse to do anything. That's for
	 * frameio.
	 */
	if (uiop->uio_iovcnt > 1)
		return (EINVAL);

	vdp = vnd_dev_lookup(getminor(dev));
	if (vdp == NULL)
		return (ENXIO);

	mutex_enter(&vdp->vdd_lock);
	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
		mutex_exit(&vdp->vdd_lock);
		vnd_dev_rele(vdp);
		return (ENXIO);
	}
	mutex_exit(&vdp->vdd_lock);
	nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY);

	vqp = &vdp->vdd_str->vns_dq_read;
	mutex_enter(&vqp->vdq_lock);

	/* Check empty case */
	if (vqp->vdq_cur == 0) {
		if (nonblock != 0) {
			error = EWOULDBLOCK;
			goto err;
		}
		while (vqp->vdq_cur == 0) {
			if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
				error = EINTR;
				goto err;
			}
		}
	}

	/* Ensure our buffer is big enough */
	mp = vqp->vdq_head;
	ASSERT(mp != NULL);
	mpsize = msgsize(mp);
	if (mpsize > uiop->uio_resid) {
		error = EOVERFLOW;
		goto err;
	}

	u_loffset = uiop->uio_loffset;
	while (mp != NULL) {
		if (uiomove(mp->b_rptr, MBLKL(mp), UIO_READ, uiop) != 0) {
			error = EFAULT;
			uiop->uio_loffset = u_loffset;
			mp = NULL;
			goto err;
		}
		mpsize -= MBLKL(mp);
		mp = mp->b_cont;
	}
	ASSERT(mpsize == 0);
	(void) vnd_dq_pop(vqp, &mp);
	freemsg(mp);
err:
	mutex_exit(&vqp->vdq_lock);
	vnd_dev_rele(vdp);

	return (error);
}

/* ARGSUSED */
static int
vnd_write(dev_t dev, struct uio *uiop, cred_t *credp)
{
	int nonblock, error;
	vnd_dev_t *vdp;
	mblk_t *mp;
	ssize_t iosize, origsize;
	vnd_data_queue_t *vqp;

	if (uiop->uio_iovcnt > 1)
		return (EINVAL);

	vdp = vnd_dev_lookup(getminor(dev));
	if (vdp == NULL)
		return (ENXIO);

	mutex_enter(&vdp->vdd_lock);
	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
		mutex_exit(&vdp->vdd_lock);
		vnd_dev_rele(vdp);
		return (ENXIO);
	}
	mutex_exit(&vdp->vdd_lock);
	nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY);

	mutex_enter(&vdp->vdd_str->vns_lock);
	if (uiop->uio_resid > vdp->vdd_str->vns_maxwrite ||
	    uiop->uio_resid < vdp->vdd_str->vns_minwrite) {
		mutex_exit(&vdp->vdd_str->vns_lock);
		vnd_dev_rele(vdp);
		return (ERANGE);
	}
	mutex_exit(&vdp->vdd_str->vns_lock);
	VERIFY(vdp->vdd_str != NULL);

	/*
	 * Reserve space in the data queue if we can. If we can't, block or
	 * return EAGAIN. If we can, go and squeue_enter.
	 */
	vqp = &vdp->vdd_str->vns_dq_write;
	mutex_enter(&vqp->vdq_lock);
	while (vnd_dq_reserve(vqp, uiop->uio_resid) == 0) {
		if (nonblock != 0) {
			mutex_exit(&vqp->vdq_lock);
			vnd_dev_rele(vdp);
			return (EAGAIN);
		}
		if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) {
			mutex_exit(&vqp->vdq_lock);
			vnd_dev_rele(vdp);
			return (EINTR);
		}
	}
	mutex_exit(&vqp->vdq_lock);

	/*
	 * Now that we've reserved the space, try to allocate kernel space for
	 * and copy in the block. To take care of all this we use the
	 * strmakedata subroutine for now.
	 */
	origsize = iosize = uiop->uio_resid;
	error = strmakedata(&iosize, uiop, vdp->vdd_str->vns_wq->q_stream, 0,
	    &mp);

	/*
	 * strmakedata() will return an error or it may only consume a portion
	 * of the data.
	 */
	if (error != 0 || uiop->uio_resid != 0) {
		vnd_dq_unreserve(vqp, origsize);
		cv_broadcast(&vqp->vdq_ready);
		pollwakeup(&vdp->vdd_ph, POLLOUT);
		vnd_dev_rele(vdp);
		return (ENOSR);
	}

	gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp,
	    vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS,
	    VND_SQUEUE_TAG_VND_WRITE);

	vnd_dev_rele(vdp);
	return (0);
}

static int
vnd_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
    struct pollhead **phpp)
{
	short ready = 0;
	vnd_dev_t *vdp;
	vnd_data_queue_t *vqp;

	vdp = vnd_dev_lookup(getminor(dev));
	if (vdp == NULL)
		return (ENXIO);

	mutex_enter(&vdp->vdd_lock);
	if (!(vdp->vdd_flags & VND_D_ATTACHED)) {
		mutex_exit(&vdp->vdd_lock);
		vnd_dev_rele(vdp);
		return (ENXIO);
	}
	mutex_exit(&vdp->vdd_lock);

	if ((events & POLLIN) || (events & POLLRDNORM)) {
		vqp = &vdp->vdd_str->vns_dq_read;
		mutex_enter(&vqp->vdq_lock);
		if (vqp->vdq_head != NULL)
			ready |= events & (POLLIN | POLLRDNORM);
		mutex_exit(&vqp->vdq_lock);
	}

	if (events & POLLOUT) {
		vqp = &vdp->vdd_str->vns_dq_write;
		mutex_enter(&vqp->vdq_lock);
		if (vqp->vdq_cur != vqp->vdq_max)
			ready |= POLLOUT;
		mutex_exit(&vqp->vdq_lock);
	}

	if ((ready == 0 && !anyyet) || (events & POLLET)) {
		*phpp = &vdp->vdd_ph;
	}
	*reventsp = ready;
	vnd_dev_rele(vdp);
	return (0);
}

/* ARGSUSED */
static void *
vnd_stack_init(netstackid_t stackid, netstack_t *ns)
{
	vnd_pnsd_t *nsp;

	nsp = kmem_cache_alloc(vnd_pnsd_cache, KM_SLEEP);
	bzero(nsp, sizeof (*nsp));
	nsp->vpnd_nsid = stackid;
	nsp->vpnd_zid = netstackid_to_zoneid(stackid);
	nsp->vpnd_flags = 0;
	mutex_init(&nsp->vpnd_lock, NULL, MUTEX_DRIVER, NULL);
	list_create(&nsp->vpnd_dev_list, sizeof (vnd_dev_t),
	    offsetof(vnd_dev_t, vdd_nslink));
	if (vnd_netinfo_init(nsp) == 0)
		nsp->vpnd_hooked = B_TRUE;

	mutex_enter(&vnd_dev_lock);
	list_insert_tail(&vnd_nsd_list, nsp);
	mutex_exit(&vnd_dev_lock);

	return (nsp);
}

/* ARGSUSED */
static void
vnd_stack_shutdown(netstackid_t stackid, void *arg)
{
	vnd_pnsd_t *nsp = arg;
	vnd_dev_t *vdp;

	ASSERT(nsp != NULL);
	/*
	 * After shut down no one should be able to find their way to this
	 * netstack again.
	 */
	mutex_enter(&vnd_dev_lock);
	list_remove(&vnd_nsd_list, nsp);
	mutex_exit(&vnd_dev_lock);

	/*
	 * Make sure hooks know that they're going away.
	 */
	if (nsp->vpnd_hooked == B_TRUE)
		vnd_netinfo_shutdown(nsp);

	/*
	 * Now we need to go through and notify each zone that they are in
	 * teardown phase.  See the big theory statement section on vnd, zones,
	 * netstacks, and sdev for more information about this.
	 */
	mutex_enter(&nsp->vpnd_lock);
	nsp->vpnd_flags |= VND_NS_CONDEMNED;
	for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL;
	    vdp = list_next(&nsp->vpnd_dev_list, vdp)) {
		mutex_enter(&vdp->vdd_lock);
		if (!(vdp->vdd_flags & VND_D_CONDEMNED))
			vdp->vdd_flags |= VND_D_ZONE_DYING;
		mutex_exit(&vdp->vdd_lock);
	}
	mutex_exit(&nsp->vpnd_lock);

	/*
	 * Next we remove all the links as we know nothing new can be added to
	 * the list and that none of the extent devices can obtain additional
	 * links.
	 */
restart:
	mutex_enter(&nsp->vpnd_lock);
	for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL;
	    vdp = list_next(&nsp->vpnd_dev_list, vdp)) {
		mutex_enter(&vdp->vdd_lock);
		if ((vdp->vdd_flags & VND_D_CONDEMNED) ||
		    !(vdp->vdd_flags & VND_D_LINKED)) {
			mutex_exit(&vdp->vdd_lock);
			continue;
		}

		/*
		 * We drop our lock here and restart afterwards. Note that as
		 * part of unlinking we end up doing a rele of the vnd_dev_t. If
		 * this is the final hold on the vnd_dev_t then it might try and
		 * remove itself. Our locking rules requires not to be holding
		 * any locks when we call any of the rele functions.
		 *
		 * Note that the unlink function requires holders to call into
		 * it with the vnd_dev_t->vdd_lock held and will take care of it
		 * for us. Because we don't have a hold on it, we're done at
		 * this point.
		 */
		mutex_exit(&nsp->vpnd_lock);
		/* Forcibly unlink */
		vnd_dev_unlink(vdp);
		goto restart;
	}
	mutex_exit(&nsp->vpnd_lock);
}

/* ARGSUSED */
static void
vnd_stack_destroy(netstackid_t stackid, void *arg)
{
	vnd_pnsd_t *nsp = arg;

	ASSERT(nsp != NULL);

	/*
	 * Now that we've unlinked everything we just have to hang out for
	 * it to finish exiting. Now that it's no longer the kernel itself
	 * that's doing this we just need to wait for our reference count to
	 * equal zero and then we're free. If the global zone is holding open a
	 * reference to a vnd device for another zone, that's bad, but there's
	 * nothing much we can do. See the section on 'vnd, zones, netstacks' in
	 * the big theory statement for more information.
	 */
	mutex_enter(&nsp->vpnd_lock);
	while (nsp->vpnd_ref != 0)
		cv_wait(&nsp->vpnd_ref_change, &nsp->vpnd_lock);
	mutex_exit(&nsp->vpnd_lock);

	/*
	 * During shutdown we removed ourselves from the list and now we have no
	 * more references so we can safely say that there is nothing left and
	 * destroy everything that we had sitting around.
	 */
	if (nsp->vpnd_hooked == B_TRUE)
		vnd_netinfo_fini(nsp);

	mutex_destroy(&nsp->vpnd_lock);
	list_destroy(&nsp->vpnd_dev_list);
	kmem_cache_free(vnd_pnsd_cache, nsp);
}

/*
 * Convert a node with a name of the form /dev/vnd/zone/%zonename and
 * /dev/vnd/zone/%zonename/%linkname to the corresponding vnd netstack.
 */
static vnd_pnsd_t *
vnd_sdev_ctx_to_ns(sdev_ctx_t ctx)
{
	enum vtype vt;
	const char *path = sdev_ctx_path(ctx);
	char *zstart, *dup;
	size_t duplen;
	vnd_pnsd_t *nsp;

	vt = sdev_ctx_vtype(ctx);
	ASSERT(strncmp(path, VND_SDEV_ZROOT, strlen(VND_SDEV_ZROOT)) == 0);

	if (vt == VDIR) {
		zstart = strrchr(path, '/');
		ASSERT(zstart != NULL);
		zstart++;
		return (vnd_nsd_lookup_by_zonename(zstart));
	}

	ASSERT(vt == VCHR);

	dup = strdup(path);
	duplen = strlen(dup) + 1;
	zstart = strrchr(dup, '/');
	*zstart = '\0';
	zstart--;
	zstart = strrchr(dup, '/');
	zstart++;
	nsp = vnd_nsd_lookup_by_zonename(zstart);
	kmem_free(dup, duplen);

	return (nsp);
}

static sdev_plugin_validate_t
vnd_sdev_validate_dir(sdev_ctx_t ctx)
{
	vnd_pnsd_t *nsp;

	if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ROOT) == 0)
		return (SDEV_VTOR_VALID);

	if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ZROOT) == 0) {
		ASSERT(getzoneid() == GLOBAL_ZONEID);
		ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL);
		return (SDEV_VTOR_VALID);
	}

	nsp = vnd_sdev_ctx_to_ns(ctx);
	if (nsp == NULL)
		return (SDEV_VTOR_INVALID);
	vnd_nsd_rele(nsp);

	return (SDEV_VTOR_VALID);
}

static sdev_plugin_validate_t
vnd_sdev_validate(sdev_ctx_t ctx)
{
	enum vtype vt;
	vnd_dev_t *vdp;
	minor_t minor;

	vt = sdev_ctx_vtype(ctx);
	if (vt == VDIR)
		return (vnd_sdev_validate_dir(ctx));
	ASSERT(vt == VCHR);

	if (strcmp("ctl", sdev_ctx_name(ctx)) == 0)
		return (SDEV_VTOR_VALID);

	if (sdev_ctx_minor(ctx, &minor) != 0)
		return (SDEV_VTOR_STALE);

	vdp = vnd_dev_lookup(minor);
	if (vdp == NULL)
		return (SDEV_VTOR_STALE);

	mutex_enter(&vdp->vdd_lock);
	if (!(vdp->vdd_flags & VND_D_LINKED) ||
	    (vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) {
		mutex_exit(&vdp->vdd_lock);
		vnd_dev_rele(vdp);
		return (SDEV_VTOR_STALE);
	}

	if (strcmp(sdev_ctx_name(ctx), vdp->vdd_lname) != 0) {
		mutex_exit(&vdp->vdd_lock);
		vnd_dev_rele(vdp);
		return (SDEV_VTOR_STALE);
	}

	mutex_exit(&vdp->vdd_lock);
	vnd_dev_rele(vdp);
	return (SDEV_VTOR_VALID);
}

/*
 * This function is a no-op. sdev never has holds on our devices as they can go
 * away at any time and specfs has to deal with that fact.
 */
/* ARGSUSED */
static void
vnd_sdev_inactive(sdev_ctx_t ctx)
{
}

static int
vnd_sdev_fillzone(vnd_pnsd_t *nsp, sdev_ctx_t ctx)
{
	int ret;
	vnd_dev_t *vdp;

	mutex_enter(&nsp->vpnd_lock);
	for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL;
	    vdp = list_next(&nsp->vpnd_dev_list, vdp)) {
		mutex_enter(&vdp->vdd_lock);
		if ((vdp->vdd_flags & VND_D_LINKED) &&
		    !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) {
			ret = sdev_plugin_mknod(ctx, vdp->vdd_lname,
			    VND_SDEV_MODE, vdp->vdd_devid);
			if (ret != 0 && ret != EEXIST) {
				mutex_exit(&vdp->vdd_lock);
				mutex_exit(&nsp->vpnd_lock);
				vnd_nsd_rele(nsp);
				return (ret);
			}
		}
		mutex_exit(&vdp->vdd_lock);
	}
	mutex_exit(&nsp->vpnd_lock);

	return (0);
}

static int
vnd_sdev_filldir_root(sdev_ctx_t ctx)
{
	zoneid_t zid;
	vnd_pnsd_t *nsp;
	int ret;

	zid = getzoneid();
	nsp = vnd_nsd_lookup(zoneid_to_netstackid(zid));
	ASSERT(nsp != NULL);
	ret = vnd_sdev_fillzone(nsp, ctx);
	vnd_nsd_rele(nsp);
	if (ret != 0)
		return (ret);

	/*
	 * Checking the zone id is not sufficient as the global zone could be
	 * reaching down into a non-global zone's mounted /dev.
	 */
	if (zid == GLOBAL_ZONEID && (sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL)) {
		ret = sdev_plugin_mkdir(ctx, "zone");
		if (ret != 0 && ret != EEXIST)
			return (ret);
	}

	/*
	 * Always add a reference to the control node. There's no need to
	 * reference it since it always exists and is always what we clone from.
	 */
	ret = sdev_plugin_mknod(ctx, "ctl", VND_SDEV_MODE,
	    makedevice(ddi_driver_major(vnd_dip), 0));
	if (ret != 0 && ret != EEXIST)
		return (ret);

	return (0);
}

static int
vnd_sdev_filldir_zroot(sdev_ctx_t ctx)
{
	int ret;
	vnd_pnsd_t *nsp;
	zone_t *zonep;

	ASSERT(getzoneid() == GLOBAL_ZONEID);
	ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL);

	mutex_enter(&vnd_dev_lock);
	for (nsp = list_head(&vnd_nsd_list); nsp != NULL;
	    nsp = list_next(&vnd_nsd_list, nsp)) {
		mutex_enter(&nsp->vpnd_lock);
		if (list_is_empty(&nsp->vpnd_dev_list)) {
			mutex_exit(&nsp->vpnd_lock);
			continue;
		}
		mutex_exit(&nsp->vpnd_lock);
		zonep = zone_find_by_id(nsp->vpnd_zid);
		/*
		 * This zone must be being torn down, so skip it.
		 */
		if (zonep == NULL)
			continue;
		ret = sdev_plugin_mkdir(ctx, zonep->zone_name);
		zone_rele(zonep);
		if (ret != 0 && ret != EEXIST) {
			mutex_exit(&vnd_dev_lock);
			return (ret);
		}
	}
	mutex_exit(&vnd_dev_lock);
	return (0);
}

static int
vnd_sdev_filldir(sdev_ctx_t ctx)
{
	int ret;
	vnd_pnsd_t *nsp;

	ASSERT(sdev_ctx_vtype(ctx) == VDIR);
	if (strcmp(VND_SDEV_ROOT, sdev_ctx_path(ctx)) == 0)
		return (vnd_sdev_filldir_root(ctx));

	if (strcmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx)) == 0)
		return (vnd_sdev_filldir_zroot(ctx));

	ASSERT(strncmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx),
	    strlen(VND_SDEV_ZROOT)) == 0);
	nsp = vnd_sdev_ctx_to_ns(ctx);
	if (nsp == NULL)
		return (0);

	ret = vnd_sdev_fillzone(nsp, ctx);
	vnd_nsd_rele(nsp);

	return (ret);
}

static sdev_plugin_ops_t vnd_sdev_ops = {
	SDEV_PLUGIN_VERSION,
	SDEV_PLUGIN_SUBDIR,
	vnd_sdev_validate,
	vnd_sdev_filldir,
	vnd_sdev_inactive
};

static int
vnd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
	int errp = 0;

	if (cmd != DDI_ATTACH)
		return (DDI_FAILURE);

	/*
	 * Only allow one instance.
	 */
	if (vnd_dip != NULL)
		return (DDI_FAILURE);

	vnd_dip = dip;
	if (ddi_create_minor_node(vnd_dip, "vnd", S_IFCHR, 0, DDI_PSEUDO, 0) !=
	    DDI_SUCCESS) {
		vnd_dip = NULL;
		return (DDI_FAILURE);
	}

	if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
		ddi_remove_minor_node(vnd_dip, NULL);
		vnd_dip = NULL;
		return (DDI_FAILURE);
	}

	vnd_sdev_hdl = sdev_plugin_register(VND_SDEV_NAME, &vnd_sdev_ops,
	    &errp);
	if (vnd_sdev_hdl == (sdev_plugin_hdl_t)NULL) {
		ddi_remove_minor_node(vnd_dip, NULL);
		ddi_prop_remove_all(vnd_dip);
		vnd_dip = NULL;
		return (DDI_FAILURE);
	}

	vnd_sqset = gsqueue_set_create(GSQUEUE_DEFAULT_PRIORITY);

	return (DDI_SUCCESS);
}

/* ARGSUSED */
static int
vnd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
	if (cmd != DDI_DETACH)
		return (DDI_FAILURE);

	mutex_enter(&vnd_dev_lock);
	if (!list_is_empty(&vnd_dev_list)) {
		mutex_exit(&vnd_dev_lock);
		return (DDI_FAILURE);
	}
	mutex_exit(&vnd_dev_lock);

	return (DDI_FAILURE);
}

/* ARGSUSED */
static int
vnd_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
{
	int error;

	switch (cmd) {
	case DDI_INFO_DEVT2DEVINFO:
		*result = (void *)vnd_dip;
		error = DDI_SUCCESS;
		break;
	case DDI_INFO_DEVT2INSTANCE:
		*result = (void *)0;
		error = DDI_SUCCESS;
		break;
	default:
		error = DDI_FAILURE;
		break;
	}
	return (error);
}


static void
vnd_ddi_fini(void)
{
	netstack_unregister(NS_VND);
	if (vnd_taskq != NULL)
		taskq_destroy(vnd_taskq);
	if (vnd_str_cache != NULL)
		kmem_cache_destroy(vnd_str_cache);
	if (vnd_dev_cache != NULL)
		kmem_cache_destroy(vnd_dev_cache);
	if (vnd_pnsd_cache != NULL)
		kmem_cache_destroy(vnd_pnsd_cache);
	if (vnd_minors != NULL)
		id_space_destroy(vnd_minors);
	if (vnd_list_init != 0) {
		list_destroy(&vnd_nsd_list);
		list_destroy(&vnd_dev_list);
		mutex_destroy(&vnd_dev_lock);
		vnd_list_init = 0;
	}
	frameio_fini();
}

static int
vnd_ddi_init(void)
{
	if (frameio_init() != 0)
		return (DDI_FAILURE);

	vnd_str_cache = kmem_cache_create("vnd_str_cache", sizeof (vnd_str_t),
	    0, NULL, NULL, NULL, NULL, NULL, 0);
	if (vnd_str_cache == NULL) {
		frameio_fini();
		return (DDI_FAILURE);
	}
	vnd_dev_cache = kmem_cache_create("vnd_dev_cache", sizeof (vnd_dev_t),
	    0, NULL, NULL, NULL, NULL, NULL, 0);
	if (vnd_dev_cache == NULL) {
		kmem_cache_destroy(vnd_str_cache);
		frameio_fini();
		return (DDI_FAILURE);
	}
	vnd_pnsd_cache = kmem_cache_create("vnd_pnsd_cache",
	    sizeof (vnd_pnsd_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
	if (vnd_pnsd_cache == NULL) {
		kmem_cache_destroy(vnd_dev_cache);
		kmem_cache_destroy(vnd_str_cache);
		frameio_fini();
		return (DDI_FAILURE);
	}

	vnd_taskq = taskq_create_instance("vnd", -1, 1, minclsyspri, 0, 0, 0);
	if (vnd_taskq == NULL) {
		kmem_cache_destroy(vnd_pnsd_cache);
		kmem_cache_destroy(vnd_dev_cache);
		kmem_cache_destroy(vnd_str_cache);
		frameio_fini();
		return (DDI_FAILURE);
	}

	vnd_minors = id_space_create("vnd_minors", 1, INT32_MAX);
	if (vnd_minors == NULL) {
		taskq_destroy(vnd_taskq);
		kmem_cache_destroy(vnd_pnsd_cache);
		kmem_cache_destroy(vnd_dev_cache);
		kmem_cache_destroy(vnd_str_cache);
		frameio_fini();
		return (DDI_FAILURE);
	}

	mutex_init(&vnd_dev_lock, NULL, MUTEX_DRIVER, NULL);
	list_create(&vnd_dev_list, sizeof (vnd_dev_t),
	    offsetof(vnd_dev_t, vdd_link));
	list_create(&vnd_nsd_list, sizeof (vnd_pnsd_t),
	    offsetof(vnd_pnsd_t, vpnd_link));
	vnd_list_init = 1;

	netstack_register(NS_VND, vnd_stack_init, vnd_stack_shutdown,
	    vnd_stack_destroy);

	return (DDI_SUCCESS);
}

static struct module_info vnd_minfo = {
	0,		/* module id */
	"vnd",		/* module name */
	1,		/* smallest packet size */
	INFPSZ,		/* largest packet size (infinite) */
	1,		/* high watermark */
	0		/* low watermark */
};

static struct qinit vnd_r_qinit = {
	vnd_s_rput,
	NULL,
	vnd_s_open,
	vnd_s_close,
	NULL,
	&vnd_minfo,
	NULL
};

static struct qinit vnd_w_qinit = {
	vnd_s_wput,
	NULL,
	NULL,
	NULL,
	NULL,
	&vnd_minfo,
	NULL
};

static struct streamtab vnd_strtab = {
	&vnd_r_qinit,
	&vnd_w_qinit,
	NULL,
	NULL
};


static struct cb_ops vnd_cb_ops = {
	vnd_open,		/* open */
	vnd_close,		/* close */
	nulldev,		/* strategy */
	nulldev,		/* print */
	nodev,			/* dump */
	vnd_read,		/* read */
	vnd_write,		/* write */
	vnd_ioctl,		/* ioctl */
	nodev,			/* devmap */
	nodev,			/* mmap */
	nodev,			/* segmap */
	vnd_chpoll,		/* poll */
	ddi_prop_op,		/* cb_prop_op */
	NULL,			/* streamtab  */
	D_MP			/* Driver compatibility flag */
};

static struct dev_ops vnd_dev_ops = {
	DEVO_REV,		/* devo_rev */
	0,			/* refcnt */
	vnd_info,		/* get_dev_info */
	nulldev,		/* identify */
	nulldev,		/* probe */
	vnd_attach,		/* attach */
	vnd_detach,		/* detach */
	nodev,			/* reset */
	&vnd_cb_ops,		/* driver operations */
	NULL,			/* bus operations */
	nodev,			/* dev power */
	ddi_quiesce_not_needed	/* quiesce */
};

static struct modldrv vnd_modldrv = {
	&mod_driverops,
	"Virtual Networking Datapath Driver",
	&vnd_dev_ops
};

static struct fmodsw vnd_fmodfsw = {
	"vnd",
	&vnd_strtab,
	D_NEW | D_MP
};

static struct modlstrmod vnd_modlstrmod = {
	&mod_strmodops,
	"Virtual Networking Datapath Driver",
	&vnd_fmodfsw
};

static struct modlinkage vnd_modlinkage = {
	MODREV_1,
	&vnd_modldrv,
	&vnd_modlstrmod,
	NULL
};

int
_init(void)
{
	int error;

	/*
	 * We need to do all of our global initialization in init as opposed to
	 * attach and detach. The problem here is that because vnd can be used
	 * from a stream context while being detached, we can not rely on having
	 * run attach to create everything, alas. so it goes in _init, just like
	 * our friend ip.
	 */
	if ((error = vnd_ddi_init()) != DDI_SUCCESS)
		return (error);
	error = mod_install((&vnd_modlinkage));
	if (error != 0)
		vnd_ddi_fini();
	return (error);
}

int
_info(struct modinfo *modinfop)
{
	return (mod_info(&vnd_modlinkage, modinfop));
}

int
_fini(void)
{
	int error;

	error = mod_remove(&vnd_modlinkage);
	if (error == 0)
		vnd_ddi_fini();
	return (error);
}