/* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* * Copyright 2019 Joyent, Inc. */ /* * vnd - virtual (machine) networking datapath * * vnd's purpose is to provide a highly performant data path for Layer 2 network * traffic and exist side by side an active IP netstack, each servicing * different datalinks. vnd provides many of the same capabilities as the * current TCP/IP stack does and some specific to layer two. Specifically: * * o Use of the DLD fastpath * o Packet capture hooks * o Ability to use hardware capabilities * o Useful interfaces for handling multiple frames * * The following image shows where vnd fits into today's networking stack: * * +---------+----------+----------+ * | libdlpi | libvnd | libsocket| * +---------+----------+----------+ * | · · VFS | * | VFS · VFS +----------+ * | · | sockfs | * +---------+----------+----------+ * | | VND | IP | * | +----------+----------+ * | DLD/DLS | * +-------------------------------+ * | MAC | * +-------------------------------+ * | GLDv3 | * +-------------------------------+ * * ----------------------------------------- * A Tale of Two Devices - DDI Device Basics * ----------------------------------------- * * vnd presents itself to userland as a character device; however, it also is a * STREAMS device so that it can interface with dld and the rest of the * networking stack. Users never interface with the STREAMs devices directly and * they are purely an implementation detail of vnd. Opening the STREAMS device * require kcred and as such userland cannot interact with it or push it onto * the stream head. * * The main vnd character device, /dev/vnd/ctl, is a self-cloning device. Every * clone gets its own minor number; however, minor nodes are not created in the * devices tree for these instances. In this state a user may do two different * things. They may issue ioctls that affect global state or they may issue * ioctls that try to attach it to a given datalink. Once a minor device has * been attached to a datalink, all operations on it are scoped to that context, * therefore subsequent global operations are not permitted. * * A given device can be linked into the /devices and /dev name space via a link * ioctl. That ioctl causes a minor node to be created in /devices and then it * will also appear under /dev/vnd/ due to vnd's sdev plugin. This is similar * to, but simpler than, IP's persistence mechanism. * * --------------------- * Binding to a datalink * --------------------- * * Datalinks are backed by the dld (datalink device) and dls (datalink services) * drivers. These drivers provide a STREAMS device for datalinks on the system * which are exposed through /dev/net. Userland generally manipulates datalinks * through libdlpi. When an IP interface is being plumbed up what actually * happens is that someone does a dlpi_open(3DLPI) of the underlying datalink * and then pushes on the ip STREAMS module with an I_PUSH ioctl. Modules may * then can negotiate with dld and dls to obtain access to various capabilities * and fast paths via a series of STREAMS messages. * * In vnd, we do the same thing, but we leave our STREAMS module as an * implementation detail of the system. We don't want users to be able to * arbitrarily push vnd STREAMS module onto any stream, so we explicitly require * kcred to manipulate it. Thus, when a user issues a request to attach a * datalink to a minor instance of the character device, that vnd minor instance * itself does a layered open (ldi_open_by_name(9F)) of the specified datalink. * vnd does that open using the passed in credentials from the ioctl, not kcred. * This ensures that users who doesn't have permissions to open the device * cannot. Once that's been opened, we push on the vnd streams module. * * Once the vnd STREAMS instance has been created for this device, eg. the * I_PUSH ioctl returns, we explicitly send a STREAMS ioctl * (VND_STRIOC_ASSOCIATE) to associate the vnd STREAMS and character devices. * This association begins the STREAM device's initialization. We start up an * asynchronous state machine that takes care of all the different aspects of * plumbing up the device with dld and dls and enabling the MAC fast path. We * need to guarantee to consumers of the character device that by the time their * ioctl returns, the data path has been fully initialized. * * The state progression is fairly linear. There are two general steady states. * The first is VND_S_ONLINE, which means that everything is jacked up and good * to go. The alternative is VND_S_ZOMBIE, which means that the streams device * encountered an error or we have finished tearing it down and the character * device can clean it up. The following is our state progression and the * meaning of each state: * * | * | * V * +---------------+ * | VNS_S_INITIAL | This is our initial state. Every * +---------------+ vnd STREAMS device starts here. * | While in this state, only dlpi * | M_PROTO and M_IOCTL messages can be * | sent or received. All STREAMS based * | data messages are dropped. * | We transition out of this state by * | sending a DL_INFO_REQ to obtain * | information about the underlying * | link. * v * +-----------------+ * +--<-| VNS_S_INFO_SENT | In this state, we verify and * | +-----------------+ record information about the * | | underlying device. If the device is * | | not suitable, eg. not of type * v | DL_ETHER, then we immediately * | | become a ZOMBIE. To leave this * | | state we request exclusive active * | | access to the device via * v | DL_EXCLUSIVE_REQ. * | v * | +----------------------+ * +--<-| VNS_S_EXCLUSIVE_SENT | In this state, we verify whether * | +----------------------+ or not we were able to obtain * | | | exclusive access to the device. If * | | | we were not able to, then we leave, * v | | as that means that something like * | | | IP is already plumbed up on top of * | | | the datalink. We leave this state * | | | by progressing through to the * | | | appropriate DLPI primitive, either * v | | DLPI_ATTACH_REQ or DLPI_BIND_REQ * | | | depending on the style of the * | | | datalink. * | | v * | | +-------------------+ * +------ |--<-| VNS_S_ATTACH_SENT | In this state, we verify we were * | | +-------------------+ able to perform a standard DLPI * | | | attach and if so, go ahead and * v | | send a DLPI_BIND_REQ. * | v v * | +-------------------+ * +--<-| VNS_S_BIND_SENT | In this state we see the result of * | +-------------------+ our attempt to bind to PPA 0 of the * v | underlying device. Because we're * | | trying to be a layer two datapath, * | | the specific attachment point isn't * | | too important as we're going to * v | have to enable promiscuous mode. We * | | transition out of this by sending * | | our first of three promiscuous mode * | | requests. * v v * | +------------------------+ * +--<-| VNS_S_SAP_PROMISC_SENT | In this state we verify that we * | +------------------------+ were able to enable promiscuous * | | mode at the physical level. We * | | transition out of this by enabling * | | multicast and broadcast promiscuous * v | mode. * | v * | +--------------------------+ * +--<-| VNS_S_MULTI_PROMISC_SENT | In this state we verify that we * | +--------------------------+ have enabled DL_PROMISC_MULTI and * v | move onto the second promiscuous * | | mode request. * | v * | +----------------------------+ * +--<-| VNS_S_RX_ONLY_PROMISC_SENT | In this state we verify that we * | +----------------------------+ enabled RX_ONLY promiscuous mode. * | | We specifically do this as we don't * v | want to receive our own traffic * | | that we'll send out. We leave this * | | state by enabling the final flag * | | DL_PROMISC_FIXUPS. * | v * | +--------------------------+ * +--<-| VNS_S_FIXUP_PROMISC_SENT | In this state we verify that we * | +--------------------------+ enabled FIXUP promiscuous mode. * | | We specifically do this as we need * v | to ensure that traffic which is * | | received by being looped back to us * | | correctly has checksums fixed. We * | | leave this state by requesting the * | | dld/dls capabilities that we can * v | process. * | v * | +--------------------+ * +--<-| VNS_S_CAPAB_Q_SENT | We loop over the set of * | +--------------------+ capabilities that dld advertised * | | and enable the ones that currently * v | support for use. See the section * | | later on regarding capabilities * | | for more information. We leave this * | | state by sending an enable request. * v v * | +--------------------+ * +--<-| VNS_S_CAPAB_E_SENT | Here we finish all capability * | +--------------------+ initialization. Once finished, we * | | transition to the next state. If * v | the dld fast path is not available, * | | we become a zombie. * | v * | +--------------+ * | | VNS_S_ONLINE | This is a vnd STREAMS device's * | +--------------+ steady state. It will normally * | | reside in this state while it is in * | | active use. It will only transition * v | to the next state when the STREAMS * | | device is closed by the character * | | device. In this state, all data * | | flows over the dld fast path. * | v * | +---------------------+ * +--->| VNS_S_SHUTTING_DOWN | This vnd state takes care of * | +---------------------+ disabling capabilities and * | | flushing all data. At this point * | | any additional data that we receive * | | will be dropped. We leave this * v | state by trying to remove multicast * | | promiscuity. * | | * | v * | +---------------------------------+ * +-->| VNS_S_MULTICAST_PROMISCOFF_SENT | In this state, we check if we have * | +---------------------------------+ successfully removed multicast * | | promiscuous mode. If we have * | | failed, we still carry on but only * | | warn. We leave this state by trying * | | to disable SAP level promiscuous * | | mode. * | v * | +---------------------------+ * +-->| VNS_S_SAP_PROMISCOFF_SENT | In this state, we check if we have * | +---------------------------+ successfully removed SAP level * | | promiscuous mode. If we have * | | failed, we still carry on but only * | | warn. Note that we don't worry * | | about either of * | | DL_PROMISC_FIXUPS or * | | DL_PROMISC_RX_ONLY. If these are * | | the only two entries left, then we * | | should have anything that MAC is * | | doing for us at this point, * | | therefore it's safe for us to * | | proceed to unbind, which is how we * | | leave this state via a * | v DL_UNBIND_REQ. * | +-------------------+ * +--->| VNS_S_UNBIND_SENT | Here, we check how the unbind * | +-------------------+ request went. Regardless of its * | | success, we always transition to * | | a zombie state. * | v * | +--------------+ * +--->| VNS_S_ZOMBIE | In this state, the vnd STREAMS * +--------------+ device is waiting to finish being * reaped. Because we have no more * ways to receive data it should be * safe to destroy all remaining data * structures. * * If the stream association fails for any reason the state machine reaches * VNS_S_ZOMBIE. A more detailed vnd_errno_t will propagate back through the * STREAMS ioctl to the character device. That will fail the user ioctl and * propagate the vnd_errno_t back to userland. If, on the other hand, the * association succeeds, then the vnd STREAMS device will be fully plumbed up * and ready to transmit and receive message blocks. Consumers will be able to * start using the other cbops(9E) entry points once the attach has fully * finished, which will occur after the original user attach ioctl to the * character device returns. * * It's quite important that we end up sending the full series of STREAMS * messages when tearing down. While it's tempting to say that we should just * rely on the STREAMS device being closed to properly ensure that we have no * more additional data, that's not sufficient due to our use of direct * callbacks. DLS does not ensure that by the time we change the direct * callback (vnd_mac_input) that all callers to it will have been quiesced. * However, it does guarantee that if we disable promiscuous mode ourselves and * we turn off the main data path via DL_UNBIND_REQ that it will work. * Therefore, we make sure to do this ourselves rather than letting DLS/DLD do * it as part of tearing down the STREAMS device. This ensures that we'll * quiesce all data before we destroy our data structures and thus we should * eliminate the race in changing the data function. * * -------------------- * General Architecture * -------------------- * * There are several different devices and structures in the vnd driver. There * is a per-netstack component, pieces related to the character device that * consumers see, the internal STREAMS device state, and the data queues * themselves. The following ASCII art picture describes their relationships and * some of the major pieces of data that contain them. These are not exhaustive, * e.g. synchronization primitives are left out. * * +----------------+ +-----------------+ * | global | | global | * | device list | | netstack list | * | vnd_dev_list | | vnd_nsd_list | * +----------------+ +-----------------+ * | | * | v * | +-------------------+ +-------------------+ * | | per-netstack data | ---> | per-netstack data | --> ... * | | vnd_pnsd_t | | vnd_pnsd_t | * | | | +-------------------+ * | | | * | | nestackid_t ---+----> Netstack ID * | | vnd_pnsd_flags_t -+----> Status flags * | | zoneid_t ---+----> Zone ID for this netstack * | | hook_family_t ---+----> VND IPv4 Hooks * | | hook_family_t ---+----> VND IPv6 Hooks * | | list_t ----+ | * | +------------+------+ * | | * | v * | +------------------+ +------------------+ * | | character device | ---> | character device | -> ... * +---------->| vnd_dev_t | | vnd_dev_t | * | | +------------------+ * | | * | minor_t ---+--> device minor number * | ldi_handle_t ---+--> handle to /dev/net/%datalink * | vnd_dev_flags_t -+--> device flags, non blocking, etc. * | char[] ---+--> name if linked * | vnd_str_t * -+ | * +--------------+---+ * | * v * +-------------------------+ * | STREAMS device | * | vnd_str_t | * | | * | vnd_str_state_t ---+---> State machine state * | gsqueue_t * ---+---> mblk_t Serialization queue * | vnd_str_stat_t ---+---> per-device kstats * | vnd_str_capab_t ---+----------------------------+ * | vnd_data_queue_t ---+ | | * | vnd_data_queue_t -+ | | v * +-------------------+-+---+ +---------------------+ * | | | Stream capabilities | * | | | vnd_str_capab_t | * | | | | * | | supported caps <--+-- vnd_capab_flags_t | * | | dld cap handle <--+-- void * | * | | direct tx func <--+-- vnd_dld_tx_t | * | | +---------------------+ * | | * +----------------+ +-------------+ * | | * v v * +-------------------+ +-------------------+ * | Read data queue | | Write data queue | * | vnd_data_queue_t | | vnd_data_queue_t | * | | | | * | size_t ----+--> Current size | size_t ----+--> Current size * | size_t ----+--> Max size | size_t ----+--> Max size * | mblk_t * ----+--> Queue head | mblk_t * ----+--> Queue head * | mblk_t * ----+--> Queue tail | mblk_t * ----+--> Queue tail * +-------------------+ +-------------------+ * * * Globally, we maintain two lists. One list contains all of the character * device soft states. The other maintains a list of all our netstack soft * states. Each netstack maintains a list of active devices that have been * associated with a datalink in its netstack. * * Recall that a given minor instance of the character device exists in one of * two modes. It can either be a cloned open of /dev/vnd/ctl, the control node, * or it can be associated with a given datalink. When minor instances are in * the former state, they do not exist in a given vnd_pnsd_t's list of devices. * As part of attaching to a datalink, the given vnd_dev_t will be inserted into * the appropriate vnd_pnsd_t. In addition, this will cause a STREAMS device, a * vnd_str_t, to be created and associated to a vnd_dev_t. * * The character device, and its vnd_dev_t, is the interface to the rest of the * system. The vnd_dev_t keeps track of various aspects like whether various * operations, such as read, write and the frameio ioctls, are considered * blocking or non-blocking in the O_NONBLOCK sense. It also is responsible for * keeping track of things like the name of the device, if any, in /dev. The * vnd_str_t, on the other hand manages aspects like buffer sizes and the actual * data queues. However, ioctls that manipulate these properties all go through * the vnd_dev_t to its associated vnd_str_t. * * Each of the STREAMS devices, the vnd_str_t, maintains two data queues. One * for frames to transmit (write queue) and one for frames received (read * queue). These data queues have a maximum size and attempting to add data * beyond that maximum size will result in data being dropped. The sizes are * configurable via ioctls VND_IOC_SETTXBUF, VND_IOC_SETRXBUF. Data either sits * in those buffers or has a reservation in those buffers while they are in vnd * and waiting to be consumed by the user or by mac. * * Finally, the vnd_str_t also has a vnd_str_capab_t which we use to manage the * available, negotiated, and currently active features. * * ---------------------- * Data Path and gsqueues * ---------------------- * * There's a lot of plumbing in vnd to get to the point where we can send data, * but vnd's bread and butter is the data path, so it's worth diving into it in * more detail. Data enters and exits the system from two ends. * * The first end is the vnd consumer. This comes in the form of read and write * system calls as well as the frame I/O ioctls. The read and write system calls * operate on a single frame at a time. Think of a frame as a single message * that has come in off the wire, which may itself comprise multiple mblk_t's * linked together in the kernel. readv(2) and writev(2) have the same * limitations as read(2) and write(2). We enforce this as the system is * required to fill up every uio(9S) buffer before moving onto the next one. * This means that if you have a MTU sized buffer and two frames come in which * are less than half of the MTU they must fill up the given iovec. Even if we * didn't want to do this, we have no way of informing the supplier of the * iovecs that they were only partially filled or where one frame ends and * another begins. That's life, as such we have frame I/O which solves this * problem. It allows for multiple frames to be consumed as well as for frames * to be broken down into multiple vector components. * * The second end is the mac direct calls. As part of negotiating capabilities * via dld, we give mac a function of ours to call when packets are received * [vnd_mac_input()] and a callback to indicate that flow has been restored * [vnd_mac_flow_control()]. In turn, we also get a function pointer that we can * transmit data with. As part of the contract with mac, mac is allowed to flow * control us by returning a cookie to the transmit function. When that happens, * all outbound traffic is halted until our callback function is called and we * can schedule drains. * * It's worth looking at these in further detail. We'll start with the rx path. * * * | * * . . . packets from gld * | * v * +-------------+ * | mac | * +-------------+ * | * v * +-------------+ * | dld | * +-------------+ * | * * . . . dld direct callback * | * v * +---------------+ * | vnd_mac_input | * +---------------+ * | * v * +---------+ +-------------+ * | dropped |<--*---------| vnd_hooks | * | by | . +-------------+ * | hooks | . drop probe | * +---------+ kstat bump * . . . Do we have free * | buffer space? * | * no . | . yes * . + . * +---*--+------*-------+ * | | * * . . drop probe * . . recv probe * | kstat bump | kstat bump * v | * +---------+ * . . fire pollin * | freemsg | v * +---------+ +-----------------------+ * | vnd_str_t`vns_dq_read | * +-----------------------+ * ^ ^ * +----------+ | | +---------+ * | read(9E) |-->-+ +--<--| frameio | * +----------+ +---------+ * * The rx path is rather linear. Packets come into us from mac. We always run * them through the various hooks, and if they come out of that, we inspect the * read data queue. If there is not enough space for a packet, we drop it. * Otherwise, we append it to the data queue, and fire read notifications * targetting anyone polling or doing blocking I/O on this device. Those * consumers then drain the head of the data queue. * * The tx path is more complicated due to mac flow control. After any call into * mac, we may have to potentially suspend writes and buffer data for an * arbitrary amount of time. As such, we need to carefully track the total * amount of outstanding data so that we don't waste kernel memory. This is * further complicated by the fact that mac will asynchronously tell us when our * flow has been resumed. * * For data to be able to enter the system, it needs to be able to take a * reservation from the write data queue. Once the reservation has been * obtained, we enter the gsqueue so that we can actually append it. We use * gsqueues (serialization queues) to ensure that packets are manipulated in * order as we deal with the draining and appending packets. We also leverage * its worker thread to help us do draining after mac has restorted our flow. * * The following image describes the flow: * * +-----------+ +--------------+ +-------------------------+ +------+ * | write(9E) |-->| Space in the |--*--->| gsqueue_enter_one() |-->| Done | * | frameio | | write queue? | . | +->vnd_squeue_tx_append | +------+ * +-----------+ +--------------+ . +-------------------------+ * | ^ . * | | . reserve space from gsqueue * | | | * queue . . . * | space v * full | * . . . avail +------------------------+ * v | | vnd_squeue_tx_append() | * +--------+ +------------+ +------------------------+ * | EAGAIN |<--*------| Non-block? |<-+ | * +--------+ . +------------+ | v * . yes v | wait +--------------+ * no . .* * . . for | append chain | * +----+ space | to outgoing | * | mblk chain | * from gsqueue +--------------+ * | | * | +-------------------------------------------------+ * | | * | | yes . . . * v v . * +-----------------------+ +--------------+ . +------+ * | vnd_squeue_tx_drain() |--->| mac blocked? |----*---->| Done | * +-----------------------+ +--------------+ +------+ * | | * +---------------------------------|---------------------+ * | | tx | * | no . . * queue . . * * | flow controlled . | empty * . fire pollout * | . v | if mblk_t's * +-------------+ . +---------------------+ | sent * | set blocked |<----*------| vnd_squeue_tx_one() |--------^-------+ * | flags | +---------------------+ | * +-------------+ More data | | | More data | * and limit ^ v * . . and limit ^ * not reached . . * | | reached | * +----+ | | * v | * +----------+ +-------------+ +---------------------------+ * | mac flow |--------->| remove mac |--->| gsqueue_enter_one() with | * | control | | block flags | | vnd_squeue_tx_drain() and | * | callback | +-------------+ | GSQUEUE_FILL flag, iff | * +----------+ | not already scheduled | * +---------------------------+ * * The final path taken for a given write(9E)/frameio ioctl depends on whether * or not the vnd_dev_t is non-blocking. That controls the initial path of * trying to take a reservation in write data queue. If the device is in * non-blocking mode, we'll return EAGAIN when there is not enough space * available, otherwise, the calling thread blocks on the data queue. * * Today when we call into vnd_squeue_tx_drain() we will not try to drain the * entire queue, as that could be quite large and we don't want to necessarily * keep the thread that's doing the drain until it's been finished. Not only * could more data be coming in, but the draining thread could be a userland * thread that has more work to do. We have two limits today. There is an upper * bound on the total amount of data and the total number of mblk_t chains. If * we hit either limit, then we will schedule another drain in the gsqueue and * go from there. * * It's worth taking some time to describe how we interact with gsqueues. vnd * has a gsqueue_set_t for itself. It's important that it has its own set, as * the profile of work that vnd does is different from other sub-systems in the * kernel. When we open a STREAMS device in vnd_s_open, we get a random gsqueue. * Unlike TCP/IP which uses an gsqueue for per TCP connection, we end up * maintaining one for a given device. Because of that, we want to use a * pseudo-random one to try and spread out the load, and picking one at random * is likely to be just as good as any fancy algorithm we might come up with, * especially as any two devices could have radically different transmit * profiles. * * While some of the write path may seem complicated, it does allow us to * maintain an important property. Once we have acknowledged a write(9E) or * frameio ioctl, we will not drop the packet, excepting something like ipf via * the firewall hooks. * * There is one other source of flow control that can exist in the system which * is in the form of a barrier. The barrier is an internal mechanism used for * ensuring that an gsqueue is drained for a given device. We use this as part * of tearing down. Specifically we disable the write path so nothing new can be * inserted into the gsqueue and then insert a barrier block. Once the barrier * block comes out of the gsqueue, then we know nothing else in the gsqueue that * could refer to the vnd_str_t, being destroyed, exists. * * --------------------- * vnd, zones, netstacks * --------------------- * * vnd devices are scoped to datalinks and datalinks are scoped to a netstack. * Because of that, vnd is also a netstack module. It registers with the * netstack sub-system and receives callbacks every time a netstack is created, * being shutdown, and destroyed. The netstack callbacks drive the creation and * destruction of the vnd_pnsd_t structures. * * Recall from the earlier architecture diagrams that every vnd device is scoped * to a netstack and known about by a given vnd_pnsd_t. When that netstack is * torn down, we also tear down any vnd devices that are hanging around. When * the netstack is torn down, we know that any zones that are scoped to that * netstack are being shut down and have no processes remaining. This is going * to be the case whether they are shared or exclusive stack zones. We have to * perform a careful dance. * * There are two different callbacks that happen on tear down, the first is a * shutdown callback, the second is a destroy callback. When the shutdown * callback is fired we need to prepare for the netstack to go away and ensure * that nothing can continue to persist itself. * * More specifically, when we get notice of a stack being shutdown we first * remove the netstack from the global netstack list to ensure that no one new * can come in and find the netstack and get a reference to it. After that, we * notify the neti hooks that they're going away. Once that's all done, we get * to the heart of the matter. * * When shutting down there could be any number of outstanding contexts that * have a reference on the vnd_pnsd_t and on the individual links. However, we * know that no one new will be able to find the vnd_pnsd_t. To account for * things that have existing references we mark the vnd_pnsd_t`vpnd_flags with * VND_NS_CONDEMNED. This is checked by code paths that wish to append a device * to the netstack's list. If this is set, then they must not append to it. * Once this is set, we know that the netstack's list of devices can never grow, * only shrink. * * Next, for each device we tag it with VND_D_ZONE_DYING. This indicates that * the container for the device is being destroyed and that we should not allow * additional references to the device to be created, whether via open, or * linking. The presence of this bit also allows things like the list ioctl and * sdev to know not to consider its existence. At the conclusion of this being * set, we know that no one else should be able to obtain a new reference to the * device. * * Once that has been set for all devices, we go through and remove any existing * links that have been established in sdev. Because doing that may cause the * final reference for the device to be dropped, which still has a reference to * the netstack, we have to restart our walk due to dropped locks. We know that * this walk will eventually complete because the device cannot be relinked and * no new devices will be attached in this netstack due to VND_NS_CONDEMNED. * Once that's finished, the shutdown callback returns. * * When we reach the destroy callback, we simply wait for references on the * netstack to disappear. Because the zone has been shut down, all processes in * it that have open references have been terminated and reaped. Any threads * that are newly trying to reference it will fail. However, there is one thing * that can halt this that we have no control over, which is the global zone * holding open a reference to the device. In this case the zone halt will hang * in vnd_stack_destroy. Once the last references is dropped we finish destroy * the netinfo hooks and free the vnd_pnsd_t. * * ---- * sdev * ---- * * vnd registers a sdev plugin which allows it to dynamically fill out /dev/vnd * for both the global and non-global zones. In any given zone we always supply * a control node via /dev/vnd/ctl. This is the self-cloning node. Each zone * will also have an entry per-link in that zone under /dev/vnd/%datalink, eg. * if a link was named net0, there would be a /dev/vnd/net0. The global zone can * also see every link for every zone, ala /dev/net, under * /dev/vnd/%zonename/%datalink, eg. if a zone named 'turin' had a vnd device * named net0, the global zone would have /dev/vnd/turin/net0. * * The sdev plugin has three interfaces that it supplies back to sdev. One is to * validate that a given node is still valid. The next is a callback from sdev * to say that it is no longer using the node. The third and final one is from * sdev where it asks us to fill a directory. All of the heavy lifting is done * in directory filling and in valiation. We opt not to maintain a reference on * the device while there is an sdev node present. This makes the removal of * nodes much simpler and most of the possible failure modes shouldn't cause any * real problems. For example, the open path has to handle both dev_t's which no * longer exist and which are no longer linked. * * ----- * hooks * ----- * * Like IP, vnd sends all L3 packets through its firewall hooks. Currently vnd * provides these for L3 IP and IPv6 traffic. Each netstack provides these hooks * in a minimal fashion. While we will allow traffic to be filtered through the * hooks, we do not provide means for packet injection or additional inspection * at this time. There are a total of four different events created: * * o IPv4 physical in * o IPv4 physical out * o IPv6 physical in * o IPv6 physical out * * --------------- * Synchronization * --------------- * * To make our synchronization simpler, we've put more effort into making the * metadata/setup paths do more work. That work allows the data paths to make * assumptions around synchronization that simplify the general case. Each major * structure, the vnd_pnsd_t, vnd_dev_t, vnd_str_t, and vnd_data_queue_t is * annotated with the protection that its members receives. The following * annotations are used: * * A Atomics; these values are only modified using atomics values. * Currently this only applies to kstat values. * E Existence; no lock is needed to access this member, it does not * change while the structure is valid. * GL Global Lock; these members are protected by the global * vnd_dev_lock. * L Locked; access to the member is controlled by a lock that is in * the structure. * NSL netstack lock; this member is protected by the containing * netstack. This only applies to the vnd_dev_t`vdd_nslink. * X This member is special, and is discussed in this section. * * In addition to locking, we also have reference counts on the vnd_dev_t and * the vnd_pnsd_t. The reference counts describe the lifetimes of the structure. * With rare exception, once a reference count is decremented, the consumer * should not assume that the data is valid any more. The only exception to this * is the case where we're removing an extant reference count from a link into * /devices or /dev. Reference counts are obtained on these structures as a part * of looking them up. * * # Global Lock Ordering * ###################### * * The following is the order that you must take locks in vnd: * * 1) vnd`vnd_dev_lock * 2) vnd_pnsd_t`vpnd_lock * 3) vnd_dev_t`vnd_lock * 4) vnd_str_t`vns_lock * 5) vnd_data_queue_t`vdq_lock * * One must adhere to the following rules: * * o You must acquire a lower numbered lock before a high numbered lock. * o It is NOT legal to hold two locks of the same level concurrently, eg. you * can not hold two different vnd_dev_t's vnd_lock at the same time. * o You may release locks in any order. * o If you release a lock, you must honor the locking rules before acquiring * it again. * o You should not hold any locks when calling any of the rele functions. * * # Special Considerations * ######################## * * While most of the locking is what's expected, it's worth going into the * special nature that a few members hold. Today, only two structures have * special considerations: the vnd_dev_t and the vnd_str_t. All members with * special considerations have an additional annotation that describes how you * should interact with it. * * vnd_dev_t: The vdd_nsd and vdd_cr are only valid when the minor node is * attached or in the process of attaching. If the code path that goes through * requires an attached vnd_dev_t, eg. the data path and tear down path, then it * is always legal to dereference that member without a lock held. When they are * added to the system, they should be done under the vdd_lock and done as part * of setting the VND_D_ATTACH_INFLIGHT flag. These should not change during the * lifetime of the vnd_dev_t. * * vnd_dev_t: The vdd_ldih is similar to the vdd_nsd and vdd_cr, except that it * always exists as it is a part of the structure. The only time that it's valid * to be using it is during the attach path with the VND_D_ATTACH_INFLIGHT flag * set or during tear down. Outside of those paths which are naturally * serialized, there is no explicit locking around the member. * * vnd_str_t: The vns_dev and vns_nsd work in similar ways. They are not * initially set as part of creating the structure, but are set as part of * responding to the association ioctl. Anything in the data path or metadata * path that requires association may assume that they exist, as we do not kick * off the state machine until they're set. * * vnd_str_t: The vns_drainblk and vns_barrierblk are similarly special. The * members are designed to be used as part of various operations with the * gsqueues. A lock isn't needed to use them, but to work with them, the * appropriate flag in the vnd_str_t`vns_flags must have been set by the current * thread. Otherwise, it is always fair game to refer to their addresses. Their * contents are ignored by vnd, but some members are manipulated by the gsqueue * subsystem. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Globals */ static dev_info_t *vnd_dip; static taskq_t *vnd_taskq; static kmem_cache_t *vnd_str_cache; static kmem_cache_t *vnd_dev_cache; static kmem_cache_t *vnd_pnsd_cache; static id_space_t *vnd_minors; static int vnd_list_init = 0; static sdev_plugin_hdl_t vnd_sdev_hdl; static gsqueue_set_t *vnd_sqset; static kmutex_t vnd_dev_lock; static list_t vnd_dev_list; /* Protected by the vnd_dev_lock */ static list_t vnd_nsd_list; /* Protected by the vnd_dev_lock */ /* * STREAMs ioctls * * The STREAMs ioctls are internal to vnd. No one should be seeing them, as such * they aren't a part of the header file. */ #define VND_STRIOC (('v' << 24) | ('n' << 16) | ('d' << 8) | 0x80) /* * Private ioctl to associate a given streams instance with a minor instance of * the character device. */ #define VND_STRIOC_ASSOCIATE (VND_STRIOC | 0x1) typedef struct vnd_strioc_associate { minor_t vsa_minor; /* minor device node */ netstackid_t vsa_nsid; /* netstack id */ vnd_errno_t vsa_errno; /* errno */ } vnd_strioc_associate_t; typedef enum vnd_strioc_state { VSS_UNKNOWN = 0, VSS_COPYIN = 1, VSS_COPYOUT = 2, } vnd_strioc_state_t; typedef struct vnd_strioc { vnd_strioc_state_t vs_state; caddr_t vs_addr; } vnd_strioc_t; /* * VND SQUEUE TAGS, start at 0x42 so we don't overlap with extent tags. Though * really, overlap is at the end of the day, inevitable. */ #define VND_SQUEUE_TAG_TX_DRAIN 0x42 #define VND_SQUEUE_TAG_MAC_FLOW_CONTROL 0x43 #define VND_SQUEUE_TAG_VND_WRITE 0x44 #define VND_SQUEUE_TAG_ND_FRAMEIO_WRITE 0x45 #define VND_SQUEUE_TAG_STRBARRIER 0x46 /* * vnd reserved names. These are names which are reserved by vnd and thus * shouldn't be used by some external program. */ static char *vnd_reserved_names[] = { "ctl", "zone", NULL }; /* * vnd's DTrace probe macros * * DTRACE_VND* are all for a stable provider. We also have an unstable internal * set of probes for reference count manipulation. */ #define DTRACE_VND3(name, type1, arg1, type2, arg2, type3, arg3) \ DTRACE_PROBE3(__vnd_##name, type1, arg1, type2, arg2, type3, arg3); #define DTRACE_VND4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \ DTRACE_PROBE4(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \ type4, arg4); #define DTRACE_VND5(name, type1, arg1, type2, arg2, type3, arg3, \ type4, arg4, type5, arg5) \ DTRACE_PROBE5(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \ type4, arg4, type5, arg5); #define DTRACE_VND_REFINC(vdp) \ DTRACE_PROBE2(vnd__ref__inc, vnd_dev_t *, vdp, int, vdp->vdd_ref); #define DTRACE_VND_REFDEC(vdp) \ DTRACE_PROBE2(vnd__ref__dec, vnd_dev_t *, vdp, int, vdp->vdd_ref); /* * Tunables */ size_t vnd_vdq_default_size = 1024 * 64; /* 64 KB */ size_t vnd_vdq_hard_max = 1024 * 1024 * 4; /* 4 MB */ /* * These numbers are designed as per-device tunables that are applied when a new * vnd device is attached. They're a rough stab at what may be a reasonable * amount of work to do in one burst in an squeue. */ size_t vnd_flush_burst_size = 1520 * 10; /* 10 1500 MTU packets */ size_t vnd_flush_nburst = 10; /* 10 frames */ /* * Constants related to our sdev plugins */ #define VND_SDEV_NAME "vnd" #define VND_SDEV_ROOT "/dev/vnd" #define VND_SDEV_ZROOT "/dev/vnd/zone" /* * vnd relies on privileges, not mode bits to limit access. As such, device * files are read-write to everyone. */ #define VND_SDEV_MODE (S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | \ S_IROTH | S_IWOTH) /* * Statistic macros */ #define VND_STAT_INC(vsp, field, val) \ atomic_add_64(&(vsp)->vns_ksdata.field.value.ui64, val) #define VND_LATENCY_1MS 1000000 #define VND_LATENCY_10MS 10000000 #define VND_LATENCY_100MS 100000000 #define VND_LATENCY_1S 1000000000 #define VND_LATENCY_10S 10000000000 /* * Constants for vnd hooks */ static uint8_t vnd_bcast_addr[6] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; #define IPV4_MCAST_LEN 3 static uint8_t vnd_ipv4_mcast[3] = { 0x01, 0x00, 0x5E }; #define IPV6_MCAST_LEN 2 static uint8_t vnd_ipv6_mcast[2] = { 0x33, 0x33 }; /* * vnd internal data structures and types */ struct vnd_str; struct vnd_dev; struct vnd_pnsd; /* * As part of opening the device stream we need to properly communicate with our * underlying stream. This is a bit of an asynchronous dance and we need to * properly work with dld to get everything set up. We have to initiate the * conversation with dld and as such we keep track of our state here. */ typedef enum vnd_str_state { VNS_S_INITIAL = 0, VNS_S_INFO_SENT, VNS_S_EXCLUSIVE_SENT, VNS_S_ATTACH_SENT, VNS_S_BIND_SENT, VNS_S_SAP_PROMISC_SENT, VNS_S_MULTI_PROMISC_SENT, VNS_S_RX_ONLY_PROMISC_SENT, VNS_S_FIXUP_PROMISC_SENT, VNS_S_CAPAB_Q_SENT, VNS_S_CAPAB_E_SENT, VNS_S_ONLINE, VNS_S_SHUTTING_DOWN, VNS_S_MULTICAST_PROMISCOFF_SENT, VNS_S_SAP_PROMISCOFF_SENT, VNS_S_UNBIND_SENT, VNS_S_ZOMBIE } vnd_str_state_t; typedef enum vnd_str_flags { VNS_F_NEED_ZONE = 0x1, VNS_F_TASKQ_DISPATCHED = 0x2, VNS_F_CONDEMNED = 0x4, VNS_F_FLOW_CONTROLLED = 0x8, VNS_F_DRAIN_SCHEDULED = 0x10, VNS_F_BARRIER = 0x20, VNS_F_BARRIER_DONE = 0x40 } vnd_str_flags_t; typedef enum vnd_capab_flags { VNS_C_HCKSUM = 0x1, VNS_C_DLD = 0x2, VNS_C_DIRECT = 0x4, VNS_C_HCKSUM_BADVERS = 0x8 } vnd_capab_flags_t; /* * Definitions to interact with direct callbacks */ typedef void (*vnd_rx_t)(struct vnd_str *, mac_resource_t *, mblk_t *, mac_header_info_t *); typedef uintptr_t vnd_mac_cookie_t; /* DLD Direct capability function */ typedef int (*vnd_dld_cap_t)(void *, uint_t, void *, uint_t); /* DLD Direct tx function */ typedef vnd_mac_cookie_t (*vnd_dld_tx_t)(void *, mblk_t *, uint64_t, uint16_t); /* DLD Direct function to set flow control callback */ typedef void *(*vnd_dld_set_fcb_t)(void *, void (*)(void *, vnd_mac_cookie_t), void *); /* DLD Direct function to see if flow controlled still */ typedef int (*vnd_dld_is_fc_t)(void *, vnd_mac_cookie_t); /* * The vnd_str_capab_t is always protected by the vnd_str_t it's a member of. */ typedef struct vnd_str_capab { vnd_capab_flags_t vsc_flags; t_uscalar_t vsc_hcksum_opts; vnd_dld_cap_t vsc_capab_f; void *vsc_capab_hdl; vnd_dld_tx_t vsc_tx_f; void *vsc_tx_hdl; vnd_dld_set_fcb_t vsc_set_fcb_f; void *vsc_set_fcb_hdl; vnd_dld_is_fc_t vsc_is_fc_f; void *vsc_is_fc_hdl; vnd_mac_cookie_t vsc_fc_cookie; void *vsc_tx_fc_hdl; } vnd_str_capab_t; /* * The vnd_data_queue is a simple construct for storing a series of messages in * a queue. * * See synchronization section of the big theory statement for member * annotations. */ typedef struct vnd_data_queue { struct vnd_str *vdq_vns; /* E */ kmutex_t vdq_lock; kcondvar_t vdq_ready; /* Uses vdq_lock */ ssize_t vdq_max; /* L */ ssize_t vdq_cur; /* L */ mblk_t *vdq_head; /* L */ mblk_t *vdq_tail; /* L */ } vnd_data_queue_t; typedef struct vnd_str_stat { kstat_named_t vks_rbytes; kstat_named_t vks_rpackets; kstat_named_t vks_obytes; kstat_named_t vks_opackets; kstat_named_t vks_nhookindrops; kstat_named_t vks_nhookoutdrops; kstat_named_t vks_ndlpidrops; kstat_named_t vks_ndataindrops; kstat_named_t vks_ndataoutdrops; kstat_named_t vks_tdrops; kstat_named_t vks_linkname; kstat_named_t vks_zonename; kstat_named_t vks_nmacflow; kstat_named_t vks_tmacflow; kstat_named_t vks_mac_flow_1ms; kstat_named_t vks_mac_flow_10ms; kstat_named_t vks_mac_flow_100ms; kstat_named_t vks_mac_flow_1s; kstat_named_t vks_mac_flow_10s; } vnd_str_stat_t; /* * vnd stream structure * * See synchronization section of the big theory statement for member * annotations. */ typedef struct vnd_str { kmutex_t vns_lock; kcondvar_t vns_cancelcv; /* Uses vns_lock */ kcondvar_t vns_barriercv; /* Uses vns_lock */ kcondvar_t vns_stcv; /* Uses vns_lock */ vnd_str_state_t vns_state; /* L */ vnd_str_state_t vns_laststate; /* L */ vnd_errno_t vns_errno; /* L */ vnd_str_flags_t vns_flags; /* L */ vnd_str_capab_t vns_caps; /* L */ taskq_ent_t vns_tqe; /* L */ vnd_data_queue_t vns_dq_read; /* E */ vnd_data_queue_t vns_dq_write; /* E */ mblk_t *vns_dlpi_inc; /* L */ queue_t *vns_rq; /* E */ queue_t *vns_wq; /* E */ queue_t *vns_lrq; /* E */ t_uscalar_t vns_dlpi_style; /* L */ t_uscalar_t vns_minwrite; /* L */ t_uscalar_t vns_maxwrite; /* L */ hrtime_t vns_fclatch; /* L */ hrtime_t vns_fcupdate; /* L */ kstat_t *vns_kstat; /* E */ gsqueue_t *vns_squeue; /* E */ mblk_t vns_drainblk; /* E + X */ mblk_t vns_barrierblk; /* E + X */ vnd_str_stat_t vns_ksdata; /* A */ size_t vns_nflush; /* L */ size_t vns_bsize; /* L */ struct vnd_dev *vns_dev; /* E + X */ struct vnd_pnsd *vns_nsd; /* E + X */ } vnd_str_t; typedef enum vnd_dev_flags { VND_D_ATTACH_INFLIGHT = 0x001, VND_D_ATTACHED = 0x002, VND_D_LINK_INFLIGHT = 0x004, VND_D_LINKED = 0x008, VND_D_CONDEMNED = 0x010, VND_D_ZONE_DYING = 0x020, VND_D_OPENED = 0x040 } vnd_dev_flags_t; /* * This represents the data associated with a minor device instance. * * See synchronization section of the big theory statement for member * annotations. */ typedef struct vnd_dev { kmutex_t vdd_lock; list_node_t vdd_link; /* GL */ list_node_t vdd_nslink; /* NSL */ int vdd_ref; /* L */ vnd_dev_flags_t vdd_flags; /* L */ minor_t vdd_minor; /* E */ dev_t vdd_devid; /* E */ ldi_ident_t vdd_ldiid; /* E */ ldi_handle_t vdd_ldih; /* X */ cred_t *vdd_cr; /* X */ vnd_str_t *vdd_str; /* L */ struct pollhead vdd_ph; /* E */ struct vnd_pnsd *vdd_nsd; /* E + X */ char vdd_datalink[VND_NAMELEN]; /* L */ char vdd_lname[VND_NAMELEN]; /* L */ } vnd_dev_t; typedef enum vnd_pnsd_flags { VND_NS_CONDEMNED = 0x1 } vnd_pnsd_flags_t; /* * Per netstack data structure. * * See synchronization section of the big theory statement for member * annotations. */ typedef struct vnd_pnsd { list_node_t vpnd_link; /* protected by global dev lock */ zoneid_t vpnd_zid; /* E */ netstackid_t vpnd_nsid; /* E */ boolean_t vpnd_hooked; /* E */ net_handle_t vpnd_neti_v4; /* E */ hook_family_t vpnd_family_v4; /* E */ hook_event_t vpnd_event_in_v4; /* E */ hook_event_t vpnd_event_out_v4; /* E */ hook_event_token_t vpnd_token_in_v4; /* E */ hook_event_token_t vpnd_token_out_v4; /* E */ net_handle_t vpnd_neti_v6; /* E */ hook_family_t vpnd_family_v6; /* E */ hook_event_t vpnd_event_in_v6; /* E */ hook_event_t vpnd_event_out_v6; /* E */ hook_event_token_t vpnd_token_in_v6; /* E */ hook_event_token_t vpnd_token_out_v6; /* E */ kmutex_t vpnd_lock; /* Protects remaining members */ kcondvar_t vpnd_ref_change; /* Uses vpnd_lock */ int vpnd_ref; /* L */ vnd_pnsd_flags_t vpnd_flags; /* L */ list_t vpnd_dev_list; /* L */ } vnd_pnsd_t; static void vnd_squeue_tx_drain(void *, mblk_t *, gsqueue_t *, void *); /* * Drop function signature. */ typedef void (*vnd_dropper_f)(vnd_str_t *, mblk_t *, const char *); static void vnd_drop_ctl(vnd_str_t *vsp, mblk_t *mp, const char *reason) { DTRACE_VND4(drop__ctl, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, mp, const char *, reason); if (mp != NULL) { freemsg(mp); } VND_STAT_INC(vsp, vks_ndlpidrops, 1); VND_STAT_INC(vsp, vks_tdrops, 1); } static void vnd_drop_in(vnd_str_t *vsp, mblk_t *mp, const char *reason) { DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, mp, const char *, reason); if (mp != NULL) { freemsg(mp); } VND_STAT_INC(vsp, vks_ndataindrops, 1); VND_STAT_INC(vsp, vks_tdrops, 1); } static void vnd_drop_out(vnd_str_t *vsp, mblk_t *mp, const char *reason) { DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, mp, const char *, reason); if (mp != NULL) { freemsg(mp); } VND_STAT_INC(vsp, vks_ndataoutdrops, 1); VND_STAT_INC(vsp, vks_tdrops, 1); } static void vnd_drop_hook_in(vnd_str_t *vsp, mblk_t *mp, const char *reason) { DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, mp, const char *, reason); if (mp != NULL) { freemsg(mp); } VND_STAT_INC(vsp, vks_nhookindrops, 1); VND_STAT_INC(vsp, vks_tdrops, 1); } static void vnd_drop_hook_out(vnd_str_t *vsp, mblk_t *mp, const char *reason) { DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, mp, const char *, reason); if (mp != NULL) { freemsg(mp); } VND_STAT_INC(vsp, vks_nhookoutdrops, 1); VND_STAT_INC(vsp, vks_tdrops, 1); } /* ARGSUSED */ static void vnd_drop_panic(vnd_str_t *vsp, mblk_t *mp, const char *reason) { panic("illegal vnd drop"); } /* ARGSUSED */ static void vnd_mac_drop_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain, mac_header_info_t *mhip) { mblk_t *mp; while (mp_chain != NULL) { mp = mp_chain; mp_chain = mp->b_next; vnd_drop_hook_in(vsp, mp, "stream not associated"); } } static vnd_pnsd_t * vnd_nsd_lookup(netstackid_t nsid) { vnd_pnsd_t *nsp; mutex_enter(&vnd_dev_lock); for (nsp = list_head(&vnd_nsd_list); nsp != NULL; nsp = list_next(&vnd_nsd_list, nsp)) { if (nsp->vpnd_nsid == nsid) { mutex_enter(&nsp->vpnd_lock); VERIFY(nsp->vpnd_ref >= 0); nsp->vpnd_ref++; mutex_exit(&nsp->vpnd_lock); break; } } mutex_exit(&vnd_dev_lock); return (nsp); } static vnd_pnsd_t * vnd_nsd_lookup_by_zid(zoneid_t zid) { netstack_t *ns; vnd_pnsd_t *nsp; ns = netstack_find_by_zoneid(zid); if (ns == NULL) return (NULL); nsp = vnd_nsd_lookup(ns->netstack_stackid); netstack_rele(ns); return (nsp); } static vnd_pnsd_t * vnd_nsd_lookup_by_zonename(char *zname) { zone_t *zonep; vnd_pnsd_t *nsp; zonep = zone_find_by_name(zname); if (zonep == NULL) return (NULL); nsp = vnd_nsd_lookup_by_zid(zonep->zone_id); zone_rele(zonep); return (nsp); } static void vnd_nsd_ref(vnd_pnsd_t *nsp) { mutex_enter(&nsp->vpnd_lock); /* * This can only be used on something that has been obtained through * some other means. As such, the caller should already have a reference * before adding another one. This function should not be used as a * means of creating the initial reference. */ VERIFY(nsp->vpnd_ref > 0); nsp->vpnd_ref++; mutex_exit(&nsp->vpnd_lock); cv_broadcast(&nsp->vpnd_ref_change); } static void vnd_nsd_rele(vnd_pnsd_t *nsp) { mutex_enter(&nsp->vpnd_lock); VERIFY(nsp->vpnd_ref > 0); nsp->vpnd_ref--; mutex_exit(&nsp->vpnd_lock); cv_broadcast(&nsp->vpnd_ref_change); } static vnd_dev_t * vnd_dev_lookup(minor_t m) { vnd_dev_t *vdp; mutex_enter(&vnd_dev_lock); for (vdp = list_head(&vnd_dev_list); vdp != NULL; vdp = list_next(&vnd_dev_list, vdp)) { if (vdp->vdd_minor == m) { mutex_enter(&vdp->vdd_lock); VERIFY(vdp->vdd_ref > 0); vdp->vdd_ref++; DTRACE_VND_REFINC(vdp); mutex_exit(&vdp->vdd_lock); break; } } mutex_exit(&vnd_dev_lock); return (vdp); } static void vnd_dev_free(vnd_dev_t *vdp) { /* * When the STREAM exists we need to go through and make sure * communication gets torn down. As part of closing the stream, we * guarantee that nothing else should be able to enter the stream layer * at this point. That means no one should be able to call * read(),write() or one of the frameio ioctls. */ if (vdp->vdd_flags & VND_D_ATTACHED) { (void) ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); crfree(vdp->vdd_cr); vdp->vdd_cr = NULL; /* * We have to remove ourselves from our parents list now. It is * really quite important that we have already set the condemend * flag here so that our containing netstack basically knows * that we're on the way down and knows not to wait for us. It's * also important that we do that before we put a rele on the * the device as that is the point at which it will check again. */ mutex_enter(&vdp->vdd_nsd->vpnd_lock); list_remove(&vdp->vdd_nsd->vpnd_dev_list, vdp); mutex_exit(&vdp->vdd_nsd->vpnd_lock); vnd_nsd_rele(vdp->vdd_nsd); vdp->vdd_nsd = NULL; } ASSERT(vdp->vdd_flags & VND_D_CONDEMNED); id_free(vnd_minors, vdp->vdd_minor); mutex_destroy(&vdp->vdd_lock); kmem_cache_free(vnd_dev_cache, vdp); } static void vnd_dev_ref(vnd_dev_t *vdp) { mutex_enter(&vdp->vdd_lock); VERIFY(vdp->vdd_ref > 0); vdp->vdd_ref++; DTRACE_VND_REFINC(vdp); mutex_exit(&vdp->vdd_lock); } /* * As part of releasing the hold on this we may tear down a given vnd_dev_t As * such we need to make sure that we grab the list lock first before grabbing * the vnd_dev_t's lock to ensure proper lock ordering. */ static void vnd_dev_rele(vnd_dev_t *vdp) { mutex_enter(&vnd_dev_lock); mutex_enter(&vdp->vdd_lock); VERIFY(vdp->vdd_ref > 0); vdp->vdd_ref--; DTRACE_VND_REFDEC(vdp); if (vdp->vdd_ref > 0) { mutex_exit(&vdp->vdd_lock); mutex_exit(&vnd_dev_lock); return; } /* * Now that we've removed this from the list, we can go ahead and * drop the list lock. No one else can find this device and reference * it. As its reference count is zero, it by definition does not have * any remaining entries in /devices that could lead someone back to * this. */ vdp->vdd_flags |= VND_D_CONDEMNED; list_remove(&vnd_dev_list, vdp); mutex_exit(&vdp->vdd_lock); mutex_exit(&vnd_dev_lock); vnd_dev_free(vdp); } /* * Insert a mesage block chain if there's space, otherwise drop it. Return one * so someone who was waiting for data would now end up having found it. eg. * caller should consider a broadcast. */ static int vnd_dq_push(vnd_data_queue_t *vqp, mblk_t *mp, boolean_t reserved, vnd_dropper_f dropf) { size_t msize; ASSERT(MUTEX_HELD(&vqp->vdq_lock)); if (reserved == B_FALSE) { msize = msgsize(mp); if (vqp->vdq_cur + msize > vqp->vdq_max) { dropf(vqp->vdq_vns, mp, "buffer full"); return (0); } vqp->vdq_cur += msize; } if (vqp->vdq_head == NULL) { ASSERT(vqp->vdq_tail == NULL); vqp->vdq_head = mp; vqp->vdq_tail = mp; } else { vqp->vdq_tail->b_next = mp; vqp->vdq_tail = mp; } return (1); } /* * Remove a message message block chain. If the amount of space in the buffer * has changed we return 1. We have no way of knowing whether or not there is * enough space overall for a given writer who is blocked, so we always end up * having to return true and thus tell consumers that they should consider * signalling. */ static int vnd_dq_pop(vnd_data_queue_t *vqp, mblk_t **mpp) { size_t msize; mblk_t *mp; ASSERT(MUTEX_HELD(&vqp->vdq_lock)); ASSERT(mpp != NULL); if (vqp->vdq_head == NULL) { ASSERT(vqp->vdq_tail == NULL); *mpp = NULL; return (0); } mp = vqp->vdq_head; msize = msgsize(mp); vqp->vdq_cur -= msize; if (mp->b_next == NULL) { vqp->vdq_head = NULL; vqp->vdq_tail = NULL; /* * We can't be certain that this is always going to be zero. * Someone may have basically taken a reservation of space on * the data queue, eg. claimed spae but not yet pushed it on * yet. */ ASSERT(vqp->vdq_cur >= 0); } else { vqp->vdq_head = mp->b_next; ASSERT(vqp->vdq_cur > 0); } mp->b_next = NULL; *mpp = mp; return (1); } /* * Reserve space in the queue. This will bump up the size of the queue and * entitle the user to push something on later without bumping the space. */ static int vnd_dq_reserve(vnd_data_queue_t *vqp, ssize_t size) { ASSERT(MUTEX_HELD(&vqp->vdq_lock)); ASSERT(size >= 0); if (size == 0) return (0); if (size + vqp->vdq_cur > vqp->vdq_max) return (0); vqp->vdq_cur += size; return (1); } static void vnd_dq_unreserve(vnd_data_queue_t *vqp, ssize_t size) { ASSERT(MUTEX_HELD(&vqp->vdq_lock)); ASSERT(size > 0); ASSERT(size <= vqp->vdq_cur); vqp->vdq_cur -= size; } static void vnd_dq_flush(vnd_data_queue_t *vqp, vnd_dropper_f dropf) { mblk_t *mp, *next; mutex_enter(&vqp->vdq_lock); for (mp = vqp->vdq_head; mp != NULL; mp = next) { next = mp->b_next; mp->b_next = NULL; dropf(vqp->vdq_vns, mp, "vnd_dq_flush"); } vqp->vdq_cur = 0; vqp->vdq_head = NULL; vqp->vdq_tail = NULL; mutex_exit(&vqp->vdq_lock); } static boolean_t vnd_dq_is_empty(vnd_data_queue_t *vqp) { boolean_t ret; mutex_enter(&vqp->vdq_lock); if (vqp->vdq_head == NULL) ret = B_TRUE; else ret = B_FALSE; mutex_exit(&vqp->vdq_lock); return (ret); } /* * Get a network uint16_t from the message and translate it into something the * host understands. */ static int vnd_mbc_getu16(mblk_t *mp, off_t off, uint16_t *out) { size_t mpsize; uint8_t *bp; mpsize = msgsize(mp); /* Check for overflow */ if (off + sizeof (uint16_t) > mpsize) return (1); mpsize = MBLKL(mp); while (off >= mpsize) { mp = mp->b_cont; off -= mpsize; mpsize = MBLKL(mp); } /* * Data is in network order. Note the second byte of data might be in * the next mp. */ bp = mp->b_rptr + off; *out = *bp << 8; if (off + 1 == mpsize) { mp = mp->b_cont; bp = mp->b_rptr; } else { bp++; } *out |= *bp; return (0); } /* * Given an mblk chain find the mblk and address of a particular offset. */ static int vnd_mbc_getoffset(mblk_t *mp, off_t off, mblk_t **mpp, uintptr_t *offp) { size_t mpsize; if (off >= msgsize(mp)) return (1); mpsize = MBLKL(mp); while (off >= mpsize) { mp = mp->b_cont; off -= mpsize; mpsize = MBLKL(mp); } *mpp = mp; *offp = (uintptr_t)mp->b_rptr + off; return (0); } /* * Fetch the destination mac address. Set *dstp to that mac address. If the data * is not contiguous in the first mblk_t, fill in datap and set *dstp to it. */ static int vnd_mbc_getdstmac(mblk_t *mp, uint8_t **dstpp, uint8_t *datap) { int i; if (MBLKL(mp) >= ETHERADDRL) { *dstpp = mp->b_rptr; return (0); } *dstpp = datap; for (i = 0; i < ETHERADDRL; i += 2, datap += 2) { if (vnd_mbc_getu16(mp, i, (uint16_t *)datap) != 0) return (1); } return (0); } static int vnd_hook(vnd_str_t *vsp, mblk_t **mpp, net_handle_t netiv4, hook_event_t hev4, hook_event_token_t hetv4, net_handle_t netiv6, hook_event_t hev6, hook_event_token_t hetv6, vnd_dropper_f hdrop, vnd_dropper_f ddrop) { uint16_t etype; hook_pkt_event_t info; size_t offset, mblen; uint8_t *dstp; uint8_t dstaddr[6]; hook_event_t he; hook_event_token_t het; net_handle_t neti; /* * Before we can ask if we're interested we have to do enough work to * determine the ethertype. */ /* Byte 12 is either the VLAN tag or the ethertype */ if (vnd_mbc_getu16(*mpp, 12, &etype) != 0) { ddrop(vsp, *mpp, "packet has incomplete ethernet header"); *mpp = NULL; return (1); } if (etype == ETHERTYPE_VLAN) { /* Actual ethertype is another four bytes in */ if (vnd_mbc_getu16(*mpp, 16, &etype) != 0) { ddrop(vsp, *mpp, "packet has incomplete ethernet vlan header"); *mpp = NULL; return (1); } offset = sizeof (struct ether_vlan_header); } else { offset = sizeof (struct ether_header); } /* * At the moment we only hook on the kinds of things that the IP module * would normally. */ if (etype != ETHERTYPE_IP && etype != ETHERTYPE_IPV6) return (0); if (etype == ETHERTYPE_IP) { neti = netiv4; he = hev4; het = hetv4; } else { neti = netiv6; he = hev6; het = hetv6; } if (!he.he_interested) return (0); if (vnd_mbc_getdstmac(*mpp, &dstp, dstaddr) != 0) { ddrop(vsp, *mpp, "packet has incomplete ethernet header"); *mpp = NULL; return (1); } /* * Now that we know we're interested, we have to do some additional * sanity checking for IPF's sake, ala ip_check_length(). Specifically * we need to check to make sure that the remaining packet size, * excluding MAC, is at least the size of an IP header. */ mblen = msgsize(*mpp); if ((etype == ETHERTYPE_IP && mblen - offset < IP_SIMPLE_HDR_LENGTH) || (etype == ETHERTYPE_IPV6 && mblen - offset < IPV6_HDR_LEN)) { ddrop(vsp, *mpp, "packet has invalid IP header"); *mpp = NULL; return (1); } info.hpe_protocol = neti; info.hpe_ifp = (phy_if_t)vsp; info.hpe_ofp = (phy_if_t)vsp; info.hpe_mp = mpp; info.hpe_flags = 0; if (bcmp(vnd_bcast_addr, dstp, ETHERADDRL) == 0) info.hpe_flags |= HPE_BROADCAST; else if (etype == ETHERTYPE_IP && bcmp(vnd_ipv4_mcast, vnd_bcast_addr, IPV4_MCAST_LEN) == 0) info.hpe_flags |= HPE_MULTICAST; else if (etype == ETHERTYPE_IPV6 && bcmp(vnd_ipv6_mcast, vnd_bcast_addr, IPV6_MCAST_LEN) == 0) info.hpe_flags |= HPE_MULTICAST; if (vnd_mbc_getoffset(*mpp, offset, &info.hpe_mb, (uintptr_t *)&info.hpe_hdr) != 0) { ddrop(vsp, *mpp, "packet too small -- " "unable to find payload"); *mpp = NULL; return (1); } if (hook_run(neti->netd_hooks, het, (hook_data_t)&info) != 0) { hdrop(vsp, *mpp, "drooped by hooks"); return (1); } return (0); } /* * This should not be used for DL_INFO_REQ. */ static mblk_t * vnd_dlpi_alloc(size_t len, t_uscalar_t prim) { mblk_t *mp; mp = allocb(len, BPRI_MED); if (mp == NULL) return (NULL); mp->b_datap->db_type = M_PROTO; mp->b_wptr = mp->b_rptr + len; bzero(mp->b_rptr, len); ((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim; return (mp); } static void vnd_dlpi_inc_push(vnd_str_t *vsp, mblk_t *mp) { mblk_t **mpp; VERIFY(MUTEX_HELD(&vsp->vns_lock)); ASSERT(mp->b_next == NULL); mpp = &vsp->vns_dlpi_inc; while (*mpp != NULL) mpp = &((*mpp)->b_next); *mpp = mp; } static mblk_t * vnd_dlpi_inc_pop(vnd_str_t *vsp) { mblk_t *mp; VERIFY(MUTEX_HELD(&vsp->vns_lock)); mp = vsp->vns_dlpi_inc; if (mp != NULL) { VERIFY(mp->b_next == NULL || mp->b_next != mp); vsp->vns_dlpi_inc = mp->b_next; mp->b_next = NULL; } return (mp); } static int vnd_st_sinfo(vnd_str_t *vsp) { mblk_t *mp; dl_info_req_t *dlir; VERIFY(MUTEX_HELD(&vsp->vns_lock)); mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), BPRI_HI); if (mp == NULL) { vsp->vns_errno = VND_E_NOMEM; return (1); } vsp->vns_state = VNS_S_INFO_SENT; cv_broadcast(&vsp->vns_stcv); mp->b_datap->db_type = M_PCPROTO; dlir = (dl_info_req_t *)mp->b_rptr; mp->b_wptr = (uchar_t *)&dlir[1]; dlir->dl_primitive = DL_INFO_REQ; putnext(vsp->vns_wq, mp); return (0); } static int vnd_st_info(vnd_str_t *vsp) { dl_info_ack_t *dlia; mblk_t *mp; VERIFY(MUTEX_HELD(&vsp->vns_lock)); mp = vnd_dlpi_inc_pop(vsp); dlia = (dl_info_ack_t *)mp->b_rptr; vsp->vns_dlpi_style = dlia->dl_provider_style; vsp->vns_minwrite = dlia->dl_min_sdu; vsp->vns_maxwrite = dlia->dl_max_sdu; /* * At this time we only support DL_ETHER devices. */ if (dlia->dl_mac_type != DL_ETHER) { freemsg(mp); vsp->vns_errno = VND_E_NOTETHER; return (1); } /* * Because vnd operates on entire packets, we need to manually account * for the ethernet header information. We add the size of the * ether_vlan_header to account for this, regardless if it is using * vlans or not. */ vsp->vns_maxwrite += sizeof (struct ether_vlan_header); freemsg(mp); return (0); } static int vnd_st_sexclusive(vnd_str_t *vsp) { mblk_t *mp; VERIFY(MUTEX_HELD(&vsp->vns_lock)); mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_EXCLUSIVE_REQ); if (mp == NULL) { vsp->vns_errno = VND_E_NOMEM; return (1); } vsp->vns_state = VNS_S_EXCLUSIVE_SENT; cv_broadcast(&vsp->vns_stcv); putnext(vsp->vns_wq, mp); return (0); } static int vnd_st_exclusive(vnd_str_t *vsp) { mblk_t *mp; t_uscalar_t prim, cprim; VERIFY(MUTEX_HELD(&vsp->vns_lock)); mp = vnd_dlpi_inc_pop(vsp); prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive; cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { vnd_drop_ctl(vsp, mp, "wrong dlpi primitive for vnd_st_exclusive"); vsp->vns_errno = VND_E_DLPIINVAL; return (1); } if (cprim != DL_EXCLUSIVE_REQ) { vnd_drop_ctl(vsp, mp, "vnd_st_exclusive: got ack/nack for wrong primitive"); vsp->vns_errno = VND_E_DLPIINVAL; return (1); } if (prim == DL_ERROR_ACK) vsp->vns_errno = VND_E_DLEXCL; freemsg(mp); return (prim == DL_ERROR_ACK); } /* * Send down a DLPI_ATTACH_REQ. */ static int vnd_st_sattach(vnd_str_t *vsp) { mblk_t *mp; VERIFY(MUTEX_HELD(&vsp->vns_lock)); mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_ATTACH_REQ); if (mp == NULL) { vsp->vns_errno = VND_E_NOMEM; return (1); } ((dl_attach_req_t *)mp->b_rptr)->dl_ppa = 0; vsp->vns_state = VNS_S_ATTACH_SENT; cv_broadcast(&vsp->vns_stcv); putnext(vsp->vns_wq, mp); return (0); } static int vnd_st_attach(vnd_str_t *vsp) { mblk_t *mp; t_uscalar_t prim, cprim; VERIFY(MUTEX_HELD(&vsp->vns_lock)); mp = vnd_dlpi_inc_pop(vsp); prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { vnd_drop_ctl(vsp, mp, "vnd_st_attach: unknown primitive type"); vsp->vns_errno = VND_E_DLPIINVAL; return (1); } if (cprim != DL_ATTACH_REQ) { vnd_drop_ctl(vsp, mp, "vnd_st_attach: Got ack/nack for wrong primitive"); vsp->vns_errno = VND_E_DLPIINVAL; return (1); } if (prim == DL_ERROR_ACK) vsp->vns_errno = VND_E_ATTACHFAIL; freemsg(mp); return (prim == DL_ERROR_ACK); } static int vnd_st_sbind(vnd_str_t *vsp) { mblk_t *mp; dl_bind_req_t *dbrp; VERIFY(MUTEX_HELD(&vsp->vns_lock)); mp = vnd_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), DL_BIND_REQ); if (mp == NULL) { vsp->vns_errno = VND_E_NOMEM; return (1); } dbrp = (dl_bind_req_t *)(mp->b_rptr); dbrp->dl_sap = 0; dbrp->dl_service_mode = DL_CLDLS; vsp->vns_state = VNS_S_BIND_SENT; cv_broadcast(&vsp->vns_stcv); putnext(vsp->vns_wq, mp); return (0); } static int vnd_st_bind(vnd_str_t *vsp) { mblk_t *mp; t_uscalar_t prim; VERIFY(MUTEX_HELD(&vsp->vns_lock)); mp = vnd_dlpi_inc_pop(vsp); prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive; if (prim != DL_BIND_ACK && prim != DL_ERROR_ACK) { vnd_drop_ctl(vsp, mp, "wrong dlpi primitive for vnd_st_bind"); vsp->vns_errno = VND_E_DLPIINVAL; return (1); } if (prim == DL_ERROR_ACK) vsp->vns_errno = VND_E_BINDFAIL; freemsg(mp); return (prim == DL_ERROR_ACK); } static int vnd_st_spromisc(vnd_str_t *vsp, int type, vnd_str_state_t next) { mblk_t *mp; dl_promiscon_req_t *dprp; VERIFY(MUTEX_HELD(&vsp->vns_lock)); mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCON_REQ); if (mp == NULL) { vsp->vns_errno = VND_E_NOMEM; return (1); } dprp = (dl_promiscon_req_t *)mp->b_rptr; dprp->dl_level = type; vsp->vns_state = next; cv_broadcast(&vsp->vns_stcv); putnext(vsp->vns_wq, mp); return (0); } static int vnd_st_promisc(vnd_str_t *vsp) { mblk_t *mp; t_uscalar_t prim, cprim; VERIFY(MUTEX_HELD(&vsp->vns_lock)); mp = vnd_dlpi_inc_pop(vsp); prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { vnd_drop_ctl(vsp, mp, "wrong dlpi primitive for vnd_st_promisc"); vsp->vns_errno = VND_E_DLPIINVAL; return (1); } if (cprim != DL_PROMISCON_REQ) { vnd_drop_ctl(vsp, mp, "vnd_st_promisc: Got ack/nack for wrong primitive"); vsp->vns_errno = VND_E_DLPIINVAL; return (1); } if (prim == DL_ERROR_ACK) vsp->vns_errno = VND_E_PROMISCFAIL; freemsg(mp); return (prim == DL_ERROR_ACK); } static int vnd_st_scapabq(vnd_str_t *vsp) { mblk_t *mp; VERIFY(MUTEX_HELD(&vsp->vns_lock)); mp = vnd_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ); if (mp == NULL) { vsp->vns_errno = VND_E_NOMEM; return (1); } vsp->vns_state = VNS_S_CAPAB_Q_SENT; cv_broadcast(&vsp->vns_stcv); putnext(vsp->vns_wq, mp); return (0); } /* ARGSUSED */ static void vnd_mac_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain, mac_header_info_t *mhip) { int signal = 0; mblk_t *mp; vnd_pnsd_t *nsp = vsp->vns_nsd; ASSERT(vsp != NULL); ASSERT(mp_chain != NULL); for (mp = mp_chain; mp != NULL; mp = mp_chain) { uint16_t vid; mp_chain = mp->b_next; mp->b_next = NULL; /* * If we were operating in a traditional dlpi context then we * would have enabled DLIOCRAW and rather than the fast path, we * would come through dld_str_rx_raw. That function does two * things that we have to consider doing ourselves. The first is * that it adjusts the b_rptr back to account for dld bumping us * past the mac header. It also tries to account for cases where * mac provides an illusion of the mac header. Fortunately, dld * only allows the fastpath when the media type is the same as * the native type. Therefore all we have to do here is adjust * the b_rptr. */ ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize); mp->b_rptr -= mhip->mhi_hdrsize; vid = VLAN_ID(mhip->mhi_tci); if (mhip->mhi_istagged && vid != VLAN_ID_NONE) { /* * This is an overlapping copy. Do not use bcopy(9F). */ (void) memmove(mp->b_rptr + 4, mp->b_rptr, 12); mp->b_rptr += 4; } if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4, nsp->vpnd_event_in_v4, nsp->vpnd_token_in_v4, nsp->vpnd_neti_v6, nsp->vpnd_event_in_v6, nsp->vpnd_token_in_v6, vnd_drop_hook_in, vnd_drop_in) != 0) continue; VND_STAT_INC(vsp, vks_rpackets, 1); VND_STAT_INC(vsp, vks_rbytes, msgsize(mp)); DTRACE_VND5(recv, mblk_t *, mp, void *, NULL, void *, NULL, vnd_str_t *, vsp, mblk_t *, mp); mutex_enter(&vsp->vns_dq_read.vdq_lock); signal |= vnd_dq_push(&vsp->vns_dq_read, mp, B_FALSE, vnd_drop_in); mutex_exit(&vsp->vns_dq_read.vdq_lock); } if (signal != 0) { cv_broadcast(&vsp->vns_dq_read.vdq_ready); pollwakeup(&vsp->vns_dev->vdd_ph, POLLIN | POLLRDNORM); } } static void vnd_mac_flow_control_stat(vnd_str_t *vsp, hrtime_t diff) { VND_STAT_INC(vsp, vks_nmacflow, 1); VND_STAT_INC(vsp, vks_tmacflow, diff); if (diff >= VND_LATENCY_1MS) VND_STAT_INC(vsp, vks_mac_flow_1ms, 1); if (diff >= VND_LATENCY_10MS) VND_STAT_INC(vsp, vks_mac_flow_10ms, 1); if (diff >= VND_LATENCY_100MS) VND_STAT_INC(vsp, vks_mac_flow_100ms, 1); if (diff >= VND_LATENCY_1S) VND_STAT_INC(vsp, vks_mac_flow_1s, 1); if (diff >= VND_LATENCY_10S) VND_STAT_INC(vsp, vks_mac_flow_10s, 1); } /* * This is a callback from MAC that indicates that we are allowed to send * packets again. */ static void vnd_mac_flow_control(void *arg, vnd_mac_cookie_t cookie) { vnd_str_t *vsp = arg; hrtime_t now; mutex_enter(&vsp->vns_lock); now = gethrtime(); /* * Check for the case that we beat vnd_squeue_tx_one to the punch. * There's also an additional case here that we got notified because * we're sharing a device that ran out of tx descriptors, even though it * wasn't because of us. */ if (!(vsp->vns_flags & VNS_F_FLOW_CONTROLLED)) { vsp->vns_fcupdate = now; mutex_exit(&vsp->vns_lock); return; } ASSERT(vsp->vns_flags & VNS_F_FLOW_CONTROLLED); ASSERT(vsp->vns_caps.vsc_fc_cookie == cookie); vsp->vns_flags &= ~VNS_F_FLOW_CONTROLLED; vsp->vns_caps.vsc_fc_cookie = (vnd_mac_cookie_t)NULL; vsp->vns_fclatch = 0; DTRACE_VND3(flow__resumed, vnd_str_t *, vsp, uint64_t, vsp->vns_dq_write.vdq_cur, uintptr_t, cookie); /* * If someone has asked to flush the squeue and thus inserted a barrier, * than we shouldn't schedule a drain. */ if (!(vsp->vns_flags & (VNS_F_DRAIN_SCHEDULED | VNS_F_BARRIER))) { vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED; gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_drainblk, vnd_squeue_tx_drain, vsp, GSQUEUE_FILL, VND_SQUEUE_TAG_MAC_FLOW_CONTROL); } mutex_exit(&vsp->vns_lock); } static void vnd_mac_enter(vnd_str_t *vsp, mac_perim_handle_t *mphp) { ASSERT(MUTEX_HELD(&vsp->vns_lock)); VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl, DLD_CAPAB_PERIM, mphp, DLD_ENABLE) == 0); } static void vnd_mac_exit(vnd_str_t *vsp, mac_perim_handle_t mph) { ASSERT(MUTEX_HELD(&vsp->vns_lock)); VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl, DLD_CAPAB_PERIM, mph, DLD_DISABLE) == 0); } static int vnd_dld_cap_enable(vnd_str_t *vsp, vnd_rx_t rxfunc) { int ret; dld_capab_direct_t d; mac_perim_handle_t mph; vnd_str_capab_t *c = &vsp->vns_caps; bzero(&d, sizeof (d)); d.di_rx_cf = (uintptr_t)rxfunc; d.di_rx_ch = vsp; d.di_flags = DI_DIRECT_RAW; vnd_mac_enter(vsp, &mph); /* * If we're coming in here for a second pass, we need to make sure that * we remove an existing flow control notification callback, otherwise * we'll create a duplicate that will remain with garbage data. */ if (c->vsc_tx_fc_hdl != NULL) { ASSERT(c->vsc_set_fcb_hdl != NULL); (void) c->vsc_set_fcb_f(c->vsc_set_fcb_hdl, NULL, c->vsc_tx_fc_hdl); c->vsc_tx_fc_hdl = NULL; } if (vsp->vns_caps.vsc_capab_f(c->vsc_capab_hdl, DLD_CAPAB_DIRECT, &d, DLD_ENABLE) == 0) { c->vsc_tx_f = (vnd_dld_tx_t)d.di_tx_df; c->vsc_tx_hdl = d.di_tx_dh; c->vsc_set_fcb_f = (vnd_dld_set_fcb_t)d.di_tx_cb_df; c->vsc_set_fcb_hdl = d.di_tx_cb_dh; c->vsc_is_fc_f = (vnd_dld_is_fc_t)d.di_tx_fctl_df; c->vsc_is_fc_hdl = d.di_tx_fctl_dh; c->vsc_tx_fc_hdl = c->vsc_set_fcb_f(c->vsc_set_fcb_hdl, vnd_mac_flow_control, vsp); c->vsc_flags |= VNS_C_DIRECT; ret = 0; } else { vsp->vns_errno = VND_E_DIRECTFAIL; ret = 1; } vnd_mac_exit(vsp, mph); return (ret); } static int vnd_st_capabq(vnd_str_t *vsp) { mblk_t *mp; dl_capability_ack_t *cap; dl_capability_sub_t *subp; dl_capab_hcksum_t *hck; dl_capab_dld_t *dld; unsigned char *rp; int ret = 0; VERIFY(MUTEX_HELD(&vsp->vns_lock)); mp = vnd_dlpi_inc_pop(vsp); rp = mp->b_rptr; cap = (dl_capability_ack_t *)rp; if (cap->dl_sub_length == 0) goto done; /* Don't try to process something too big */ if (sizeof (dl_capability_ack_t) + cap->dl_sub_length > MBLKL(mp)) { VND_STAT_INC(vsp, vks_ndlpidrops, 1); VND_STAT_INC(vsp, vks_tdrops, 1); vsp->vns_errno = VND_E_CAPACKINVAL; ret = 1; goto done; } rp += cap->dl_sub_offset; while (cap->dl_sub_length > 0) { subp = (dl_capability_sub_t *)rp; /* Sanity check something crazy from down below */ if (subp->dl_length + sizeof (dl_capability_sub_t) > cap->dl_sub_length) { VND_STAT_INC(vsp, vks_ndlpidrops, 1); VND_STAT_INC(vsp, vks_tdrops, 1); vsp->vns_errno = VND_E_SUBCAPINVAL; ret = 1; goto done; } switch (subp->dl_cap) { case DL_CAPAB_HCKSUM: hck = (dl_capab_hcksum_t *)(rp + sizeof (dl_capability_sub_t)); if (hck->hcksum_version != HCKSUM_CURRENT_VERSION) { vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM_BADVERS; break; } if (dlcapabcheckqid(&hck->hcksum_mid, vsp->vns_lrq) != B_TRUE) { vsp->vns_errno = VND_E_CAPABPASS; ret = 1; goto done; } vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM; vsp->vns_caps.vsc_hcksum_opts = hck->hcksum_txflags; break; case DL_CAPAB_DLD: dld = (dl_capab_dld_t *)(rp + sizeof (dl_capability_sub_t)); if (dld->dld_version != DLD_CURRENT_VERSION) { vsp->vns_errno = VND_E_DLDBADVERS; ret = 1; goto done; } if (dlcapabcheckqid(&dld->dld_mid, vsp->vns_lrq) != B_TRUE) { vsp->vns_errno = VND_E_CAPABPASS; ret = 1; goto done; } vsp->vns_caps.vsc_flags |= VNS_C_DLD; vsp->vns_caps.vsc_capab_f = (vnd_dld_cap_t)dld->dld_capab; vsp->vns_caps.vsc_capab_hdl = (void *)dld->dld_capab_handle; /* * At this point in time, we have to set up a direct * function that drops all input. This validates that * we'll be able to set up direct input and that we can * easily switch it earlier to the real data function * when we've plumbed everything up. */ if (vnd_dld_cap_enable(vsp, vnd_mac_drop_input) != 0) { /* vns_errno set by vnd_dld_cap_enable */ ret = 1; goto done; } break; default: /* Ignore unsupported cap */ break; } rp += sizeof (dl_capability_sub_t) + subp->dl_length; cap->dl_sub_length -= sizeof (dl_capability_sub_t) + subp->dl_length; } done: /* Make sure we enabled direct callbacks */ if (ret == 0 && !(vsp->vns_caps.vsc_flags & VNS_C_DIRECT)) { vsp->vns_errno = VND_E_DIRECTNOTSUP; ret = 1; } freemsg(mp); return (ret); } static void vnd_st_sonline(vnd_str_t *vsp) { VERIFY(MUTEX_HELD(&vsp->vns_lock)); vsp->vns_state = VNS_S_ONLINE; cv_broadcast(&vsp->vns_stcv); } static void vnd_st_shutdown(vnd_str_t *vsp) { mac_perim_handle_t mph; vnd_str_capab_t *vsc = &vsp->vns_caps; VERIFY(MUTEX_HELD(&vsp->vns_lock)); /* * At this point in time we know that there is no one transmitting as * our final reference has been torn down and that vnd_s_close inserted * a barrier to validate that everything is flushed. */ if (vsc->vsc_flags & VNS_C_DIRECT) { vnd_mac_enter(vsp, &mph); vsc->vsc_flags &= ~VNS_C_DIRECT; (void) vsc->vsc_set_fcb_f(vsc->vsc_set_fcb_hdl, NULL, vsc->vsc_tx_fc_hdl); vsc->vsc_tx_fc_hdl = NULL; (void) vsc->vsc_capab_f(vsc->vsc_capab_hdl, DLD_CAPAB_DIRECT, NULL, DLD_DISABLE); vnd_mac_exit(vsp, mph); } } static boolean_t vnd_st_spromiscoff(vnd_str_t *vsp, int type, vnd_str_state_t next) { boolean_t ret = B_TRUE; mblk_t *mp; dl_promiscoff_req_t *dprp; VERIFY(MUTEX_HELD(&vsp->vns_lock)); mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCOFF_REQ); if (mp == NULL) { cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for " "promiscoff request"); ret = B_FALSE; goto next; } dprp = (dl_promiscoff_req_t *)mp->b_rptr; dprp->dl_level = type; putnext(vsp->vns_wq, mp); next: vsp->vns_state = next; cv_broadcast(&vsp->vns_stcv); return (ret); } static void vnd_st_promiscoff(vnd_str_t *vsp) { mblk_t *mp; t_uscalar_t prim, cprim; VERIFY(MUTEX_HELD(&vsp->vns_lock)); /* * Unlike other cases where we guard against the incoming packet being * NULL, during tear down we try to keep driving and therefore we may * have gotten here due to an earlier failure, so there's nothing to do. */ mp = vnd_dlpi_inc_pop(vsp); if (mp == NULL) return; prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { vnd_drop_ctl(vsp, mp, "wrong dlpi primitive for vnd_st_promiscoff"); return; } if (cprim != DL_PROMISCOFF_REQ) { vnd_drop_ctl(vsp, mp, "vnd_st_promiscoff: Got ack/nack for wrong primitive"); return; } if (prim == DL_ERROR_ACK) { cmn_err(CE_WARN, "!failed to disable promiscuos mode during " "vnd teardown"); } } static boolean_t vnd_st_sunbind(vnd_str_t *vsp) { mblk_t *mp; boolean_t ret = B_TRUE; mp = vnd_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); if (mp == NULL) { cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for " "unbind request"); ret = B_FALSE; goto next; } putnext(vsp->vns_wq, mp); next: vsp->vns_state = VNS_S_UNBIND_SENT; cv_broadcast(&vsp->vns_stcv); return (ret); } static void vnd_st_unbind(vnd_str_t *vsp) { mblk_t *mp; t_uscalar_t prim, cprim; /* * Unlike other cases where we guard against the incoming packet being * NULL, during tear down we try to keep driving and therefore we may * have gotten here due to an earlier failure, so there's nothing to do. */ mp = vnd_dlpi_inc_pop(vsp); if (mp == NULL) goto next; prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { vnd_drop_ctl(vsp, mp, "wrong dlpi primitive for vnd_st_unbind"); goto next; } if (cprim != DL_UNBIND_REQ) { vnd_drop_ctl(vsp, mp, "vnd_st_unbind: Got ack/nack for wrong primitive"); goto next; } if (prim == DL_ERROR_ACK) { cmn_err(CE_WARN, "!failed to unbind stream during vnd " "teardown"); } next: vsp->vns_state = VNS_S_ZOMBIE; cv_broadcast(&vsp->vns_stcv); } /* * Perform state transitions. This is a one way shot down the flow chart * described in the big theory statement. */ static void vnd_str_state_transition(void *arg) { boolean_t died = B_FALSE; vnd_str_t *vsp = arg; mblk_t *mp; mutex_enter(&vsp->vns_lock); if (vsp->vns_dlpi_inc == NULL && (vsp->vns_state != VNS_S_INITIAL && vsp->vns_state != VNS_S_SHUTTING_DOWN)) { mutex_exit(&vsp->vns_lock); return; } /* * When trying to shut down, or unwinding from a failed enabling, rather * than immediately entering the ZOMBIE state, we may instead opt to try * and enter the next state in the progression. This is especially * important when trying to tear everything down. */ loop: DTRACE_PROBE2(vnd__state__transition, uintptr_t, vsp, vnd_str_state_t, vsp->vns_state); switch (vsp->vns_state) { case VNS_S_INITIAL: VERIFY(vsp->vns_dlpi_inc == NULL); if (vnd_st_sinfo(vsp) != 0) died = B_TRUE; break; case VNS_S_INFO_SENT: VERIFY(vsp->vns_dlpi_inc != NULL); if (vnd_st_info(vsp) == 0) { if (vnd_st_sexclusive(vsp) != 0) died = B_TRUE; } else { died = B_TRUE; } break; case VNS_S_EXCLUSIVE_SENT: VERIFY(vsp->vns_dlpi_inc != NULL); if (vnd_st_exclusive(vsp) == 0) { if (vsp->vns_dlpi_style == DL_STYLE2) { if (vnd_st_sattach(vsp) != 0) died = B_TRUE; } else { if (vnd_st_sbind(vsp) != 0) died = B_TRUE; } } else { died = B_TRUE; } break; case VNS_S_ATTACH_SENT: VERIFY(vsp->vns_dlpi_inc != NULL); if (vnd_st_attach(vsp) == 0) { if (vnd_st_sbind(vsp) != 0) died = B_TRUE; } else { died = B_TRUE; } break; case VNS_S_BIND_SENT: VERIFY(vsp->vns_dlpi_inc != NULL); if (vnd_st_bind(vsp) == 0) { if (vnd_st_spromisc(vsp, DL_PROMISC_SAP, VNS_S_SAP_PROMISC_SENT) != 0) died = B_TRUE; } else { died = B_TRUE; } break; case VNS_S_SAP_PROMISC_SENT: VERIFY(vsp->vns_dlpi_inc != NULL); if (vnd_st_promisc(vsp) == 0) { if (vnd_st_spromisc(vsp, DL_PROMISC_MULTI, VNS_S_MULTI_PROMISC_SENT) != 0) died = B_TRUE; } else { died = B_TRUE; } break; case VNS_S_MULTI_PROMISC_SENT: VERIFY(vsp->vns_dlpi_inc != NULL); if (vnd_st_promisc(vsp) == 0) { if (vnd_st_spromisc(vsp, DL_PROMISC_RX_ONLY, VNS_S_RX_ONLY_PROMISC_SENT) != 0) died = B_TRUE; } else { died = B_TRUE; } break; case VNS_S_RX_ONLY_PROMISC_SENT: VERIFY(vsp->vns_dlpi_inc != NULL); if (vnd_st_promisc(vsp) == 0) { if (vnd_st_spromisc(vsp, DL_PROMISC_FIXUPS, VNS_S_FIXUP_PROMISC_SENT) != 0) died = B_TRUE; } else { died = B_TRUE; } break; case VNS_S_FIXUP_PROMISC_SENT: VERIFY(vsp->vns_dlpi_inc != NULL); if (vnd_st_promisc(vsp) == 0) { if (vnd_st_scapabq(vsp) != 0) died = B_TRUE; } else { died = B_TRUE; } break; case VNS_S_CAPAB_Q_SENT: if (vnd_st_capabq(vsp) != 0) died = B_TRUE; else vnd_st_sonline(vsp); break; case VNS_S_SHUTTING_DOWN: vnd_st_shutdown(vsp); if (vnd_st_spromiscoff(vsp, DL_PROMISC_MULTI, VNS_S_MULTICAST_PROMISCOFF_SENT) == B_FALSE) goto loop; break; case VNS_S_MULTICAST_PROMISCOFF_SENT: vnd_st_promiscoff(vsp); if (vnd_st_spromiscoff(vsp, DL_PROMISC_SAP, VNS_S_SAP_PROMISCOFF_SENT) == B_FALSE) goto loop; break; case VNS_S_SAP_PROMISCOFF_SENT: vnd_st_promiscoff(vsp); if (vnd_st_sunbind(vsp) == B_FALSE) goto loop; break; case VNS_S_UNBIND_SENT: vnd_st_unbind(vsp); break; case VNS_S_ZOMBIE: while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL) vnd_drop_ctl(vsp, mp, "vsp received data as a zombie"); break; default: panic("vnd_str_t entered an unknown state"); } if (died == B_TRUE) { ASSERT(vsp->vns_errno != VND_E_SUCCESS); vsp->vns_laststate = vsp->vns_state; vsp->vns_state = VNS_S_ZOMBIE; cv_broadcast(&vsp->vns_stcv); } mutex_exit(&vsp->vns_lock); } static void vnd_dlpi_taskq_dispatch(void *arg) { vnd_str_t *vsp = arg; int run = 1; while (run != 0) { vnd_str_state_transition(vsp); mutex_enter(&vsp->vns_lock); if (vsp->vns_flags & VNS_F_CONDEMNED || vsp->vns_dlpi_inc == NULL) { run = 0; vsp->vns_flags &= ~VNS_F_TASKQ_DISPATCHED; } if (vsp->vns_flags & VNS_F_CONDEMNED) cv_signal(&vsp->vns_cancelcv); mutex_exit(&vsp->vns_lock); } } /* ARGSUSED */ static int vnd_neti_getifname(net_handle_t neti, phy_if_t phy, char *buf, const size_t len) { return (-1); } /* ARGSUSED */ static int vnd_neti_getmtu(net_handle_t neti, phy_if_t phy, lif_if_t ifdata) { return (-1); } /* ARGSUSED */ static int vnd_neti_getptmue(net_handle_t neti) { return (-1); } /* ARGSUSED */ static int vnd_neti_getlifaddr(net_handle_t neti, phy_if_t phy, lif_if_t ifdata, size_t nelem, net_ifaddr_t type[], void *storage) { return (-1); } /* ARGSUSED */ static int vnd_neti_getlifzone(net_handle_t neti, phy_if_t phy, lif_if_t ifdata, zoneid_t *zid) { return (-1); } /* ARGSUSED */ static int vnd_neti_getlifflags(net_handle_t neti, phy_if_t phy, lif_if_t ifdata, uint64_t *flags) { return (-1); } /* ARGSUSED */ static phy_if_t vnd_neti_phygetnext(net_handle_t neti, phy_if_t phy) { return ((phy_if_t)-1); } /* ARGSUSED */ static phy_if_t vnd_neti_phylookup(net_handle_t neti, const char *name) { return ((phy_if_t)-1); } /* ARGSUSED */ static lif_if_t vnd_neti_lifgetnext(net_handle_t neti, phy_if_t phy, lif_if_t ifdata) { return (-1); } /* ARGSUSED */ static int vnd_neti_inject(net_handle_t neti, inject_t style, net_inject_t *packet) { return (-1); } /* ARGSUSED */ static phy_if_t vnd_neti_route(net_handle_t neti, struct sockaddr *address, struct sockaddr *next) { return ((phy_if_t)-1); } /* ARGSUSED */ static int vnd_neti_ispchksum(net_handle_t neti, mblk_t *mp) { return (-1); } /* ARGSUSED */ static int vnd_neti_isvchksum(net_handle_t neti, mblk_t *mp) { return (-1); } static net_protocol_t vnd_neti_info_v4 = { NETINFO_VERSION, NHF_VND_INET, vnd_neti_getifname, vnd_neti_getmtu, vnd_neti_getptmue, vnd_neti_getlifaddr, vnd_neti_getlifzone, vnd_neti_getlifflags, vnd_neti_phygetnext, vnd_neti_phylookup, vnd_neti_lifgetnext, vnd_neti_inject, vnd_neti_route, vnd_neti_ispchksum, vnd_neti_isvchksum }; static net_protocol_t vnd_neti_info_v6 = { NETINFO_VERSION, NHF_VND_INET6, vnd_neti_getifname, vnd_neti_getmtu, vnd_neti_getptmue, vnd_neti_getlifaddr, vnd_neti_getlifzone, vnd_neti_getlifflags, vnd_neti_phygetnext, vnd_neti_phylookup, vnd_neti_lifgetnext, vnd_neti_inject, vnd_neti_route, vnd_neti_ispchksum, vnd_neti_isvchksum }; static int vnd_netinfo_init(vnd_pnsd_t *nsp) { nsp->vpnd_neti_v4 = net_protocol_register(nsp->vpnd_nsid, &vnd_neti_info_v4); ASSERT(nsp->vpnd_neti_v4 != NULL); nsp->vpnd_neti_v6 = net_protocol_register(nsp->vpnd_nsid, &vnd_neti_info_v6); ASSERT(nsp->vpnd_neti_v6 != NULL); nsp->vpnd_family_v4.hf_version = HOOK_VERSION; nsp->vpnd_family_v4.hf_name = "vnd_inet"; if (net_family_register(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4) != 0) { (void) net_protocol_unregister(nsp->vpnd_neti_v4); (void) net_protocol_unregister(nsp->vpnd_neti_v6); cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register " "failed for stack %d", nsp->vpnd_nsid); return (1); } nsp->vpnd_family_v6.hf_version = HOOK_VERSION; nsp->vpnd_family_v6.hf_name = "vnd_inet6"; if (net_family_register(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6) != 0) { (void) net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); (void) net_protocol_unregister(nsp->vpnd_neti_v4); (void) net_protocol_unregister(nsp->vpnd_neti_v6); cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register " "failed for stack %d", nsp->vpnd_nsid); return (1); } nsp->vpnd_event_in_v4.he_version = HOOK_VERSION; nsp->vpnd_event_in_v4.he_name = NH_PHYSICAL_IN; nsp->vpnd_event_in_v4.he_flags = 0; nsp->vpnd_event_in_v4.he_interested = B_FALSE; nsp->vpnd_token_in_v4 = net_event_register(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); if (nsp->vpnd_token_in_v4 == NULL) { (void) net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); (void) net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6); (void) net_protocol_unregister(nsp->vpnd_neti_v4); (void) net_protocol_unregister(nsp->vpnd_neti_v6); cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " "failed for stack %d", nsp->vpnd_nsid); return (1); } nsp->vpnd_event_in_v6.he_version = HOOK_VERSION; nsp->vpnd_event_in_v6.he_name = NH_PHYSICAL_IN; nsp->vpnd_event_in_v6.he_flags = 0; nsp->vpnd_event_in_v6.he_interested = B_FALSE; nsp->vpnd_token_in_v6 = net_event_register(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); if (nsp->vpnd_token_in_v6 == NULL) { (void) net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); (void) net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); (void) net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); (void) net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6); (void) net_protocol_unregister(nsp->vpnd_neti_v4); (void) net_protocol_unregister(nsp->vpnd_neti_v6); cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " "failed for stack %d", nsp->vpnd_nsid); return (1); } nsp->vpnd_event_out_v4.he_version = HOOK_VERSION; nsp->vpnd_event_out_v4.he_name = NH_PHYSICAL_OUT; nsp->vpnd_event_out_v4.he_flags = 0; nsp->vpnd_event_out_v4.he_interested = B_FALSE; nsp->vpnd_token_out_v4 = net_event_register(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4); if (nsp->vpnd_token_out_v4 == NULL) { (void) net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); (void) net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); (void) net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); (void) net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); (void) net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); (void) net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6); (void) net_protocol_unregister(nsp->vpnd_neti_v4); (void) net_protocol_unregister(nsp->vpnd_neti_v6); cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " "failed for stack %d", nsp->vpnd_nsid); return (1); } nsp->vpnd_event_out_v6.he_version = HOOK_VERSION; nsp->vpnd_event_out_v6.he_name = NH_PHYSICAL_OUT; nsp->vpnd_event_out_v6.he_flags = 0; nsp->vpnd_event_out_v6.he_interested = B_FALSE; nsp->vpnd_token_out_v6 = net_event_register(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6); if (nsp->vpnd_token_out_v6 == NULL) { (void) net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); (void) net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); (void) net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); (void) net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); (void) net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); (void) net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); (void) net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); (void) net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6); (void) net_protocol_unregister(nsp->vpnd_neti_v4); (void) net_protocol_unregister(nsp->vpnd_neti_v6); cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " "failed for stack %d", nsp->vpnd_nsid); return (1); } return (0); } static void vnd_netinfo_shutdown(vnd_pnsd_t *nsp) { int ret; ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); VERIFY(ret == 0); ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4); VERIFY(ret == 0); ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); VERIFY(ret == 0); ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6); VERIFY(ret == 0); } static void vnd_netinfo_fini(vnd_pnsd_t *nsp) { int ret; ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); VERIFY(ret == 0); ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4); VERIFY(ret == 0); ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); VERIFY(ret == 0); ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6); VERIFY(ret == 0); ret = net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); VERIFY(ret == 0); ret = net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6); VERIFY(ret == 0); ret = net_protocol_unregister(nsp->vpnd_neti_v4); VERIFY(ret == 0); ret = net_protocol_unregister(nsp->vpnd_neti_v6); VERIFY(ret == 0); } /* ARGSUSED */ static void vnd_strbarrier_cb(void *arg, mblk_t *bmp, gsqueue_t *gsp, void *dummy) { vnd_str_t *vsp = arg; VERIFY(bmp == &vsp->vns_barrierblk); mutex_enter(&vsp->vns_lock); VERIFY(vsp->vns_flags & VNS_F_BARRIER); VERIFY(!(vsp->vns_flags & VNS_F_BARRIER_DONE)); vsp->vns_flags |= VNS_F_BARRIER_DONE; mutex_exit(&vsp->vns_lock); /* * For better or worse, we have to broadcast here as we could have a * thread that's blocked for completion as well as one that's blocked * waiting to do a barrier itself. */ cv_broadcast(&vsp->vns_barriercv); } /* * This is a data barrier for the stream while it is in fastpath mode. It blocks * and ensures that there is nothing else in the squeue. */ static void vnd_strbarrier(vnd_str_t *vsp) { mutex_enter(&vsp->vns_lock); while (vsp->vns_flags & VNS_F_BARRIER) cv_wait(&vsp->vns_barriercv, &vsp->vns_lock); vsp->vns_flags |= VNS_F_BARRIER; mutex_exit(&vsp->vns_lock); gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_barrierblk, vnd_strbarrier_cb, vsp, GSQUEUE_PROCESS, VND_SQUEUE_TAG_STRBARRIER); mutex_enter(&vsp->vns_lock); while (!(vsp->vns_flags & VNS_F_BARRIER_DONE)) cv_wait(&vsp->vns_barriercv, &vsp->vns_lock); vsp->vns_flags &= ~VNS_F_BARRIER; vsp->vns_flags &= ~VNS_F_BARRIER_DONE; mutex_exit(&vsp->vns_lock); /* * We have to broadcast in case anyone is waiting for the barrier * themselves. */ cv_broadcast(&vsp->vns_barriercv); } /* * Based on the type of message that we're dealing with we're going to want to * do one of several things. Basically if it looks like it's something we know * about, we should probably handle it in one of our transition threads. * Otherwise, we should just simply putnext. */ static int vnd_s_rput(queue_t *q, mblk_t *mp) { t_uscalar_t prim; int dispatch = 0; vnd_str_t *vsp = q->q_ptr; switch (DB_TYPE(mp)) { case M_PROTO: case M_PCPROTO: if (MBLKL(mp) < sizeof (t_uscalar_t)) { vnd_drop_ctl(vsp, mp, "PROTO message too short"); break; } prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive; if (prim == DL_UNITDATA_REQ || prim == DL_UNITDATA_IND) { vnd_drop_ctl(vsp, mp, "recieved an unsupported dlpi DATA req"); break; } /* * Enqueue the entry and fire off a taskq dispatch. */ mutex_enter(&vsp->vns_lock); vnd_dlpi_inc_push(vsp, mp); if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) { dispatch = 1; vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED; } mutex_exit(&vsp->vns_lock); if (dispatch != 0) taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp, 0, &vsp->vns_tqe); break; case M_DATA: vnd_drop_in(vsp, mp, "M_DATA via put(9E)"); break; default: putnext(vsp->vns_rq, mp); } return (0); } /* ARGSUSED */ static void vnd_strioctl(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct iocblk *iocp) { int error; vnd_strioc_t *visp; if (iocp->ioc_cmd != VND_STRIOC_ASSOCIATE || iocp->ioc_count != TRANSPARENT) { error = EINVAL; goto nak; } /* * All streams ioctls that we support must use kcred as a means to * distinguish that this is a layered open by the kernel as opposed to * one by a user who has done an I_PUSH of the module. */ if (iocp->ioc_cr != kcred) { error = EPERM; goto nak; } if (mp->b_cont == NULL) { error = EAGAIN; goto nak; } visp = kmem_alloc(sizeof (vnd_strioc_t), KM_SLEEP); ASSERT(MBLKL(mp->b_cont) == sizeof (caddr_t)); visp->vs_addr = *(caddr_t *)mp->b_cont->b_rptr; visp->vs_state = VSS_COPYIN; mcopyin(mp, (void *)visp, sizeof (vnd_strioc_associate_t), NULL); qreply(q, mp); return; nak: if (mp->b_cont != NULL) { freemsg(mp->b_cont); mp->b_cont = NULL; } iocp->ioc_error = error; mp->b_datap->db_type = M_IOCNAK; iocp->ioc_count = 0; qreply(q, mp); } static void vnd_striocdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp) { vnd_str_state_t state; struct copyreq *crp; vnd_strioc_associate_t *vss; vnd_dev_t *vdp = NULL; vnd_pnsd_t *nsp = NULL; char iname[2*VND_NAMELEN]; zone_t *zone; vnd_strioc_t *visp; visp = (vnd_strioc_t *)csp->cp_private; /* If it's not ours, it's not our problem */ if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) { if (q->q_next != NULL) { putnext(q, mp); } else { VND_STAT_INC(vsp, vks_ndlpidrops, 1); VND_STAT_INC(vsp, vks_tdrops, 1); vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA"); } kmem_free(visp, sizeof (vnd_strioc_t)); return; } /* The nak is already sent for us */ if (csp->cp_rval != 0) { vnd_drop_ctl(vsp, mp, "M_COPYIN failed"); kmem_free(visp, sizeof (vnd_strioc_t)); return; } /* Data is sitting for us in b_cont */ if (mp->b_cont == NULL || MBLKL(mp->b_cont) != sizeof (vnd_strioc_associate_t)) { kmem_free(visp, sizeof (vnd_strioc_t)); miocnak(q, mp, 0, EINVAL); return; } vss = (vnd_strioc_associate_t *)mp->b_cont->b_rptr; vdp = vnd_dev_lookup(vss->vsa_minor); if (vdp == NULL) { vss->vsa_errno = VND_E_NODEV; goto nak; } nsp = vnd_nsd_lookup(vss->vsa_nsid); if (nsp == NULL) { vss->vsa_errno = VND_E_NONETSTACK; goto nak; } mutex_enter(&vsp->vns_lock); if (!(vsp->vns_flags & VNS_F_NEED_ZONE)) { mutex_exit(&vsp->vns_lock); vss->vsa_errno = VND_E_ASSOCIATED; goto nak; } vsp->vns_nsd = nsp; vsp->vns_flags &= ~VNS_F_NEED_ZONE; vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED; mutex_exit(&vsp->vns_lock); taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp, 0, &vsp->vns_tqe); /* At this point we need to wait until we have transitioned to ONLINE */ mutex_enter(&vsp->vns_lock); while (vsp->vns_state != VNS_S_ONLINE && vsp->vns_state != VNS_S_ZOMBIE) cv_wait(&vsp->vns_stcv, &vsp->vns_lock); state = vsp->vns_state; mutex_exit(&vsp->vns_lock); if (state == VNS_S_ZOMBIE) { vss->vsa_errno = vsp->vns_errno; goto nak; } mutex_enter(&vdp->vdd_lock); mutex_enter(&vsp->vns_lock); VERIFY(vdp->vdd_str == NULL); /* * Now initialize the remaining kstat properties and let's go ahead and * create it. */ (void) snprintf(iname, sizeof (iname), "z%d_%d", vdp->vdd_nsd->vpnd_zid, vdp->vdd_minor); vsp->vns_kstat = kstat_create_zone("vnd", vdp->vdd_minor, iname, "net", KSTAT_TYPE_NAMED, sizeof (vnd_str_stat_t) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID); if (vsp->vns_kstat == NULL) { vss->vsa_errno = VND_E_KSTATCREATE; mutex_exit(&vsp->vns_lock); mutex_exit(&vdp->vdd_lock); goto nak; } vdp->vdd_str = vsp; vsp->vns_dev = vdp; /* * Now, it's time to do the las thing that can fail, changing out the * input function. After this we know that we can receive data, so we * should make sure that we're ready. */ if (vnd_dld_cap_enable(vsp, vnd_mac_input) != 0) { vss->vsa_errno = VND_E_DIRECTFAIL; vdp->vdd_str = NULL; vsp->vns_dev = NULL; mutex_exit(&vsp->vns_lock); mutex_exit(&vdp->vdd_lock); goto nak; } zone = zone_find_by_id(vdp->vdd_nsd->vpnd_zid); ASSERT(zone != NULL); vsp->vns_kstat->ks_data = &vsp->vns_ksdata; /* Account for zone name */ vsp->vns_kstat->ks_data_size += strlen(zone->zone_name) + 1; /* Account for eventual link name */ vsp->vns_kstat->ks_data_size += VND_NAMELEN; kstat_named_setstr(&vsp->vns_ksdata.vks_zonename, zone->zone_name); kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname, vdp->vdd_lname); zone_rele(zone); kstat_install(vsp->vns_kstat); mutex_exit(&vsp->vns_lock); mutex_exit(&vdp->vdd_lock); /* * Note that the vnd_str_t does not keep a permanent hold on the * vnd_pnsd_t. We leave that up to the vnd_dev_t as that's also what * the nestack goes through to take care of everything. */ vss->vsa_errno = VND_E_SUCCESS; nak: if (vdp != NULL) vnd_dev_rele(vdp); if (nsp != NULL) vnd_nsd_rele(nsp); /* * Change the copyin request to a copyout. Note that we can't use * mcopyout here as it only works when the DB_TYPE is M_IOCTL. That's * okay, as the copyin vs. copyout is basically the same. */ DB_TYPE(mp) = M_COPYOUT; visp->vs_state = VSS_COPYOUT; crp = (struct copyreq *)mp->b_rptr; crp->cq_private = (void *)visp; crp->cq_addr = visp->vs_addr; crp->cq_size = sizeof (vnd_strioc_associate_t); qreply(q, mp); } static void vnd_stroutdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp) { ASSERT(csp->cp_private != NULL); kmem_free(csp->cp_private, sizeof (vnd_strioc_t)); if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) { if (q->q_next != NULL) { putnext(q, mp); } else { VND_STAT_INC(vsp, vks_ndlpidrops, 1); VND_STAT_INC(vsp, vks_tdrops, 1); vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA"); } return; } /* The nak is already sent for us */ if (csp->cp_rval != 0) { vnd_drop_ctl(vsp, mp, "M_COPYOUT failed"); return; } /* Ack and let's be done with it all */ miocack(q, mp, 0, 0); } static int vnd_s_wput(queue_t *q, mblk_t *mp) { vnd_str_t *vsp = q->q_ptr; struct copyresp *crp; vnd_strioc_state_t vstate; vnd_strioc_t *visp; switch (DB_TYPE(mp)) { case M_IOCTL: vnd_strioctl(q, vsp, mp, (struct iocblk *)mp->b_rptr); return (0); case M_IOCDATA: crp = (struct copyresp *)mp->b_rptr; ASSERT(crp->cp_private != NULL); visp = (vnd_strioc_t *)crp->cp_private; vstate = visp->vs_state; ASSERT(vstate == VSS_COPYIN || vstate == VSS_COPYOUT); if (vstate == VSS_COPYIN) vnd_striocdata(q, vsp, mp, (struct copyresp *)mp->b_rptr); else vnd_stroutdata(q, vsp, mp, (struct copyresp *)mp->b_rptr); return (0); default: break; } if (q->q_next != NULL) putnext(q, mp); else vnd_drop_ctl(vsp, mp, "!M_IOCTL in wput"); return (0); } /* ARGSUSED */ static int vnd_s_open(queue_t *q, dev_t *devp, int oflag, int sflag, cred_t *credp) { vnd_str_t *vsp; uint_t rand; if (q->q_ptr != NULL) return (EINVAL); if (!(sflag & MODOPEN)) return (ENXIO); if (credp != kcred) return (EPERM); vsp = kmem_cache_alloc(vnd_str_cache, KM_SLEEP); bzero(vsp, sizeof (*vsp)); mutex_init(&vsp->vns_lock, NULL, MUTEX_DRIVER, NULL); cv_init(&vsp->vns_cancelcv, NULL, CV_DRIVER, NULL); cv_init(&vsp->vns_barriercv, NULL, CV_DRIVER, NULL); cv_init(&vsp->vns_stcv, NULL, CV_DRIVER, NULL); vsp->vns_state = VNS_S_INITIAL; mutex_init(&vsp->vns_dq_read.vdq_lock, NULL, MUTEX_DRIVER, NULL); mutex_init(&vsp->vns_dq_write.vdq_lock, NULL, MUTEX_DRIVER, NULL); mutex_enter(&vnd_dev_lock); vsp->vns_dq_read.vdq_max = vnd_vdq_default_size; vsp->vns_dq_read.vdq_vns = vsp; vsp->vns_dq_write.vdq_max = vnd_vdq_default_size; vsp->vns_dq_write.vdq_vns = vsp; mutex_exit(&vnd_dev_lock); vsp->vns_rq = q; vsp->vns_wq = WR(q); q->q_ptr = WR(q)->q_ptr = vsp; vsp->vns_flags = VNS_F_NEED_ZONE; vsp->vns_nflush = vnd_flush_nburst; vsp->vns_bsize = vnd_flush_burst_size; (void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand)); vsp->vns_squeue = gsqueue_set_get(vnd_sqset, rand); /* * We create our kstat and initialize all of its fields now, but we * don't install it until we actually do the zone association so we can * get everything. */ kstat_named_init(&vsp->vns_ksdata.vks_rbytes, "rbytes", KSTAT_DATA_UINT64); kstat_named_init(&vsp->vns_ksdata.vks_rpackets, "rpackets", KSTAT_DATA_UINT64); kstat_named_init(&vsp->vns_ksdata.vks_obytes, "obytes", KSTAT_DATA_UINT64); kstat_named_init(&vsp->vns_ksdata.vks_opackets, "opackets", KSTAT_DATA_UINT64); kstat_named_init(&vsp->vns_ksdata.vks_nhookindrops, "nhookindrops", KSTAT_DATA_UINT64); kstat_named_init(&vsp->vns_ksdata.vks_nhookoutdrops, "nhookoutdrops", KSTAT_DATA_UINT64); kstat_named_init(&vsp->vns_ksdata.vks_ndlpidrops, "ndlpidrops", KSTAT_DATA_UINT64); kstat_named_init(&vsp->vns_ksdata.vks_ndataindrops, "ndataindrops", KSTAT_DATA_UINT64); kstat_named_init(&vsp->vns_ksdata.vks_ndataoutdrops, "ndataoutdrops", KSTAT_DATA_UINT64); kstat_named_init(&vsp->vns_ksdata.vks_tdrops, "total_drops", KSTAT_DATA_UINT64); kstat_named_init(&vsp->vns_ksdata.vks_linkname, "linkname", KSTAT_DATA_STRING); kstat_named_init(&vsp->vns_ksdata.vks_zonename, "zonename", KSTAT_DATA_STRING); kstat_named_init(&vsp->vns_ksdata.vks_nmacflow, "flowcontrol_events", KSTAT_DATA_UINT64); kstat_named_init(&vsp->vns_ksdata.vks_tmacflow, "flowcontrol_time", KSTAT_DATA_UINT64); kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1ms, "flowcontrol_1ms", KSTAT_DATA_UINT64); kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10ms, "flowcontrol_10ms", KSTAT_DATA_UINT64); kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_100ms, "flowcontrol_100ms", KSTAT_DATA_UINT64); kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1s, "flowcontrol_1s", KSTAT_DATA_UINT64); kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10s, "flowcontrol_10s", KSTAT_DATA_UINT64); qprocson(q); /* * Now that we've called qprocson, grab the lower module for making sure * that we don't have any pass through modules. */ vsp->vns_lrq = RD(vsp->vns_wq->q_next); return (0); } /* ARGSUSED */ static int vnd_s_close(queue_t *q, int flag, cred_t *credp) { vnd_str_t *vsp; mblk_t *mp; VERIFY(WR(q)->q_next != NULL); vsp = q->q_ptr; ASSERT(vsp != NULL); /* * We need to transition ourselves down. This means that we have a few * important different things to do in the process of tearing down our * input and output buffers, making sure we've drained the current * squeue, and disabling the fast path. Before we disable the fast path, * we should make sure the squeue is drained. Because we're in streams * close, we know that no packets can come into us from userland, but we * can receive more. As such, the following is the exact order of things * that we do: * * 1) flush the vns_dq_read * 2) Insert the drain mblk * 3) When it's been received, tear down the fast path by kicking * off the state machine. * 4) One final flush of both the vns_dq_read,vns_dq_write */ vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in); vnd_strbarrier(vsp); mutex_enter(&vsp->vns_lock); vsp->vns_state = VNS_S_SHUTTING_DOWN; if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) { vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED; taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp, 0, &vsp->vns_tqe); } while (vsp->vns_state != VNS_S_ZOMBIE) cv_wait(&vsp->vns_stcv, &vsp->vns_lock); mutex_exit(&vsp->vns_lock); qprocsoff(q); mutex_enter(&vsp->vns_lock); vsp->vns_flags |= VNS_F_CONDEMNED; while (vsp->vns_flags & VNS_F_TASKQ_DISPATCHED) cv_wait(&vsp->vns_cancelcv, &vsp->vns_lock); while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL) vnd_drop_ctl(vsp, mp, "vnd_s_close"); mutex_exit(&vsp->vns_lock); q->q_ptr = NULL; vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in); vnd_dq_flush(&vsp->vns_dq_write, vnd_drop_out); mutex_destroy(&vsp->vns_dq_read.vdq_lock); mutex_destroy(&vsp->vns_dq_write.vdq_lock); if (vsp->vns_kstat != NULL) kstat_delete(vsp->vns_kstat); mutex_destroy(&vsp->vns_lock); cv_destroy(&vsp->vns_stcv); cv_destroy(&vsp->vns_barriercv); cv_destroy(&vsp->vns_cancelcv); kmem_cache_free(vnd_str_cache, vsp); return (0); } static vnd_mac_cookie_t vnd_squeue_tx_one(vnd_str_t *vsp, mblk_t *mp) { hrtime_t txtime; vnd_mac_cookie_t vc; VND_STAT_INC(vsp, vks_opackets, 1); VND_STAT_INC(vsp, vks_obytes, msgsize(mp)); DTRACE_VND5(send, mblk_t *, mp, void *, NULL, void *, NULL, vnd_str_t *, vsp, mblk_t *, mp); /* Actually tx now */ txtime = gethrtime(); vc = vsp->vns_caps.vsc_tx_f(vsp->vns_caps.vsc_tx_hdl, mp, 0, MAC_DROP_ON_NO_DESC); /* * We need to check two different conditions before we immediately set * the flow control lock. The first thing that we need to do is verify * that this is an instance of hard flow control, so to say. The flow * control callbacks won't always fire in cases where we still get a * cookie returned. The explicit check for flow control will guarantee * us that we'll get a subsequent notification callback. * * The second case comes about because we do not hold the * vnd_str_t`vns_lock across calls to tx, we need to determine if a flow * control notification already came across for us in a different thread * calling vnd_mac_flow_control(). To deal with this, we record a * timestamp every time that we change the flow control state. We grab * txtime here before we transmit because that guarantees that the * hrtime_t of the call to vnd_mac_flow_control() will be after txtime. * * If the flow control notification beat us to the punch, the value of * vns_fcupdate will be larger than the value of txtime, and we should * just record the statistics. However, if we didn't beat it to the * punch (txtime > vns_fcupdate), then we know that it's safe to wait * for a notification. */ if (vc != (vnd_mac_cookie_t)NULL) { hrtime_t diff; if (vsp->vns_caps.vsc_is_fc_f(vsp->vns_caps.vsc_is_fc_hdl, vc) == 0) return ((vnd_mac_cookie_t)NULL); mutex_enter(&vsp->vns_lock); diff = vsp->vns_fcupdate - txtime; if (diff > 0) { mutex_exit(&vsp->vns_lock); vnd_mac_flow_control_stat(vsp, diff); return ((vnd_mac_cookie_t)NULL); } vsp->vns_flags |= VNS_F_FLOW_CONTROLLED; vsp->vns_caps.vsc_fc_cookie = vc; vsp->vns_fclatch = txtime; vsp->vns_fcupdate = txtime; DTRACE_VND3(flow__blocked, vnd_str_t *, vsp, uint64_t, vsp->vns_dq_write.vdq_cur, uintptr_t, vc); mutex_exit(&vsp->vns_lock); } return (vc); } /* ARGSUSED */ static void vnd_squeue_tx_drain(void *arg, mblk_t *drain_mp, gsqueue_t *gsp, void *dummy) { mblk_t *mp; int nmps; size_t mptot, nflush, bsize; boolean_t blocked, empty; vnd_data_queue_t *vqp; vnd_str_t *vsp = arg; mutex_enter(&vsp->vns_lock); /* * We either enter here via an squeue or via vnd_squeue_tx_append(). In * the former case we need to mark that there is no longer an active * user of the drain block. */ if (drain_mp != NULL) { VERIFY(drain_mp == &vsp->vns_drainblk); VERIFY(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED); vsp->vns_flags &= ~VNS_F_DRAIN_SCHEDULED; } /* * If we're still flow controlled or under a flush barrier, nothing to * do. */ if (vsp->vns_flags & (VNS_F_FLOW_CONTROLLED | VNS_F_BARRIER)) { mutex_exit(&vsp->vns_lock); return; } nflush = vsp->vns_nflush; bsize = vsp->vns_bsize; mutex_exit(&vsp->vns_lock); /* * We're potentially going deep into the networking layer; make sure the * guest can't run concurrently. */ smt_begin_unsafe(); nmps = 0; mptot = 0; blocked = B_FALSE; vqp = &vsp->vns_dq_write; while (nmps < nflush && mptot <= bsize) { mutex_enter(&vqp->vdq_lock); if (vnd_dq_pop(vqp, &mp) == 0) { mutex_exit(&vqp->vdq_lock); break; } mutex_exit(&vqp->vdq_lock); nmps++; mptot += msgsize(mp); if (vnd_squeue_tx_one(vsp, mp) != (vnd_mac_cookie_t)NULL) { blocked = B_TRUE; break; } } smt_end_unsafe(); empty = vnd_dq_is_empty(&vsp->vns_dq_write); /* * If the queue is not empty, we're not blocked, and there isn't a drain * scheduled, put it into the squeue with the drain block and * GSQUEUE_FILL. */ if (blocked == B_FALSE && empty == B_FALSE) { mutex_enter(&vsp->vns_lock); if (!(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED)) { mblk_t *mp = &vsp->vns_drainblk; vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED; gsqueue_enter_one(vsp->vns_squeue, mp, vnd_squeue_tx_drain, vsp, GSQUEUE_FILL, VND_SQUEUE_TAG_TX_DRAIN); } mutex_exit(&vsp->vns_lock); } /* * If we drained some amount of data, we need to signal the data queue. */ if (nmps > 0) { cv_broadcast(&vsp->vns_dq_write.vdq_ready); pollwakeup(&vsp->vns_dev->vdd_ph, POLLOUT); } } /* ARGSUSED */ static void vnd_squeue_tx_append(void *arg, mblk_t *mp, gsqueue_t *gsp, void *dummy) { vnd_str_t *vsp = arg; vnd_data_queue_t *vqp = &vsp->vns_dq_write; vnd_pnsd_t *nsp = vsp->vns_nsd; size_t len = msgsize(mp); /* * Before we append this packet, we should run it through the firewall * rules. */ if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4, nsp->vpnd_event_out_v4, nsp->vpnd_token_out_v4, nsp->vpnd_neti_v6, nsp->vpnd_event_out_v6, nsp->vpnd_token_out_v6, vnd_drop_hook_out, vnd_drop_out) != 0) { /* * Because we earlier reserved space for this packet and it's * not making the cut, we need to go through and unreserve that * space. Also note that the message block will likely be freed * by the time we return from vnd_hook so we cannot rely on it. */ mutex_enter(&vqp->vdq_lock); vnd_dq_unreserve(vqp, len); mutex_exit(&vqp->vdq_lock); return; } /* * We earlier reserved space for this packet. So for now simply append * it and call drain. We know that no other drain can be going on right * now thanks to the squeue. */ mutex_enter(&vqp->vdq_lock); (void) vnd_dq_push(&vsp->vns_dq_write, mp, B_TRUE, vnd_drop_panic); mutex_exit(&vqp->vdq_lock); vnd_squeue_tx_drain(vsp, NULL, NULL, NULL); } /* * We need to see if this is a valid name of sorts for us. That means a few * things. First off, we can't assume that what we've been given has actually * been null terminated. More importantly, that it's a valid name as far as * ddi_create_minor_node is concerned (that means no '@', '/', or ' '). We * further constrain ourselves to simply alphanumeric characters and a few * additional ones, ':', '-', and '_'. */ static int vnd_validate_name(const char *buf, size_t buflen) { int i, len; /* First make sure a null terminator exists */ for (i = 0; i < buflen; i++) if (buf[i] == '\0') break; len = i; if (i == 0 || i == buflen) return (0); for (i = 0; i < len; i++) if (!isalnum(buf[i]) && buf[i] != ':' && buf[i] != '-' && buf[i] != '_') return (0); return (1); } static int vnd_ioctl_attach(vnd_dev_t *vdp, uintptr_t arg, cred_t *credp, int cpflag) { vnd_ioc_attach_t via; vnd_strioc_associate_t vss; vnd_pnsd_t *nsp; zone_t *zonep; zoneid_t zid; char buf[2*VND_NAMELEN]; int ret, rp; if (secpolicy_net_config(credp, B_FALSE) != 0) return (EPERM); if (secpolicy_net_rawaccess(credp) != 0) return (EPERM); if (ddi_copyin((void *)arg, &via, sizeof (via), cpflag) != 0) return (EFAULT); via.via_errno = VND_E_SUCCESS; if (vnd_validate_name(via.via_name, VND_NAMELEN) == 0) { via.via_errno = VND_E_BADNAME; ret = EIO; goto errcopyout; } /* * Only the global zone can request to create a device in a different * zone. */ zid = crgetzoneid(credp); if (zid != GLOBAL_ZONEID && via.via_zoneid != -1 && zid != via.via_zoneid) { via.via_errno = VND_E_PERM; ret = EIO; goto errcopyout; } if (via.via_zoneid == -1) via.via_zoneid = zid; /* * Establish the name we'll use now. We want to be extra paranoid about * the device we're opening so check that now. */ if (zid == GLOBAL_ZONEID && via.via_zoneid != zid) { zonep = zone_find_by_id(via.via_zoneid); if (zonep == NULL) { via.via_errno = VND_E_NOZONE; ret = EIO; goto errcopyout; } if (snprintf(NULL, 0, "/dev/net/zone/%s/%s", zonep->zone_name, via.via_name) >= sizeof (buf)) { zone_rele(zonep); via.via_errno = VND_E_BADNAME; ret = EIO; goto errcopyout; } (void) snprintf(buf, sizeof (buf), "/dev/net/zone/%s/%s", zonep->zone_name, via.via_name); zone_rele(zonep); zonep = NULL; } else { if (snprintf(NULL, 0, "/dev/net/%s", via.via_name) >= sizeof (buf)) { via.via_errno = VND_E_BADNAME; ret = EIO; goto errcopyout; } (void) snprintf(buf, sizeof (buf), "/dev/net/%s", via.via_name); } /* * If our zone is dying then the netstack will have been removed from * this list. */ nsp = vnd_nsd_lookup_by_zid(via.via_zoneid); if (nsp == NULL) { via.via_errno = VND_E_NOZONE; ret = EIO; goto errcopyout; } /* * Note we set the attached handle even though we haven't actually * finished the process of attaching the ldi handle. */ mutex_enter(&vdp->vdd_lock); if (vdp->vdd_flags & (VND_D_ATTACHED | VND_D_ATTACH_INFLIGHT)) { mutex_exit(&vdp->vdd_lock); vnd_nsd_rele(nsp); via.via_errno = VND_E_ATTACHED; ret = EIO; goto errcopyout; } vdp->vdd_flags |= VND_D_ATTACH_INFLIGHT; ASSERT(vdp->vdd_cr == NULL); crhold(credp); vdp->vdd_cr = credp; ASSERT(vdp->vdd_nsd == NULL); vdp->vdd_nsd = nsp; mutex_exit(&vdp->vdd_lock); /* * Place an additional hold on the vnd_pnsd_t as we go through and do * all of the rest of our work. This will be the hold that we keep for * as long as this thing is attached. */ vnd_nsd_ref(nsp); ret = ldi_open_by_name(buf, FREAD | FWRITE, vdp->vdd_cr, &vdp->vdd_ldih, vdp->vdd_ldiid); if (ret != 0) { if (ret == ENODEV) via.via_errno = VND_E_NODATALINK; goto err; } /* * Unfortunately the I_PUSH interface doesn't allow us a way to detect * whether or not we're coming in from a layered device. We really want * to make sure that a normal user can't push on our streams module. * Currently the only idea I have for this is to make sure that the * credp is kcred which is really terrible. */ ret = ldi_ioctl(vdp->vdd_ldih, I_PUSH, (intptr_t)"vnd", FKIOCTL, kcred, &rp); if (ret != 0) { rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); VERIFY(rp == 0); via.via_errno = VND_E_STRINIT; ret = EIO; goto err; } vss.vsa_minor = vdp->vdd_minor; vss.vsa_nsid = nsp->vpnd_nsid; ret = ldi_ioctl(vdp->vdd_ldih, VND_STRIOC_ASSOCIATE, (intptr_t)&vss, FKIOCTL, kcred, &rp); if (ret != 0 || vss.vsa_errno != VND_E_SUCCESS) { rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); VERIFY(rp == 0); if (ret == 0) { via.via_errno = vss.vsa_errno; ret = EIO; } goto err; } mutex_enter(&vdp->vdd_nsd->vpnd_lock); /* * There's a chance that our netstack was condemned while we've had a * hold on it. As such we need to check and if so, error out. */ if (vdp->vdd_nsd->vpnd_flags & VND_NS_CONDEMNED) { mutex_exit(&vdp->vdd_nsd->vpnd_lock); rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); VERIFY(rp == 0); ret = EIO; via.via_errno = VND_E_NOZONE; goto err; } mutex_enter(&vdp->vdd_lock); VERIFY(vdp->vdd_str != NULL); vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT; vdp->vdd_flags |= VND_D_ATTACHED; (void) strlcpy(vdp->vdd_datalink, via.via_name, sizeof (vdp->vdd_datalink)); list_insert_tail(&vdp->vdd_nsd->vpnd_dev_list, vdp); mutex_exit(&vdp->vdd_lock); mutex_exit(&vdp->vdd_nsd->vpnd_lock); vnd_nsd_rele(nsp); return (0); err: mutex_enter(&vdp->vdd_lock); vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT; crfree(vdp->vdd_cr); vdp->vdd_cr = NULL; vdp->vdd_nsd = NULL; mutex_exit(&vdp->vdd_lock); /* * We have two holds to drop here. One for our original reference and * one for the hold this operation would have represented. */ vnd_nsd_rele(nsp); vnd_nsd_rele(nsp); errcopyout: if (ddi_copyout(&via, (void *)arg, sizeof (via), cpflag) != 0) ret = EFAULT; return (ret); } static int vnd_ioctl_link(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag) { int ret = 0; vnd_ioc_link_t vil; char mname[2*VND_NAMELEN]; char **c; vnd_dev_t *v; zoneid_t zid; /* Not anyone can link something */ if (secpolicy_net_config(credp, B_FALSE) != 0) return (EPERM); if (ddi_copyin((void *)arg, &vil, sizeof (vil), cpflag) != 0) return (EFAULT); if (vnd_validate_name(vil.vil_name, VND_NAMELEN) == 0) { ret = EIO; vil.vil_errno = VND_E_BADNAME; goto errcopyout; } c = vnd_reserved_names; while (*c != NULL) { if (strcmp(vil.vil_name, *c) == 0) { ret = EIO; vil.vil_errno = VND_E_BADNAME; goto errcopyout; } c++; } mutex_enter(&vdp->vdd_lock); if (!(vdp->vdd_flags & VND_D_ATTACHED)) { mutex_exit(&vdp->vdd_lock); ret = EIO; vil.vil_errno = VND_E_NOTATTACHED; goto errcopyout; } if (vdp->vdd_flags & VND_D_ZONE_DYING) { mutex_exit(&vdp->vdd_lock); ret = EIO; vil.vil_errno = VND_E_NOZONE; goto errcopyout; } if (vdp->vdd_flags & (VND_D_LINK_INFLIGHT | VND_D_LINKED)) { mutex_exit(&vdp->vdd_lock); ret = EIO; vil.vil_errno = VND_E_LINKED; goto errcopyout; } vdp->vdd_flags |= VND_D_LINK_INFLIGHT; zid = vdp->vdd_nsd->vpnd_zid; mutex_exit(&vdp->vdd_lock); if (snprintf(NULL, 0, "z%d:%s", zid, vil.vil_name) >= sizeof (mname)) { ret = EIO; vil.vil_errno = VND_E_BADNAME; goto errcopyout; } mutex_enter(&vnd_dev_lock); for (v = list_head(&vnd_dev_list); v != NULL; v = list_next(&vnd_dev_list, v)) { if (!(v->vdd_flags & VND_D_LINKED)) continue; if (v->vdd_nsd->vpnd_zid == zid && strcmp(v->vdd_lname, vil.vil_name) == 0) { mutex_exit(&vnd_dev_lock); ret = EIO; vil.vil_errno = VND_E_LINKEXISTS; goto error; } } /* * We set the name and mark ourselves attached while holding the list * lock to ensure that no other user can mistakingly find our name. */ (void) snprintf(mname, sizeof (mname), "z%d:%s", zid, vil.vil_name); mutex_enter(&vdp->vdd_lock); /* * Because we dropped our lock, we need to double check whether or not * the zone was marked as dying while we were here. If it hasn't, then * it's safe for us to link it in. */ if (vdp->vdd_flags & VND_D_ZONE_DYING) { mutex_exit(&vdp->vdd_lock); mutex_exit(&vnd_dev_lock); ret = EIO; vil.vil_errno = VND_E_NOZONE; goto error; } (void) strlcpy(vdp->vdd_lname, vil.vil_name, sizeof (vdp->vdd_lname)); if (ddi_create_minor_node(vnd_dip, mname, S_IFCHR, vdp->vdd_minor, DDI_PSEUDO, 0) != DDI_SUCCESS) { ret = EIO; vil.vil_errno = VND_E_MINORNODE; } else { vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT; vdp->vdd_flags |= VND_D_LINKED; kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname, vdp->vdd_lname); ret = 0; } mutex_exit(&vdp->vdd_lock); mutex_exit(&vnd_dev_lock); if (ret == 0) { /* * Add a reference to represent that this device is linked into * the file system name space to ensure that it doesn't * disappear. */ vnd_dev_ref(vdp); return (0); } error: mutex_enter(&vdp->vdd_lock); vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT; vdp->vdd_lname[0] = '\0'; mutex_exit(&vdp->vdd_lock); errcopyout: if (ddi_copyout(&vil, (void *)arg, sizeof (vil), cpflag) != 0) ret = EFAULT; return (ret); } /* * Common unlink function. This is used both from the ioctl path and from the * netstack shutdown path. The caller is required to hold the mutex on the * vnd_dev_t, but they basically will have it relinquished for them. The only * thing the caller is allowed to do afterward is to potentially rele the * vnd_dev_t if they have their own hold. Note that only the ioctl path has its * own hold. */ static void vnd_dev_unlink(vnd_dev_t *vdp) { char mname[2*VND_NAMELEN]; ASSERT(MUTEX_HELD(&vdp->vdd_lock)); (void) snprintf(mname, sizeof (mname), "z%d:%s", vdp->vdd_nsd->vpnd_zid, vdp->vdd_lname); ddi_remove_minor_node(vnd_dip, mname); vdp->vdd_lname[0] = '\0'; vdp->vdd_flags &= ~VND_D_LINKED; kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname, vdp->vdd_lname); mutex_exit(&vdp->vdd_lock); /* * This rele corresponds to the reference that we took in * vnd_ioctl_link. */ vnd_dev_rele(vdp); } static int vnd_ioctl_unlink(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag) { int ret; zoneid_t zid; vnd_ioc_unlink_t viu; /* Not anyone can unlink something */ if (secpolicy_net_config(credp, B_FALSE) != 0) return (EPERM); zid = crgetzoneid(credp); if (ddi_copyin((void *)arg, &viu, sizeof (viu), cpflag) != 0) return (EFAULT); viu.viu_errno = VND_E_SUCCESS; mutex_enter(&vdp->vdd_lock); if (!(vdp->vdd_flags & VND_D_LINKED)) { mutex_exit(&vdp->vdd_lock); ret = EIO; viu.viu_errno = VND_E_NOTLINKED; goto err; } VERIFY(vdp->vdd_flags & VND_D_ATTACHED); if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) { mutex_exit(&vdp->vdd_lock); ret = EIO; viu.viu_errno = VND_E_PERM; goto err; } /* vnd_dev_unlink releases the vdp mutex for us */ vnd_dev_unlink(vdp); ret = 0; err: if (ddi_copyout(&viu, (void *)arg, sizeof (viu), cpflag) != 0) return (EFAULT); return (ret); } static int vnd_ioctl_setrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) { int ret; vnd_ioc_buf_t vib; if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0) return (EFAULT); mutex_enter(&vnd_dev_lock); if (vib.vib_size > vnd_vdq_hard_max) { mutex_exit(&vnd_dev_lock); vib.vib_errno = VND_E_BUFTOOBIG; ret = EIO; goto err; } mutex_exit(&vnd_dev_lock); mutex_enter(&vdp->vdd_lock); if (!(vdp->vdd_flags & VND_D_ATTACHED)) { mutex_exit(&vdp->vdd_lock); vib.vib_errno = VND_E_NOTATTACHED; ret = EIO; goto err; } mutex_enter(&vdp->vdd_str->vns_lock); if (vib.vib_size < vdp->vdd_str->vns_minwrite) { mutex_exit(&vdp->vdd_str->vns_lock); mutex_exit(&vdp->vdd_lock); vib.vib_errno = VND_E_BUFTOOSMALL; ret = EIO; goto err; } mutex_exit(&vdp->vdd_str->vns_lock); mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock); vdp->vdd_str->vns_dq_read.vdq_max = (size_t)vib.vib_size; mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock); mutex_exit(&vdp->vdd_lock); ret = 0; err: if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) return (EFAULT); return (ret); } static int vnd_ioctl_getrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) { int ret; vnd_ioc_buf_t vib; mutex_enter(&vdp->vdd_lock); if (!(vdp->vdd_flags & VND_D_ATTACHED)) { mutex_exit(&vdp->vdd_lock); vib.vib_errno = VND_E_NOTATTACHED; ret = EIO; goto err; } mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock); vib.vib_size = vdp->vdd_str->vns_dq_read.vdq_max; mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock); mutex_exit(&vdp->vdd_lock); ret = 0; err: if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) return (EFAULT); return (ret); } /* ARGSUSED */ static int vnd_ioctl_getmaxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) { vnd_ioc_buf_t vib; mutex_enter(&vnd_dev_lock); vib.vib_size = vnd_vdq_hard_max; mutex_exit(&vnd_dev_lock); if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) return (EFAULT); return (0); } static int vnd_ioctl_gettxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) { int ret; vnd_ioc_buf_t vib; mutex_enter(&vdp->vdd_lock); if (!(vdp->vdd_flags & VND_D_ATTACHED)) { mutex_exit(&vdp->vdd_lock); vib.vib_errno = VND_E_NOTATTACHED; ret = EIO; goto err; } mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock); vib.vib_size = vdp->vdd_str->vns_dq_write.vdq_max; mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock); mutex_exit(&vdp->vdd_lock); ret = 0; err: if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) return (EFAULT); return (ret); } static int vnd_ioctl_settxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) { int ret; vnd_ioc_buf_t vib; if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0) return (EFAULT); mutex_enter(&vnd_dev_lock); if (vib.vib_size > vnd_vdq_hard_max) { mutex_exit(&vnd_dev_lock); vib.vib_errno = VND_E_BUFTOOBIG; ret = EIO; goto err; } mutex_exit(&vnd_dev_lock); mutex_enter(&vdp->vdd_lock); if (!(vdp->vdd_flags & VND_D_ATTACHED)) { mutex_exit(&vdp->vdd_lock); vib.vib_errno = VND_E_NOTATTACHED; ret = EIO; goto err; } mutex_enter(&vdp->vdd_str->vns_lock); if (vib.vib_size < vdp->vdd_str->vns_minwrite) { mutex_exit(&vdp->vdd_str->vns_lock); mutex_exit(&vdp->vdd_lock); vib.vib_errno = VND_E_BUFTOOSMALL; ret = EIO; goto err; } mutex_exit(&vdp->vdd_str->vns_lock); mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock); vdp->vdd_str->vns_dq_write.vdq_max = (size_t)vib.vib_size; mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock); mutex_exit(&vdp->vdd_lock); ret = 0; err: if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) return (EFAULT); return (ret); } static int vnd_ioctl_gettu(vnd_dev_t *vdp, intptr_t arg, int mode, boolean_t min) { vnd_ioc_buf_t vib; vib.vib_errno = 0; mutex_enter(&vdp->vdd_lock); if (vdp->vdd_flags & VND_D_ATTACHED) { mutex_enter(&vdp->vdd_str->vns_lock); if (min == B_TRUE) vib.vib_size = vdp->vdd_str->vns_minwrite; else vib.vib_size = vdp->vdd_str->vns_maxwrite; mutex_exit(&vdp->vdd_str->vns_lock); } else { vib.vib_errno = VND_E_NOTATTACHED; } mutex_exit(&vdp->vdd_lock); if (ddi_copyout(&vib, (void *)arg, sizeof (vib), mode & FKIOCTL) != 0) return (EFAULT); return (0); } static int vnd_frameio_read(vnd_dev_t *vdp, intptr_t addr, int mode) { int ret, nonblock, nwrite; frameio_t *fio; vnd_data_queue_t *vqp; mblk_t *mp; fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI); if (fio == NULL) return (EAGAIN); ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (const void *)addr, mode); if (ret != 0) { frameio_free(fio); return (ret); } mutex_enter(&vdp->vdd_lock); if (!(vdp->vdd_flags & VND_D_ATTACHED)) { mutex_exit(&vdp->vdd_lock); frameio_free(fio); return (ENXIO); } mutex_exit(&vdp->vdd_lock); nonblock = mode & (FNONBLOCK | FNDELAY); vqp = &vdp->vdd_str->vns_dq_read; mutex_enter(&vqp->vdq_lock); /* Check empty case */ if (vqp->vdq_cur == 0) { if (nonblock != 0) { mutex_exit(&vqp->vdq_lock); frameio_free(fio); return (EWOULDBLOCK); } while (vqp->vdq_cur == 0) { if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { mutex_exit(&vqp->vdq_lock); frameio_free(fio); return (EINTR); } } } ret = frameio_mblk_chain_write(fio, MAP_BLK_FRAME, vqp->vdq_head, &nwrite, mode & FKIOCTL); if (ret != 0) { mutex_exit(&vqp->vdq_lock); frameio_free(fio); return (ret); } ret = frameio_hdr_copyout(fio, nwrite, (void *)addr, mode); if (ret != 0) { mutex_exit(&vqp->vdq_lock); frameio_free(fio); return (ret); } while (nwrite > 0) { (void) vnd_dq_pop(vqp, &mp); freemsg(mp); nwrite--; } mutex_exit(&vqp->vdq_lock); frameio_free(fio); return (0); } static int vnd_frameio_write(vnd_dev_t *vdp, intptr_t addr, int mode) { frameio_t *fio; int ret, nonblock, nframes, i, nread; size_t maxwrite, minwrite, total, flen; mblk_t *mp_chain, *mp, *nmp; vnd_data_queue_t *vqp; fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI); if (fio == NULL) return (EAGAIN); ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (void *)addr, mode); if (ret != 0) { frameio_free(fio); return (ret); } mutex_enter(&vdp->vdd_lock); if (!(vdp->vdd_flags & VND_D_ATTACHED)) { mutex_exit(&vdp->vdd_lock); frameio_free(fio); return (ENXIO); } mutex_exit(&vdp->vdd_lock); nonblock = mode & (FNONBLOCK | FNDELAY); /* * Make sure no single frame is larger than we can accept. */ mutex_enter(&vdp->vdd_str->vns_lock); minwrite = vdp->vdd_str->vns_minwrite; maxwrite = vdp->vdd_str->vns_maxwrite; mutex_exit(&vdp->vdd_str->vns_lock); nframes = fio->fio_nvpf / fio->fio_nvecs; total = 0; for (i = 0; i < nframes; i++) { flen = frameio_frame_length(fio, &fio->fio_vecs[i*fio->fio_nvpf]); if (flen < minwrite || flen > maxwrite) { frameio_free(fio); return (ERANGE); } total += flen; } vqp = &vdp->vdd_str->vns_dq_write; mutex_enter(&vqp->vdq_lock); while (vnd_dq_reserve(vqp, total) == 0) { if (nonblock != 0) { frameio_free(fio); mutex_exit(&vqp->vdq_lock); return (EAGAIN); } if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { mutex_exit(&vqp->vdq_lock); frameio_free(fio); return (EINTR); } } mutex_exit(&vqp->vdq_lock); /* * We've reserved our space, let's copyin and go from here. */ ret = frameio_mblk_chain_read(fio, &mp_chain, &nread, mode & FKIOCTL); if (ret != 0) { frameio_free(fio); vnd_dq_unreserve(vqp, total); cv_broadcast(&vqp->vdq_ready); pollwakeup(&vdp->vdd_ph, POLLOUT); return (ret); } for (mp = mp_chain; mp != NULL; mp = nmp) { nmp = mp->b_next; mp->b_next = NULL; gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp, vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS, VND_SQUEUE_TAG_VND_WRITE); } /* * Update the frameio structure to indicate that we wrote those frames. */ frameio_mark_consumed(fio, nread); ret = frameio_hdr_copyout(fio, nread, (void *)addr, mode); frameio_free(fio); return (ret); } static int vnd_ioctl_list_copy_info(vnd_dev_t *vdp, vnd_ioc_info_t *arg, int mode) { const char *link; uint32_t vers = 1; ASSERT(MUTEX_HELD(&vdp->vdd_lock)); /* * Copy all of the members out to userland. */ if (ddi_copyout(&vers, &arg->vii_version, sizeof (uint32_t), mode & FKIOCTL) != 0) return (EFAULT); if (vdp->vdd_flags & VND_D_LINKED) link = vdp->vdd_lname; else link = ""; if (ddi_copyout(link, arg->vii_name, sizeof (arg->vii_name), mode & FKIOCTL) != 0) return (EFAULT); if (ddi_copyout(vdp->vdd_datalink, arg->vii_datalink, sizeof (arg->vii_datalink), mode & FKIOCTL) != 0) return (EFAULT); if (ddi_copyout(&vdp->vdd_nsd->vpnd_zid, &arg->vii_zone, sizeof (zoneid_t), mode & FKIOCTL) != 0) return (EFAULT); return (0); } static int vnd_ioctl_list(intptr_t arg, cred_t *credp, int mode) { vnd_ioc_list_t vl; vnd_ioc_list32_t vl32; zoneid_t zid; vnd_dev_t *vdp; vnd_ioc_info_t *vip; int found, cancopy, ret; if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { if (ddi_copyin((void *)arg, &vl32, sizeof (vnd_ioc_list32_t), mode & FKIOCTL) != 0) return (EFAULT); vl.vl_nents = vl32.vl_nents; vl.vl_actents = vl32.vl_actents; vl.vl_ents = (void *)(uintptr_t)vl32.vl_ents; } else { if (ddi_copyin((void *)arg, &vl, sizeof (vnd_ioc_list_t), mode & FKIOCTL) != 0) return (EFAULT); } cancopy = vl.vl_nents; vip = vl.vl_ents; found = 0; zid = crgetzoneid(credp); mutex_enter(&vnd_dev_lock); for (vdp = list_head(&vnd_dev_list); vdp != NULL; vdp = list_next(&vnd_dev_list, vdp)) { mutex_enter(&vdp->vdd_lock); if (vdp->vdd_flags & VND_D_ATTACHED && !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING)) && (zid == GLOBAL_ZONEID || zid == vdp->vdd_nsd->vpnd_zid)) { found++; if (cancopy > 0) { ret = vnd_ioctl_list_copy_info(vdp, vip, mode); if (ret != 0) { mutex_exit(&vdp->vdd_lock); mutex_exit(&vnd_dev_lock); return (ret); } cancopy--; vip++; } } mutex_exit(&vdp->vdd_lock); } mutex_exit(&vnd_dev_lock); if (ddi_copyout(&found, &((vnd_ioc_list_t *)arg)->vl_actents, sizeof (uint_t), mode & FKIOCTL) != 0) return (EFAULT); return (0); } /* ARGSUSED */ static int vnd_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) { int ret; minor_t m; vnd_dev_t *vdp; m = getminor(dev); ASSERT(m != 0); /* * Make sure no one has come in on an ioctl from the strioc case. */ if ((cmd & VND_STRIOC) == VND_STRIOC) return (ENOTTY); /* * Like close, seems like if this minor isn't found, it's a programmer * error somehow. */ vdp = vnd_dev_lookup(m); if (vdp == NULL) return (ENXIO); switch (cmd) { case VND_IOC_ATTACH: if (!(mode & FWRITE)) { ret = EBADF; break; } ret = vnd_ioctl_attach(vdp, arg, credp, mode); break; case VND_IOC_LINK: if (!(mode & FWRITE)) { ret = EBADF; break; } ret = vnd_ioctl_link(vdp, arg, credp, mode); break; case VND_IOC_UNLINK: if (!(mode & FWRITE)) { ret = EBADF; break; } ret = vnd_ioctl_unlink(vdp, arg, credp, mode); break; case VND_IOC_GETRXBUF: if (!(mode & FREAD)) { ret = EBADF; break; } ret = vnd_ioctl_getrxbuf(vdp, arg, mode); break; case VND_IOC_SETRXBUF: if (!(mode & FWRITE)) { ret = EBADF; break; } ret = vnd_ioctl_setrxbuf(vdp, arg, mode); break; case VND_IOC_GETTXBUF: if (!(mode & FREAD)) { ret = EBADF; break; } ret = vnd_ioctl_gettxbuf(vdp, arg, mode); break; case VND_IOC_SETTXBUF: if (!(mode & FWRITE)) { ret = EBADF; break; } ret = vnd_ioctl_settxbuf(vdp, arg, mode); break; case VND_IOC_GETMAXBUF: if (!(mode & FREAD)) { ret = EBADF; break; } if (crgetzoneid(credp) != GLOBAL_ZONEID) { ret = EPERM; break; } ret = vnd_ioctl_getmaxbuf(vdp, arg, mode); break; case VND_IOC_GETMINTU: if (!(mode & FREAD)) { ret = EBADF; break; } ret = vnd_ioctl_gettu(vdp, arg, mode, B_TRUE); break; case VND_IOC_GETMAXTU: if (!(mode & FREAD)) { ret = EBADF; break; } ret = vnd_ioctl_gettu(vdp, arg, mode, B_FALSE); break; case VND_IOC_FRAMEIO_READ: if (!(mode & FREAD)) { ret = EBADF; break; } ret = vnd_frameio_read(vdp, arg, mode); break; case VND_IOC_FRAMEIO_WRITE: if (!(mode & FWRITE)) { ret = EBADF; break; } ret = vnd_frameio_write(vdp, arg, mode); break; case VND_IOC_LIST: if (!(mode & FREAD)) { ret = EBADF; break; } ret = vnd_ioctl_list(arg, credp, mode); break; default: ret = ENOTTY; break; } vnd_dev_rele(vdp); return (ret); } static int vnd_open(dev_t *devp, int flag, int otyp, cred_t *credp) { vnd_dev_t *vdp; minor_t m; zoneid_t zid; if (flag & (FEXCL | FNDELAY)) return (ENOTSUP); if (otyp & OTYP_BLK) return (ENOTSUP); zid = crgetzoneid(credp); m = getminor(*devp); /* * If we have an open of a non-zero instance then we need to look that * up in our list of entries. */ if (m != 0) { /* * We don't check for rawaccess globally as a user could be * doing a list ioctl on the control node which doesn't require * this privilege. */ if (secpolicy_net_rawaccess(credp) != 0) return (EPERM); vdp = vnd_dev_lookup(m); if (vdp == NULL) return (ENOENT); /* * We need to check to make sure that the user is allowed to * open this node. At this point it should be an attached handle * as that's all we're allowed to access. */ mutex_enter(&vdp->vdd_lock); if (!(vdp->vdd_flags & VND_D_LINKED)) { mutex_exit(&vdp->vdd_lock); vnd_dev_rele(vdp); return (ENOENT); } if (vdp->vdd_flags & VND_D_ZONE_DYING) { mutex_exit(&vdp->vdd_lock); vnd_dev_rele(vdp); return (ENOENT); } if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) { mutex_exit(&vdp->vdd_lock); vnd_dev_rele(vdp); return (ENOENT); } if ((flag & FEXCL) && (vdp->vdd_flags & VND_D_OPENED)) { mutex_exit(&vdp->vdd_lock); vnd_dev_rele(vdp); return (EBUSY); } if (!(vdp->vdd_flags & VND_D_OPENED)) { vdp->vdd_flags |= VND_D_OPENED; vdp->vdd_ref++; DTRACE_VND_REFINC(vdp); } mutex_exit(&vdp->vdd_lock); vnd_dev_rele(vdp); return (0); } if (flag & FEXCL) return (ENOTSUP); /* * We need to clone ourselves and set up new a state. */ vdp = kmem_cache_alloc(vnd_dev_cache, KM_SLEEP); bzero(vdp, sizeof (vnd_dev_t)); if (ldi_ident_from_dev(*devp, &vdp->vdd_ldiid) != 0) { kmem_cache_free(vnd_dev_cache, vdp); return (EINVAL); } vdp->vdd_minor = id_alloc(vnd_minors); mutex_init(&vdp->vdd_lock, NULL, MUTEX_DRIVER, NULL); list_link_init(&vdp->vdd_link); vdp->vdd_ref = 1; *devp = makedevice(getmajor(*devp), vdp->vdd_minor); vdp->vdd_devid = *devp; DTRACE_VND_REFINC(vdp); vdp->vdd_flags |= VND_D_OPENED; mutex_enter(&vnd_dev_lock); list_insert_head(&vnd_dev_list, vdp); mutex_exit(&vnd_dev_lock); return (0); } /* ARGSUSED */ static int vnd_close(dev_t dev, int flag, int otyp, cred_t *credp) { minor_t m; vnd_dev_t *vdp; m = getminor(dev); if (m == 0) return (ENXIO); vdp = vnd_dev_lookup(m); if (vdp == NULL) return (ENXIO); mutex_enter(&vdp->vdd_lock); VERIFY(vdp->vdd_flags & VND_D_OPENED); vdp->vdd_flags &= ~VND_D_OPENED; mutex_exit(&vdp->vdd_lock); /* Remove the hold from the previous open. */ vnd_dev_rele(vdp); /* And now from lookup */ vnd_dev_rele(vdp); return (0); } /* ARGSUSED */ static int vnd_read(dev_t dev, struct uio *uiop, cred_t *credp) { int nonblock, error = 0; size_t mpsize; vnd_dev_t *vdp; vnd_data_queue_t *vqp; mblk_t *mp = NULL; offset_t u_loffset; /* * If we have more than one uio we refuse to do anything. That's for * frameio. */ if (uiop->uio_iovcnt > 1) return (EINVAL); vdp = vnd_dev_lookup(getminor(dev)); if (vdp == NULL) return (ENXIO); mutex_enter(&vdp->vdd_lock); if (!(vdp->vdd_flags & VND_D_ATTACHED)) { mutex_exit(&vdp->vdd_lock); vnd_dev_rele(vdp); return (ENXIO); } mutex_exit(&vdp->vdd_lock); nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY); vqp = &vdp->vdd_str->vns_dq_read; mutex_enter(&vqp->vdq_lock); /* Check empty case */ if (vqp->vdq_cur == 0) { if (nonblock != 0) { error = EWOULDBLOCK; goto err; } while (vqp->vdq_cur == 0) { if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { error = EINTR; goto err; } } } /* Ensure our buffer is big enough */ mp = vqp->vdq_head; ASSERT(mp != NULL); mpsize = msgsize(mp); if (mpsize > uiop->uio_resid) { error = EOVERFLOW; goto err; } u_loffset = uiop->uio_loffset; while (mp != NULL) { if (uiomove(mp->b_rptr, MBLKL(mp), UIO_READ, uiop) != 0) { error = EFAULT; uiop->uio_loffset = u_loffset; mp = NULL; goto err; } mpsize -= MBLKL(mp); mp = mp->b_cont; } ASSERT(mpsize == 0); (void) vnd_dq_pop(vqp, &mp); freemsg(mp); err: mutex_exit(&vqp->vdq_lock); vnd_dev_rele(vdp); return (error); } /* ARGSUSED */ static int vnd_write(dev_t dev, struct uio *uiop, cred_t *credp) { int nonblock, error; vnd_dev_t *vdp; mblk_t *mp; ssize_t iosize, origsize; vnd_data_queue_t *vqp; if (uiop->uio_iovcnt > 1) return (EINVAL); vdp = vnd_dev_lookup(getminor(dev)); if (vdp == NULL) return (ENXIO); mutex_enter(&vdp->vdd_lock); if (!(vdp->vdd_flags & VND_D_ATTACHED)) { mutex_exit(&vdp->vdd_lock); vnd_dev_rele(vdp); return (ENXIO); } mutex_exit(&vdp->vdd_lock); nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY); mutex_enter(&vdp->vdd_str->vns_lock); if (uiop->uio_resid > vdp->vdd_str->vns_maxwrite || uiop->uio_resid < vdp->vdd_str->vns_minwrite) { mutex_exit(&vdp->vdd_str->vns_lock); vnd_dev_rele(vdp); return (ERANGE); } mutex_exit(&vdp->vdd_str->vns_lock); VERIFY(vdp->vdd_str != NULL); /* * Reserve space in the data queue if we can. If we can't, block or * return EAGAIN. If we can, go and squeue_enter. */ vqp = &vdp->vdd_str->vns_dq_write; mutex_enter(&vqp->vdq_lock); while (vnd_dq_reserve(vqp, uiop->uio_resid) == 0) { if (nonblock != 0) { mutex_exit(&vqp->vdq_lock); vnd_dev_rele(vdp); return (EAGAIN); } if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { mutex_exit(&vqp->vdq_lock); vnd_dev_rele(vdp); return (EINTR); } } mutex_exit(&vqp->vdq_lock); /* * Now that we've reserved the space, try to allocate kernel space for * and copy in the block. To take care of all this we use the * strmakedata subroutine for now. */ origsize = iosize = uiop->uio_resid; error = strmakedata(&iosize, uiop, vdp->vdd_str->vns_wq->q_stream, 0, &mp); /* * strmakedata() will return an error or it may only consume a portion * of the data. */ if (error != 0 || uiop->uio_resid != 0) { vnd_dq_unreserve(vqp, origsize); cv_broadcast(&vqp->vdq_ready); pollwakeup(&vdp->vdd_ph, POLLOUT); vnd_dev_rele(vdp); return (ENOSR); } gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp, vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS, VND_SQUEUE_TAG_VND_WRITE); vnd_dev_rele(vdp); return (0); } static int vnd_chpoll(dev_t dev, short events, int anyyet, short *reventsp, struct pollhead **phpp) { short ready = 0; vnd_dev_t *vdp; vnd_data_queue_t *vqp; vdp = vnd_dev_lookup(getminor(dev)); if (vdp == NULL) return (ENXIO); mutex_enter(&vdp->vdd_lock); if (!(vdp->vdd_flags & VND_D_ATTACHED)) { mutex_exit(&vdp->vdd_lock); vnd_dev_rele(vdp); return (ENXIO); } mutex_exit(&vdp->vdd_lock); if ((events & POLLIN) || (events & POLLRDNORM)) { vqp = &vdp->vdd_str->vns_dq_read; mutex_enter(&vqp->vdq_lock); if (vqp->vdq_head != NULL) ready |= events & (POLLIN | POLLRDNORM); mutex_exit(&vqp->vdq_lock); } if (events & POLLOUT) { vqp = &vdp->vdd_str->vns_dq_write; mutex_enter(&vqp->vdq_lock); if (vqp->vdq_cur != vqp->vdq_max) ready |= POLLOUT; mutex_exit(&vqp->vdq_lock); } if ((ready == 0 && !anyyet) || (events & POLLET)) { *phpp = &vdp->vdd_ph; } *reventsp = ready; vnd_dev_rele(vdp); return (0); } /* ARGSUSED */ static void * vnd_stack_init(netstackid_t stackid, netstack_t *ns) { vnd_pnsd_t *nsp; nsp = kmem_cache_alloc(vnd_pnsd_cache, KM_SLEEP); bzero(nsp, sizeof (*nsp)); nsp->vpnd_nsid = stackid; nsp->vpnd_zid = netstackid_to_zoneid(stackid); nsp->vpnd_flags = 0; mutex_init(&nsp->vpnd_lock, NULL, MUTEX_DRIVER, NULL); list_create(&nsp->vpnd_dev_list, sizeof (vnd_dev_t), offsetof(vnd_dev_t, vdd_nslink)); if (vnd_netinfo_init(nsp) == 0) nsp->vpnd_hooked = B_TRUE; mutex_enter(&vnd_dev_lock); list_insert_tail(&vnd_nsd_list, nsp); mutex_exit(&vnd_dev_lock); return (nsp); } /* ARGSUSED */ static void vnd_stack_shutdown(netstackid_t stackid, void *arg) { vnd_pnsd_t *nsp = arg; vnd_dev_t *vdp; ASSERT(nsp != NULL); /* * After shut down no one should be able to find their way to this * netstack again. */ mutex_enter(&vnd_dev_lock); list_remove(&vnd_nsd_list, nsp); mutex_exit(&vnd_dev_lock); /* * Make sure hooks know that they're going away. */ if (nsp->vpnd_hooked == B_TRUE) vnd_netinfo_shutdown(nsp); /* * Now we need to go through and notify each zone that they are in * teardown phase. See the big theory statement section on vnd, zones, * netstacks, and sdev for more information about this. */ mutex_enter(&nsp->vpnd_lock); nsp->vpnd_flags |= VND_NS_CONDEMNED; for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL; vdp = list_next(&nsp->vpnd_dev_list, vdp)) { mutex_enter(&vdp->vdd_lock); if (!(vdp->vdd_flags & VND_D_CONDEMNED)) vdp->vdd_flags |= VND_D_ZONE_DYING; mutex_exit(&vdp->vdd_lock); } mutex_exit(&nsp->vpnd_lock); /* * Next we remove all the links as we know nothing new can be added to * the list and that none of the extent devices can obtain additional * links. */ restart: mutex_enter(&nsp->vpnd_lock); for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL; vdp = list_next(&nsp->vpnd_dev_list, vdp)) { mutex_enter(&vdp->vdd_lock); if ((vdp->vdd_flags & VND_D_CONDEMNED) || !(vdp->vdd_flags & VND_D_LINKED)) { mutex_exit(&vdp->vdd_lock); continue; } /* * We drop our lock here and restart afterwards. Note that as * part of unlinking we end up doing a rele of the vnd_dev_t. If * this is the final hold on the vnd_dev_t then it might try and * remove itself. Our locking rules requires not to be holding * any locks when we call any of the rele functions. * * Note that the unlink function requires holders to call into * it with the vnd_dev_t->vdd_lock held and will take care of it * for us. Because we don't have a hold on it, we're done at * this point. */ mutex_exit(&nsp->vpnd_lock); /* Forcibly unlink */ vnd_dev_unlink(vdp); goto restart; } mutex_exit(&nsp->vpnd_lock); } /* ARGSUSED */ static void vnd_stack_destroy(netstackid_t stackid, void *arg) { vnd_pnsd_t *nsp = arg; ASSERT(nsp != NULL); /* * Now that we've unlinked everything we just have to hang out for * it to finish exiting. Now that it's no longer the kernel itself * that's doing this we just need to wait for our reference count to * equal zero and then we're free. If the global zone is holding open a * reference to a vnd device for another zone, that's bad, but there's * nothing much we can do. See the section on 'vnd, zones, netstacks' in * the big theory statement for more information. */ mutex_enter(&nsp->vpnd_lock); while (nsp->vpnd_ref != 0) cv_wait(&nsp->vpnd_ref_change, &nsp->vpnd_lock); mutex_exit(&nsp->vpnd_lock); /* * During shutdown we removed ourselves from the list and now we have no * more references so we can safely say that there is nothing left and * destroy everything that we had sitting around. */ if (nsp->vpnd_hooked == B_TRUE) vnd_netinfo_fini(nsp); mutex_destroy(&nsp->vpnd_lock); list_destroy(&nsp->vpnd_dev_list); kmem_cache_free(vnd_pnsd_cache, nsp); } /* * Convert a node with a name of the form /dev/vnd/zone/%zonename and * /dev/vnd/zone/%zonename/%linkname to the corresponding vnd netstack. */ static vnd_pnsd_t * vnd_sdev_ctx_to_ns(sdev_ctx_t ctx) { enum vtype vt; const char *path = sdev_ctx_path(ctx); char *zstart, *dup; size_t duplen; vnd_pnsd_t *nsp; vt = sdev_ctx_vtype(ctx); ASSERT(strncmp(path, VND_SDEV_ZROOT, strlen(VND_SDEV_ZROOT)) == 0); if (vt == VDIR) { zstart = strrchr(path, '/'); ASSERT(zstart != NULL); zstart++; return (vnd_nsd_lookup_by_zonename(zstart)); } ASSERT(vt == VCHR); dup = strdup(path); duplen = strlen(dup) + 1; zstart = strrchr(dup, '/'); *zstart = '\0'; zstart--; zstart = strrchr(dup, '/'); zstart++; nsp = vnd_nsd_lookup_by_zonename(zstart); kmem_free(dup, duplen); return (nsp); } static sdev_plugin_validate_t vnd_sdev_validate_dir(sdev_ctx_t ctx) { vnd_pnsd_t *nsp; if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ROOT) == 0) return (SDEV_VTOR_VALID); if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ZROOT) == 0) { ASSERT(getzoneid() == GLOBAL_ZONEID); ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL); return (SDEV_VTOR_VALID); } nsp = vnd_sdev_ctx_to_ns(ctx); if (nsp == NULL) return (SDEV_VTOR_INVALID); vnd_nsd_rele(nsp); return (SDEV_VTOR_VALID); } static sdev_plugin_validate_t vnd_sdev_validate(sdev_ctx_t ctx) { enum vtype vt; vnd_dev_t *vdp; minor_t minor; vt = sdev_ctx_vtype(ctx); if (vt == VDIR) return (vnd_sdev_validate_dir(ctx)); ASSERT(vt == VCHR); if (strcmp("ctl", sdev_ctx_name(ctx)) == 0) return (SDEV_VTOR_VALID); if (sdev_ctx_minor(ctx, &minor) != 0) return (SDEV_VTOR_STALE); vdp = vnd_dev_lookup(minor); if (vdp == NULL) return (SDEV_VTOR_STALE); mutex_enter(&vdp->vdd_lock); if (!(vdp->vdd_flags & VND_D_LINKED) || (vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) { mutex_exit(&vdp->vdd_lock); vnd_dev_rele(vdp); return (SDEV_VTOR_STALE); } if (strcmp(sdev_ctx_name(ctx), vdp->vdd_lname) != 0) { mutex_exit(&vdp->vdd_lock); vnd_dev_rele(vdp); return (SDEV_VTOR_STALE); } mutex_exit(&vdp->vdd_lock); vnd_dev_rele(vdp); return (SDEV_VTOR_VALID); } /* * This function is a no-op. sdev never has holds on our devices as they can go * away at any time and specfs has to deal with that fact. */ /* ARGSUSED */ static void vnd_sdev_inactive(sdev_ctx_t ctx) { } static int vnd_sdev_fillzone(vnd_pnsd_t *nsp, sdev_ctx_t ctx) { int ret; vnd_dev_t *vdp; mutex_enter(&nsp->vpnd_lock); for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL; vdp = list_next(&nsp->vpnd_dev_list, vdp)) { mutex_enter(&vdp->vdd_lock); if ((vdp->vdd_flags & VND_D_LINKED) && !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) { ret = sdev_plugin_mknod(ctx, vdp->vdd_lname, VND_SDEV_MODE, vdp->vdd_devid); if (ret != 0 && ret != EEXIST) { mutex_exit(&vdp->vdd_lock); mutex_exit(&nsp->vpnd_lock); vnd_nsd_rele(nsp); return (ret); } } mutex_exit(&vdp->vdd_lock); } mutex_exit(&nsp->vpnd_lock); return (0); } static int vnd_sdev_filldir_root(sdev_ctx_t ctx) { zoneid_t zid; vnd_pnsd_t *nsp; int ret; zid = getzoneid(); nsp = vnd_nsd_lookup(zoneid_to_netstackid(zid)); ASSERT(nsp != NULL); ret = vnd_sdev_fillzone(nsp, ctx); vnd_nsd_rele(nsp); if (ret != 0) return (ret); /* * Checking the zone id is not sufficient as the global zone could be * reaching down into a non-global zone's mounted /dev. */ if (zid == GLOBAL_ZONEID && (sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL)) { ret = sdev_plugin_mkdir(ctx, "zone"); if (ret != 0 && ret != EEXIST) return (ret); } /* * Always add a reference to the control node. There's no need to * reference it since it always exists and is always what we clone from. */ ret = sdev_plugin_mknod(ctx, "ctl", VND_SDEV_MODE, makedevice(ddi_driver_major(vnd_dip), 0)); if (ret != 0 && ret != EEXIST) return (ret); return (0); } static int vnd_sdev_filldir_zroot(sdev_ctx_t ctx) { int ret; vnd_pnsd_t *nsp; zone_t *zonep; ASSERT(getzoneid() == GLOBAL_ZONEID); ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL); mutex_enter(&vnd_dev_lock); for (nsp = list_head(&vnd_nsd_list); nsp != NULL; nsp = list_next(&vnd_nsd_list, nsp)) { mutex_enter(&nsp->vpnd_lock); if (list_is_empty(&nsp->vpnd_dev_list)) { mutex_exit(&nsp->vpnd_lock); continue; } mutex_exit(&nsp->vpnd_lock); zonep = zone_find_by_id(nsp->vpnd_zid); /* * This zone must be being torn down, so skip it. */ if (zonep == NULL) continue; ret = sdev_plugin_mkdir(ctx, zonep->zone_name); zone_rele(zonep); if (ret != 0 && ret != EEXIST) { mutex_exit(&vnd_dev_lock); return (ret); } } mutex_exit(&vnd_dev_lock); return (0); } static int vnd_sdev_filldir(sdev_ctx_t ctx) { int ret; vnd_pnsd_t *nsp; ASSERT(sdev_ctx_vtype(ctx) == VDIR); if (strcmp(VND_SDEV_ROOT, sdev_ctx_path(ctx)) == 0) return (vnd_sdev_filldir_root(ctx)); if (strcmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx)) == 0) return (vnd_sdev_filldir_zroot(ctx)); ASSERT(strncmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx), strlen(VND_SDEV_ZROOT)) == 0); nsp = vnd_sdev_ctx_to_ns(ctx); if (nsp == NULL) return (0); ret = vnd_sdev_fillzone(nsp, ctx); vnd_nsd_rele(nsp); return (ret); } static sdev_plugin_ops_t vnd_sdev_ops = { SDEV_PLUGIN_VERSION, SDEV_PLUGIN_SUBDIR, vnd_sdev_validate, vnd_sdev_filldir, vnd_sdev_inactive }; static int vnd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { int errp = 0; if (cmd != DDI_ATTACH) return (DDI_FAILURE); /* * Only allow one instance. */ if (vnd_dip != NULL) return (DDI_FAILURE); vnd_dip = dip; if (ddi_create_minor_node(vnd_dip, "vnd", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { vnd_dip = NULL; return (DDI_FAILURE); } if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) { ddi_remove_minor_node(vnd_dip, NULL); vnd_dip = NULL; return (DDI_FAILURE); } vnd_sdev_hdl = sdev_plugin_register(VND_SDEV_NAME, &vnd_sdev_ops, &errp); if (vnd_sdev_hdl == (sdev_plugin_hdl_t)NULL) { ddi_remove_minor_node(vnd_dip, NULL); ddi_prop_remove_all(vnd_dip); vnd_dip = NULL; return (DDI_FAILURE); } vnd_sqset = gsqueue_set_create(GSQUEUE_DEFAULT_PRIORITY); return (DDI_SUCCESS); } /* ARGSUSED */ static int vnd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { if (cmd != DDI_DETACH) return (DDI_FAILURE); mutex_enter(&vnd_dev_lock); if (!list_is_empty(&vnd_dev_list)) { mutex_exit(&vnd_dev_lock); return (DDI_FAILURE); } mutex_exit(&vnd_dev_lock); return (DDI_FAILURE); } /* ARGSUSED */ static int vnd_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) { int error; switch (cmd) { case DDI_INFO_DEVT2DEVINFO: *result = (void *)vnd_dip; error = DDI_SUCCESS; break; case DDI_INFO_DEVT2INSTANCE: *result = (void *)0; error = DDI_SUCCESS; break; default: error = DDI_FAILURE; break; } return (error); } static void vnd_ddi_fini(void) { netstack_unregister(NS_VND); if (vnd_taskq != NULL) taskq_destroy(vnd_taskq); if (vnd_str_cache != NULL) kmem_cache_destroy(vnd_str_cache); if (vnd_dev_cache != NULL) kmem_cache_destroy(vnd_dev_cache); if (vnd_pnsd_cache != NULL) kmem_cache_destroy(vnd_pnsd_cache); if (vnd_minors != NULL) id_space_destroy(vnd_minors); if (vnd_list_init != 0) { list_destroy(&vnd_nsd_list); list_destroy(&vnd_dev_list); mutex_destroy(&vnd_dev_lock); vnd_list_init = 0; } frameio_fini(); } static int vnd_ddi_init(void) { if (frameio_init() != 0) return (DDI_FAILURE); vnd_str_cache = kmem_cache_create("vnd_str_cache", sizeof (vnd_str_t), 0, NULL, NULL, NULL, NULL, NULL, 0); if (vnd_str_cache == NULL) { frameio_fini(); return (DDI_FAILURE); } vnd_dev_cache = kmem_cache_create("vnd_dev_cache", sizeof (vnd_dev_t), 0, NULL, NULL, NULL, NULL, NULL, 0); if (vnd_dev_cache == NULL) { kmem_cache_destroy(vnd_str_cache); frameio_fini(); return (DDI_FAILURE); } vnd_pnsd_cache = kmem_cache_create("vnd_pnsd_cache", sizeof (vnd_pnsd_t), 0, NULL, NULL, NULL, NULL, NULL, 0); if (vnd_pnsd_cache == NULL) { kmem_cache_destroy(vnd_dev_cache); kmem_cache_destroy(vnd_str_cache); frameio_fini(); return (DDI_FAILURE); } vnd_taskq = taskq_create_instance("vnd", -1, 1, minclsyspri, 0, 0, 0); if (vnd_taskq == NULL) { kmem_cache_destroy(vnd_pnsd_cache); kmem_cache_destroy(vnd_dev_cache); kmem_cache_destroy(vnd_str_cache); frameio_fini(); return (DDI_FAILURE); } vnd_minors = id_space_create("vnd_minors", 1, INT32_MAX); if (vnd_minors == NULL) { taskq_destroy(vnd_taskq); kmem_cache_destroy(vnd_pnsd_cache); kmem_cache_destroy(vnd_dev_cache); kmem_cache_destroy(vnd_str_cache); frameio_fini(); return (DDI_FAILURE); } mutex_init(&vnd_dev_lock, NULL, MUTEX_DRIVER, NULL); list_create(&vnd_dev_list, sizeof (vnd_dev_t), offsetof(vnd_dev_t, vdd_link)); list_create(&vnd_nsd_list, sizeof (vnd_pnsd_t), offsetof(vnd_pnsd_t, vpnd_link)); vnd_list_init = 1; netstack_register(NS_VND, vnd_stack_init, vnd_stack_shutdown, vnd_stack_destroy); return (DDI_SUCCESS); } static struct module_info vnd_minfo = { 0, /* module id */ "vnd", /* module name */ 1, /* smallest packet size */ INFPSZ, /* largest packet size (infinite) */ 1, /* high watermark */ 0 /* low watermark */ }; static struct qinit vnd_r_qinit = { vnd_s_rput, NULL, vnd_s_open, vnd_s_close, NULL, &vnd_minfo, NULL }; static struct qinit vnd_w_qinit = { vnd_s_wput, NULL, NULL, NULL, NULL, &vnd_minfo, NULL }; static struct streamtab vnd_strtab = { &vnd_r_qinit, &vnd_w_qinit, NULL, NULL }; static struct cb_ops vnd_cb_ops = { vnd_open, /* open */ vnd_close, /* close */ nulldev, /* strategy */ nulldev, /* print */ nodev, /* dump */ vnd_read, /* read */ vnd_write, /* write */ vnd_ioctl, /* ioctl */ nodev, /* devmap */ nodev, /* mmap */ nodev, /* segmap */ vnd_chpoll, /* poll */ ddi_prop_op, /* cb_prop_op */ NULL, /* streamtab */ D_MP /* Driver compatibility flag */ }; static struct dev_ops vnd_dev_ops = { DEVO_REV, /* devo_rev */ 0, /* refcnt */ vnd_info, /* get_dev_info */ nulldev, /* identify */ nulldev, /* probe */ vnd_attach, /* attach */ vnd_detach, /* detach */ nodev, /* reset */ &vnd_cb_ops, /* driver operations */ NULL, /* bus operations */ nodev, /* dev power */ ddi_quiesce_not_needed /* quiesce */ }; static struct modldrv vnd_modldrv = { &mod_driverops, "Virtual Networking Datapath Driver", &vnd_dev_ops }; static struct fmodsw vnd_fmodfsw = { "vnd", &vnd_strtab, D_NEW | D_MP }; static struct modlstrmod vnd_modlstrmod = { &mod_strmodops, "Virtual Networking Datapath Driver", &vnd_fmodfsw }; static struct modlinkage vnd_modlinkage = { MODREV_1, &vnd_modldrv, &vnd_modlstrmod, NULL }; int _init(void) { int error; /* * We need to do all of our global initialization in init as opposed to * attach and detach. The problem here is that because vnd can be used * from a stream context while being detached, we can not rely on having * run attach to create everything, alas. so it goes in _init, just like * our friend ip. */ if ((error = vnd_ddi_init()) != DDI_SUCCESS) return (error); error = mod_install((&vnd_modlinkage)); if (error != 0) vnd_ddi_fini(); return (error); } int _info(struct modinfo *modinfop) { return (mod_info(&vnd_modlinkage, modinfop)); } int _fini(void) { int error; error = mod_remove(&vnd_modlinkage); if (error == 0) vnd_ddi_fini(); return (error); }